summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorVitaly Buka <vitalybuka@google.com>2023-12-21 22:27:30 -0800
committerVitaly Buka <vitalybuka@google.com>2023-12-21 22:27:30 -0800
commitfb23e77533bc871212ad624908649a1544f8dc4e (patch)
tree4491d9b3c09e507382436c7ab42f2ac1da555d0b
parent4ddac7a55b15bcfc4e3a867af6677d32b6eec4c5 (diff)
parentf25bcfbb291e3d213eaded5cfa84d3d4e7002052 (diff)
Created using spr 1.3.4 [skip ci]
-rw-r--r--.github/workflows/libcxx-build-and-test.yaml2
-rw-r--r--clang/lib/Sema/SemaStmt.cpp7
-rw-r--r--clang/test/Sema/switch-default.c28
-rw-r--r--clang/test/Sema/switch-default.cpp53
-rw-r--r--compiler-rt/lib/hwasan/hwasan_report.cpp4
-rw-r--r--compiler-rt/test/hwasan/TestCases/stack-uas.c2
-rw-r--r--compiler-rt/test/hwasan/TestCases/strip_path_prefix.c5
-rw-r--r--lld/ELF/InputSection.cpp22
-rw-r--r--lld/test/ELF/debug-dead-reloc-32.s11
-rw-r--r--lld/test/ELF/debug-dead-reloc.s24
-rw-r--r--llvm/docs/llvm-objdump.1209
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp1
-rw-r--r--llvm/lib/Target/AArch64/AArch64InstrInfo.cpp13
-rw-r--r--llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp53
-rw-r--r--llvm/lib/Target/AMDGPU/SIISelLowering.cpp5
-rw-r--r--llvm/lib/Target/X86/X86InstrAMX.td16
-rw-r--r--llvm/lib/Target/X86/X86InstrAVX512.td446
-rw-r--r--llvm/lib/Target/X86/X86InstrArithmetic.td16
-rw-r--r--llvm/lib/Target/X86/X86InstrMisc.td28
-rw-r--r--llvm/lib/Target/X86/X86InstrSSE.td462
-rw-r--r--llvm/lib/Target/X86/X86InstrSystem.td17
-rw-r--r--llvm/lib/Target/X86/X86InstrTBM.td4
-rw-r--r--llvm/lib/Target/X86/X86InstrUtils.td20
-rw-r--r--llvm/lib/Target/X86/X86InstrXOP.td28
-rw-r--r--llvm/test/CodeGen/AArch64/arm64-addrmode.ll85
-rw-r--r--llvm/test/CodeGen/AMDGPU/bf16.ll8300
-rw-r--r--llvm/test/CodeGen/AMDGPU/function-args.ll131
-rw-r--r--llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll2395
-rw-r--r--llvm/test/MC/Disassembler/X86/apx/wrssd.txt6
-rw-r--r--llvm/test/MC/Disassembler/X86/apx/wrssq.txt6
-rw-r--r--llvm/test/MC/Disassembler/X86/apx/wrussd.txt6
-rw-r--r--llvm/test/MC/Disassembler/X86/apx/wrussq.txt6
-rw-r--r--llvm/test/MC/X86/apx/wrssd-att.s8
-rw-r--r--llvm/test/MC/X86/apx/wrssd-intel.s5
-rw-r--r--llvm/test/MC/X86/apx/wrssq-att.s8
-rw-r--r--llvm/test/MC/X86/apx/wrssq-intel.s5
-rw-r--r--llvm/test/MC/X86/apx/wrussd-att.s8
-rw-r--r--llvm/test/MC/X86/apx/wrussd-intel.s5
-rw-r--r--llvm/test/MC/X86/apx/wrussq-att.s8
-rw-r--r--llvm/test/MC/X86/apx/wrussq-intel.s5
-rwxr-xr-xllvm/utils/chunk-print-before-all.py4
-rw-r--r--mlir/include/mlir/Dialect/Vector/IR/VectorOps.td42
-rw-r--r--mlir/lib/Dialect/MemRef/Transforms/RuntimeOpVerification.cpp172
-rw-r--r--mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir30
-rw-r--r--mlir/test/Dialect/Vector/ops.mlir10
-rw-r--r--mlir/test/Integration/Dialect/Memref/cast-runtime-verification.mlir8
-rw-r--r--mlir/test/Integration/Dialect/Memref/load-runtime-verification.mlir67
-rw-r--r--mlir/test/Integration/Dialect/Memref/reinterpret-cast-runtime-verification.mlir74
-rw-r--r--mlir/test/Integration/Dialect/Memref/subview-runtime-verification.mlir89
-rw-r--r--mlir/test/Target/SPIRV/physical-storage-buffer.mlir48
50 files changed, 12058 insertions, 949 deletions
diff --git a/.github/workflows/libcxx-build-and-test.yaml b/.github/workflows/libcxx-build-and-test.yaml
index 370cf830a60c..25e8c8c1ef21 100644
--- a/.github/workflows/libcxx-build-and-test.yaml
+++ b/.github/workflows/libcxx-build-and-test.yaml
@@ -185,7 +185,7 @@ jobs:
std_modules: 'OFF'
# Use a larger machine for MSAN to avoid timeout and memory allocation issues.
- config: 'generic-msan'
- machine: libcxx-runners-32-set
+ machine: libcxx-runners-8-set
std_modules: 'OFF'
runs-on: ${{ matrix.machine }}
steps:
diff --git a/clang/lib/Sema/SemaStmt.cpp b/clang/lib/Sema/SemaStmt.cpp
index 63348d27a8c9..f0b03db69084 100644
--- a/clang/lib/Sema/SemaStmt.cpp
+++ b/clang/lib/Sema/SemaStmt.cpp
@@ -1271,6 +1271,9 @@ Sema::ActOnFinishSwitchStmt(SourceLocation SwitchLoc, Stmt *Switch,
bool CaseListIsErroneous = false;
+ // FIXME: We'd better diagnose missing or duplicate default labels even
+ // in the dependent case. Because default labels themselves are never
+ // dependent.
for (SwitchCase *SC = SS->getSwitchCaseList(); SC && !HasDependentValue;
SC = SC->getNextSwitchCase()) {
@@ -1327,9 +1330,6 @@ Sema::ActOnFinishSwitchStmt(SourceLocation SwitchLoc, Stmt *Switch,
}
}
- if (!TheDefaultStmt)
- Diag(SwitchLoc, diag::warn_switch_default);
-
if (!HasDependentValue) {
// If we don't have a default statement, check whether the
// condition is constant.
@@ -1344,6 +1344,7 @@ Sema::ActOnFinishSwitchStmt(SourceLocation SwitchLoc, Stmt *Switch,
assert(!HasConstantCond ||
(ConstantCondValue.getBitWidth() == CondWidth &&
ConstantCondValue.isSigned() == CondIsSigned));
+ Diag(SwitchLoc, diag::warn_switch_default);
}
bool ShouldCheckConstantCond = HasConstantCond;
diff --git a/clang/test/Sema/switch-default.c b/clang/test/Sema/switch-default.c
deleted file mode 100644
index 342a97ee68b1..000000000000
--- a/clang/test/Sema/switch-default.c
+++ /dev/null
@@ -1,28 +0,0 @@
-// RUN: %clang_cc1 -fsyntax-only -verify -Wswitch-default %s
-
-int f1(int a) {
- switch (a) { // expected-warning {{'switch' missing 'default' label}}
- case 1: a++; break;
- case 2: a += 2; break;
- }
- return a;
-}
-
-int f2(int a) {
- switch (a) { // no-warning
- default:
- ;
- }
- return a;
-}
-
-// Warn even completely covered Enum cases(GCC compatibility).
-enum E { A, B };
-enum E check_enum(enum E e) {
- switch (e) { // expected-warning {{'switch' missing 'default' label}}
- case A: break;
- case B: break;
- }
- return e;
-}
-
diff --git a/clang/test/Sema/switch-default.cpp b/clang/test/Sema/switch-default.cpp
new file mode 100644
index 000000000000..32d03dae8827
--- /dev/null
+++ b/clang/test/Sema/switch-default.cpp
@@ -0,0 +1,53 @@
+// RUN: %clang_cc1 -fsyntax-only -verify -std=c++11 -Wswitch-default %s
+
+int f1(int a) {
+ switch (a) { // expected-warning {{'switch' missing 'default' label}}
+ case 1: a++; break;
+ case 2: a += 2; break;
+ }
+ return a;
+}
+
+int f2(int a) {
+ switch (a) { // no-warning
+ default:
+ ;
+ }
+ return a;
+}
+
+// Warn even completely covered Enum cases(GCC compatibility).
+enum E { A, B };
+enum E check_enum(enum E e) {
+ switch (e) { // expected-warning {{'switch' missing 'default' label}}
+ case A: break;
+ case B: break;
+ }
+ return e;
+}
+
+template<typename Index>
+int t1(Index i)
+{
+ switch (i) { // expected-warning {{'switch' missing 'default' label}}
+ case 0: return 0;
+ case 1: return 1;
+ }
+ return 0;
+}
+
+template<typename Index>
+int t2(Index i)
+{
+ switch (i) { // no-warning
+ case 0: return 0;
+ case 1: return 1;
+ default: return 2;
+ }
+ return 0;
+}
+
+int main() {
+ return t1(1); // expected-note {{in instantiation of function template specialization 't1<int>' requested here}}
+}
+
diff --git a/compiler-rt/lib/hwasan/hwasan_report.cpp b/compiler-rt/lib/hwasan/hwasan_report.cpp
index 3d192d149753..5b3a99adfea7 100644
--- a/compiler-rt/lib/hwasan/hwasan_report.cpp
+++ b/compiler-rt/lib/hwasan/hwasan_report.cpp
@@ -228,6 +228,8 @@ static void PrintStackAllocations(const StackAllocationsRingBuffer *sa,
tag_t obj_tag = base_tag ^ local.tag_offset;
if (obj_tag != addr_tag)
continue;
+ // Guess top bits of local variable from the faulting address, because
+ // we only store bits 4-19 of FP (bits 0-3 are guaranteed to be zero).
uptr local_beg = (fp + local.frame_offset) |
(untagged_addr & ~(uptr(kRecordFPModulus) - 1));
uptr local_end = local_beg + local.size;
@@ -777,8 +779,6 @@ void BaseReport::PrintAddressDescription() const {
// Check stack first. If the address is on the stack of a live thread, we
// know it cannot be a heap / global overflow.
for (const auto &sa : allocations.stack) {
- // TODO(fmayer): figure out how to distinguish use-after-return and
- // stack-buffer-overflow.
Printf("%s", d.Error());
Printf("\nCause: stack tag-mismatch\n");
Printf("%s", d.Location());
diff --git a/compiler-rt/test/hwasan/TestCases/stack-uas.c b/compiler-rt/test/hwasan/TestCases/stack-uas.c
index d38eedb87fc2..53a7054c1c43 100644
--- a/compiler-rt/test/hwasan/TestCases/stack-uas.c
+++ b/compiler-rt/test/hwasan/TestCases/stack-uas.c
@@ -69,6 +69,8 @@ int main() {
// CHECK: Cause: stack tag-mismatch
// CHECK: is located in stack of thread
// CHECK: Potentially referenced stack objects:
+ // CHECK: Cause: use-after-scope
+ // CHECK-NEXT: 0x{{.*}} is located 0 bytes inside a 2048-byte region
// CHECK-NEXT: {{zzz|yyy}} in buggy {{.*}}stack-uas.c:
// CHECK: Memory tags around the buggy address
diff --git a/compiler-rt/test/hwasan/TestCases/strip_path_prefix.c b/compiler-rt/test/hwasan/TestCases/strip_path_prefix.c
index 5844749a6d97..5e41d03b683e 100644
--- a/compiler-rt/test/hwasan/TestCases/strip_path_prefix.c
+++ b/compiler-rt/test/hwasan/TestCases/strip_path_prefix.c
@@ -1,8 +1,11 @@
-// RUN: %clang_hwasan -O0 %s -o %t && %env_hwasan_opts=strip_path_prefix='"%S/"' not %run %t 2>&1 | FileCheck %s
+// RUN: %clang_hwasan -O0 %s -o %t && %env_hwasan_opts=strip_path_prefix=/TestCases/ not %run %t 2>&1 | FileCheck %s
// Stack histories currently are not recorded on x86.
// XFAIL: target=x86_64{{.*}}
+// FIXME: Android does not see a variable.
+// XFAIL: android
+
#include <assert.h>
#include <sanitizer/hwasan_interface.h>
#include <stdio.h>
diff --git a/lld/ELF/InputSection.cpp b/lld/ELF/InputSection.cpp
index 81468a20dfb5..5dfb57fda432 100644
--- a/lld/ELF/InputSection.cpp
+++ b/lld/ELF/InputSection.cpp
@@ -898,10 +898,16 @@ void InputSection::relocateNonAlloc(uint8_t *buf, ArrayRef<RelTy> rels) {
const TargetInfo &target = *elf::target;
const auto emachine = config->emachine;
const bool isDebug = isDebugSection(*this);
- const bool isDebugLocOrRanges =
- isDebug && (name == ".debug_loc" || name == ".debug_ranges");
const bool isDebugLine = isDebug && name == ".debug_line";
std::optional<uint64_t> tombstone;
+ if (isDebug) {
+ if (name == ".debug_loc" || name == ".debug_ranges")
+ tombstone = 1;
+ else if (name == ".debug_names")
+ tombstone = UINT64_MAX; // tombstone value
+ else
+ tombstone = 0;
+ }
for (const auto &patAndValue : llvm::reverse(config->deadRelocInNonAlloc))
if (patAndValue.first.match(this->name)) {
tombstone = patAndValue.second;
@@ -946,8 +952,7 @@ void InputSection::relocateNonAlloc(uint8_t *buf, ArrayRef<RelTy> rels) {
return;
}
- if (tombstone ||
- (isDebug && (type == target.symbolicRel || expr == R_DTPREL))) {
+ if (tombstone && (expr == R_ABS || expr == R_DTPREL)) {
// Resolve relocations in .debug_* referencing (discarded symbols or ICF
// folded section symbols) to a tombstone value. Resolving to addend is
// unsatisfactory because the result address range may collide with a
@@ -978,8 +983,13 @@ void InputSection::relocateNonAlloc(uint8_t *buf, ArrayRef<RelTy> rels) {
// value. Enable -1 in a future release.
if (!sym.getOutputSection() || (ds && ds->folded && !isDebugLine)) {
// If -z dead-reloc-in-nonalloc= is specified, respect it.
- const uint64_t value = tombstone ? SignExtend64<bits>(*tombstone)
- : (isDebugLocOrRanges ? 1 : 0);
+ uint64_t value = SignExtend64<bits>(*tombstone);
+ // For a 32-bit local TU reference in .debug_names, X86_64::relocate
+ // requires that the unsigned value for R_X86_64_32 is truncated to
+ // 32-bit. Other 64-bit targets's don't discern signed/unsigned 32-bit
+ // absolute relocations and do not need this change.
+ if (emachine == EM_X86_64 && type == R_X86_64_32)
+ value = static_cast<uint32_t>(value);
target.relocateNoSym(bufLoc, type, value);
continue;
}
diff --git a/lld/test/ELF/debug-dead-reloc-32.s b/lld/test/ELF/debug-dead-reloc-32.s
index 99335b44f51c..1aa43148689e 100644
--- a/lld/test/ELF/debug-dead-reloc-32.s
+++ b/lld/test/ELF/debug-dead-reloc-32.s
@@ -13,6 +13,8 @@
# CHECK-NEXT: 0000 01000000
# CHECK-NEXT: Contents of section .debug_addr:
# CHECK-NEXT: 0000 00000000
+# CHECK-NEXT: Contents of section .debug_names:
+# CHECK-NEXT: 0000 ffffffff
## -z dead-reloc-in-nonalloc= can override the tombstone value.
# RUN: ld.lld -z dead-reloc-in-nonalloc=.debug_loc=42 -z dead-reloc-in-nonalloc=.debug_addr=0xfffffffffffffffe %t.o -o %t1
@@ -38,3 +40,12 @@
## Resolved to UINT32_C(0), with the addend ignored.
.section .debug_addr
.long .text.1+8
+
+.section .debug_info,"eG",@progbits,5657452045627120676,comdat
+.Ltu_begin0:
+
+.section .debug_names
+## .debug_names may reference a local type unit defined in a COMDAT .debug_info
+## section (-g -gpubnames -fdebug-types-section). If the referenced section is
+## non-prevailing, resolve to UINT32_MAX.
+.long .Ltu_begin0
diff --git a/lld/test/ELF/debug-dead-reloc.s b/lld/test/ELF/debug-dead-reloc.s
index cfa41e00eab0..1a8823737ed5 100644
--- a/lld/test/ELF/debug-dead-reloc.s
+++ b/lld/test/ELF/debug-dead-reloc.s
@@ -21,9 +21,12 @@
# CHECK: Contents of section .debug_addr:
# CHECK-NEXT: 0000 {{.*}}00 00000000 {{.*}}00 00000000
# CHECK-NEXT: 0010 00000000 00000000 {{.*}}00 00000000
+# CHECK: Contents of section .debug_names:
+# CHECK-NEXT: 0000 00000000 00000000 00000000 ffffffff .
+# CHECK-NEXT: 0010 ffffffff ffffffff .
# CHECK: Contents of section .debug_foo:
-# CHECK-NEXT: 0000 00000000 00000000 08000000 00000000
-# CHECK-NEXT: 0010 00000000 00000000 08000000 00000000
+# CHECK-NEXT: 0000 00000000 00000000 00000000 00000000
+# CHECK-NEXT: 0010 00000000 00000000 00000000 00000000
# REL: Relocations [
# REL-NEXT: .rela.text {
@@ -43,6 +46,12 @@
# REL-NEXT: 0x10 R_X86_64_NONE - 0x18
# REL-NEXT: 0x18 R_X86_64_64 group 0x20
# REL-NEXT: }
+# REL-NEXT: .rela.debug_names {
+# REL-NEXT: 0x0 R_X86_64_32 .debug_info 0x0
+# REL-NEXT: 0x4 R_X86_64_64 .debug_info 0x0
+# REL-NEXT: 0xC R_X86_64_NONE - 0x0
+# REL-NEXT: 0x10 R_X86_64_NONE - 0x0
+# REL-NEXT: }
# REL-NEXT: .rela.debug_foo {
# REL-NEXT: 0x0 R_X86_64_NONE - 0x8
# REL-NEXT: 0x8 R_X86_64_NONE - 0x8
@@ -82,6 +91,17 @@ group:
## resolved to the prevailing copy.
.quad group+32
+.section .debug_info,"G",@progbits,5657452045627120676,comdat
+.Ltu_begin0:
+
+.section .debug_names
+## .debug_names may reference a local type unit defined in a COMDAT .debug_info
+## section (-g -gpubnames -fdebug-types-section). If the referenced section is
+## non-prevailing, resolve to UINT32_MAX.
+.long .Ltu_begin0
+## ... or UINT64_MAX for DWARF64.
+.quad .Ltu_begin0
+
.section .debug_foo
.quad .text.1+8
diff --git a/llvm/docs/llvm-objdump.1 b/llvm/docs/llvm-objdump.1
deleted file mode 100644
index 42dcc7367659..000000000000
--- a/llvm/docs/llvm-objdump.1
+++ /dev/null
@@ -1,209 +0,0 @@
-.\" Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-.\" See https://llvm.org/LICENSE.txt for license information.
-.\" SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-.\"
-.Dd December 19, 2018
-.Dt LLVM-OBJDUMP 1
-.Os
-.Sh NAME
-.Nm llvm-objdump
-.Nd LLVM object file dumper
-.Sh SYNOPSIS
-.Nm llvm-objdump
-.Op Ar options
-.Ar objfile ...
-.Sh DESCRIPTION
-.Nm
-prints the contents of object files and final linked images named on the
-command line.
-If no file name is specified,
-.Nm
-will attempt to read from
-.Pa a.out .
-If
-.Pa -
-is used as a file name,
-.Nm
-will process a file on its standard input stream.
-.Nm
-accepts many of the same command line arguments as GNU objdump.
-.Sh OPTIONS
-.Ss General Options
-.Bl -tag -width indent
-.It Fl -aarch64-neon-syntax Ns = Ns Ar value
-Choose style of NEON code to emit from AArch64 backend.
-.Ar value
-may be one of:
-.Bl -tag -width indent
-.It generic
-Generic NEON assembly
-.It apple
-Apple-style NEON assembly
-.El
-.It Fl -arch Ns = Ns Ar value
-Choose architecture(s) from a Mach-O file to dump
-.It Fl -arch-name Ns = Ns ar arch
-Target arch to disassemble for.
-See
-.Fl -version
-for available targets.
-.It Fl -bind
-Display mach-o binding info.
-.It Fl -color
-Use colored syntax highlighting.
-Default autodetect.
-.It Fl -disassemble
-Display assembler mnemonics for machine instructions.
-.It Fl -disassemble-all
-Display assembler mnemonics for the machine instruction in all sections.
-.It Fl -dsym Ns = Ns Ar file
-Use
-.Ar file
-for debug info.
-.It Fl -dwarf Ns = Ns Ar sections
-Dump of dwarf debug sections.
-.Bl -tag -width indent
-.It frames
-.Dv .debug_frame
-.El
-.It Fl -exports-trie
-Display mach-o exported symbols.
-.It Fl -fault-map-section
-Display contents of faultmap section.
-.It Fl -filter-print-funcs Ns = Ns Ar functions
-Only print IR for functions whose name match
-.Ar functions
-for all print-[before|after][-all] options.
-.It Fl -full-leading-addr
-Print full leading address.
-.It Fl g
-Print line information from debug info if available.
-.It Fl h , -headers , -section-headers
-Display summaries of the headers for each section.
-.It Fl -help
-Display available options.
-Use
-.Fl -help-hidden
-for more.
-.It Fl -lazy-bind
-Display mach-o lazy binding info.
-.It Fl -line-numbers
-Display source line numbers with disassembly.
-Implies disassemble object.
-.It Fl -macho
-Use MachO specific object file parser.
-.It Fl -mattr Ns = Ns Ar attribute ...
-Target specific attributes.
-.It Fl -mcpu Ns = Ns Ar CPU
-Target a specific cpu type.
-Use
-.Fl mcpu Ns = Ns help
-for details.
-.It Fl -no-leading-addr
-Print no leading address.
-.It Fl -no-leading-headers
-Print no leading headers.
-.It Fl -no-show-raw-insn
-When disassembling instructions, do not print the instruction bytes.
-.It Fl -offloading
-Display the content of the LLVM offloading section.
-.It Fl -prefix Ns = Ns Ar PREFIX
-When disassembling, add
-.Ar PREFIX
-to absolute paths.
-.It Fl -prefix-strip Ns = Ns Ar LEVEL
-When disassembling, strip out
-.Ar LEVEL
-initial directories from absolute paths. This option has no effect without
-.Fl -prefix Ns = Ns PREFIX .
-.It Fl -print-imm-hex
-Use hex format for immediate values.
-.It Fl -private-header
-Display only the first format specific file header.
-.It Fl -private-headers
-Display format specific file headers.
-.It Fl r
-Display the relocation entries in the file.
-.It Fl -raw-clang-ast
-Dump the raw binary contents of the clang AST section.
-.It Fl -rebase
-Display mach-o rebasing info.
-.It Fl -reverse-iterate
-Reverse iterate.
-.It Fl s
-Display the content of each section.
-.It Fl -section Ns = Ns Ar section
-Operate on the specified sections only.
-With
-.Fl -macho
-dump segment,section.
-.It Fl -source
-Display source inline with disassembly.
-Implies disassemble object.
-.It Fl -start-address Ns = Ns Ar address
-Disassemble beginning at
-.Ar address .
-.It Fl -stop-address Ns = Ns Ar address
-Stop disassembly at
-.Ar address .
-.It Fl t
-Display the symbol table.
-.It Fl -triple Ns = Ns Ar triple
-Target triple to disassemble for.
-See
-.Fl -version
-for available targets.
-.It Fl -unwind-info
-Display unwind information.
-.It Fl -version
-Display the version of this program.
-.It Fl -weak-bind
-Display mach-o weak binding info.
-.It Fl -x86-asm-syntax Ns = Ns Ar syntax
-Choose style of code to emit from X86 backend.
-.Bl -tag -width indent
-.It att
-Emit AT&T-style assembly.
-.It intel
-Emit Intel-style assembly.
-.El
-.El
-.Ss Mach-O Options
-There are a number of options specific to the Mach-O format.
-These are used in combination with the
-.Fl -macho
-option.
-.Bl -tag -width indent
-.It Fl -archive-headers
-Print archive headers for Mach-O archives.
-.It Fl -archive-member-offsets
-Print the offset to each archive member for Mach-O archives.
-Requires
-.Fl -macho
-and
-.Fl -archive-headers .
-.It Fl -data-in-code
-Print the data in code table for Mach-O objects.
-.It Fl -dis-symname Ns = Ns Ar symbol
-Disassemble just
-.Ar symbol 's
-instructions.
-.It Fl -dylib-id
-Print the shared library's id for the dylib Mach-O file.
-.It Fl -dylibs-used
-Print the shared libraries used for linked Mach-O files.
-.It Fl -indirect-symbols
-Print indirect symbol table for Mach-O objects.
-.It Fl -info-plist
-Print the info plist section as strings for Mach-O objects.
-.It Fl -link-opt-hints
-Print the linker optimization hints for Mach-O objects.
-.It Fl -no-symbolic-operands
-do not symbolic operands when disassembling.
-.It Fl -non-verbose
-Print the info for Mach-O objects in non-verbose or numeric form.
-.It Fl -objc-meta-data
-Print the Objective-C runtime meta data for Mach-O files.
-.It Fl -universal-headers
-Print Mach-O universal headers.
-.El
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index 0917d0e4eb3e..a483b8028fda 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -5352,6 +5352,7 @@ void SelectionDAGLegalize::PromoteNode(SDNode *Node) {
case ISD::FEXP:
case ISD::FEXP2:
case ISD::FEXP10:
+ case ISD::FCANONICALIZE:
Tmp1 = DAG.getNode(ISD::FP_EXTEND, dl, NVT, Node->getOperand(0));
Tmp2 = DAG.getNode(Node->getOpcode(), dl, NVT, Tmp1);
Results.push_back(
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index 6d85e1fb5fbf..bc9678c13971 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -4094,20 +4094,7 @@ AArch64InstrInfo::getLdStAmountOp(const MachineInstr &MI) {
switch (MI.getOpcode()) {
default:
llvm_unreachable("Unexpected opcode");
- case AArch64::LDRBroX:
case AArch64::LDRBBroX:
- case AArch64::LDRSBXroX:
- case AArch64::LDRSBWroX:
- case AArch64::LDRHroX:
- case AArch64::LDRHHroX:
- case AArch64::LDRSHXroX:
- case AArch64::LDRSHWroX:
- case AArch64::LDRWroX:
- case AArch64::LDRSroX:
- case AArch64::LDRSWroX:
- case AArch64::LDRDroX:
- case AArch64::LDRXroX:
- case AArch64::LDRQroX:
return MI.getOperand(4);
}
}
diff --git a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
index aa7a4bc23536..b435b3ce03e7 100644
--- a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
@@ -180,7 +180,7 @@ struct AArch64LoadStoreOpt : public MachineFunctionPass {
// Scan the instruction list to find a register assigned with a const
// value that can be combined with the current instruction (a load or store)
- // using base addressing with writeback. Scan backwards.
+ // using base addressing with writeback. Scan forwards.
MachineBasicBlock::iterator
findMatchingConstOffsetBackward(MachineBasicBlock::iterator I, unsigned Limit,
unsigned &Offset);
@@ -221,7 +221,7 @@ struct AArch64LoadStoreOpt : public MachineFunctionPass {
// Find and merge a base register updates before or after a ld/st instruction.
bool tryToMergeLdStUpdate(MachineBasicBlock::iterator &MBBI);
- // Find and merge a index ldr/st instruction into a base ld/st instruction.
+ // Find and merge a index ldr/st instructions into a base ld/st instruction.
bool tryToMergeIndexLdSt(MachineBasicBlock::iterator &MBBI, int Scale);
bool optimizeBlock(MachineBasicBlock &MBB, bool EnableNarrowZeroStOpt);
@@ -511,34 +511,8 @@ static unsigned getBaseAddressOpcode(unsigned Opc) {
switch (Opc) {
default:
llvm_unreachable("Opcode has no base address equivalent!");
- case AArch64::LDRBroX:
- return AArch64::LDRBui;
case AArch64::LDRBBroX:
return AArch64::LDRBBui;
- case AArch64::LDRSBXroX:
- return AArch64::LDRSBXui;
- case AArch64::LDRSBWroX:
- return AArch64::LDRSBWui;
- case AArch64::LDRHroX:
- return AArch64::LDRHui;
- case AArch64::LDRHHroX:
- return AArch64::LDRHHui;
- case AArch64::LDRSHXroX:
- return AArch64::LDRSHXui;
- case AArch64::LDRSHWroX:
- return AArch64::LDRSHWui;
- case AArch64::LDRWroX:
- return AArch64::LDRWui;
- case AArch64::LDRSroX:
- return AArch64::LDRSui;
- case AArch64::LDRSWroX:
- return AArch64::LDRSWui;
- case AArch64::LDRDroX:
- return AArch64::LDRDui;
- case AArch64::LDRXroX:
- return AArch64::LDRXui;
- case AArch64::LDRQroX:
- return AArch64::LDRQui;
}
}
@@ -790,31 +764,10 @@ static bool isMergeableIndexLdSt(MachineInstr &MI, int &Scale) {
default:
return false;
// Scaled instructions.
- // TODO: Add more index address stores.
- case AArch64::LDRBroX:
+ // TODO: Add more index address loads/stores.
case AArch64::LDRBBroX:
- case AArch64::LDRSBXroX:
- case AArch64::LDRSBWroX:
Scale = 1;
return true;
- case AArch64::LDRHroX:
- case AArch64::LDRHHroX:
- case AArch64::LDRSHXroX:
- case AArch64::LDRSHWroX:
- Scale = 2;
- return true;
- case AArch64::LDRWroX:
- case AArch64::LDRSroX:
- case AArch64::LDRSWroX:
- Scale = 4;
- return true;
- case AArch64::LDRDroX:
- case AArch64::LDRXroX:
- Scale = 8;
- return true;
- case AArch64::LDRQroX:
- Scale = 16;
- return true;
}
}
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 4f4bc45e49b4..fc119aa61d01 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -540,10 +540,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
MVT::f16, Custom);
setOperationAction({ISD::SINT_TO_FP, ISD::UINT_TO_FP}, MVT::i16, Custom);
-
- setOperationAction(
- {ISD::FP_TO_SINT, ISD::FP_TO_UINT, ISD::SINT_TO_FP, ISD::UINT_TO_FP},
- MVT::f16, Promote);
+ setOperationAction({ISD::FP_TO_SINT, ISD::FP_TO_UINT}, MVT::f16, Promote);
// F16 - VOP2 Actions.
setOperationAction({ISD::BR_CC, ISD::SELECT_CC}, MVT::f16, Expand);
diff --git a/llvm/lib/Target/X86/X86InstrAMX.td b/llvm/lib/Target/X86/X86InstrAMX.td
index 2dbb3e5ee316..71e6a44c9d8e 100644
--- a/llvm/lib/Target/X86/X86InstrAMX.td
+++ b/llvm/lib/Target/X86/X86InstrAMX.td
@@ -91,19 +91,19 @@ let Predicates = [HasAMXINT8, In64BitMode] in {
def TDPBSSD : I<0x5e, MRMSrcReg4VOp3, (outs TILE:$dst),
(ins TILE:$src1, TILE:$src2, TILE:$src3),
"tdpbssd\t{$src3, $src2, $dst|$dst, $src2, $src3}", []>,
- VEX_4V, T8XD;
+ VEX, VVVV, T8XD;
def TDPBSUD : I<0x5e, MRMSrcReg4VOp3, (outs TILE:$dst),
(ins TILE:$src1, TILE:$src2, TILE:$src3),
"tdpbsud\t{$src3, $src2, $dst|$dst, $src2, $src3}", []>,
- VEX_4V, T8XS;
+ VEX, VVVV, T8XS;
def TDPBUSD : I<0x5e, MRMSrcReg4VOp3, (outs TILE:$dst),
(ins TILE:$src1, TILE:$src2, TILE:$src3),
"tdpbusd\t{$src3, $src2, $dst|$dst, $src2, $src3}", []>,
- VEX_4V, T8PD;
+ VEX, VVVV, T8PD;
def TDPBUUD : I<0x5e, MRMSrcReg4VOp3, (outs TILE:$dst),
(ins TILE:$src1, TILE:$src2, TILE:$src3),
"tdpbuud\t{$src3, $src2, $dst|$dst, $src2, $src3}", []>,
- VEX_4V, T8PS;
+ VEX, VVVV, T8PS;
}
// Pseduo instruction for RA.
@@ -163,7 +163,7 @@ let Predicates = [HasAMXBF16, In64BitMode] in {
def TDPBF16PS : I<0x5c, MRMSrcReg4VOp3, (outs TILE:$dst),
(ins TILE:$src1, TILE:$src2, TILE:$src3),
"tdpbf16ps\t{$src3, $src2, $dst|$dst, $src2, $src3}",
- []>, VEX_4V, T8XS;
+ []>, VEX, VVVV, T8XS;
// Pseduo instruction for RA.
let isPseudo = true, Constraints = "$src4 = $dst" in
@@ -193,7 +193,7 @@ let Predicates = [HasAMXFP16, In64BitMode] in {
def TDPFP16PS : I<0x5c, MRMSrcReg4VOp3, (outs TILE:$dst),
(ins TILE:$src1, TILE:$src2, TILE:$src3),
"tdpfp16ps\t{$src3, $src2, $src1|$src1, $src2, $src3}",
- []>, VEX_4V, T8XD;
+ []>, VEX, VVVV, T8XD;
}
// Pseduo instruction for RA.
@@ -222,11 +222,11 @@ let Predicates = [HasAMXCOMPLEX, In64BitMode] in {
def TCMMIMFP16PS : I<0x6c, MRMSrcReg4VOp3, (outs TILE:$dst),
(ins TILE:$src1, TILE:$src2, TILE:$src3),
"tcmmimfp16ps\t{$src3, $src2, $src1|$src1, $src2, $src3}",
- []>, T8PD, VEX_4V;
+ []>, T8PD, VEX, VVVV;
def TCMMRLFP16PS : I<0x6c, MRMSrcReg4VOp3, (outs TILE:$dst),
(ins TILE:$src1, TILE:$src2, TILE:$src3),
"tcmmrlfp16ps\t{$src3, $src2, $src1|$src1, $src2, $src3}",
- []>, VEX_4V, WIG, T8PS;
+ []>, VEX, VVVV, WIG, T8PS;
} // Constraints = "$src1 = $dst"
diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td
index e1fe2b680b96..86619dfd07bc 100644
--- a/llvm/lib/Target/X86/X86InstrAVX512.td
+++ b/llvm/lib/Target/X86/X86InstrAVX512.td
@@ -378,7 +378,7 @@ multiclass vinsert_for_size_split<int Opcode, X86VectorVTInfo From,
(vinsert_for_mask:$src3 (To.VT To.RC:$src1),
(From.VT From.RC:$src2),
(iPTR imm))>,
- AVX512AIi8Base, EVEX_4V, Sched<[sched]>;
+ AVX512AIi8Base, EVEX, VVVV, Sched<[sched]>;
let mayLoad = 1 in
defm rm : AVX512_maskable_split<Opcode, MRMSrcMem, To, (outs To.RC:$dst),
(ins To.RC:$src1, From.MemOp:$src2, u8imm:$src3),
@@ -389,7 +389,7 @@ multiclass vinsert_for_size_split<int Opcode, X86VectorVTInfo From,
(iPTR imm)),
(vinsert_for_mask:$src3 (To.VT To.RC:$src1),
(From.VT (From.LdFrag addr:$src2)),
- (iPTR imm))>, AVX512AIi8Base, EVEX_4V,
+ (iPTR imm))>, AVX512AIi8Base, EVEX, VVVV,
EVEX_CD8<From.EltSize, From.CD8TupleForm>,
Sched<[sched.Folded, sched.ReadAfterFold]>;
}
@@ -647,14 +647,14 @@ def VINSERTPSZrr : AVX512AIi8<0x21, MRMSrcReg, (outs VR128X:$dst),
(ins VR128X:$src1, VR128X:$src2, u8imm:$src3),
"vinsertps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
[(set VR128X:$dst, (X86insertps VR128X:$src1, VR128X:$src2, timm:$src3))]>,
- EVEX_4V, Sched<[SchedWriteFShuffle.XMM]>;
+ EVEX, VVVV, Sched<[SchedWriteFShuffle.XMM]>;
def VINSERTPSZrm: AVX512AIi8<0x21, MRMSrcMem, (outs VR128X:$dst),
(ins VR128X:$src1, f32mem:$src2, u8imm:$src3),
"vinsertps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
[(set VR128X:$dst, (X86insertps VR128X:$src1,
(v4f32 (scalar_to_vector (loadf32 addr:$src2))),
timm:$src3))]>,
- EVEX_4V, EVEX_CD8<32, CD8VT1>,
+ EVEX, VVVV, EVEX_CD8<32, CD8VT1>,
Sched<[SchedWriteFShuffle.XMM.Folded, SchedWriteFShuffle.XMM.ReadAfterFold]>;
}
@@ -1593,7 +1593,7 @@ let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain,
(ins _.RC:$src2, _.RC:$src3),
OpcodeStr, "$src3, $src2", "$src2, $src3",
(_.VT (X86VPermt2 _.RC:$src2, IdxVT.RC:$src1, _.RC:$src3)), 1>,
- EVEX_4V, AVX5128IBase, Sched<[sched]>;
+ EVEX, VVVV, AVX5128IBase, Sched<[sched]>;
let mayLoad = 1 in
defm rm: AVX512_maskable_3src_cast<opc, MRMSrcMem, _, IdxVT, (outs _.RC:$dst),
@@ -1601,7 +1601,7 @@ let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain,
OpcodeStr, "$src3, $src2", "$src2, $src3",
(_.VT (X86VPermt2 _.RC:$src2, IdxVT.RC:$src1,
(_.VT (_.LdFrag addr:$src3)))), 1>,
- EVEX_4V, AVX5128IBase, Sched<[sched.Folded, sched.ReadAfterFold]>;
+ EVEX, VVVV, AVX5128IBase, Sched<[sched.Folded, sched.ReadAfterFold]>;
}
}
@@ -1616,7 +1616,7 @@ multiclass avx512_perm_i_mb<bits<8> opc, string OpcodeStr,
!strconcat("$src2, ${src3}", _.BroadcastStr ),
(_.VT (X86VPermt2 _.RC:$src2,
IdxVT.RC:$src1,(_.VT (_.BroadcastLdFrag addr:$src3)))), 1>,
- AVX5128IBase, EVEX_4V, EVEX_B,
+ AVX5128IBase, EVEX, VVVV, EVEX_B,
Sched<[sched.Folded, sched.ReadAfterFold]>;
}
@@ -1715,14 +1715,14 @@ let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain in {
(ins IdxVT.RC:$src2, _.RC:$src3),
OpcodeStr, "$src3, $src2", "$src2, $src3",
(_.VT (X86VPermt2 _.RC:$src1, IdxVT.RC:$src2, _.RC:$src3)), 1>,
- EVEX_4V, AVX5128IBase, Sched<[sched]>;
+ EVEX, VVVV, AVX5128IBase, Sched<[sched]>;
defm rm: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins IdxVT.RC:$src2, _.MemOp:$src3),
OpcodeStr, "$src3, $src2", "$src2, $src3",
(_.VT (X86VPermt2 _.RC:$src1, IdxVT.RC:$src2,
(_.LdFrag addr:$src3))), 1>,
- EVEX_4V, AVX5128IBase, Sched<[sched.Folded, sched.ReadAfterFold]>;
+ EVEX, VVVV, AVX5128IBase, Sched<[sched.Folded, sched.ReadAfterFold]>;
}
}
multiclass avx512_perm_t_mb<bits<8> opc, string OpcodeStr,
@@ -1735,7 +1735,7 @@ multiclass avx512_perm_t_mb<bits<8> opc, string OpcodeStr,
!strconcat("$src2, ${src3}", _.BroadcastStr ),
(_.VT (X86VPermt2 _.RC:$src1,
IdxVT.RC:$src2,(_.VT (_.BroadcastLdFrag addr:$src3)))), 1>,
- AVX5128IBase, EVEX_4V, EVEX_B,
+ AVX5128IBase, EVEX, VVVV, EVEX_B,
Sched<[sched.Folded, sched.ReadAfterFold]>;
}
@@ -1800,35 +1800,35 @@ multiclass WriteFVarBlendask<bits<8> opc, string OpcodeStr,
(ins _.RC:$src1, _.RC:$src2),
!strconcat(OpcodeStr,
"\t{$src2, $src1, ${dst}|${dst}, $src1, $src2}"), []>,
- EVEX_4V, Sched<[sched]>;
+ EVEX, VVVV, Sched<[sched]>;
def rrk : AVX5128I<opc, MRMSrcReg, (outs _.RC:$dst),
(ins _.KRCWM:$mask, _.RC:$src1, _.RC:$src2),
!strconcat(OpcodeStr,
"\t{$src2, $src1, ${dst} {${mask}}|${dst} {${mask}}, $src1, $src2}"),
- []>, EVEX_4V, EVEX_K, Sched<[sched]>;
+ []>, EVEX, VVVV, EVEX_K, Sched<[sched]>;
def rrkz : AVX5128I<opc, MRMSrcReg, (outs _.RC:$dst),
(ins _.KRCWM:$mask, _.RC:$src1, _.RC:$src2),
!strconcat(OpcodeStr,
"\t{$src2, $src1, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src1, $src2}"),
- []>, EVEX_4V, EVEX_KZ, Sched<[sched]>;
+ []>, EVEX, VVVV, EVEX_KZ, Sched<[sched]>;
let mayLoad = 1 in {
def rm : AVX5128I<opc, MRMSrcMem, (outs _.RC:$dst),
(ins _.RC:$src1, _.MemOp:$src2),
!strconcat(OpcodeStr,
"\t{$src2, $src1, ${dst}|${dst}, $src1, $src2}"),
- []>, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>,
+ []>, EVEX, VVVV, EVEX_CD8<_.EltSize, CD8VF>,
Sched<[sched.Folded, sched.ReadAfterFold]>;
def rmk : AVX5128I<opc, MRMSrcMem, (outs _.RC:$dst),
(ins _.KRCWM:$mask, _.RC:$src1, _.MemOp:$src2),
!strconcat(OpcodeStr,
"\t{$src2, $src1, ${dst} {${mask}}|${dst} {${mask}}, $src1, $src2}"),
- []>, EVEX_4V, EVEX_K, EVEX_CD8<_.EltSize, CD8VF>,
+ []>, EVEX, VVVV, EVEX_K, EVEX_CD8<_.EltSize, CD8VF>,
Sched<[sched.Folded, sched.ReadAfterFold]>;
def rmkz : AVX5128I<opc, MRMSrcMem, (outs _.RC:$dst),
(ins _.KRCWM:$mask, _.RC:$src1, _.MemOp:$src2),
!strconcat(OpcodeStr,
"\t{$src2, $src1, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src1, $src2}"),
- []>, EVEX_4V, EVEX_KZ, EVEX_CD8<_.EltSize, CD8VF>,
+ []>, EVEX, VVVV, EVEX_KZ, EVEX_CD8<_.EltSize, CD8VF>,
Sched<[sched.Folded, sched.ReadAfterFold]>;
}
}
@@ -1841,7 +1841,7 @@ multiclass WriteFVarBlendask_rmb<bits<8> opc, string OpcodeStr,
!strconcat(OpcodeStr,
"\t{${src2}", _.BroadcastStr, ", $src1, $dst {${mask}}|",
"$dst {${mask}}, $src1, ${src2}", _.BroadcastStr, "}"), []>,
- EVEX_4V, EVEX_K, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>,
+ EVEX, VVVV, EVEX_K, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>,
Sched<[sched.Folded, sched.ReadAfterFold]>;
def rmbkz : AVX5128I<opc, MRMSrcMem, (outs _.RC:$dst),
@@ -1849,7 +1849,7 @@ multiclass WriteFVarBlendask_rmb<bits<8> opc, string OpcodeStr,
!strconcat(OpcodeStr,
"\t{${src2}", _.BroadcastStr, ", $src1, $dst {${mask}} {z}|",
"$dst {${mask}} {z}, $src1, ${src2}", _.BroadcastStr, "}"), []>,
- EVEX_4V, EVEX_KZ, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>,
+ EVEX, VVVV, EVEX_KZ, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>,
Sched<[sched.Folded, sched.ReadAfterFold]>;
def rmb : AVX5128I<opc, MRMSrcMem, (outs _.RC:$dst),
@@ -1857,7 +1857,7 @@ multiclass WriteFVarBlendask_rmb<bits<8> opc, string OpcodeStr,
!strconcat(OpcodeStr,
"\t{${src2}", _.BroadcastStr, ", $src1, $dst|",
"$dst, $src1, ${src2}", _.BroadcastStr, "}"), []>,
- EVEX_4V, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>,
+ EVEX, VVVV, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>,
Sched<[sched.Folded, sched.ReadAfterFold]>;
}
}
@@ -1921,7 +1921,7 @@ multiclass avx512_cmp_scalar<X86VectorVTInfo _, SDNode OpNode, SDNode OpNodeSAE,
"$cc, $src2, $src1", "$src1, $src2, $cc",
(OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2), timm:$cc),
(OpNode_su (_.VT _.RC:$src1), (_.VT _.RC:$src2),
- timm:$cc)>, EVEX_4V, VEX_LIG, Sched<[sched]>, SIMD_EXC;
+ timm:$cc)>, EVEX, VVVV, VEX_LIG, Sched<[sched]>, SIMD_EXC;
let mayLoad = 1 in
defm rm_Int : AVX512_maskable_cmp<0xC2, MRMSrcMem, _,
(outs _.KRC:$dst),
@@ -1931,7 +1931,7 @@ multiclass avx512_cmp_scalar<X86VectorVTInfo _, SDNode OpNode, SDNode OpNodeSAE,
(OpNode (_.VT _.RC:$src1), (_.ScalarIntMemFrags addr:$src2),
timm:$cc),
(OpNode_su (_.VT _.RC:$src1), (_.ScalarIntMemFrags addr:$src2),
- timm:$cc)>, EVEX_4V, VEX_LIG, EVEX_CD8<_.EltSize, CD8VT1>,
+ timm:$cc)>, EVEX, VVVV, VEX_LIG, EVEX_CD8<_.EltSize, CD8VT1>,
Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC;
let Uses = [MXCSR] in
@@ -1944,7 +1944,7 @@ multiclass avx512_cmp_scalar<X86VectorVTInfo _, SDNode OpNode, SDNode OpNodeSAE,
timm:$cc),
(OpNodeSAE_su (_.VT _.RC:$src1), (_.VT _.RC:$src2),
timm:$cc)>,
- EVEX_4V, VEX_LIG, EVEX_B, Sched<[sched]>;
+ EVEX, VVVV, VEX_LIG, EVEX_B, Sched<[sched]>;
let isCodeGenOnly = 1 in {
let isCommutable = 1 in
@@ -1955,7 +1955,7 @@ multiclass avx512_cmp_scalar<X86VectorVTInfo _, SDNode OpNode, SDNode OpNodeSAE,
[(set _.KRC:$dst, (OpNode _.FRC:$src1,
_.FRC:$src2,
timm:$cc))]>,
- EVEX_4V, VEX_LIG, Sched<[sched]>, SIMD_EXC;
+ EVEX, VVVV, VEX_LIG, Sched<[sched]>, SIMD_EXC;
def rm : AVX512Ii8<0xC2, MRMSrcMem,
(outs _.KRC:$dst),
(ins _.FRC:$src1, _.ScalarMemOp:$src2, u8imm:$cc),
@@ -1964,7 +1964,7 @@ multiclass avx512_cmp_scalar<X86VectorVTInfo _, SDNode OpNode, SDNode OpNodeSAE,
[(set _.KRC:$dst, (OpNode _.FRC:$src1,
(_.ScalarLdFrag addr:$src2),
timm:$cc))]>,
- EVEX_4V, VEX_LIG, EVEX_CD8<_.EltSize, CD8VT1>,
+ EVEX, VVVV, VEX_LIG, EVEX_CD8<_.EltSize, CD8VT1>,
Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC;
}
}
@@ -1991,24 +1991,24 @@ multiclass avx512_icmp_packed<bits<8> opc, string OpcodeStr,
def rr : AVX512BI<opc, MRMSrcReg,
(outs _.KRC:$dst), (ins _.RC:$src1, _.RC:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
- []>, EVEX_4V, Sched<[sched]>;
+ []>, EVEX, VVVV, Sched<[sched]>;
let mayLoad = 1, hasSideEffects = 0 in
def rm : AVX512BI<opc, MRMSrcMem,
(outs _.KRC:$dst), (ins _.RC:$src1, _.MemOp:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
- []>, EVEX_4V, Sched<[sched.Folded, sched.ReadAfterFold]>;
+ []>, EVEX, VVVV, Sched<[sched.Folded, sched.ReadAfterFold]>;
let isCommutable = IsCommutable, hasSideEffects = 0 in
def rrk : AVX512BI<opc, MRMSrcReg,
(outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, _.RC:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst {${mask}}|",
"$dst {${mask}}, $src1, $src2}"),
- []>, EVEX_4V, EVEX_K, Sched<[sched]>;
+ []>, EVEX, VVVV, EVEX_K, Sched<[sched]>;
let mayLoad = 1, hasSideEffects = 0 in
def rmk : AVX512BI<opc, MRMSrcMem,
(outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, _.MemOp:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst {${mask}}|",
"$dst {${mask}}, $src1, $src2}"),
- []>, EVEX_4V, EVEX_K, Sched<[sched.Folded, sched.ReadAfterFold]>;
+ []>, EVEX, VVVV, EVEX_K, Sched<[sched.Folded, sched.ReadAfterFold]>;
}
multiclass avx512_icmp_packed_rmb<bits<8> opc, string OpcodeStr,
@@ -2020,14 +2020,14 @@ multiclass avx512_icmp_packed_rmb<bits<8> opc, string OpcodeStr,
(outs _.KRC:$dst), (ins _.RC:$src1, _.ScalarMemOp:$src2),
!strconcat(OpcodeStr, "\t{${src2}", _.BroadcastStr, ", $src1, $dst",
"|$dst, $src1, ${src2}", _.BroadcastStr, "}"),
- []>, EVEX_4V, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>;
+ []>, EVEX, VVVV, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>;
def rmbk : AVX512BI<opc, MRMSrcMem,
(outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1,
_.ScalarMemOp:$src2),
!strconcat(OpcodeStr,
"\t{${src2}", _.BroadcastStr, ", $src1, $dst {${mask}}|",
"$dst {${mask}}, $src1, ${src2}", _.BroadcastStr, "}"),
- []>, EVEX_4V, EVEX_K, EVEX_B,
+ []>, EVEX, VVVV, EVEX_K, EVEX_B,
Sched<[sched.Folded, sched.ReadAfterFold]>;
}
}
@@ -2113,7 +2113,7 @@ multiclass avx512_icmp_cc<bits<8> opc, string Suffix, PatFrag Frag,
[(set _.KRC:$dst, (_.KVT (Frag:$cc (_.VT _.RC:$src1),
(_.VT _.RC:$src2),
cond)))]>,
- EVEX_4V, Sched<[sched]>;
+ EVEX, VVVV, Sched<[sched]>;
def rmi : AVX512AIi8<opc, MRMSrcMem,
(outs _.KRC:$dst), (ins _.RC:$src1, _.MemOp:$src2, u8imm:$cc),
!strconcat("vpcmp", Suffix,
@@ -2123,7 +2123,7 @@ multiclass avx512_icmp_cc<bits<8> opc, string Suffix, PatFrag Frag,
(_.VT _.RC:$src1),
(_.VT (_.LdFrag addr:$src2)),
cond)))]>,
- EVEX_4V, Sched<[sched.Folded, sched.ReadAfterFold]>;
+ EVEX, VVVV, Sched<[sched.Folded, sched.ReadAfterFold]>;
let isCommutable = 1 in
def rrik : AVX512AIi8<opc, MRMSrcReg,
(outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, _.RC:$src2,
@@ -2135,7 +2135,7 @@ multiclass avx512_icmp_cc<bits<8> opc, string Suffix, PatFrag Frag,
(_.KVT (Frag_su:$cc (_.VT _.RC:$src1),
(_.VT _.RC:$src2),
cond))))]>,
- EVEX_4V, EVEX_K, Sched<[sched]>;
+ EVEX, VVVV, EVEX_K, Sched<[sched]>;
def rmik : AVX512AIi8<opc, MRMSrcMem,
(outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, _.MemOp:$src2,
u8imm:$cc),
@@ -2148,7 +2148,7 @@ multiclass avx512_icmp_cc<bits<8> opc, string Suffix, PatFrag Frag,
(_.VT _.RC:$src1),
(_.VT (_.LdFrag addr:$src2)),
cond))))]>,
- EVEX_4V, EVEX_K, Sched<[sched.Folded, sched.ReadAfterFold]>;
+ EVEX, VVVV, EVEX_K, Sched<[sched.Folded, sched.ReadAfterFold]>;
def : Pat<(_.KVT (Frag:$cc (_.LdFrag addr:$src2),
(_.VT _.RC:$src1), cond)),
@@ -2177,7 +2177,7 @@ multiclass avx512_icmp_cc_rmb<bits<8> opc, string Suffix, PatFrag Frag,
(_.VT _.RC:$src1),
(_.BroadcastLdFrag addr:$src2),
cond)))]>,
- EVEX_4V, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>;
+ EVEX, VVVV, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>;
def rmibk : AVX512AIi8<opc, MRMSrcMem,
(outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1,
_.ScalarMemOp:$src2, u8imm:$cc),
@@ -2189,7 +2189,7 @@ multiclass avx512_icmp_cc_rmb<bits<8> opc, string Suffix, PatFrag Frag,
(_.VT _.RC:$src1),
(_.BroadcastLdFrag addr:$src2),
cond))))]>,
- EVEX_4V, EVEX_K, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>;
+ EVEX, VVVV, EVEX_K, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>;
def : Pat<(_.KVT (Frag:$cc (_.BroadcastLdFrag addr:$src2),
(_.VT _.RC:$src1), cond)),
@@ -2405,11 +2405,11 @@ multiclass avx512_vcmp<X86SchedWriteWidths sched, AVX512VLVectorVTInfo _,
}
defm VCMPPD : avx512_vcmp<SchedWriteFCmp, avx512vl_f64_info>,
- AVX512PDIi8Base, EVEX_4V, EVEX_CD8<64, CD8VF>, REX_W;
+ AVX512PDIi8Base, EVEX, VVVV, EVEX_CD8<64, CD8VF>, REX_W;
defm VCMPPS : avx512_vcmp<SchedWriteFCmp, avx512vl_f32_info>,
- AVX512PSIi8Base, EVEX_4V, EVEX_CD8<32, CD8VF>;
+ AVX512PSIi8Base, EVEX, VVVV, EVEX_CD8<32, CD8VF>;
defm VCMPPH : avx512_vcmp<SchedWriteFCmp, avx512vl_f16_info, HasFP16>,
- AVX512PSIi8Base, EVEX_4V, EVEX_CD8<16, CD8VF>, TA;
+ AVX512PSIi8Base, EVEX, VVVV, EVEX_CD8<16, CD8VF>, TA;
// Patterns to select fp compares with load as first operand.
let Predicates = [HasAVX512] in {
@@ -2812,13 +2812,13 @@ multiclass avx512_mask_binop_all<bits<8> opc, string OpcodeStr,
X86FoldableSchedWrite sched, bit IsCommutable,
Predicate prdW = HasAVX512> {
defm B : avx512_mask_binop<opc, !strconcat(OpcodeStr, "b"), VK8, OpNode,
- sched, HasDQI, IsCommutable>, VEX_4V, VEX_L, PD;
+ sched, HasDQI, IsCommutable>, VEX, VVVV, VEX_L, PD;
defm W : avx512_mask_binop<opc, !strconcat(OpcodeStr, "w"), VK16, OpNode,
- sched, prdW, IsCommutable>, VEX_4V, VEX_L, PS;
+ sched, prdW, IsCommutable>, VEX, VVVV, VEX_L, PS;
defm D : avx512_mask_binop<opc, !strconcat(OpcodeStr, "d"), VK32, OpNode,
- sched, HasBWI, IsCommutable>, VEX_4V, VEX_L, REX_W, PD;
+ sched, HasBWI, IsCommutable>, VEX, VVVV, VEX_L, REX_W, PD;
defm Q : avx512_mask_binop<opc, !strconcat(OpcodeStr, "q"), VK64, OpNode,
- sched, HasBWI, IsCommutable>, VEX_4V, VEX_L, REX_W, PS;
+ sched, HasBWI, IsCommutable>, VEX, VVVV, VEX_L, REX_W, PS;
}
// TODO - do we need a X86SchedWriteWidths::KMASK type?
@@ -2869,7 +2869,7 @@ multiclass avx512_mask_unpck<string Suffix, X86KVectorVTInfo Dst,
def rr : I<0x4b, MRMSrcReg, (outs Dst.KRC:$dst),
(ins Src.KRC:$src1, Src.KRC:$src2),
"kunpck"#Suffix#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
- VEX_4V, VEX_L, Sched<[sched]>;
+ VEX, VVVV, VEX_L, Sched<[sched]>;
def : Pat<(Dst.KVT (concat_vectors Src.KRC:$src1, Src.KRC:$src2)),
(!cast<Instruction>(NAME#rr) Src.KRC:$src2, Src.KRC:$src1)>;
@@ -3897,7 +3897,7 @@ multiclass avx512_move_scalar<string asm, SDNode OpNode, PatFrag vzload_frag,
(ins _.RC:$src1, _.RC:$src2),
!strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(set _.RC:$dst, (_.VT (OpNode _.RC:$src1, _.RC:$src2)))],
- _.ExeDomain>, EVEX_4V, Sched<[SchedWriteFShuffle.XMM]>;
+ _.ExeDomain>, EVEX, VVVV, Sched<[SchedWriteFShuffle.XMM]>;
let Predicates = [prd] in {
def rrkz : AVX512PI<0x10, MRMSrcReg, (outs _.RC:$dst),
(ins _.KRCWM:$mask, _.RC:$src1, _.RC:$src2),
@@ -3906,7 +3906,7 @@ multiclass avx512_move_scalar<string asm, SDNode OpNode, PatFrag vzload_frag,
[(set _.RC:$dst, (_.VT (X86selects _.KRCWM:$mask,
(_.VT (OpNode _.RC:$src1, _.RC:$src2)),
_.ImmAllZerosV)))],
- _.ExeDomain>, EVEX_4V, EVEX_KZ, Sched<[SchedWriteFShuffle.XMM]>;
+ _.ExeDomain>, EVEX, VVVV, EVEX_KZ, Sched<[SchedWriteFShuffle.XMM]>;
let Constraints = "$src0 = $dst" in
def rrk : AVX512PI<0x10, MRMSrcReg, (outs _.RC:$dst),
(ins _.RC:$src0, _.KRCWM:$mask, _.RC:$src1, _.RC:$src2),
@@ -3915,7 +3915,7 @@ multiclass avx512_move_scalar<string asm, SDNode OpNode, PatFrag vzload_frag,
[(set _.RC:$dst, (_.VT (X86selects _.KRCWM:$mask,
(_.VT (OpNode _.RC:$src1, _.RC:$src2)),
(_.VT _.RC:$src0))))],
- _.ExeDomain>, EVEX_4V, EVEX_K, Sched<[SchedWriteFShuffle.XMM]>;
+ _.ExeDomain>, EVEX, VVVV, EVEX_K, Sched<[SchedWriteFShuffle.XMM]>;
let canFoldAsLoad = 1, isReMaterializable = 1 in {
def rm : AVX512PI<0x10, MRMSrcMem, (outs _.RC:$dst), (ins _.ScalarMemOp:$src),
!strconcat(asm, "\t{$src, $dst|$dst, $src}"),
@@ -4286,7 +4286,7 @@ let hasSideEffects = 0, isCodeGenOnly = 1, ForceDisassemble = 1 in {
def VMOVSHZrr_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst),
(ins VR128X:$src1, VR128X:$src2),
"vmovsh\t{$src2, $src1, $dst|$dst, $src1, $src2}",
- []>, T_MAP5XS, EVEX_4V, VEX_LIG,
+ []>, T_MAP5XS, EVEX, VVVV, VEX_LIG,
Sched<[SchedWriteFShuffle.XMM]>;
let Constraints = "$src0 = $dst" in
@@ -4295,20 +4295,20 @@ let hasSideEffects = 0, isCodeGenOnly = 1, ForceDisassemble = 1 in {
VR128X:$src1, VR128X:$src2),
"vmovsh\t{$src2, $src1, $dst {${mask}}|"#
"$dst {${mask}}, $src1, $src2}",
- []>, T_MAP5XS, EVEX_K, EVEX_4V, VEX_LIG,
+ []>, T_MAP5XS, EVEX_K, EVEX, VVVV, VEX_LIG,
Sched<[SchedWriteFShuffle.XMM]>;
def VMOVSHZrrkz_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst),
(ins f16x_info.KRCWM:$mask, VR128X:$src1, VR128X:$src2),
"vmovsh\t{$src2, $src1, $dst {${mask}} {z}|"#
"$dst {${mask}} {z}, $src1, $src2}",
- []>, EVEX_KZ, T_MAP5XS, EVEX_4V, VEX_LIG,
+ []>, EVEX_KZ, T_MAP5XS, EVEX, VVVV, VEX_LIG,
Sched<[SchedWriteFShuffle.XMM]>;
}
def VMOVSSZrr_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst),
(ins VR128X:$src1, VR128X:$src2),
"vmovss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
- []>, XS, EVEX_4V, VEX_LIG,
+ []>, XS, EVEX, VVVV, VEX_LIG,
Sched<[SchedWriteFShuffle.XMM]>;
let Constraints = "$src0 = $dst" in
@@ -4317,20 +4317,20 @@ let hasSideEffects = 0, isCodeGenOnly = 1, ForceDisassemble = 1 in {
VR128X:$src1, VR128X:$src2),
"vmovss\t{$src2, $src1, $dst {${mask}}|"#
"$dst {${mask}}, $src1, $src2}",
- []>, EVEX_K, XS, EVEX_4V, VEX_LIG,
+ []>, EVEX_K, XS, EVEX, VVVV, VEX_LIG,
Sched<[SchedWriteFShuffle.XMM]>;
def VMOVSSZrrkz_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst),
(ins f32x_info.KRCWM:$mask, VR128X:$src1, VR128X:$src2),
"vmovss\t{$src2, $src1, $dst {${mask}} {z}|"#
"$dst {${mask}} {z}, $src1, $src2}",
- []>, EVEX_KZ, XS, EVEX_4V, VEX_LIG,
+ []>, EVEX_KZ, XS, EVEX, VVVV, VEX_LIG,
Sched<[SchedWriteFShuffle.XMM]>;
def VMOVSDZrr_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst),
(ins VR128X:$src1, VR128X:$src2),
"vmovsd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
- []>, XD, EVEX_4V, VEX_LIG, REX_W,
+ []>, XD, EVEX, VVVV, VEX_LIG, REX_W,
Sched<[SchedWriteFShuffle.XMM]>;
let Constraints = "$src0 = $dst" in
@@ -4339,7 +4339,7 @@ let hasSideEffects = 0, isCodeGenOnly = 1, ForceDisassemble = 1 in {
VR128X:$src1, VR128X:$src2),
"vmovsd\t{$src2, $src1, $dst {${mask}}|"#
"$dst {${mask}}, $src1, $src2}",
- []>, EVEX_K, XD, EVEX_4V, VEX_LIG,
+ []>, EVEX_K, XD, EVEX, VVVV, VEX_LIG,
REX_W, Sched<[SchedWriteFShuffle.XMM]>;
def VMOVSDZrrkz_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst),
@@ -4347,7 +4347,7 @@ let hasSideEffects = 0, isCodeGenOnly = 1, ForceDisassemble = 1 in {
VR128X:$src2),
"vmovsd\t{$src2, $src1, $dst {${mask}} {z}|"#
"$dst {${mask}} {z}, $src1, $src2}",
- []>, EVEX_KZ, XD, EVEX_4V, VEX_LIG,
+ []>, EVEX_KZ, XD, EVEX, VVVV, VEX_LIG,
REX_W, Sched<[SchedWriteFShuffle.XMM]>;
}
@@ -4665,14 +4665,14 @@ multiclass avx512_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
(ins _.RC:$src1, _.RC:$src2), OpcodeStr,
"$src2, $src1", "$src1, $src2",
(_.VT (OpNode _.RC:$src1, _.RC:$src2)),
- IsCommutable, IsCommutable>, AVX512BIBase, EVEX_4V,
+ IsCommutable, IsCommutable>, AVX512BIBase, EVEX, VVVV,
Sched<[sched]>;
defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.MemOp:$src2), OpcodeStr,
"$src2, $src1", "$src1, $src2",
(_.VT (OpNode _.RC:$src1, (_.LdFrag addr:$src2)))>,
- AVX512BIBase, EVEX_4V,
+ AVX512BIBase, EVEX, VVVV,
Sched<[sched.Folded, sched.ReadAfterFold]>;
}
@@ -4686,7 +4686,7 @@ multiclass avx512_binop_rmb<bits<8> opc, string OpcodeStr, SDNode OpNode,
"$src1, ${src2}"#_.BroadcastStr,
(_.VT (OpNode _.RC:$src1,
(_.BroadcastLdFrag addr:$src2)))>,
- AVX512BIBase, EVEX_4V, EVEX_B,
+ AVX512BIBase, EVEX, VVVV, EVEX_B,
Sched<[sched.Folded, sched.ReadAfterFold]>;
}
@@ -4796,13 +4796,13 @@ multiclass avx512_binop_rm2<bits<8> opc, string OpcodeStr,
(_Src.VT _Src.RC:$src1),
(_Src.VT _Src.RC:$src2))),
IsCommutable>,
- AVX512BIBase, EVEX_4V, Sched<[sched]>;
+ AVX512BIBase, EVEX, VVVV, Sched<[sched]>;
defm rm : AVX512_maskable<opc, MRMSrcMem, _Dst, (outs _Dst.RC:$dst),
(ins _Src.RC:$src1, _Src.MemOp:$src2), OpcodeStr,
"$src2, $src1", "$src1, $src2",
(_Dst.VT (OpNode (_Src.VT _Src.RC:$src1),
(_Src.LdFrag addr:$src2)))>,
- AVX512BIBase, EVEX_4V,
+ AVX512BIBase, EVEX, VVVV,
Sched<[sched.Folded, sched.ReadAfterFold]>;
defm rmb : AVX512_maskable<opc, MRMSrcMem, _Dst, (outs _Dst.RC:$dst),
@@ -4812,7 +4812,7 @@ multiclass avx512_binop_rm2<bits<8> opc, string OpcodeStr,
"$src1, ${src2}"#_Brdct.BroadcastStr,
(_Dst.VT (OpNode (_Src.VT _Src.RC:$src1), (bitconvert
(_Brdct.VT (_Brdct.BroadcastLdFrag addr:$src2)))))>,
- AVX512BIBase, EVEX_4V, EVEX_B,
+ AVX512BIBase, EVEX, VVVV, EVEX_B,
Sched<[sched.Folded, sched.ReadAfterFold]>;
}
@@ -4884,7 +4884,7 @@ multiclass avx512_packs_rmb<bits<8> opc, string OpcodeStr, SDNode OpNode,
"$src1, ${src2}"#_Src.BroadcastStr,
(_Dst.VT (OpNode (_Src.VT _Src.RC:$src1), (bitconvert
(_Src.VT (_Src.BroadcastLdFrag addr:$src2)))))>,
- EVEX_4V, EVEX_B, EVEX_CD8<_Src.EltSize, CD8VF>,
+ EVEX, VVVV, EVEX_B, EVEX_CD8<_Src.EltSize, CD8VF>,
Sched<[sched.Folded, sched.ReadAfterFold]>;
}
@@ -4899,13 +4899,13 @@ multiclass avx512_packs_rm<bits<8> opc, string OpcodeStr,
(_Src.VT _Src.RC:$src1),
(_Src.VT _Src.RC:$src2))),
IsCommutable, IsCommutable>,
- EVEX_CD8<_Src.EltSize, CD8VF>, EVEX_4V, Sched<[sched]>;
+ EVEX_CD8<_Src.EltSize, CD8VF>, EVEX, VVVV, Sched<[sched]>;
defm rm : AVX512_maskable<opc, MRMSrcMem, _Dst, (outs _Dst.RC:$dst),
(ins _Src.RC:$src1, _Src.MemOp:$src2), OpcodeStr,
"$src2, $src1", "$src1, $src2",
(_Dst.VT (OpNode (_Src.VT _Src.RC:$src1),
(_Src.LdFrag addr:$src2)))>,
- EVEX_4V, EVEX_CD8<_Src.EltSize, CD8VF>,
+ EVEX, VVVV, EVEX_CD8<_Src.EltSize, CD8VF>,
Sched<[sched.Folded, sched.ReadAfterFold]>;
}
@@ -5445,18 +5445,18 @@ multiclass avx512_binop_s_round<bits<8> opc, string OpcodeStr, SDPatternOperator
sched.PS.Scl, IsCommutable>,
avx512_fp_scalar_round<opc, OpcodeStr#"ss", f32x_info, RndNode,
sched.PS.Scl>,
- XS, EVEX_4V, VEX_LIG, EVEX_CD8<32, CD8VT1>;
+ XS, EVEX, VVVV, VEX_LIG, EVEX_CD8<32, CD8VT1>;
defm SDZ : avx512_fp_scalar<opc, OpcodeStr#"sd", f64x_info, OpNode, VecNode,
sched.PD.Scl, IsCommutable>,
avx512_fp_scalar_round<opc, OpcodeStr#"sd", f64x_info, RndNode,
sched.PD.Scl>,
- XD, REX_W, EVEX_4V, VEX_LIG, EVEX_CD8<64, CD8VT1>;
+ XD, REX_W, EVEX, VVVV, VEX_LIG, EVEX_CD8<64, CD8VT1>;
let Predicates = [HasFP16] in
defm SHZ : avx512_fp_scalar<opc, OpcodeStr#"sh", f16x_info, OpNode,
VecNode, sched.PH.Scl, IsCommutable>,
avx512_fp_scalar_round<opc, OpcodeStr#"sh", f16x_info, RndNode,
sched.PH.Scl>,
- T_MAP5XS, EVEX_4V, VEX_LIG, EVEX_CD8<16, CD8VT1>;
+ T_MAP5XS, EVEX, VVVV, VEX_LIG, EVEX_CD8<16, CD8VT1>;
}
multiclass avx512_binop_s_sae<bits<8> opc, string OpcodeStr, SDNode OpNode,
@@ -5465,16 +5465,16 @@ multiclass avx512_binop_s_sae<bits<8> opc, string OpcodeStr, SDNode OpNode,
defm SSZ : avx512_fp_scalar_sae<opc, OpcodeStr#"ss", f32x_info, OpNode,
VecNode, SaeNode, sched.PS.Scl, IsCommutable,
NAME#"SS">,
- XS, EVEX_4V, VEX_LIG, EVEX_CD8<32, CD8VT1>;
+ XS, EVEX, VVVV, VEX_LIG, EVEX_CD8<32, CD8VT1>;
defm SDZ : avx512_fp_scalar_sae<opc, OpcodeStr#"sd", f64x_info, OpNode,
VecNode, SaeNode, sched.PD.Scl, IsCommutable,
NAME#"SD">,
- XD, REX_W, EVEX_4V, VEX_LIG, EVEX_CD8<64, CD8VT1>;
+ XD, REX_W, EVEX, VVVV, VEX_LIG, EVEX_CD8<64, CD8VT1>;
let Predicates = [HasFP16] in {
defm SHZ : avx512_fp_scalar_sae<opc, OpcodeStr#"sh", f16x_info, OpNode,
VecNode, SaeNode, sched.PH.Scl, IsCommutable,
NAME#"SH">,
- T_MAP5XS, EVEX_4V, VEX_LIG, EVEX_CD8<16, CD8VT1>,
+ T_MAP5XS, EVEX, VVVV, VEX_LIG, EVEX_CD8<16, CD8VT1>,
NotEVEX2VEXConvertible;
}
}
@@ -5516,29 +5516,29 @@ multiclass avx512_comutable_binop_s<bits<8> opc, string OpcodeStr,
}
defm VMINCSSZ : avx512_comutable_binop_s<0x5D, "vminss", f32x_info, X86fminc,
SchedWriteFCmp.Scl, "VMINCSS">, XS,
- EVEX_4V, VEX_LIG, EVEX_CD8<32, CD8VT1>, SIMD_EXC;
+ EVEX, VVVV, VEX_LIG, EVEX_CD8<32, CD8VT1>, SIMD_EXC;
defm VMINCSDZ : avx512_comutable_binop_s<0x5D, "vminsd", f64x_info, X86fminc,
SchedWriteFCmp.Scl, "VMINCSD">, XD,
- REX_W, EVEX_4V, VEX_LIG,
+ REX_W, EVEX, VVVV, VEX_LIG,
EVEX_CD8<64, CD8VT1>, SIMD_EXC;
defm VMAXCSSZ : avx512_comutable_binop_s<0x5F, "vmaxss", f32x_info, X86fmaxc,
SchedWriteFCmp.Scl, "VMAXCSS">, XS,
- EVEX_4V, VEX_LIG, EVEX_CD8<32, CD8VT1>, SIMD_EXC;
+ EVEX, VVVV, VEX_LIG, EVEX_CD8<32, CD8VT1>, SIMD_EXC;
defm VMAXCSDZ : avx512_comutable_binop_s<0x5F, "vmaxsd", f64x_info, X86fmaxc,
SchedWriteFCmp.Scl, "VMAXCSD">, XD,
- REX_W, EVEX_4V, VEX_LIG,
+ REX_W, EVEX, VVVV, VEX_LIG,
EVEX_CD8<64, CD8VT1>, SIMD_EXC;
defm VMINCSHZ : avx512_comutable_binop_s<0x5D, "vminsh", f16x_info, X86fminc,
SchedWriteFCmp.Scl, "VMINCSH">, T_MAP5XS,
- EVEX_4V, VEX_LIG, EVEX_CD8<16, CD8VT1>, SIMD_EXC,
+ EVEX, VVVV, VEX_LIG, EVEX_CD8<16, CD8VT1>, SIMD_EXC,
NotEVEX2VEXConvertible;
defm VMAXCSHZ : avx512_comutable_binop_s<0x5F, "vmaxsh", f16x_info, X86fmaxc,
SchedWriteFCmp.Scl, "VMAXCSH">, T_MAP5XS,
- EVEX_4V, VEX_LIG, EVEX_CD8<16, CD8VT1>, SIMD_EXC,
+ EVEX, VVVV, VEX_LIG, EVEX_CD8<16, CD8VT1>, SIMD_EXC,
NotEVEX2VEXConvertible;
multiclass avx512_fp_packed<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
@@ -5556,21 +5556,21 @@ multiclass avx512_fp_packed<bits<8> opc, string OpcodeStr, SDPatternOperator OpN
"$src2, $src1", "$src1, $src2",
(_.VT (OpNode _.RC:$src1, _.RC:$src2)),
(_.VT (MaskOpNode _.RC:$src1, _.RC:$src2)), ClobberConstraint,
- IsCommutable, IsKCommutable, IsKCommutable>, EVEX_4V, Sched<[sched]>;
+ IsCommutable, IsKCommutable, IsKCommutable>, EVEX, VVVV, Sched<[sched]>;
let mayLoad = 1 in {
defm rm: AVX512_maskable_split<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.MemOp:$src2), OpcodeStr#suffix,
"$src2, $src1", "$src1, $src2",
(OpNode _.RC:$src1, (_.LdFrag addr:$src2)),
(MaskOpNode _.RC:$src1, (_.LdFrag addr:$src2)),
- ClobberConstraint>, EVEX_4V, Sched<[sched.Folded, sched.ReadAfterFold]>;
+ ClobberConstraint>, EVEX, VVVV, Sched<[sched.Folded, sched.ReadAfterFold]>;
defm rmb: AVX512_maskable_split<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr#suffix,
"${src2}"#_.BroadcastStr#", $src1",
"$src1, ${src2}"#_.BroadcastStr,
(OpNode _.RC:$src1, (_.VT (_.BroadcastLdFrag addr:$src2))),
(MaskOpNode _.RC:$src1, (_.VT (_.BroadcastLdFrag addr:$src2))),
- ClobberConstraint>, EVEX_4V, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>;
+ ClobberConstraint>, EVEX, VVVV, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>;
}
}
}
@@ -5586,7 +5586,7 @@ multiclass avx512_fp_round_packed<bits<8> opc, string OpcodeStr,
"$rc, $src2, $src1", "$src1, $src2, $rc",
(_.VT (OpNodeRnd _.RC:$src1, _.RC:$src2, (i32 timm:$rc))),
0, 0, 0, vselect_mask, ClobberConstraint>,
- EVEX_4V, EVEX_B, EVEX_RC, Sched<[sched]>;
+ EVEX, VVVV, EVEX_B, EVEX_RC, Sched<[sched]>;
}
multiclass avx512_fp_sae_packed<bits<8> opc, string OpcodeStr,
@@ -5597,7 +5597,7 @@ multiclass avx512_fp_sae_packed<bits<8> opc, string OpcodeStr,
(ins _.RC:$src1, _.RC:$src2), OpcodeStr#_.Suffix,
"{sae}, $src2, $src1", "$src1, $src2, {sae}",
(_.VT (OpNodeSAE _.RC:$src1, _.RC:$src2))>,
- EVEX_4V, EVEX_B, Sched<[sched]>;
+ EVEX, VVVV, EVEX_B, Sched<[sched]>;
}
multiclass avx512_fp_binop_p<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
@@ -5734,18 +5734,18 @@ multiclass avx512_fp_scalef_p<bits<8> opc, string OpcodeStr, SDNode OpNode,
(ins _.RC:$src1, _.RC:$src2), OpcodeStr#_.Suffix,
"$src2, $src1", "$src1, $src2",
(_.VT (OpNode _.RC:$src1, _.RC:$src2))>,
- EVEX_4V, Sched<[sched]>;
+ EVEX, VVVV, Sched<[sched]>;
defm rm: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.MemOp:$src2), OpcodeStr#_.Suffix,
"$src2, $src1", "$src1, $src2",
(OpNode _.RC:$src1, (_.LdFrag addr:$src2))>,
- EVEX_4V, Sched<[sched.Folded, sched.ReadAfterFold]>;
+ EVEX, VVVV, Sched<[sched.Folded, sched.ReadAfterFold]>;
defm rmb: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr#_.Suffix,
"${src2}"#_.BroadcastStr#", $src1",
"$src1, ${src2}"#_.BroadcastStr,
(OpNode _.RC:$src1, (_.VT (_.BroadcastLdFrag addr:$src2)))>,
- EVEX_4V, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>;
+ EVEX, VVVV, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>;
}
}
@@ -5773,7 +5773,7 @@ multiclass avx512_fp_scalef_all<bits<8> opc, bits<8> opcScaler, string OpcodeStr
EVEX_V512, T_MAP6PD, EVEX_CD8<16, CD8VF>;
defm SHZ : avx512_fp_scalef_scalar<opcScaler, OpcodeStr, X86scalefs, sched.Scl, f16x_info>,
avx512_fp_scalar_round<opcScaler, OpcodeStr#"sh", f16x_info, X86scalefsRnd, sched.Scl>,
- EVEX_4V, T_MAP6PD, EVEX_CD8<16, CD8VT1>;
+ EVEX, VVVV, T_MAP6PD, EVEX_CD8<16, CD8VT1>;
}
defm PSZ : avx512_fp_scalef_p<opc, OpcodeStr, X86scalef, sched.ZMM, v16f32_info>,
avx512_fp_round_packed<opc, OpcodeStr, X86scalefRnd, sched.ZMM, v16f32_info>,
@@ -5784,11 +5784,11 @@ multiclass avx512_fp_scalef_all<bits<8> opc, bits<8> opcScaler, string OpcodeStr
defm SSZ : avx512_fp_scalef_scalar<opcScaler, OpcodeStr, X86scalefs, sched.Scl, f32x_info>,
avx512_fp_scalar_round<opcScaler, OpcodeStr#"ss", f32x_info,
X86scalefsRnd, sched.Scl>,
- EVEX_4V, VEX_LIG, EVEX_CD8<32, CD8VT1>, T8PD;
+ EVEX, VVVV, VEX_LIG, EVEX_CD8<32, CD8VT1>, T8PD;
defm SDZ : avx512_fp_scalef_scalar<opcScaler, OpcodeStr, X86scalefs, sched.Scl, f64x_info>,
avx512_fp_scalar_round<opcScaler, OpcodeStr#"sd", f64x_info,
X86scalefsRnd, sched.Scl>,
- EVEX_4V, VEX_LIG, EVEX_CD8<64, CD8VT1>, REX_W, T8PD;
+ EVEX, VVVV, VEX_LIG, EVEX_CD8<64, CD8VT1>, REX_W, T8PD;
// Define only if AVX512VL feature is present.
let Predicates = [HasVLX] in {
@@ -5825,13 +5825,13 @@ multiclass avx512_vptest<bits<8> opc, string OpcodeStr,
(ins _.RC:$src1, _.RC:$src2), OpcodeStr,
"$src2, $src1", "$src1, $src2",
(null_frag), (null_frag), 1>,
- EVEX_4V, Sched<[sched]>;
+ EVEX, VVVV, Sched<[sched]>;
let mayLoad = 1 in
defm rm : AVX512_maskable_cmp<opc, MRMSrcMem, _, (outs _.KRC:$dst),
(ins _.RC:$src1, _.MemOp:$src2), OpcodeStr,
"$src2, $src1", "$src1, $src2",
(null_frag), (null_frag)>,
- EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>,
+ EVEX, VVVV, EVEX_CD8<_.EltSize, CD8VF>,
Sched<[sched.Folded, sched.ReadAfterFold]>;
}
}
@@ -5844,7 +5844,7 @@ multiclass avx512_vptest_mb<bits<8> opc, string OpcodeStr,
"${src2}"#_.BroadcastStr#", $src1",
"$src1, ${src2}"#_.BroadcastStr,
(null_frag), (null_frag)>,
- EVEX_B, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>,
+ EVEX_B, EVEX, VVVV, EVEX_CD8<_.EltSize, CD8VF>,
Sched<[sched.Folded, sched.ReadAfterFold]>;
}
@@ -5944,13 +5944,13 @@ multiclass avx512_shift_rrm<bits<8> opc, string OpcodeStr, SDNode OpNode,
(ins _.RC:$src1, VR128X:$src2), OpcodeStr,
"$src2, $src1", "$src1, $src2",
(_.VT (OpNode _.RC:$src1, (SrcVT VR128X:$src2)))>,
- AVX512BIBase, EVEX_4V, Sched<[sched]>;
+ AVX512BIBase, EVEX, VVVV, Sched<[sched]>;
defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src1, i128mem:$src2), OpcodeStr,
"$src2, $src1", "$src1, $src2",
(_.VT (OpNode _.RC:$src1, (SrcVT (load addr:$src2))))>,
AVX512BIBase,
- EVEX_4V, Sched<[sched.Folded, sched.ReadAfterFold]>;
+ EVEX, VVVV, Sched<[sched.Folded, sched.ReadAfterFold]>;
}
}
@@ -6035,22 +6035,22 @@ multiclass avx512_shift_rmi_dq<bits<8> opcd, bits<8> opcq,
defm VPSRL : avx512_shift_rmi_dq<0x72, 0x73, MRM2r, MRM2m, "vpsrl", X86vsrli,
SchedWriteVecShiftImm>,
avx512_shift_rmi_w<0x71, MRM2r, MRM2m, "vpsrlw", X86vsrli,
- SchedWriteVecShiftImm>, AVX512BIi8Base, EVEX_4V;
+ SchedWriteVecShiftImm>, AVX512BIi8Base, EVEX, VVVV;
defm VPSLL : avx512_shift_rmi_dq<0x72, 0x73, MRM6r, MRM6m, "vpsll", X86vshli,
SchedWriteVecShiftImm>,
avx512_shift_rmi_w<0x71, MRM6r, MRM6m, "vpsllw", X86vshli,
- SchedWriteVecShiftImm>, AVX512BIi8Base, EVEX_4V;
+ SchedWriteVecShiftImm>, AVX512BIi8Base, EVEX, VVVV;
defm VPSRA : avx512_shift_rmi_dq<0x72, 0x72, MRM4r, MRM4m, "vpsra", X86vsrai,
SchedWriteVecShiftImm, 1>,
avx512_shift_rmi_w<0x71, MRM4r, MRM4m, "vpsraw", X86vsrai,
- SchedWriteVecShiftImm>, AVX512BIi8Base, EVEX_4V;
+ SchedWriteVecShiftImm>, AVX512BIi8Base, EVEX, VVVV;
defm VPROR : avx512_shift_rmi_dq<0x72, 0x72, MRM0r, MRM0m, "vpror", X86vrotri,
- SchedWriteVecShiftImm>, AVX512BIi8Base, EVEX_4V;
+ SchedWriteVecShiftImm>, AVX512BIi8Base, EVEX, VVVV;
defm VPROL : avx512_shift_rmi_dq<0x72, 0x72, MRM1r, MRM1m, "vprol", X86vrotli,
- SchedWriteVecShiftImm>, AVX512BIi8Base, EVEX_4V;
+ SchedWriteVecShiftImm>, AVX512BIi8Base, EVEX, VVVV;
defm VPSLL : avx512_shift_types<0xF2, 0xF3, 0xF1, "vpsll", X86vshl,
SchedWriteVecShift>;
@@ -6097,13 +6097,13 @@ multiclass avx512_var_shift<bits<8> opc, string OpcodeStr, SDNode OpNode,
(ins _.RC:$src1, _.RC:$src2), OpcodeStr,
"$src2, $src1", "$src1, $src2",
(_.VT (OpNode _.RC:$src1, (_.VT _.RC:$src2)))>,
- AVX5128IBase, EVEX_4V, Sched<[sched]>;
+ AVX5128IBase, EVEX, VVVV, Sched<[sched]>;
defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.MemOp:$src2), OpcodeStr,
"$src2, $src1", "$src1, $src2",
(_.VT (OpNode _.RC:$src1,
(_.VT (_.LdFrag addr:$src2))))>,
- AVX5128IBase, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>,
+ AVX5128IBase, EVEX, VVVV, EVEX_CD8<_.EltSize, CD8VF>,
Sched<[sched.Folded, sched.ReadAfterFold]>;
}
}
@@ -6116,7 +6116,7 @@ multiclass avx512_var_shift_mb<bits<8> opc, string OpcodeStr, SDNode OpNode,
"${src2}"#_.BroadcastStr#", $src1",
"$src1, ${src2}"#_.BroadcastStr,
(_.VT (OpNode _.RC:$src1, (_.VT (_.BroadcastLdFrag addr:$src2))))>,
- AVX5128IBase, EVEX_B, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>,
+ AVX5128IBase, EVEX_B, EVEX, VVVV, EVEX_CD8<_.EltSize, CD8VF>,
Sched<[sched.Folded, sched.ReadAfterFold]>;
}
@@ -6374,14 +6374,14 @@ multiclass avx512_permil_vec<bits<8> OpcVar, string OpcodeStr, SDNode OpNode,
"$src2, $src1", "$src1, $src2",
(_.VT (OpNode _.RC:$src1,
(Ctrl.VT Ctrl.RC:$src2)))>,
- T8PD, EVEX_4V, Sched<[sched]>;
+ T8PD, EVEX, VVVV, Sched<[sched]>;
defm rm: AVX512_maskable<OpcVar, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src1, Ctrl.MemOp:$src2), OpcodeStr,
"$src2, $src1", "$src1, $src2",
(_.VT (OpNode
_.RC:$src1,
(Ctrl.VT (Ctrl.LdFrag addr:$src2))))>,
- T8PD, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>,
+ T8PD, EVEX, VVVV, EVEX_CD8<_.EltSize, CD8VF>,
Sched<[sched.Folded, sched.ReadAfterFold]>;
defm rmb: AVX512_maskable<OpcVar, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr,
@@ -6390,7 +6390,7 @@ multiclass avx512_permil_vec<bits<8> OpcVar, string OpcodeStr, SDNode OpNode,
(_.VT (OpNode
_.RC:$src1,
(Ctrl.VT (Ctrl.BroadcastLdFrag addr:$src2))))>,
- T8PD, EVEX_4V, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>,
+ T8PD, EVEX, VVVV, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>,
Sched<[sched.Folded, sched.ReadAfterFold]>;
}
@@ -6469,13 +6469,13 @@ def VMOVLHPSZrr : AVX512PSI<0x16, MRMSrcReg, (outs VR128X:$dst),
(ins VR128X:$src1, VR128X:$src2),
"vmovlhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[(set VR128X:$dst, (v4f32 (X86Movlhps VR128X:$src1, VR128X:$src2)))]>,
- Sched<[SchedWriteFShuffle.XMM]>, EVEX_4V;
+ Sched<[SchedWriteFShuffle.XMM]>, EVEX, VVVV;
let isCommutable = 1 in
def VMOVHLPSZrr : AVX512PSI<0x12, MRMSrcReg, (outs VR128X:$dst),
(ins VR128X:$src1, VR128X:$src2),
"vmovhlps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[(set VR128X:$dst, (v4f32 (X86Movhlps VR128X:$src1, VR128X:$src2)))]>,
- Sched<[SchedWriteFShuffle.XMM]>, EVEX_4V;
+ Sched<[SchedWriteFShuffle.XMM]>, EVEX, VVVV;
//===----------------------------------------------------------------------===//
// VMOVHPS/PD VMOVLPS Instructions
@@ -6494,7 +6494,7 @@ multiclass avx512_mov_hilo_packed<bits<8> opc, string OpcodeStr,
(OpNode _.RC:$src1,
(_.VT (bitconvert
(v2f64 (scalar_to_vector (loadf64 addr:$src2)))))))]>,
- Sched<[SchedWriteFShuffle.XMM.Folded, SchedWriteFShuffle.XMM.ReadAfterFold]>, EVEX_4V;
+ Sched<[SchedWriteFShuffle.XMM.Folded, SchedWriteFShuffle.XMM.ReadAfterFold]>, EVEX, VVVV;
}
// No patterns for MOVLPS/MOVHPS as the Movlhps node should only be created in
@@ -6565,14 +6565,14 @@ multiclass avx512_fma3p_213_rm<bits<8> opc, string OpcodeStr, SDPatternOperator
OpcodeStr, "$src3, $src2", "$src2, $src3",
(_.VT (OpNode _.RC:$src2, _.RC:$src1, _.RC:$src3)),
(_.VT (MaskOpNode _.RC:$src2, _.RC:$src1, _.RC:$src3)), 1, 1>,
- EVEX_4V, Sched<[sched]>;
+ EVEX, VVVV, Sched<[sched]>;
defm m: AVX512_maskable_fma<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src2, _.MemOp:$src3),
OpcodeStr, "$src3, $src2", "$src2, $src3",
(_.VT (OpNode _.RC:$src2, _.RC:$src1, (_.LdFrag addr:$src3))),
(_.VT (MaskOpNode _.RC:$src2, _.RC:$src1, (_.LdFrag addr:$src3))), 1, 0>,
- EVEX_4V, Sched<[sched.Folded, sched.ReadAfterFold,
+ EVEX, VVVV, Sched<[sched.Folded, sched.ReadAfterFold,
sched.ReadAfterFold]>;
defm mb: AVX512_maskable_fma<opc, MRMSrcMem, _, (outs _.RC:$dst),
@@ -6583,7 +6583,7 @@ multiclass avx512_fma3p_213_rm<bits<8> opc, string OpcodeStr, SDPatternOperator
_.RC:$src1,(_.VT (_.BroadcastLdFrag addr:$src3))),
(MaskOpNode _.RC:$src2,
_.RC:$src1,(_.VT (_.BroadcastLdFrag addr:$src3))), 1, 0>,
- EVEX_4V, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold,
+ EVEX, VVVV, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold,
sched.ReadAfterFold]>;
}
}
@@ -6598,7 +6598,7 @@ multiclass avx512_fma3_213_round<bits<8> opc, string OpcodeStr, SDNode OpNode,
OpcodeStr, "$rc, $src3, $src2", "$src2, $src3, $rc",
(_.VT ( OpNode _.RC:$src2, _.RC:$src1, _.RC:$src3, (i32 timm:$rc))),
(_.VT ( OpNode _.RC:$src2, _.RC:$src1, _.RC:$src3, (i32 timm:$rc))), 1, 1>,
- EVEX_4V, EVEX_B, EVEX_RC, Sched<[sched]>;
+ EVEX, VVVV, EVEX_B, EVEX_RC, Sched<[sched]>;
}
multiclass avx512_fma3p_213_common<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
@@ -6660,14 +6660,14 @@ multiclass avx512_fma3p_231_rm<bits<8> opc, string OpcodeStr, SDPatternOperator
OpcodeStr, "$src3, $src2", "$src2, $src3",
(null_frag),
(_.VT (MaskOpNode _.RC:$src2, _.RC:$src3, _.RC:$src1)), 1, 1>,
- EVEX_4V, Sched<[sched]>;
+ EVEX, VVVV, Sched<[sched]>;
defm m: AVX512_maskable_fma<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src2, _.MemOp:$src3),
OpcodeStr, "$src3, $src2", "$src2, $src3",
(_.VT (OpNode _.RC:$src2, (_.LdFrag addr:$src3), _.RC:$src1)),
(_.VT (MaskOpNode _.RC:$src2, (_.LdFrag addr:$src3), _.RC:$src1)), 1, 0>,
- EVEX_4V, Sched<[sched.Folded, sched.ReadAfterFold,
+ EVEX, VVVV, Sched<[sched.Folded, sched.ReadAfterFold,
sched.ReadAfterFold]>;
defm mb: AVX512_maskable_fma<opc, MRMSrcMem, _, (outs _.RC:$dst),
@@ -6679,7 +6679,7 @@ multiclass avx512_fma3p_231_rm<bits<8> opc, string OpcodeStr, SDPatternOperator
_.RC:$src1)),
(_.VT (MaskOpNode _.RC:$src2,
(_.VT (_.BroadcastLdFrag addr:$src3)),
- _.RC:$src1)), 1, 0>, EVEX_4V, EVEX_B,
+ _.RC:$src1)), 1, 0>, EVEX, VVVV, EVEX_B,
Sched<[sched.Folded, sched.ReadAfterFold,
sched.ReadAfterFold]>;
}
@@ -6695,7 +6695,7 @@ multiclass avx512_fma3_231_round<bits<8> opc, string OpcodeStr, SDNode OpNode,
OpcodeStr, "$rc, $src3, $src2", "$src2, $src3, $rc",
(null_frag),
(_.VT (OpNode _.RC:$src2, _.RC:$src3, _.RC:$src1, (i32 timm:$rc))),
- 1, 1>, EVEX_4V, EVEX_B, EVEX_RC, Sched<[sched]>;
+ 1, 1>, EVEX, VVVV, EVEX_B, EVEX_RC, Sched<[sched]>;
}
multiclass avx512_fma3p_231_common<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
@@ -6756,7 +6756,7 @@ multiclass avx512_fma3p_132_rm<bits<8> opc, string OpcodeStr, SDPatternOperator
OpcodeStr, "$src3, $src2", "$src2, $src3",
(null_frag),
(_.VT (MaskOpNode _.RC:$src1, _.RC:$src3, _.RC:$src2)), 1, 1>,
- EVEX_4V, Sched<[sched]>;
+ EVEX, VVVV, Sched<[sched]>;
// Pattern is 312 order so that the load is in a different place from the
// 213 and 231 patterns this helps tablegen's duplicate pattern detection.
@@ -6765,7 +6765,7 @@ multiclass avx512_fma3p_132_rm<bits<8> opc, string OpcodeStr, SDPatternOperator
OpcodeStr, "$src3, $src2", "$src2, $src3",
(_.VT (OpNode (_.LdFrag addr:$src3), _.RC:$src1, _.RC:$src2)),
(_.VT (MaskOpNode (_.LdFrag addr:$src3), _.RC:$src1, _.RC:$src2)), 1, 0>,
- EVEX_4V, Sched<[sched.Folded, sched.ReadAfterFold,
+ EVEX, VVVV, Sched<[sched.Folded, sched.ReadAfterFold,
sched.ReadAfterFold]>;
// Pattern is 312 order so that the load is in a different place from the
@@ -6778,7 +6778,7 @@ multiclass avx512_fma3p_132_rm<bits<8> opc, string OpcodeStr, SDPatternOperator
_.RC:$src1, _.RC:$src2)),
(_.VT (MaskOpNode (_.VT (_.BroadcastLdFrag addr:$src3)),
_.RC:$src1, _.RC:$src2)), 1, 0>,
- EVEX_4V, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold,
+ EVEX, VVVV, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold,
sched.ReadAfterFold]>;
}
}
@@ -6793,7 +6793,7 @@ multiclass avx512_fma3_132_round<bits<8> opc, string OpcodeStr, SDNode OpNode,
OpcodeStr, "$rc, $src3, $src2", "$src2, $src3, $rc",
(null_frag),
(_.VT (OpNode _.RC:$src1, _.RC:$src3, _.RC:$src2, (i32 timm:$rc))),
- 1, 1>, EVEX_4V, EVEX_B, EVEX_RC, Sched<[sched]>;
+ 1, 1>, EVEX, VVVV, EVEX_B, EVEX_RC, Sched<[sched]>;
}
multiclass avx512_fma3p_132_common<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
@@ -6851,33 +6851,33 @@ let Constraints = "$src1 = $dst", hasSideEffects = 0 in {
defm r_Int: AVX512_maskable_3src_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src2, _.RC:$src3), OpcodeStr,
"$src3, $src2", "$src2, $src3", (null_frag), 1, 1>,
- EVEX_4V, Sched<[SchedWriteFMA.Scl]>, SIMD_EXC;
+ EVEX, VVVV, Sched<[SchedWriteFMA.Scl]>, SIMD_EXC;
let mayLoad = 1 in
defm m_Int: AVX512_maskable_3src_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src2, _.IntScalarMemOp:$src3), OpcodeStr,
"$src3, $src2", "$src2, $src3", (null_frag), 1, 1>,
- EVEX_4V, Sched<[SchedWriteFMA.Scl.Folded, SchedWriteFMA.Scl.ReadAfterFold,
+ EVEX, VVVV, Sched<[SchedWriteFMA.Scl.Folded, SchedWriteFMA.Scl.ReadAfterFold,
SchedWriteFMA.Scl.ReadAfterFold]>, SIMD_EXC;
let Uses = [MXCSR] in
defm rb_Int: AVX512_maskable_3src_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src2, _.RC:$src3, AVX512RC:$rc),
OpcodeStr, "$rc, $src3, $src2", "$src2, $src3, $rc", (null_frag), 1, 1>,
- EVEX_4V, EVEX_B, EVEX_RC, Sched<[SchedWriteFMA.Scl]>;
+ EVEX, VVVV, EVEX_B, EVEX_RC, Sched<[SchedWriteFMA.Scl]>;
let isCodeGenOnly = 1, isCommutable = 1 in {
def r : AVX512<opc, MRMSrcReg, (outs _.FRC:$dst),
(ins _.FRC:$src1, _.FRC:$src2, _.FRC:$src3),
!strconcat(OpcodeStr,
"\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
- !if(MaskOnlyReg, [], [RHS_r])>, Sched<[SchedWriteFMA.Scl]>, EVEX_4V, SIMD_EXC;
+ !if(MaskOnlyReg, [], [RHS_r])>, Sched<[SchedWriteFMA.Scl]>, EVEX, VVVV, SIMD_EXC;
def m : AVX512<opc, MRMSrcMem, (outs _.FRC:$dst),
(ins _.FRC:$src1, _.FRC:$src2, _.ScalarMemOp:$src3),
!strconcat(OpcodeStr,
"\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
[RHS_m]>, Sched<[SchedWriteFMA.Scl.Folded, SchedWriteFMA.Scl.ReadAfterFold,
- SchedWriteFMA.Scl.ReadAfterFold]>, EVEX_4V, SIMD_EXC;
+ SchedWriteFMA.Scl.ReadAfterFold]>, EVEX, VVVV, SIMD_EXC;
let Uses = [MXCSR] in
def rb : AVX512<opc, MRMSrcReg, (outs _.FRC:$dst),
@@ -6885,7 +6885,7 @@ let Constraints = "$src1 = $dst", hasSideEffects = 0 in {
!strconcat(OpcodeStr,
"\t{$rc, $src3, $src2, $dst|$dst, $src2, $src3, $rc}"),
!if(MaskOnlyReg, [], [RHS_b])>, EVEX_B, EVEX_RC,
- Sched<[SchedWriteFMA.Scl]>, EVEX_4V;
+ Sched<[SchedWriteFMA.Scl]>, EVEX, VVVV;
}// isCodeGenOnly = 1
}// Constraints = "$src1 = $dst"
}
@@ -7189,13 +7189,13 @@ multiclass avx512_pmadd52_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
(ins _.RC:$src2, _.RC:$src3),
OpcodeStr, "$src3, $src2", "$src2, $src3",
(_.VT (OpNode _.RC:$src2, _.RC:$src3, _.RC:$src1)), 1, 1>,
- T8PD, EVEX_4V, Sched<[sched]>;
+ T8PD, EVEX, VVVV, Sched<[sched]>;
defm m: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src2, _.MemOp:$src3),
OpcodeStr, "$src3, $src2", "$src2, $src3",
(_.VT (OpNode _.RC:$src2, (_.LdFrag addr:$src3), _.RC:$src1))>,
- T8PD, EVEX_4V, Sched<[sched.Folded, sched.ReadAfterFold,
+ T8PD, EVEX, VVVV, Sched<[sched.Folded, sched.ReadAfterFold,
sched.ReadAfterFold]>;
defm mb: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
@@ -7205,7 +7205,7 @@ multiclass avx512_pmadd52_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
(OpNode _.RC:$src2,
(_.VT (_.BroadcastLdFrag addr:$src3)),
_.RC:$src1)>,
- T8PD, EVEX_4V, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold,
+ T8PD, EVEX, VVVV, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold,
sched.ReadAfterFold]>;
}
}
@@ -7247,19 +7247,19 @@ let ExeDomain = DstVT.ExeDomain, Uses = _Uses,
def rr : SI<opc, MRMSrcReg, (outs DstVT.FRC:$dst),
(ins DstVT.FRC:$src1, SrcRC:$src),
!strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>,
- EVEX_4V, Sched<[sched, ReadDefault, ReadInt2Fpu]>;
+ EVEX, VVVV, Sched<[sched, ReadDefault, ReadInt2Fpu]>;
let mayLoad = 1 in
def rm : SI<opc, MRMSrcMem, (outs DstVT.FRC:$dst),
(ins DstVT.FRC:$src1, x86memop:$src),
asm#"{"#mem#"}\t{$src, $src1, $dst|$dst, $src1, $src}", []>,
- EVEX_4V, Sched<[sched.Folded, sched.ReadAfterFold]>;
+ EVEX, VVVV, Sched<[sched.Folded, sched.ReadAfterFold]>;
} // hasSideEffects = 0
def rr_Int : SI<opc, MRMSrcReg, (outs DstVT.RC:$dst),
(ins DstVT.RC:$src1, SrcRC:$src2),
!strconcat(asm,"\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(set DstVT.RC:$dst,
(OpNode (DstVT.VT DstVT.RC:$src1), SrcRC:$src2))]>,
- EVEX_4V, Sched<[sched, ReadDefault, ReadInt2Fpu]>;
+ EVEX, VVVV, Sched<[sched, ReadDefault, ReadInt2Fpu]>;
def rm_Int : SI<opc, MRMSrcMem, (outs DstVT.RC:$dst),
(ins DstVT.RC:$src1, x86memop:$src2),
@@ -7267,7 +7267,7 @@ let ExeDomain = DstVT.ExeDomain, Uses = _Uses,
[(set DstVT.RC:$dst,
(OpNode (DstVT.VT DstVT.RC:$src1),
(ld_frag addr:$src2)))]>,
- EVEX_4V, Sched<[sched.Folded, sched.ReadAfterFold]>;
+ EVEX, VVVV, Sched<[sched.Folded, sched.ReadAfterFold]>;
}
def : InstAlias<"v"#asm#mem#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
(!cast<Instruction>(NAME#"rr_Int") DstVT.RC:$dst,
@@ -7287,7 +7287,7 @@ multiclass avx512_vcvtsi_round<bits<8> opc, SDNode OpNode,
(OpNode (DstVT.VT DstVT.RC:$src1),
SrcRC:$src2,
(i32 timm:$rc)))]>,
- EVEX_4V, EVEX_B, EVEX_RC, Sched<[sched, ReadDefault, ReadInt2Fpu]>;
+ EVEX, VVVV, EVEX_B, EVEX_RC, Sched<[sched, ReadDefault, ReadInt2Fpu]>;
def : InstAlias<"v"#asm#mem#"\t{$src2, $rc, $src1, $dst|$dst, $src1, $rc, $src2}",
(!cast<Instruction>(NAME#"rrb_Int") DstVT.RC:$dst,
DstVT.RC:$src1, SrcRC:$src2, AVX512RC:$rc), 0, "att">;
@@ -7646,25 +7646,25 @@ multiclass avx512_cvt_fp_scalar<bits<8> opc, string OpcodeStr, X86VectorVTInfo _
"$src2, $src1", "$src1, $src2",
(_.VT (OpNode (_.VT _.RC:$src1),
(_Src.VT _Src.RC:$src2)))>,
- EVEX_4V, VEX_LIG, Sched<[sched]>;
+ EVEX, VVVV, VEX_LIG, Sched<[sched]>;
defm rm_Int : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src1, _Src.IntScalarMemOp:$src2), OpcodeStr,
"$src2, $src1", "$src1, $src2",
(_.VT (OpNode (_.VT _.RC:$src1),
(_Src.ScalarIntMemFrags addr:$src2)))>,
- EVEX_4V, VEX_LIG,
+ EVEX, VVVV, VEX_LIG,
Sched<[sched.Folded, sched.ReadAfterFold]>;
let isCodeGenOnly = 1, hasSideEffects = 0 in {
def rr : I<opc, MRMSrcReg, (outs _.FRC:$dst),
(ins _.FRC:$src1, _Src.FRC:$src2),
OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
- EVEX_4V, VEX_LIG, Sched<[sched]>;
+ EVEX, VVVV, VEX_LIG, Sched<[sched]>;
let mayLoad = 1 in
def rm : I<opc, MRMSrcMem, (outs _.FRC:$dst),
(ins _.FRC:$src1, _Src.ScalarMemOp:$src2),
OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
- EVEX_4V, VEX_LIG, Sched<[sched.Folded, sched.ReadAfterFold]>;
+ EVEX, VVVV, VEX_LIG, Sched<[sched.Folded, sched.ReadAfterFold]>;
}
}
@@ -7678,7 +7678,7 @@ multiclass avx512_cvt_fp_sae_scalar<bits<8> opc, string OpcodeStr, X86VectorVTIn
"{sae}, $src2, $src1", "$src1, $src2, {sae}",
(_.VT (OpNodeSAE (_.VT _.RC:$src1),
(_Src.VT _Src.RC:$src2)))>,
- EVEX_4V, VEX_LIG, EVEX_B, Sched<[sched]>;
+ EVEX, VVVV, VEX_LIG, EVEX_B, Sched<[sched]>;
}
// Scalar Conversion with rounding control (RC)
@@ -7691,7 +7691,7 @@ multiclass avx512_cvt_fp_rc_scalar<bits<8> opc, string OpcodeStr, X86VectorVTInf
"$rc, $src2, $src1", "$src1, $src2, $rc",
(_.VT (OpNodeRnd (_.VT _.RC:$src1),
(_Src.VT _Src.RC:$src2), (i32 timm:$rc)))>,
- EVEX_4V, VEX_LIG, Sched<[sched]>,
+ EVEX, VVVV, VEX_LIG, Sched<[sched]>,
EVEX_B, EVEX_RC;
}
multiclass avx512_cvt_fp_scalar_trunc<bits<8> opc, string OpcodeStr,
@@ -9129,12 +9129,12 @@ multiclass avx512_fp14_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
(ins _.RC:$src1, _.RC:$src2), OpcodeStr,
"$src2, $src1", "$src1, $src2",
(OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2))>,
- EVEX_4V, VEX_LIG, Sched<[sched]>;
+ EVEX, VVVV, VEX_LIG, Sched<[sched]>;
defm rm : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.IntScalarMemOp:$src2), OpcodeStr,
"$src2, $src1", "$src1, $src2",
(OpNode (_.VT _.RC:$src1),
- (_.ScalarIntMemFrags addr:$src2))>, EVEX_4V, VEX_LIG,
+ (_.ScalarIntMemFrags addr:$src2))>, EVEX, VVVV, VEX_LIG,
Sched<[sched.Folded, sched.ReadAfterFold]>;
}
}
@@ -9250,16 +9250,16 @@ multiclass avx512_fp28_s<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,
multiclass avx512_eri_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
SDNode OpNodeSAE, X86FoldableSchedWrite sched> {
defm SSZ : avx512_fp28_s<opc, OpcodeStr#"ss", f32x_info, OpNode, OpNodeSAE,
- sched>, EVEX_CD8<32, CD8VT1>, VEX_LIG, T8PD, EVEX_4V;
+ sched>, EVEX_CD8<32, CD8VT1>, VEX_LIG, T8PD, EVEX, VVVV;
defm SDZ : avx512_fp28_s<opc, OpcodeStr#"sd", f64x_info, OpNode, OpNodeSAE,
- sched>, EVEX_CD8<64, CD8VT1>, VEX_LIG, REX_W, T8PD, EVEX_4V;
+ sched>, EVEX_CD8<64, CD8VT1>, VEX_LIG, REX_W, T8PD, EVEX, VVVV;
}
multiclass avx512_vgetexpsh<bits<8> opc, string OpcodeStr, SDNode OpNode,
SDNode OpNodeSAE, X86FoldableSchedWrite sched> {
let Predicates = [HasFP16] in
defm SHZ : avx512_fp28_s<opc, OpcodeStr#"sh", f16x_info, OpNode, OpNodeSAE, sched>,
- EVEX_CD8<16, CD8VT1>, T_MAP6PD, EVEX_4V;
+ EVEX_CD8<16, CD8VT1>, T_MAP6PD, EVEX, VVVV;
}
let Predicates = [HasERI] in {
@@ -9501,11 +9501,11 @@ multiclass avx512_sqrt_scalar<bits<8> opc, string OpcodeStr, X86FoldableSchedWri
multiclass avx512_sqrt_scalar_all<bits<8> opc, string OpcodeStr,
X86SchedWriteSizes sched> {
defm SHZ : avx512_sqrt_scalar<opc, OpcodeStr#"sh", sched.PH.Scl, f16x_info, NAME#"SH", HasFP16>,
- EVEX_CD8<16, CD8VT1>, EVEX_4V, T_MAP5XS;
+ EVEX_CD8<16, CD8VT1>, EVEX, VVVV, T_MAP5XS;
defm SSZ : avx512_sqrt_scalar<opc, OpcodeStr#"ss", sched.PS.Scl, f32x_info, NAME#"SS">,
- EVEX_CD8<32, CD8VT1>, EVEX_4V, XS;
+ EVEX_CD8<32, CD8VT1>, EVEX, VVVV, XS;
defm SDZ : avx512_sqrt_scalar<opc, OpcodeStr#"sd", sched.PD.Scl, f64x_info, NAME#"SD">,
- EVEX_CD8<64, CD8VT1>, EVEX_4V, XD, REX_W;
+ EVEX_CD8<64, CD8VT1>, EVEX, VVVV, XD, REX_W;
}
defm VSQRT : avx512_sqrt_packed_all<0x51, "vsqrt", SchedWriteFSqrtSizes>,
@@ -9569,17 +9569,17 @@ multiclass avx512_rndscale_scalar<bits<8> opc, string OpcodeStr,
let Predicates = [HasFP16] in
defm VRNDSCALESHZ : avx512_rndscale_scalar<0x0A, "vrndscalesh",
SchedWriteFRnd.Scl, f16x_info>,
- AVX512PSIi8Base, TA, EVEX_4V,
+ AVX512PSIi8Base, TA, EVEX, VVVV,
EVEX_CD8<16, CD8VT1>;
defm VRNDSCALESSZ : avx512_rndscale_scalar<0x0A, "vrndscaless",
SchedWriteFRnd.Scl, f32x_info>,
- AVX512AIi8Base, EVEX_4V, VEX_LIG,
+ AVX512AIi8Base, EVEX, VVVV, VEX_LIG,
EVEX_CD8<32, CD8VT1>;
defm VRNDSCALESDZ : avx512_rndscale_scalar<0x0B, "vrndscalesd",
SchedWriteFRnd.Scl, f64x_info>,
- REX_W, AVX512AIi8Base, EVEX_4V, VEX_LIG,
+ REX_W, AVX512AIi8Base, EVEX, VVVV, VEX_LIG,
EVEX_CD8<64, CD8VT1>;
multiclass avx512_masked_scalar<SDNode OpNode, string OpcPrefix, SDNode Move,
@@ -10773,13 +10773,13 @@ multiclass avx512_common_3Op_rm_imm8<bits<8> opc, SDNode OpNode, string OpStr,
AVX512VLVectorVTInfo SrcInfo, Predicate Pred = HasBWI> {
let Predicates = [Pred] in {
defm Z : avx512_3Op_rm_imm8<opc, OpStr, OpNode, sched.ZMM, DestInfo.info512,
- SrcInfo.info512>, EVEX_V512, AVX512AIi8Base, EVEX_4V;
+ SrcInfo.info512>, EVEX_V512, AVX512AIi8Base, EVEX, VVVV;
}
let Predicates = [Pred, HasVLX] in {
defm Z128 : avx512_3Op_rm_imm8<opc, OpStr, OpNode, sched.XMM, DestInfo.info128,
- SrcInfo.info128>, EVEX_V128, AVX512AIi8Base, EVEX_4V;
+ SrcInfo.info128>, EVEX_V128, AVX512AIi8Base, EVEX, VVVV;
defm Z256 : avx512_3Op_rm_imm8<opc, OpStr, OpNode, sched.YMM, DestInfo.info256,
- SrcInfo.info256>, EVEX_V256, AVX512AIi8Base, EVEX_4V;
+ SrcInfo.info256>, EVEX_V256, AVX512AIi8Base, EVEX, VVVV;
}
}
@@ -10835,38 +10835,38 @@ defm VGETMANT : avx512_common_unary_fp_sae_packed_imm_all<"vgetmant", 0x26, 0x26
defm VRANGEPD : avx512_common_fp_sae_packed_imm<"vrangepd", avx512vl_f64_info,
0x50, X86VRange, X86VRangeSAE,
SchedWriteFAdd, HasDQI>,
- AVX512AIi8Base, EVEX_4V, EVEX_CD8<64, CD8VF>, REX_W;
+ AVX512AIi8Base, EVEX, VVVV, EVEX_CD8<64, CD8VF>, REX_W;
defm VRANGEPS : avx512_common_fp_sae_packed_imm<"vrangeps", avx512vl_f32_info,
0x50, X86VRange, X86VRangeSAE,
SchedWriteFAdd, HasDQI>,
- AVX512AIi8Base, EVEX_4V, EVEX_CD8<32, CD8VF>;
+ AVX512AIi8Base, EVEX, VVVV, EVEX_CD8<32, CD8VF>;
defm VRANGESD: avx512_common_fp_sae_scalar_imm<"vrangesd",
f64x_info, 0x51, X86Ranges, X86RangesSAE, SchedWriteFAdd, HasDQI>,
- AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<64, CD8VT1>, REX_W;
+ AVX512AIi8Base, VEX_LIG, EVEX, VVVV, EVEX_CD8<64, CD8VT1>, REX_W;
defm VRANGESS: avx512_common_fp_sae_scalar_imm<"vrangess", f32x_info,
0x51, X86Ranges, X86RangesSAE, SchedWriteFAdd, HasDQI>,
- AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<32, CD8VT1>;
+ AVX512AIi8Base, VEX_LIG, EVEX, VVVV, EVEX_CD8<32, CD8VT1>;
defm VREDUCESD: avx512_common_fp_sae_scalar_imm<"vreducesd", f64x_info,
0x57, X86Reduces, X86ReducesSAE, SchedWriteFRnd, HasDQI>,
- AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<64, CD8VT1>, REX_W;
+ AVX512AIi8Base, VEX_LIG, EVEX, VVVV, EVEX_CD8<64, CD8VT1>, REX_W;
defm VREDUCESS: avx512_common_fp_sae_scalar_imm<"vreducess", f32x_info,
0x57, X86Reduces, X86ReducesSAE, SchedWriteFRnd, HasDQI>,
- AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<32, CD8VT1>;
+ AVX512AIi8Base, VEX_LIG, EVEX, VVVV, EVEX_CD8<32, CD8VT1>;
defm VREDUCESH: avx512_common_fp_sae_scalar_imm<"vreducesh", f16x_info,
0x57, X86Reduces, X86ReducesSAE, SchedWriteFRnd, HasFP16>,
- AVX512PSIi8Base, TA, VEX_LIG, EVEX_4V, EVEX_CD8<16, CD8VT1>;
+ AVX512PSIi8Base, TA, VEX_LIG, EVEX, VVVV, EVEX_CD8<16, CD8VT1>;
defm VGETMANTSD: avx512_common_fp_sae_scalar_imm<"vgetmantsd", f64x_info,
0x27, X86GetMants, X86GetMantsSAE, SchedWriteFRnd, HasAVX512>,
- AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<64, CD8VT1>, REX_W;
+ AVX512AIi8Base, VEX_LIG, EVEX, VVVV, EVEX_CD8<64, CD8VT1>, REX_W;
defm VGETMANTSS: avx512_common_fp_sae_scalar_imm<"vgetmantss", f32x_info,
0x27, X86GetMants, X86GetMantsSAE, SchedWriteFRnd, HasAVX512>,
- AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<32, CD8VT1>;
+ AVX512AIi8Base, VEX_LIG, EVEX, VVVV, EVEX_CD8<32, CD8VT1>;
defm VGETMANTSH: avx512_common_fp_sae_scalar_imm<"vgetmantsh", f16x_info,
0x27, X86GetMants, X86GetMantsSAE, SchedWriteFRnd, HasFP16>,
- AVX512PSIi8Base, TA, VEX_LIG, EVEX_4V, EVEX_CD8<16, CD8VT1>;
+ AVX512PSIi8Base, TA, VEX_LIG, EVEX, VVVV, EVEX_CD8<16, CD8VT1>;
multiclass avx512_shuff_packed_128_common<bits<8> opc, string OpcodeStr,
X86FoldableSchedWrite sched,
@@ -10920,13 +10920,13 @@ multiclass avx512_shuff_packed_128<string OpcodeStr, X86FoldableSchedWrite sched
}
defm VSHUFF32X4 : avx512_shuff_packed_128<"vshuff32x4", WriteFShuffle256,
- avx512vl_f32_info, avx512vl_f64_info, 0x23, "VPERM2F128">, AVX512AIi8Base, EVEX_4V, EVEX_CD8<32, CD8VF>;
+ avx512vl_f32_info, avx512vl_f64_info, 0x23, "VPERM2F128">, AVX512AIi8Base, EVEX, VVVV, EVEX_CD8<32, CD8VF>;
defm VSHUFF64X2 : avx512_shuff_packed_128<"vshuff64x2", WriteFShuffle256,
- avx512vl_f64_info, avx512vl_f64_info, 0x23, "VPERM2F128">, AVX512AIi8Base, EVEX_4V, EVEX_CD8<64, CD8VF>, REX_W;
+ avx512vl_f64_info, avx512vl_f64_info, 0x23, "VPERM2F128">, AVX512AIi8Base, EVEX, VVVV, EVEX_CD8<64, CD8VF>, REX_W;
defm VSHUFI32X4 : avx512_shuff_packed_128<"vshufi32x4", WriteFShuffle256,
- avx512vl_i32_info, avx512vl_i64_info, 0x43, "VPERM2I128">, AVX512AIi8Base, EVEX_4V, EVEX_CD8<32, CD8VF>;
+ avx512vl_i32_info, avx512vl_i64_info, 0x43, "VPERM2I128">, AVX512AIi8Base, EVEX, VVVV, EVEX_CD8<32, CD8VF>;
defm VSHUFI64X2 : avx512_shuff_packed_128<"vshufi64x2", WriteFShuffle256,
- avx512vl_i64_info, avx512vl_i64_info, 0x43, "VPERM2I128">, AVX512AIi8Base, EVEX_4V, EVEX_CD8<64, CD8VF>, REX_W;
+ avx512vl_i64_info, avx512vl_i64_info, 0x43, "VPERM2I128">, AVX512AIi8Base, EVEX, VVVV, EVEX_CD8<64, CD8VF>, REX_W;
multiclass avx512_valign<bits<8> opc, string OpcodeStr,
X86FoldableSchedWrite sched, X86VectorVTInfo _>{
@@ -10962,15 +10962,15 @@ multiclass avx512_valign_common<string OpcodeStr, X86SchedWriteWidths sched,
AVX512VLVectorVTInfo _> {
let Predicates = [HasAVX512] in {
defm Z : avx512_valign<0x03, OpcodeStr, sched.ZMM, _.info512>,
- AVX512AIi8Base, EVEX_4V, EVEX_V512;
+ AVX512AIi8Base, EVEX, VVVV, EVEX_V512;
}
let Predicates = [HasAVX512, HasVLX] in {
defm Z128 : avx512_valign<0x03, OpcodeStr, sched.XMM, _.info128>,
- AVX512AIi8Base, EVEX_4V, EVEX_V128;
+ AVX512AIi8Base, EVEX, VVVV, EVEX_V128;
// We can't really override the 256-bit version so change it back to unset.
let EVEX2VEXOverride = ? in
defm Z256 : avx512_valign<0x03, OpcodeStr, sched.YMM, _.info256>,
- AVX512AIi8Base, EVEX_4V, EVEX_V256;
+ AVX512AIi8Base, EVEX, VVVV, EVEX_V256;
}
}
@@ -11427,7 +11427,7 @@ multiclass avx512_insert_elt_m<bits<8> opc, string OpcodeStr, SDNode OpNode,
OpcodeStr#"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
[(set _.RC:$dst,
(_.VT (OpNode _.RC:$src1, (LdFrag addr:$src2), immoperator:$src3)))]>,
- EVEX_4V, EVEX_CD8<_.EltSize, CD8VT1>, Sched<[WriteVecInsert.Folded, WriteVecInsert.ReadAfterFold]>;
+ EVEX, VVVV, EVEX_CD8<_.EltSize, CD8VT1>, Sched<[WriteVecInsert.Folded, WriteVecInsert.ReadAfterFold]>;
}
multiclass avx512_insert_elt_bw<bits<8> opc, string OpcodeStr, SDNode OpNode,
@@ -11437,7 +11437,7 @@ multiclass avx512_insert_elt_bw<bits<8> opc, string OpcodeStr, SDNode OpNode,
(ins _.RC:$src1, GR32orGR64:$src2, u8imm:$src3),
OpcodeStr#"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
[(set _.RC:$dst,
- (OpNode _.RC:$src1, GR32orGR64:$src2, timm:$src3))]>, EVEX_4V,
+ (OpNode _.RC:$src1, GR32orGR64:$src2, timm:$src3))]>, EVEX, VVVV,
Sched<[WriteVecInsert]>;
defm NAME : avx512_insert_elt_m<opc, OpcodeStr, OpNode, _, LdFrag, timm>;
@@ -11452,7 +11452,7 @@ multiclass avx512_insert_elt_dq<bits<8> opc, string OpcodeStr,
OpcodeStr#"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
[(set _.RC:$dst,
(_.VT (insertelt _.RC:$src1, GRC:$src2, imm:$src3)))]>,
- EVEX_4V, TAPD, Sched<[WriteVecInsert]>;
+ EVEX, VVVV, TAPD, Sched<[WriteVecInsert]>;
defm NAME : avx512_insert_elt_m<opc, OpcodeStr, insertelt, _,
_.ScalarLdFrag, imm>, TAPD;
@@ -11501,7 +11501,7 @@ multiclass avx512_shufp<string OpcodeStr, AVX512VLVectorVTInfo VTInfo_FP>{
defm NAME: avx512_common_3Op_imm8<OpcodeStr, VTInfo_FP, 0xC6, X86Shufp,
SchedWriteFShuffle>,
EVEX_CD8<VTInfo_FP.info512.EltSize, CD8VF>,
- AVX512AIi8Base, EVEX_4V;
+ AVX512AIi8Base, EVEX, VVVV;
}
defm VSHUFPS: avx512_shufp<"vshufps", avx512vl_f32_info>, PS;
@@ -11543,10 +11543,10 @@ multiclass avx512_shift_packed_all<bits<8> opc, SDNode OpNode, Format MRMr,
}
defm VPSLLDQ : avx512_shift_packed_all<0x73, X86vshldq, MRM7r, MRM7m, "vpslldq",
SchedWriteShuffle, HasBWI>,
- AVX512PDIi8Base, EVEX_4V, WIG;
+ AVX512PDIi8Base, EVEX, VVVV, WIG;
defm VPSRLDQ : avx512_shift_packed_all<0x73, X86vshrdq, MRM3r, MRM3m, "vpsrldq",
SchedWriteShuffle, HasBWI>,
- AVX512PDIi8Base, EVEX_4V, WIG;
+ AVX512PDIi8Base, EVEX, VVVV, WIG;
multiclass avx512_psadbw_packed<bits<8> opc, SDNode OpNode,
string OpcodeStr, X86FoldableSchedWrite sched,
@@ -11584,7 +11584,7 @@ multiclass avx512_psadbw_packed_all<bits<8> opc, SDNode OpNode,
}
defm VPSADBW : avx512_psadbw_packed_all<0xf6, X86psadbw, "vpsadbw",
- SchedWritePSADBW, HasBWI>, EVEX_4V, WIG;
+ SchedWritePSADBW, HasBWI>, EVEX, VVVV, WIG;
// Transforms to swizzle an immediate to enable better matching when
// memory operand isn't in the right place.
@@ -11659,7 +11659,7 @@ multiclass avx512_ternlog<bits<8> opc, string OpcodeStr, SDNode OpNode,
(_.VT _.RC:$src2),
(_.VT _.RC:$src3),
(i8 timm:$src4)), 1, 1>,
- AVX512AIi8Base, EVEX_4V, Sched<[sched]>;
+ AVX512AIi8Base, EVEX, VVVV, Sched<[sched]>;
defm rmi : AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src2, _.MemOp:$src3, u8imm:$src4),
OpcodeStr, "$src4, $src3, $src2", "$src2, $src3, $src4",
@@ -11667,7 +11667,7 @@ multiclass avx512_ternlog<bits<8> opc, string OpcodeStr, SDNode OpNode,
(_.VT _.RC:$src2),
(_.VT (bitconvert (_.LdFrag addr:$src3))),
(i8 timm:$src4)), 1, 0>,
- AVX512AIi8Base, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>,
+ AVX512AIi8Base, EVEX, VVVV, EVEX_CD8<_.EltSize, CD8VF>,
Sched<[sched.Folded, sched.ReadAfterFold]>;
defm rmbi : AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src2, _.ScalarMemOp:$src3, u8imm:$src4),
@@ -11677,7 +11677,7 @@ multiclass avx512_ternlog<bits<8> opc, string OpcodeStr, SDNode OpNode,
(_.VT _.RC:$src2),
(_.VT (_.BroadcastLdFrag addr:$src3)),
(i8 timm:$src4)), 1, 0>, EVEX_B,
- AVX512AIi8Base, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>,
+ AVX512AIi8Base, EVEX, VVVV, EVEX_CD8<_.EltSize, CD8VF>,
Sched<[sched.Folded, sched.ReadAfterFold]>;
}// Constraints = "$src1 = $dst"
@@ -12002,23 +12002,23 @@ multiclass avx512_fixupimm_packed_all<X86SchedWriteWidths sched,
let Predicates = [HasAVX512] in
defm Z : avx512_fixupimm_packed_sae<0x54, "vfixupimm", sched.ZMM,
_Vec.info512, _Tbl.info512>, AVX512AIi8Base,
- EVEX_4V, EVEX_V512;
+ EVEX, VVVV, EVEX_V512;
let Predicates = [HasAVX512, HasVLX] in {
defm Z128 : avx512_fixupimm_packed<0x54, "vfixupimm", sched.XMM,
_Vec.info128, _Tbl.info128>, AVX512AIi8Base,
- EVEX_4V, EVEX_V128;
+ EVEX, VVVV, EVEX_V128;
defm Z256 : avx512_fixupimm_packed<0x54, "vfixupimm", sched.YMM,
_Vec.info256, _Tbl.info256>, AVX512AIi8Base,
- EVEX_4V, EVEX_V256;
+ EVEX, VVVV, EVEX_V256;
}
}
defm VFIXUPIMMSSZ : avx512_fixupimm_scalar<0x55, "vfixupimm",
SchedWriteFAdd.Scl, f32x_info, v4i32x_info>,
- AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<32, CD8VT1>;
+ AVX512AIi8Base, VEX_LIG, EVEX, VVVV, EVEX_CD8<32, CD8VT1>;
defm VFIXUPIMMSDZ : avx512_fixupimm_scalar<0x55, "vfixupimm",
SchedWriteFAdd.Scl, f64x_info, v2i64x_info>,
- AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<64, CD8VT1>, REX_W;
+ AVX512AIi8Base, VEX_LIG, EVEX, VVVV, EVEX_CD8<64, CD8VT1>, REX_W;
defm VFIXUPIMMPS : avx512_fixupimm_packed_all<SchedWriteFAdd, avx512vl_f32_info,
avx512vl_i32_info>, EVEX_CD8<32, CD8VF>;
defm VFIXUPIMMPD : avx512_fixupimm_packed_all<SchedWriteFAdd, avx512vl_f64_info,
@@ -12165,17 +12165,17 @@ multiclass avx512_vaes<bits<8> Op, string OpStr, string IntPrefix> {
defm Z128 : AESI_binop_rm_int<Op, OpStr,
!cast<Intrinsic>(IntPrefix),
loadv2i64, 0, VR128X, i128mem>,
- EVEX_4V, EVEX_CD8<64, CD8VF>, EVEX_V128, WIG;
+ EVEX, VVVV, EVEX_CD8<64, CD8VF>, EVEX_V128, WIG;
defm Z256 : AESI_binop_rm_int<Op, OpStr,
!cast<Intrinsic>(IntPrefix#"_256"),
loadv4i64, 0, VR256X, i256mem>,
- EVEX_4V, EVEX_CD8<64, CD8VF>, EVEX_V256, WIG;
+ EVEX, VVVV, EVEX_CD8<64, CD8VF>, EVEX_V256, WIG;
}
let Predicates = [HasAVX512, HasVAES] in
defm Z : AESI_binop_rm_int<Op, OpStr,
!cast<Intrinsic>(IntPrefix#"_512"),
loadv8i64, 0, VR512, i512mem>,
- EVEX_4V, EVEX_CD8<64, CD8VF>, EVEX_V512, WIG;
+ EVEX, VVVV, EVEX_CD8<64, CD8VF>, EVEX_V512, WIG;
}
defm VAESENC : avx512_vaes<0xDC, "vaesenc", "int_x86_aesni_aesenc">;
@@ -12189,14 +12189,14 @@ defm VAESDECLAST : avx512_vaes<0xDF, "vaesdeclast", "int_x86_aesni_aesdeclast">
let Predicates = [HasAVX512, HasVPCLMULQDQ] in
defm VPCLMULQDQZ : vpclmulqdq<VR512, i512mem, loadv8i64, int_x86_pclmulqdq_512>,
- EVEX_4V, EVEX_V512, EVEX_CD8<64, CD8VF>, WIG;
+ EVEX, VVVV, EVEX_V512, EVEX_CD8<64, CD8VF>, WIG;
let Predicates = [HasVLX, HasVPCLMULQDQ] in {
defm VPCLMULQDQZ128 : vpclmulqdq<VR128X, i128mem, loadv2i64, int_x86_pclmulqdq>,
- EVEX_4V, EVEX_V128, EVEX_CD8<64, CD8VF>, WIG;
+ EVEX, VVVV, EVEX_V128, EVEX_CD8<64, CD8VF>, WIG;
defm VPCLMULQDQZ256: vpclmulqdq<VR256X, i256mem, loadv4i64,
- int_x86_pclmulqdq_256>, EVEX_4V, EVEX_V256,
+ int_x86_pclmulqdq_256>, EVEX, VVVV, EVEX_V256,
EVEX_CD8<64, CD8VF>, WIG;
}
@@ -12217,13 +12217,13 @@ multiclass VBMI2_shift_var_rm<bits<8> Op, string OpStr, SDNode OpNode,
(ins VTI.RC:$src2, VTI.RC:$src3), OpStr,
"$src3, $src2", "$src2, $src3",
(VTI.VT (OpNode VTI.RC:$src1, VTI.RC:$src2, VTI.RC:$src3))>,
- T8PD, EVEX_4V, Sched<[sched]>;
+ T8PD, EVEX, VVVV, Sched<[sched]>;
defm m: AVX512_maskable_3src<Op, MRMSrcMem, VTI, (outs VTI.RC:$dst),
(ins VTI.RC:$src2, VTI.MemOp:$src3), OpStr,
"$src3, $src2", "$src2, $src3",
(VTI.VT (OpNode VTI.RC:$src1, VTI.RC:$src2,
(VTI.VT (VTI.LdFrag addr:$src3))))>,
- T8PD, EVEX_4V,
+ T8PD, EVEX, VVVV,
Sched<[sched.Folded, sched.ReadAfterFold]>;
}
}
@@ -12239,7 +12239,7 @@ multiclass VBMI2_shift_var_rmb<bits<8> Op, string OpStr, SDNode OpNode,
"$src2, ${src3}"#VTI.BroadcastStr,
(OpNode VTI.RC:$src1, VTI.RC:$src2,
(VTI.VT (VTI.BroadcastLdFrag addr:$src3)))>,
- T8PD, EVEX_4V, EVEX_B,
+ T8PD, EVEX, VVVV, EVEX_B,
Sched<[sched.Folded, sched.ReadAfterFold]>;
}
@@ -12284,9 +12284,9 @@ multiclass VBMI2_shift_imm<bits<8> wOp, bits<8> dqOp, string Prefix,
avx512vl_i16_info, avx512vl_i16_info, HasVBMI2>,
REX_W, EVEX_CD8<16, CD8VF>;
defm D : avx512_common_3Op_imm8<Prefix#"d", avx512vl_i32_info, dqOp,
- OpNode, sched, HasVBMI2>, AVX512AIi8Base, EVEX_4V, EVEX_CD8<32, CD8VF>;
+ OpNode, sched, HasVBMI2>, AVX512AIi8Base, EVEX, VVVV, EVEX_CD8<32, CD8VF>;
defm Q : avx512_common_3Op_imm8<Prefix#"q", avx512vl_i64_info, dqOp, OpNode,
- sched, HasVBMI2>, AVX512AIi8Base, EVEX_4V, EVEX_CD8<64, CD8VF>, REX_W;
+ sched, HasVBMI2>, AVX512AIi8Base, EVEX, VVVV, EVEX_CD8<64, CD8VF>, REX_W;
}
// Concat & Shift
@@ -12321,13 +12321,13 @@ multiclass VNNI_rmb<bits<8> Op, string OpStr, SDNode OpNode,
(VTI.VT (OpNode VTI.RC:$src1,
VTI.RC:$src2, VTI.RC:$src3)),
IsCommutable, IsCommutable>,
- EVEX_4V, T8PD, Sched<[sched]>;
+ EVEX, VVVV, T8PD, Sched<[sched]>;
defm m : AVX512_maskable_3src<Op, MRMSrcMem, VTI, (outs VTI.RC:$dst),
(ins VTI.RC:$src2, VTI.MemOp:$src3), OpStr,
"$src3, $src2", "$src2, $src3",
(VTI.VT (OpNode VTI.RC:$src1, VTI.RC:$src2,
(VTI.VT (VTI.LdFrag addr:$src3))))>,
- EVEX_4V, EVEX_CD8<32, CD8VF>, T8PD,
+ EVEX, VVVV, EVEX_CD8<32, CD8VF>, T8PD,
Sched<[sched.Folded, sched.ReadAfterFold,
sched.ReadAfterFold]>;
defm mb : AVX512_maskable_3src<Op, MRMSrcMem, VTI, (outs VTI.RC:$dst),
@@ -12336,7 +12336,7 @@ multiclass VNNI_rmb<bits<8> Op, string OpStr, SDNode OpNode,
"$src2, ${src3}"#VTI.BroadcastStr,
(OpNode VTI.RC:$src1, VTI.RC:$src2,
(VTI.VT (VTI.BroadcastLdFrag addr:$src3)))>,
- EVEX_4V, EVEX_CD8<32, CD8VF>, EVEX_B,
+ EVEX, VVVV, EVEX_CD8<32, CD8VF>, EVEX_B,
T8PD, Sched<[sched.Folded, sched.ReadAfterFold,
sched.ReadAfterFold]>;
}
@@ -12406,7 +12406,7 @@ multiclass VPSHUFBITQMB_rm<X86FoldableSchedWrite sched, X86VectorVTInfo VTI> {
(X86Vpshufbitqmb (VTI.VT VTI.RC:$src1),
(VTI.VT VTI.RC:$src2)),
(X86Vpshufbitqmb_su (VTI.VT VTI.RC:$src1),
- (VTI.VT VTI.RC:$src2))>, EVEX_4V, T8PD,
+ (VTI.VT VTI.RC:$src2))>, EVEX, VVVV, T8PD,
Sched<[sched]>;
defm rm : AVX512_maskable_cmp<0x8F, MRMSrcMem, VTI, (outs VTI.KRC:$dst),
(ins VTI.RC:$src1, VTI.MemOp:$src2),
@@ -12416,7 +12416,7 @@ multiclass VPSHUFBITQMB_rm<X86FoldableSchedWrite sched, X86VectorVTInfo VTI> {
(VTI.VT (VTI.LdFrag addr:$src2))),
(X86Vpshufbitqmb_su (VTI.VT VTI.RC:$src1),
(VTI.VT (VTI.LdFrag addr:$src2)))>,
- EVEX_4V, EVEX_CD8<8, CD8VF>, T8PD,
+ EVEX, VVVV, EVEX_CD8<8, CD8VF>, T8PD,
Sched<[sched.Folded, sched.ReadAfterFold]>;
}
@@ -12483,10 +12483,10 @@ multiclass GF2P8AFFINE_avx512_common<bits<8> Op, string OpStr, SDNode OpNode,
defm VGF2P8AFFINEINVQB : GF2P8AFFINE_avx512_common<0xCF, "vgf2p8affineinvqb",
X86GF2P8affineinvqb, SchedWriteVecIMul>,
- EVEX_4V, EVEX_CD8<8, CD8VF>, REX_W, AVX512AIi8Base;
+ EVEX, VVVV, EVEX_CD8<8, CD8VF>, REX_W, AVX512AIi8Base;
defm VGF2P8AFFINEQB : GF2P8AFFINE_avx512_common<0xCE, "vgf2p8affineqb",
X86GF2P8affineqb, SchedWriteVecIMul>,
- EVEX_4V, EVEX_CD8<8, CD8VF>, REX_W, AVX512AIi8Base;
+ EVEX, VVVV, EVEX_CD8<8, CD8VF>, REX_W, AVX512AIi8Base;
//===----------------------------------------------------------------------===//
@@ -12498,25 +12498,25 @@ let hasSideEffects = 0, mayLoad = 1, ExeDomain = SSEPackedSingle,
defm V4FMADDPSrm : AVX512_maskable_3src_in_asm<0x9A, MRMSrcMem, v16f32_info,
(outs VR512:$dst), (ins VR512:$src2, f128mem:$src3),
"v4fmaddps", "$src3, $src2", "$src2, $src3",
- []>, EVEX_V512, EVEX_4V, T8XD, EVEX_CD8<32, CD8VQ>,
+ []>, EVEX_V512, EVEX, VVVV, T8XD, EVEX_CD8<32, CD8VQ>,
Sched<[SchedWriteFMA.ZMM.Folded]>;
defm V4FNMADDPSrm : AVX512_maskable_3src_in_asm<0xAA, MRMSrcMem, v16f32_info,
(outs VR512:$dst), (ins VR512:$src2, f128mem:$src3),
"v4fnmaddps", "$src3, $src2", "$src2, $src3",
- []>, EVEX_V512, EVEX_4V, T8XD, EVEX_CD8<32, CD8VQ>,
+ []>, EVEX_V512, EVEX, VVVV, T8XD, EVEX_CD8<32, CD8VQ>,
Sched<[SchedWriteFMA.ZMM.Folded]>;
defm V4FMADDSSrm : AVX512_maskable_3src_in_asm<0x9B, MRMSrcMem, f32x_info,
(outs VR128X:$dst), (ins VR128X:$src2, f128mem:$src3),
"v4fmaddss", "$src3, $src2", "$src2, $src3",
- []>, VEX_LIG, EVEX_4V, T8XD, EVEX_CD8<32, CD8VF>,
+ []>, VEX_LIG, EVEX, VVVV, T8XD, EVEX_CD8<32, CD8VF>,
Sched<[SchedWriteFMA.Scl.Folded]>;
defm V4FNMADDSSrm : AVX512_maskable_3src_in_asm<0xAB, MRMSrcMem, f32x_info,
(outs VR128X:$dst), (ins VR128X:$src2, f128mem:$src3),
"v4fnmaddss", "$src3, $src2", "$src2, $src3",
- []>, VEX_LIG, EVEX_4V, T8XD, EVEX_CD8<32, CD8VF>,
+ []>, VEX_LIG, EVEX, VVVV, T8XD, EVEX_CD8<32, CD8VF>,
Sched<[SchedWriteFMA.Scl.Folded]>;
}
@@ -12529,13 +12529,13 @@ let hasSideEffects = 0, mayLoad = 1, ExeDomain = SSEPackedInt,
defm VP4DPWSSDrm : AVX512_maskable_3src_in_asm<0x52, MRMSrcMem, v16i32_info,
(outs VR512:$dst), (ins VR512:$src2, f128mem:$src3),
"vp4dpwssd", "$src3, $src2", "$src2, $src3",
- []>, EVEX_V512, EVEX_4V, T8XD, EVEX_CD8<32, CD8VQ>,
+ []>, EVEX_V512, EVEX, VVVV, T8XD, EVEX_CD8<32, CD8VQ>,
Sched<[SchedWriteFMA.ZMM.Folded]>;
defm VP4DPWSSDSrm : AVX512_maskable_3src_in_asm<0x53, MRMSrcMem, v16i32_info,
(outs VR512:$dst), (ins VR512:$src2, f128mem:$src3),
"vp4dpwssds", "$src3, $src2", "$src2, $src3",
- []>, EVEX_V512, EVEX_4V, T8XD, EVEX_CD8<32, CD8VQ>,
+ []>, EVEX_V512, EVEX, VVVV, T8XD, EVEX_CD8<32, CD8VQ>,
Sched<[SchedWriteFMA.ZMM.Folded]>;
}
@@ -12558,7 +12558,7 @@ multiclass avx512_vp2intersect_modes<X86FoldableSchedWrite sched, X86VectorVTInf
"\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(set _.KRPC:$dst, (X86vp2intersect
_.RC:$src1, (_.VT _.RC:$src2)))]>,
- EVEX_4V, T8XD, Sched<[sched]>;
+ EVEX, VVVV, T8XD, Sched<[sched]>;
def rm : I<0x68, MRMSrcMem,
(outs _.KRPC:$dst),
@@ -12567,7 +12567,7 @@ multiclass avx512_vp2intersect_modes<X86FoldableSchedWrite sched, X86VectorVTInf
"\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(set _.KRPC:$dst, (X86vp2intersect
_.RC:$src1, (_.VT (bitconvert (_.LdFrag addr:$src2)))))]>,
- EVEX_4V, T8XD, EVEX_CD8<_.EltSize, CD8VF>,
+ EVEX, VVVV, T8XD, EVEX_CD8<_.EltSize, CD8VF>,
Sched<[sched.Folded, sched.ReadAfterFold]>;
def rmb : I<0x68, MRMSrcMem,
@@ -12577,7 +12577,7 @@ multiclass avx512_vp2intersect_modes<X86FoldableSchedWrite sched, X86VectorVTInf
", $src1, $dst|$dst, $src1, ${src2}", _.BroadcastStr ,"}"),
[(set _.KRPC:$dst, (X86vp2intersect
_.RC:$src1, (_.VT (_.BroadcastLdFrag addr:$src2))))]>,
- EVEX_4V, T8XD, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>,
+ EVEX, VVVV, T8XD, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>,
Sched<[sched.Folded, sched.ReadAfterFold]>;
}
@@ -12744,13 +12744,13 @@ multiclass avx512_dpbf16ps_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
(ins src_v.RC:$src2, src_v.RC:$src3),
OpcodeStr, "$src3, $src2", "$src2, $src3",
(_.VT (OpNode _.RC:$src1, src_v.RC:$src2, src_v.RC:$src3))>,
- EVEX_4V, Sched<[sched]>;
+ EVEX, VVVV, Sched<[sched]>;
defm m: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins src_v.RC:$src2, src_v.MemOp:$src3),
OpcodeStr, "$src3, $src2", "$src2, $src3",
(_.VT (OpNode _.RC:$src1, src_v.RC:$src2,
- (src_v.LdFrag addr:$src3)))>, EVEX_4V,
+ (src_v.LdFrag addr:$src3)))>, EVEX, VVVV,
Sched<[sched.Folded, sched.ReadAfterFold]>;
defm mb: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
@@ -12760,7 +12760,7 @@ multiclass avx512_dpbf16ps_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
!strconcat("$src2, ${src3}", _.BroadcastStr),
(_.VT (OpNode _.RC:$src1, src_v.RC:$src2,
(src_v.VT (src_v.BroadcastLdFrag addr:$src3))))>,
- EVEX_B, EVEX_4V, Sched<[sched.Folded, sched.ReadAfterFold]>;
+ EVEX_B, EVEX, VVVV, Sched<[sched.Folded, sched.ReadAfterFold]>;
}
} // Constraints = "$src1 = $dst"
@@ -13390,17 +13390,17 @@ let Constraints = "@earlyclobber $dst, $src1 = $dst" in {
defm r: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src2, _.RC:$src3),
OpcodeStr, "$src3, $src2", "$src2, $src3",
- (_.VT (OpNode _.RC:$src2, _.RC:$src3, _.RC:$src1)), IsCommutable>, EVEX_4V;
+ (_.VT (OpNode _.RC:$src2, _.RC:$src3, _.RC:$src1)), IsCommutable>, EVEX, VVVV;
defm m: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src2, _.MemOp:$src3),
OpcodeStr, "$src3, $src2", "$src2, $src3",
- (_.VT (OpNode _.RC:$src2, (_.LdFrag addr:$src3), _.RC:$src1))>, EVEX_4V;
+ (_.VT (OpNode _.RC:$src2, (_.LdFrag addr:$src3), _.RC:$src1))>, EVEX, VVVV;
defm mb: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src2, _.ScalarMemOp:$src3),
OpcodeStr, !strconcat("${src3}", _.BroadcastStr,", $src2"), !strconcat("$src2, ${src3}", _.BroadcastStr),
- (_.VT (OpNode _.RC:$src2, (_.VT (_.BroadcastLdFrag addr:$src3)), _.RC:$src1))>, EVEX_B, EVEX_4V;
+ (_.VT (OpNode _.RC:$src2, (_.VT (_.BroadcastLdFrag addr:$src3)), _.RC:$src1))>, EVEX_B, EVEX, VVVV;
}
} // Constraints = "@earlyclobber $dst, $src1 = $dst"
@@ -13411,7 +13411,7 @@ multiclass avx512_cfmaop_round<bits<8> opc, string OpcodeStr, SDNode OpNode,
(ins _.RC:$src2, _.RC:$src3, AVX512RC:$rc),
OpcodeStr, "$rc, $src3, $src2", "$src2, $src3, $rc",
(_.VT (OpNode _.RC:$src2, _.RC:$src3, _.RC:$src1, (i32 timm:$rc)))>,
- EVEX_4V, EVEX_B, EVEX_RC;
+ EVEX, VVVV, EVEX_B, EVEX_RC;
}
@@ -13504,12 +13504,12 @@ multiclass avx512_cfmbinop_sh_common<bits<8> opc, string OpcodeStr, SDNode OpNod
let Uses = [MXCSR] in {
defm VFMADDCSHZ : avx512_cfmaop_sh_common<0x57, "vfmaddcsh", x86vfmaddcSh, x86vfmaddcShRnd, 1>,
- T_MAP6XS, EVEX_CD8<32, CD8VT1>, EVEX_V128, EVEX_4V;
+ T_MAP6XS, EVEX_CD8<32, CD8VT1>, EVEX_V128, EVEX, VVVV;
defm VFCMADDCSHZ : avx512_cfmaop_sh_common<0x57, "vfcmaddcsh", x86vfcmaddcSh, x86vfcmaddcShRnd, 0>,
- T_MAP6XD, EVEX_CD8<32, CD8VT1>, EVEX_V128, EVEX_4V;
+ T_MAP6XD, EVEX_CD8<32, CD8VT1>, EVEX_V128, EVEX, VVVV;
defm VFMULCSHZ : avx512_cfmbinop_sh_common<0xD7, "vfmulcsh", x86vfmulcSh, x86vfmulcShRnd, 1>,
- T_MAP6XS, EVEX_CD8<32, CD8VT1>, EVEX_V128, VEX_LIG, EVEX_4V;
+ T_MAP6XS, EVEX_CD8<32, CD8VT1>, EVEX_V128, VEX_LIG, EVEX, VVVV;
defm VFCMULCSHZ : avx512_cfmbinop_sh_common<0xD7, "vfcmulcsh", x86vfcmulcSh, x86vfcmulcShRnd, 0>,
- T_MAP6XD, EVEX_CD8<32, CD8VT1>, EVEX_V128, VEX_LIG, EVEX_4V;
+ T_MAP6XD, EVEX_CD8<32, CD8VT1>, EVEX_V128, VEX_LIG, EVEX, VVVV;
}
diff --git a/llvm/lib/Target/X86/X86InstrArithmetic.td b/llvm/lib/Target/X86/X86InstrArithmetic.td
index 87feb7dc3b4e..6f4b69c9b5c9 100644
--- a/llvm/lib/Target/X86/X86InstrArithmetic.td
+++ b/llvm/lib/Target/X86/X86InstrArithmetic.td
@@ -1095,23 +1095,23 @@ let Predicates = [HasBMI, NoEGPR] in {
def rr : I<0xF2, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
!strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(set RC:$dst, EFLAGS, (X86and_flag (not RC:$src1), RC:$src2))]>,
- VEX_4V, Sched<[sched]>;
+ VEX, VVVV, Sched<[sched]>;
def rm : I<0xF2, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
!strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(set RC:$dst, EFLAGS,
(X86and_flag (not RC:$src1), (ld_frag addr:$src2)))]>,
- VEX_4V, Sched<[sched.Folded, sched.ReadAfterFold]>;
+ VEX, VVVV, Sched<[sched.Folded, sched.ReadAfterFold]>;
}
let Predicates = [HasBMI, HasEGPR, In64BitMode] in {
def rr_EVEX : I<0xF2, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
!strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(set RC:$dst, EFLAGS, (X86and_flag (not RC:$src1), RC:$src2))]>,
- EVEX_4V, Sched<[sched]>;
+ EVEX, VVVV, Sched<[sched]>;
def rm_EVEX : I<0xF2, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
!strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(set RC:$dst, EFLAGS,
(X86and_flag (not RC:$src1), (ld_frag addr:$src2)))]>,
- EVEX_4V, Sched<[sched.Folded, sched.ReadAfterFold]>;
+ EVEX, VVVV, Sched<[sched.Folded, sched.ReadAfterFold]>;
}
}
@@ -1141,12 +1141,12 @@ let hasSideEffects = 0 in {
let Predicates = [HasBMI2, NoEGPR] in {
def rr : I<0xF6, MRMSrcReg, (outs RC:$dst1, RC:$dst2), (ins RC:$src),
!strconcat(mnemonic, "\t{$src, $dst2, $dst1|$dst1, $dst2, $src}"),
- []>, T8XD, VEX_4V, Sched<[WriteIMulH, sched]>;
+ []>, T8XD, VEX, VVVV, Sched<[WriteIMulH, sched]>;
let mayLoad = 1 in
def rm : I<0xF6, MRMSrcMem, (outs RC:$dst1, RC:$dst2), (ins x86memop:$src),
!strconcat(mnemonic, "\t{$src, $dst2, $dst1|$dst1, $dst2, $src}"),
- []>, T8XD, VEX_4V,
+ []>, T8XD, VEX, VVVV,
Sched<[WriteIMulHLd, sched.Folded,
// Memory operand.
ReadDefault, ReadDefault, ReadDefault, ReadDefault, ReadDefault,
@@ -1165,11 +1165,11 @@ let Predicates = [HasBMI2, NoEGPR] in {
let Predicates = [HasBMI2, HasEGPR, In64BitMode] in
def rr#_EVEX : I<0xF6, MRMSrcReg, (outs RC:$dst1, RC:$dst2), (ins RC:$src),
!strconcat(mnemonic, "\t{$src, $dst2, $dst1|$dst1, $dst2, $src}"),
- []>, T8XD, EVEX_4V, Sched<[WriteIMulH, sched]>;
+ []>, T8XD, EVEX, VVVV, Sched<[WriteIMulH, sched]>;
let Predicates = [HasBMI2, HasEGPR, In64BitMode], mayLoad = 1 in
def rm#_EVEX : I<0xF6, MRMSrcMem, (outs RC:$dst1, RC:$dst2), (ins x86memop:$src),
!strconcat(mnemonic, "\t{$src, $dst2, $dst1|$dst1, $dst2, $src}"),
- []>, T8XD, EVEX_4V,
+ []>, T8XD, EVEX, VVVV,
Sched<[WriteIMulHLd, sched.Folded,
// Memory operand.
ReadDefault, ReadDefault, ReadDefault, ReadDefault, ReadDefault,
diff --git a/llvm/lib/Target/X86/X86InstrMisc.td b/llvm/lib/Target/X86/X86InstrMisc.td
index 3006969b76d6..a6bed74b5bef 100644
--- a/llvm/lib/Target/X86/X86InstrMisc.td
+++ b/llvm/lib/Target/X86/X86InstrMisc.td
@@ -165,10 +165,10 @@ def POPP64r : I<0x58, AddRegFrm, (outs GR64:$reg), (ins), "popp\t$reg", []>,
REX_W, ExplicitREX2Prefix, Requires<[In64BitMode]>;
def POP2: I<0x8F, MRM0r, (outs GR64:$reg1, GR64:$reg2), (ins),
"pop2\t{$reg2, $reg1|$reg1, $reg2}",
- []>, EVEX_4V, EVEX_B, T_MAP4PS;
+ []>, EVEX, VVVV, EVEX_B, T_MAP4PS;
def POP2P: I<0x8F, MRM0r, (outs GR64:$reg1, GR64:$reg2), (ins),
"pop2p\t{$reg2, $reg1|$reg1, $reg2}",
- []>, EVEX_4V, EVEX_B, T_MAP4PS, REX_W;
+ []>, EVEX, VVVV, EVEX_B, T_MAP4PS, REX_W;
} // mayLoad, SchedRW
let mayLoad = 1, mayStore = 1, SchedRW = [WriteCopy] in
@@ -186,10 +186,10 @@ def PUSHP64r : I<0x50, AddRegFrm, (outs), (ins GR64:$reg), "pushp\t$reg", []>,
REX_W, ExplicitREX2Prefix, Requires<[In64BitMode]>;
def PUSH2: I<0xFF, MRM6r, (outs), (ins GR64:$reg1, GR64:$reg2),
"push2\t{$reg2, $reg1|$reg1, $reg2}",
- []>, EVEX_4V, EVEX_B, T_MAP4PS;
+ []>, EVEX, VVVV, EVEX_B, T_MAP4PS;
def PUSH2P: I<0xFF, MRM6r, (outs), (ins GR64:$reg1, GR64:$reg2),
"push2p\t{$reg2, $reg1|$reg1, $reg2}",
- []>, EVEX_4V, EVEX_B, T_MAP4PS, REX_W;
+ []>, EVEX, VVVV, EVEX_B, T_MAP4PS, REX_W;
} // mayStore, SchedRW
let mayLoad = 1, mayStore = 1, SchedRW = [WriteCopy] in {
def PUSH64rmm: I<0xFF, MRM6m, (outs), (ins i64mem:$src), "push{q}\t$src", []>,
@@ -1218,11 +1218,11 @@ multiclass bmi_bls<string mnemonic, Format RegMRM, Format MemMRM,
let hasSideEffects = 0 in {
def rr#Suffix : I<0xF3, RegMRM, (outs RC:$dst), (ins RC:$src),
!strconcat(mnemonic, "\t{$src, $dst|$dst, $src}"), []>,
- T8PS, VEX_4V, Sched<[sched]>;
+ T8PS, VEX, VVVV, Sched<[sched]>;
let mayLoad = 1 in
def rm#Suffix : I<0xF3, MemMRM, (outs RC:$dst), (ins x86memop:$src),
!strconcat(mnemonic, "\t{$src, $dst|$dst, $src}"), []>,
- T8PS, VEX_4V, Sched<[sched.Folded]>;
+ T8PS, VEX, VVVV, Sched<[sched.Folded]>;
}
}
@@ -1371,11 +1371,11 @@ multiclass bmi_pdep_pext<string mnemonic, RegisterClass RC,
def rr#Suffix : I<0xF5, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
!strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(set RC:$dst, (OpNode RC:$src1, RC:$src2))]>,
- VEX_4V, Sched<[WriteALU]>;
+ VEX, VVVV, Sched<[WriteALU]>;
def rm#Suffix : I<0xF5, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
!strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(set RC:$dst, (OpNode RC:$src1, (ld_frag addr:$src2)))]>,
- VEX_4V, Sched<[WriteALU.Folded, WriteALU.ReadAfterFold]>;
+ VEX, VVVV, Sched<[WriteALU.Folded, WriteALU.ReadAfterFold]>;
}
let Predicates = [HasBMI2, NoEGPR] in {
@@ -1419,12 +1419,12 @@ multiclass lwpins_intr<RegisterClass RC> {
def rri : Ii32<0x12, MRM0r, (outs), (ins RC:$src0, GR32:$src1, i32imm:$cntl),
"lwpins\t{$cntl, $src1, $src0|$src0, $src1, $cntl}",
[(set EFLAGS, (X86lwpins RC:$src0, GR32:$src1, timm:$cntl))]>,
- XOP_4V, XOPA;
+ XOP, VVVV, XOPA;
let mayLoad = 1 in
def rmi : Ii32<0x12, MRM0m, (outs), (ins RC:$src0, i32mem:$src1, i32imm:$cntl),
"lwpins\t{$cntl, $src1, $src0|$src0, $src1, $cntl}",
[(set EFLAGS, (X86lwpins RC:$src0, (loadi32 addr:$src1), timm:$cntl))]>,
- XOP_4V, XOPA;
+ XOP, VVVV, XOPA;
}
let Defs = [EFLAGS] in {
@@ -1435,12 +1435,12 @@ let Defs = [EFLAGS] in {
multiclass lwpval_intr<RegisterClass RC, Intrinsic Int> {
def rri : Ii32<0x12, MRM1r, (outs), (ins RC:$src0, GR32:$src1, i32imm:$cntl),
"lwpval\t{$cntl, $src1, $src0|$src0, $src1, $cntl}",
- [(Int RC:$src0, GR32:$src1, timm:$cntl)]>, XOP_4V, XOPA;
+ [(Int RC:$src0, GR32:$src1, timm:$cntl)]>, XOP, VVVV, XOPA;
let mayLoad = 1 in
def rmi : Ii32<0x12, MRM1m, (outs), (ins RC:$src0, i32mem:$src1, i32imm:$cntl),
"lwpval\t{$cntl, $src1, $src0|$src0, $src1, $cntl}",
[(Int RC:$src0, (loadi32 addr:$src1), timm:$cntl)]>,
- XOP_4V, XOPA;
+ XOP, VVVV, XOPA;
}
defm LWPVAL32 : lwpval_intr<GR32, int_x86_lwpval32>;
@@ -1670,14 +1670,14 @@ def CMPCCXADDmr32 : I<0xe0, MRMDestMem4VOp3CC, (outs GR32:$dst),
"cmp${cond}xadd\t{$src3, $dst, $dstsrc2|$dstsrc2, $dst, $src3}",
[(set GR32:$dst, (X86cmpccxadd addr:$dstsrc2,
GR32:$dstsrc1, GR32:$src3, timm:$cond))]>,
- VEX_4V, T8PD, Sched<[WriteXCHG]>;
+ VEX, VVVV, T8PD, Sched<[WriteXCHG]>;
def CMPCCXADDmr64 : I<0xe0, MRMDestMem4VOp3CC, (outs GR64:$dst),
(ins GR64:$dstsrc1, i64mem:$dstsrc2, GR64:$src3, ccode:$cond),
"cmp${cond}xadd\t{$src3, $dst, $dstsrc2|$dstsrc2, $dst, $src3}",
[(set GR64:$dst, (X86cmpccxadd addr:$dstsrc2,
GR64:$dstsrc1, GR64:$src3, timm:$cond))]>,
- VEX_4V, REX_W, T8PD, Sched<[WriteXCHG]>;
+ VEX, VVVV, REX_W, T8PD, Sched<[WriteXCHG]>;
}
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td
index 2e1560a9f7dc..d91c7740aae3 100644
--- a/llvm/lib/Target/X86/X86InstrSSE.td
+++ b/llvm/lib/Target/X86/X86InstrSSE.td
@@ -215,7 +215,7 @@ multiclass sse12_move<RegisterClass RC, SDNode OpNode, ValueType vt,
let Predicates = [UseAVX, OptForSize] in
defm V#NAME : sse12_move_rr<OpNode, vt, OpcodeStr,
"\t{$src2, $src1, $dst|$dst, $src1, $src2}", d>,
- VEX_4V, VEX_LIG, WIG;
+ VEX, VVVV, VEX_LIG, WIG;
def V#NAME#mr : SI<0x11, MRMDestMem, (outs), (ins x86memop:$dst, RC:$src),
!strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
@@ -683,7 +683,7 @@ multiclass sse12_mov_hilo_packed<bits<8>opc, SDPatternOperator pdnode,
let Predicates = [UseAVX] in
defm V#NAME : sse12_mov_hilo_packed_base<opc, pdnode, base_opc,
"\t{$src2, $src1, $dst|$dst, $src1, $src2}">,
- VEX_4V, WIG;
+ VEX, VVVV, WIG;
let Constraints = "$src1 = $dst" in
defm NAME : sse12_mov_hilo_packed_base<opc, pdnode, base_opc,
@@ -823,14 +823,14 @@ let Predicates = [UseAVX] in {
"movlhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[(set VR128:$dst,
(v4f32 (X86Movlhps VR128:$src1, VR128:$src2)))]>,
- VEX_4V, Sched<[SchedWriteFShuffle.XMM]>, WIG;
+ VEX, VVVV, Sched<[SchedWriteFShuffle.XMM]>, WIG;
let isCommutable = 1 in
def VMOVHLPSrr : VPSI<0x12, MRMSrcReg, (outs VR128:$dst),
(ins VR128:$src1, VR128:$src2),
"movhlps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[(set VR128:$dst,
(v4f32 (X86Movhlps VR128:$src1, VR128:$src2)))]>,
- VEX_4V, Sched<[SchedWriteFShuffle.XMM]>, WIG;
+ VEX, VVVV, Sched<[SchedWriteFShuffle.XMM]>, WIG;
}
let Constraints = "$src1 = $dst" in {
def MOVLHPSrr : PSI<0x16, MRMSrcReg, (outs VR128:$dst),
@@ -941,16 +941,16 @@ defm VCVTSD2SI64 : sse12_cvt_s<0x2D, FR64, GR64, llrint, f64mem, loadf64,
// where appropriate to do so.
let isCodeGenOnly = 1 in {
defm VCVTSI2SS : sse12_vcvt_avx<0x2A, GR32, FR32, i32mem, "cvtsi2ss", "l",
- WriteCvtI2SS, SSEPackedSingle>, XS, VEX_4V,
+ WriteCvtI2SS, SSEPackedSingle>, XS, VEX, VVVV,
VEX_LIG, SIMD_EXC;
defm VCVTSI642SS : sse12_vcvt_avx<0x2A, GR64, FR32, i64mem, "cvtsi2ss", "q",
- WriteCvtI2SS, SSEPackedSingle>, XS, VEX_4V,
+ WriteCvtI2SS, SSEPackedSingle>, XS, VEX, VVVV,
REX_W, VEX_LIG, SIMD_EXC;
defm VCVTSI2SD : sse12_vcvt_avx<0x2A, GR32, FR64, i32mem, "cvtsi2sd", "l",
- WriteCvtI2SD, SSEPackedDouble>, XD, VEX_4V,
+ WriteCvtI2SD, SSEPackedDouble>, XD, VEX, VVVV,
VEX_LIG;
defm VCVTSI642SD : sse12_vcvt_avx<0x2A, GR64, FR64, i64mem, "cvtsi2sd", "q",
- WriteCvtI2SD, SSEPackedDouble>, XD, VEX_4V,
+ WriteCvtI2SD, SSEPackedDouble>, XD, VEX, VVVV,
REX_W, VEX_LIG, SIMD_EXC;
} // isCodeGenOnly = 1
@@ -1090,16 +1090,16 @@ defm CVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, i64, v2f64, X86cvts2si,
let Predicates = [UseAVX] in {
defm VCVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
i32mem, "cvtsi2ss", "l", WriteCvtI2SS, SSEPackedSingle, 0>,
- XS, VEX_4V, VEX_LIG, SIMD_EXC;
+ XS, VEX, VVVV, VEX_LIG, SIMD_EXC;
defm VCVTSI642SS : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
i64mem, "cvtsi2ss", "q", WriteCvtI2SS, SSEPackedSingle, 0>,
- XS, VEX_4V, VEX_LIG, REX_W, SIMD_EXC;
+ XS, VEX, VVVV, VEX_LIG, REX_W, SIMD_EXC;
defm VCVTSI2SD : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
i32mem, "cvtsi2sd", "l", WriteCvtI2SD, SSEPackedDouble, 0>,
- XD, VEX_4V, VEX_LIG;
+ XD, VEX, VVVV, VEX_LIG;
defm VCVTSI642SD : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
i64mem, "cvtsi2sd", "q", WriteCvtI2SD, SSEPackedDouble, 0>,
- XD, VEX_4V, VEX_LIG, REX_W, SIMD_EXC;
+ XD, VEX, VVVV, VEX_LIG, REX_W, SIMD_EXC;
}
let Constraints = "$src1 = $dst" in {
defm CVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
@@ -1289,13 +1289,13 @@ let isCodeGenOnly = 1, hasSideEffects = 0, Predicates = [UseAVX],
def VCVTSD2SSrr : VSDI<0x5A, MRMSrcReg, (outs FR32:$dst),
(ins FR32:$src1, FR64:$src2),
"cvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
- VEX_4V, VEX_LIG, WIG,
+ VEX, VVVV, VEX_LIG, WIG,
Sched<[WriteCvtSD2SS]>, SIMD_EXC;
let mayLoad = 1 in
def VCVTSD2SSrm : I<0x5A, MRMSrcMem, (outs FR32:$dst),
(ins FR32:$src1, f64mem:$src2),
"vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
- XD, VEX_4V, VEX_LIG, WIG,
+ XD, VEX, VVVV, VEX_LIG, WIG,
Sched<[WriteCvtSD2SS.Folded, WriteCvtSD2SS.ReadAfterFold]>, SIMD_EXC;
}
@@ -1321,14 +1321,14 @@ def VCVTSD2SSrr_Int: I<0x5A, MRMSrcReg,
"vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[(set VR128:$dst,
(v4f32 (X86frounds VR128:$src1, (v2f64 VR128:$src2))))]>,
- XD, VEX_4V, VEX_LIG, WIG, Requires<[UseAVX]>,
+ XD, VEX, VVVV, VEX_LIG, WIG, Requires<[UseAVX]>,
Sched<[WriteCvtSD2SS]>;
def VCVTSD2SSrm_Int: I<0x5A, MRMSrcMem,
(outs VR128:$dst), (ins VR128:$src1, sdmem:$src2),
"vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[(set VR128:$dst,
(v4f32 (X86frounds VR128:$src1, (sse_load_f64 addr:$src2))))]>,
- XD, VEX_4V, VEX_LIG, WIG, Requires<[UseAVX]>,
+ XD, VEX, VVVV, VEX_LIG, WIG, Requires<[UseAVX]>,
Sched<[WriteCvtSD2SS.Folded, WriteCvtSD2SS.ReadAfterFold]>;
let Constraints = "$src1 = $dst" in {
def CVTSD2SSrr_Int: I<0x5A, MRMSrcReg,
@@ -1353,13 +1353,13 @@ let isCodeGenOnly = 1, hasSideEffects = 0, ExeDomain = SSEPackedSingle in {
def VCVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst),
(ins FR64:$src1, FR32:$src2),
"vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
- XS, VEX_4V, VEX_LIG, WIG,
+ XS, VEX, VVVV, VEX_LIG, WIG,
Sched<[WriteCvtSS2SD]>, Requires<[UseAVX]>, SIMD_EXC;
let mayLoad = 1 in
def VCVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst),
(ins FR64:$src1, f32mem:$src2),
"vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
- XS, VEX_4V, VEX_LIG, WIG,
+ XS, VEX, VVVV, VEX_LIG, WIG,
Sched<[WriteCvtSS2SD.Folded, WriteCvtSS2SD.ReadAfterFold]>,
Requires<[UseAVX, OptForSize]>, SIMD_EXC;
} // isCodeGenOnly = 1, hasSideEffects = 0
@@ -1386,13 +1386,13 @@ let hasSideEffects = 0, Uses = [MXCSR], mayRaiseFPException = 1,
def VCVTSS2SDrr_Int: I<0x5A, MRMSrcReg,
(outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
"vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
- []>, XS, VEX_4V, VEX_LIG, WIG,
+ []>, XS, VEX, VVVV, VEX_LIG, WIG,
Requires<[HasAVX]>, Sched<[WriteCvtSS2SD]>;
let mayLoad = 1 in
def VCVTSS2SDrm_Int: I<0x5A, MRMSrcMem,
(outs VR128:$dst), (ins VR128:$src1, ssmem:$src2),
"vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
- []>, XS, VEX_4V, VEX_LIG, WIG, Requires<[HasAVX]>,
+ []>, XS, VEX, VVVV, VEX_LIG, WIG, Requires<[HasAVX]>,
Sched<[WriteCvtSS2SD.Folded, WriteCvtSS2SD.ReadAfterFold]>;
let Constraints = "$src1 = $dst" in { // SSE2 instructions with XS prefix
def CVTSS2SDrr_Int: I<0x5A, MRMSrcReg,
@@ -1860,12 +1860,12 @@ let ExeDomain = SSEPackedSingle in
defm VCMPSS : sse12_cmp_scalar<FR32, f32mem, ssmem, X86cmps, v4f32, loadf32,
"cmpss\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
SchedWriteFCmpSizes.PS.Scl, sse_load_f32>,
- XS, VEX_4V, VEX_LIG, WIG;
+ XS, VEX, VVVV, VEX_LIG, WIG;
let ExeDomain = SSEPackedDouble in
defm VCMPSD : sse12_cmp_scalar<FR64, f64mem, sdmem, X86cmps, v2f64, loadf64,
"cmpsd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
SchedWriteFCmpSizes.PD.Scl, sse_load_f64>,
- XD, VEX_4V, VEX_LIG, WIG;
+ XD, VEX, VVVV, VEX_LIG, WIG;
let Constraints = "$src1 = $dst" in {
let ExeDomain = SSEPackedSingle in
@@ -1979,16 +1979,16 @@ multiclass sse12_cmp_packed<RegisterClass RC, X86MemOperand x86memop,
defm VCMPPS : sse12_cmp_packed<VR128, f128mem, v4f32,
"cmpps\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
- SchedWriteFCmpSizes.PS.XMM, SSEPackedSingle, loadv4f32>, PS, VEX_4V, WIG;
+ SchedWriteFCmpSizes.PS.XMM, SSEPackedSingle, loadv4f32>, PS, VEX, VVVV, WIG;
defm VCMPPD : sse12_cmp_packed<VR128, f128mem, v2f64,
"cmppd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
- SchedWriteFCmpSizes.PD.XMM, SSEPackedDouble, loadv2f64>, PD, VEX_4V, WIG;
+ SchedWriteFCmpSizes.PD.XMM, SSEPackedDouble, loadv2f64>, PD, VEX, VVVV, WIG;
defm VCMPPSY : sse12_cmp_packed<VR256, f256mem, v8f32,
"cmpps\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
- SchedWriteFCmpSizes.PS.YMM, SSEPackedSingle, loadv8f32>, PS, VEX_4V, VEX_L, WIG;
+ SchedWriteFCmpSizes.PS.YMM, SSEPackedSingle, loadv8f32>, PS, VEX, VVVV, VEX_L, WIG;
defm VCMPPDY : sse12_cmp_packed<VR256, f256mem, v4f64,
"cmppd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
- SchedWriteFCmpSizes.PD.YMM, SSEPackedDouble, loadv4f64>, PD, VEX_4V, VEX_L, WIG;
+ SchedWriteFCmpSizes.PD.YMM, SSEPackedDouble, loadv4f64>, PD, VEX, VVVV, VEX_L, WIG;
let Constraints = "$src1 = $dst" in {
defm CMPPS : sse12_cmp_packed<VR128, f128mem, v4f32,
"cmpps\t{$cc, $src2, $dst|$dst, $src2, $cc}",
@@ -2076,19 +2076,19 @@ let Predicates = [HasAVX, NoVLX] in {
defm VSHUFPS : sse12_shuffle<VR128, f128mem, v4f32,
"shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
loadv4f32, SchedWriteFShuffle.XMM, SSEPackedSingle>,
- PS, VEX_4V, WIG;
+ PS, VEX, VVVV, WIG;
defm VSHUFPSY : sse12_shuffle<VR256, f256mem, v8f32,
"shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
loadv8f32, SchedWriteFShuffle.YMM, SSEPackedSingle>,
- PS, VEX_4V, VEX_L, WIG;
+ PS, VEX, VVVV, VEX_L, WIG;
defm VSHUFPD : sse12_shuffle<VR128, f128mem, v2f64,
"shufpd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
loadv2f64, SchedWriteFShuffle.XMM, SSEPackedDouble>,
- PD, VEX_4V, WIG;
+ PD, VEX, VVVV, WIG;
defm VSHUFPDY : sse12_shuffle<VR256, f256mem, v4f64,
"shufpd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
loadv4f64, SchedWriteFShuffle.YMM, SSEPackedDouble>,
- PD, VEX_4V, VEX_L, WIG;
+ PD, VEX, VVVV, VEX_L, WIG;
}
let Constraints = "$src1 = $dst" in {
defm SHUFPS : sse12_shuffle<VR128, f128mem, v4f32,
@@ -2126,29 +2126,29 @@ multiclass sse12_unpack_interleave<bits<8> opc, SDNode OpNode, ValueType vt,
let Predicates = [HasAVX, NoVLX] in {
defm VUNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, load,
VR128, f128mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
- SchedWriteFShuffle.XMM, SSEPackedSingle>, PS, VEX_4V, WIG;
+ SchedWriteFShuffle.XMM, SSEPackedSingle>, PS, VEX, VVVV, WIG;
defm VUNPCKHPD: sse12_unpack_interleave<0x15, X86Unpckh, v2f64, load,
VR128, f128mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
- SchedWriteFShuffle.XMM, SSEPackedDouble, 1>, PD, VEX_4V, WIG;
+ SchedWriteFShuffle.XMM, SSEPackedDouble, 1>, PD, VEX, VVVV, WIG;
defm VUNPCKLPS: sse12_unpack_interleave<0x14, X86Unpckl, v4f32, load,
VR128, f128mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
- SchedWriteFShuffle.XMM, SSEPackedSingle>, PS, VEX_4V, WIG;
+ SchedWriteFShuffle.XMM, SSEPackedSingle>, PS, VEX, VVVV, WIG;
defm VUNPCKLPD: sse12_unpack_interleave<0x14, X86Unpckl, v2f64, load,
VR128, f128mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
- SchedWriteFShuffle.XMM, SSEPackedDouble>, PD, VEX_4V, WIG;
+ SchedWriteFShuffle.XMM, SSEPackedDouble>, PD, VEX, VVVV, WIG;
defm VUNPCKHPSY: sse12_unpack_interleave<0x15, X86Unpckh, v8f32, load,
VR256, f256mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
- SchedWriteFShuffle.YMM, SSEPackedSingle>, PS, VEX_4V, VEX_L, WIG;
+ SchedWriteFShuffle.YMM, SSEPackedSingle>, PS, VEX, VVVV, VEX_L, WIG;
defm VUNPCKHPDY: sse12_unpack_interleave<0x15, X86Unpckh, v4f64, load,
VR256, f256mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
- SchedWriteFShuffle.YMM, SSEPackedDouble>, PD, VEX_4V, VEX_L, WIG;
+ SchedWriteFShuffle.YMM, SSEPackedDouble>, PD, VEX, VVVV, VEX_L, WIG;
defm VUNPCKLPSY: sse12_unpack_interleave<0x14, X86Unpckl, v8f32, load,
VR256, f256mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
- SchedWriteFShuffle.YMM, SSEPackedSingle>, PS, VEX_4V, VEX_L, WIG;
+ SchedWriteFShuffle.YMM, SSEPackedSingle>, PS, VEX, VVVV, VEX_L, WIG;
defm VUNPCKLPDY: sse12_unpack_interleave<0x14, X86Unpckl, v4f64, load,
VR256, f256mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
- SchedWriteFShuffle.YMM, SSEPackedDouble>, PD, VEX_4V, VEX_L, WIG;
+ SchedWriteFShuffle.YMM, SSEPackedDouble>, PD, VEX, VVVV, VEX_L, WIG;
}// Predicates = [HasAVX, NoVLX]
let Constraints = "$src1 = $dst" in {
@@ -2276,7 +2276,7 @@ multiclass PDI_binop_all<bits<8> opc, string OpcodeStr, SDNode Opcode,
let Predicates = [HasAVX, prd] in
defm V#NAME : PDI_binop_rm<opc, !strconcat("v", OpcodeStr), Opcode, OpVT128,
VR128, load, i128mem, sched.XMM,
- IsCommutable, 0>, VEX_4V, WIG;
+ IsCommutable, 0>, VEX, VVVV, WIG;
let Constraints = "$src1 = $dst" in
defm NAME : PDI_binop_rm<opc, OpcodeStr, Opcode, OpVT128, VR128,
@@ -2285,7 +2285,7 @@ let Constraints = "$src1 = $dst" in
let Predicates = [HasAVX2, prd] in
defm V#NAME#Y : PDI_binop_rm<opc, !strconcat("v", OpcodeStr), Opcode,
OpVT256, VR256, load, i256mem, sched.YMM,
- IsCommutable, 0>, VEX_4V, VEX_L, WIG;
+ IsCommutable, 0>, VEX, VVVV, VEX_L, WIG;
}
// These are ordered here for pattern ordering requirements with the fp versions
@@ -2312,19 +2312,19 @@ multiclass sse12_fp_packed_logical<bits<8> opc, string OpcodeStr,
let Predicates = [HasAVX, NoVLX] in {
defm V#NAME#PSY : sse12_fp_packed_logical_rm<opc, VR256, SSEPackedSingle,
!strconcat(OpcodeStr, "ps"), f256mem, sched.YMM,
- [], [], 0>, PS, VEX_4V, VEX_L, WIG;
+ [], [], 0>, PS, VEX, VVVV, VEX_L, WIG;
defm V#NAME#PDY : sse12_fp_packed_logical_rm<opc, VR256, SSEPackedDouble,
!strconcat(OpcodeStr, "pd"), f256mem, sched.YMM,
- [], [], 0>, PD, VEX_4V, VEX_L, WIG;
+ [], [], 0>, PD, VEX, VVVV, VEX_L, WIG;
defm V#NAME#PS : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedSingle,
!strconcat(OpcodeStr, "ps"), f128mem, sched.XMM,
- [], [], 0>, PS, VEX_4V, WIG;
+ [], [], 0>, PS, VEX, VVVV, WIG;
defm V#NAME#PD : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedDouble,
!strconcat(OpcodeStr, "pd"), f128mem, sched.XMM,
- [], [], 0>, PD, VEX_4V, WIG;
+ [], [], 0>, PD, VEX, VVVV, WIG;
}
let Constraints = "$src1 = $dst" in {
@@ -2636,17 +2636,17 @@ let Uses = [MXCSR], mayRaiseFPException = 1 in {
let Predicates = [HasAVX, NoVLX] in {
defm V#NAME#PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode,
VR128, v4f32, f128mem, loadv4f32,
- SSEPackedSingle, sched.PS.XMM, 0>, PS, VEX_4V, WIG;
+ SSEPackedSingle, sched.PS.XMM, 0>, PS, VEX, VVVV, WIG;
defm V#NAME#PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode,
VR128, v2f64, f128mem, loadv2f64,
- SSEPackedDouble, sched.PD.XMM, 0>, PD, VEX_4V, WIG;
+ SSEPackedDouble, sched.PD.XMM, 0>, PD, VEX, VVVV, WIG;
defm V#NAME#PSY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"),
OpNode, VR256, v8f32, f256mem, loadv8f32,
- SSEPackedSingle, sched.PS.YMM, 0>, PS, VEX_4V, VEX_L, WIG;
+ SSEPackedSingle, sched.PS.YMM, 0>, PS, VEX, VVVV, VEX_L, WIG;
defm V#NAME#PDY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"),
OpNode, VR256, v4f64, f256mem, loadv4f64,
- SSEPackedDouble, sched.PD.YMM, 0>, PD, VEX_4V, VEX_L, WIG;
+ SSEPackedDouble, sched.PD.YMM, 0>, PD, VEX, VVVV, VEX_L, WIG;
}
let Constraints = "$src1 = $dst" in {
@@ -2665,10 +2665,10 @@ multiclass basic_sse12_fp_binop_s<bits<8> opc, string OpcodeStr, SDPatternOperat
let Uses = [MXCSR], mayRaiseFPException = 1 in {
defm V#NAME#SS : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "ss"),
OpNode, FR32, f32mem, SSEPackedSingle, sched.PS.Scl, 0>,
- XS, VEX_4V, VEX_LIG, WIG;
+ XS, VEX, VVVV, VEX_LIG, WIG;
defm V#NAME#SD : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "sd"),
OpNode, FR64, f64mem, SSEPackedDouble, sched.PD.Scl, 0>,
- XD, VEX_4V, VEX_LIG, WIG;
+ XD, VEX, VVVV, VEX_LIG, WIG;
let Constraints = "$src1 = $dst" in {
defm SS : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "ss"),
@@ -2687,10 +2687,10 @@ multiclass basic_sse12_fp_binop_s_int<bits<8> opc, string OpcodeStr,
let Uses = [MXCSR], mayRaiseFPException = 1 in {
defm V#NAME#SS : sse12_fp_scalar_int<opc, OpNode, VR128, v4f32,
!strconcat(OpcodeStr, "ss"), ssmem, sse_load_f32,
- SSEPackedSingle, sched.PS.Scl, 0>, XS, VEX_4V, VEX_LIG, WIG;
+ SSEPackedSingle, sched.PS.Scl, 0>, XS, VEX, VVVV, VEX_LIG, WIG;
defm V#NAME#SD : sse12_fp_scalar_int<opc, OpNode, VR128, v2f64,
!strconcat(OpcodeStr, "sd"), sdmem, sse_load_f64,
- SSEPackedDouble, sched.PD.Scl, 0>, XD, VEX_4V, VEX_LIG, WIG;
+ SSEPackedDouble, sched.PD.Scl, 0>, XD, VEX, VVVV, VEX_LIG, WIG;
let Constraints = "$src1 = $dst" in {
defm SS : sse12_fp_scalar_int<opc, OpNode, VR128, v4f32,
@@ -3020,7 +3020,7 @@ multiclass sse1_fp_unop_s_intr<string OpcodeStr, Predicate AVXTarget> {
defm V#NAME#SS : avx_fp_unop_s_intr<v4f32, sse_load_f32,
!cast<Intrinsic>("int_x86_sse_"#OpcodeStr#_ss),
AVXTarget>,
- XS, VEX_4V, VEX_LIG, WIG;
+ XS, VEX, VVVV, VEX_LIG, WIG;
}
multiclass sse1_fp_unop_s<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
@@ -3029,7 +3029,7 @@ multiclass sse1_fp_unop_s<bits<8> opc, string OpcodeStr, SDPatternOperator OpNod
ssmem, OpNode, SSEPackedSingle, sched.Scl, UseSSE1>, XS;
defm V#NAME#SS : avx_fp_unop_s<opc, "v"#OpcodeStr#ss, FR32, f32,
f32mem, ssmem, OpNode, SSEPackedSingle, sched.Scl, AVXTarget>,
- XS, VEX_4V, VEX_LIG, WIG;
+ XS, VEX, VVVV, VEX_LIG, WIG;
}
multiclass sse2_fp_unop_s<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
@@ -3038,7 +3038,7 @@ multiclass sse2_fp_unop_s<bits<8> opc, string OpcodeStr, SDPatternOperator OpNod
sdmem, OpNode, SSEPackedDouble, sched.Scl, UseSSE2>, XD;
defm V#NAME#SD : avx_fp_unop_s<opc, "v"#OpcodeStr#sd, FR64, f64,
f64mem, sdmem, OpNode, SSEPackedDouble, sched.Scl, AVXTarget>,
- XD, VEX_4V, VEX_LIG, WIG;
+ XD, VEX, VVVV, VEX_LIG, WIG;
}
// Square root.
@@ -3537,12 +3537,12 @@ defm PMULUDQ : PDI_binop_all<0xF4, "pmuludq", X86pmuludq, v2i64, v4i64,
let Predicates = [HasAVX, NoVLX_Or_NoBWI] in
defm VPMADDWD : PDI_binop_rm2<0xF5, "vpmaddwd", X86vpmaddwd, v4i32, v8i16, VR128,
load, i128mem, SchedWriteVecIMul.XMM, 0>,
- VEX_4V, WIG;
+ VEX, VVVV, WIG;
let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in
defm VPMADDWDY : PDI_binop_rm2<0xF5, "vpmaddwd", X86vpmaddwd, v8i32, v16i16,
VR256, load, i256mem, SchedWriteVecIMul.YMM,
- 0>, VEX_4V, VEX_L, WIG;
+ 0>, VEX, VVVV, VEX_L, WIG;
let Constraints = "$src1 = $dst" in
defm PMADDWD : PDI_binop_rm2<0xF5, "pmaddwd", X86vpmaddwd, v4i32, v8i16, VR128,
memop, i128mem, SchedWriteVecIMul.XMM>;
@@ -3550,11 +3550,11 @@ defm PMADDWD : PDI_binop_rm2<0xF5, "pmaddwd", X86vpmaddwd, v4i32, v8i16, VR128,
let Predicates = [HasAVX, NoVLX_Or_NoBWI] in
defm VPSADBW : PDI_binop_rm2<0xF6, "vpsadbw", X86psadbw, v2i64, v16i8, VR128,
load, i128mem, SchedWritePSADBW.XMM, 0>,
- VEX_4V, WIG;
+ VEX, VVVV, WIG;
let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in
defm VPSADBWY : PDI_binop_rm2<0xF6, "vpsadbw", X86psadbw, v4i64, v32i8, VR256,
load, i256mem, SchedWritePSADBW.YMM, 0>,
- VEX_4V, VEX_L, WIG;
+ VEX, VVVV, VEX_L, WIG;
let Constraints = "$src1 = $dst" in
defm PSADBW : PDI_binop_rm2<0xF6, "psadbw", X86psadbw, v2i64, v16i8, VR128,
memop, i128mem, SchedWritePSADBW.XMM>;
@@ -3604,11 +3604,11 @@ multiclass PDI_binop_rmi_all<bits<8> opc, bits<8> opc2, Format ImmForm,
let Predicates = [HasAVX, prd] in
defm V#NAME : PDI_binop_rmi<opc, opc2, ImmForm, !strconcat("v", OpcodeStr),
OpNode, OpNode2, VR128, sched.XMM, schedImm.XMM,
- DstVT128, SrcVT, load, 0>, VEX_4V, WIG;
+ DstVT128, SrcVT, load, 0>, VEX, VVVV, WIG;
let Predicates = [HasAVX2, prd] in
defm V#NAME#Y : PDI_binop_rmi<opc, opc2, ImmForm, !strconcat("v", OpcodeStr),
OpNode, OpNode2, VR256, sched.YMM, schedImm.YMM,
- DstVT256, SrcVT, load, 0>, VEX_4V, VEX_L,
+ DstVT256, SrcVT, load, 0>, VEX, VVVV, VEX_L,
WIG;
let Constraints = "$src1 = $dst" in
defm NAME : PDI_binop_rmi<opc, opc2, ImmForm, OpcodeStr, OpNode, OpNode2,
@@ -3631,11 +3631,11 @@ multiclass PDI_binop_ri_all<bits<8> opc, Format ImmForm, string OpcodeStr,
SDNode OpNode, X86SchedWriteWidths sched> {
let Predicates = [HasAVX, NoVLX_Or_NoBWI] in
defm V#NAME : PDI_binop_ri<opc, ImmForm, !strconcat("v", OpcodeStr), OpNode,
- VR128, v16i8, sched.XMM, 0>, VEX_4V, WIG;
+ VR128, v16i8, sched.XMM, 0>, VEX, VVVV, WIG;
let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in
defm V#NAME#Y : PDI_binop_ri<opc, ImmForm, !strconcat("v", OpcodeStr), OpNode,
VR256, v32i8, sched.YMM, 0>,
- VEX_4V, VEX_L, WIG;
+ VEX, VVVV, VEX_L, WIG;
let Constraints = "$src1 = $dst" in
defm NAME : PDI_binop_ri<opc, ImmForm, OpcodeStr, OpNode, VR128, v16i8,
sched.XMM>;
@@ -3821,33 +3821,33 @@ multiclass sse4_pack<bits<8> opc, string OpcodeStr, ValueType OutVT,
let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
defm VPACKSSWB : sse2_pack<0x63, "vpacksswb", v16i8, v8i16, X86Packss, VR128,
i128mem, SchedWriteShuffle.XMM, load, 0>,
- VEX_4V, WIG;
+ VEX, VVVV, WIG;
defm VPACKSSDW : sse2_pack<0x6B, "vpackssdw", v8i16, v4i32, X86Packss, VR128,
i128mem, SchedWriteShuffle.XMM, load, 0>,
- VEX_4V, WIG;
+ VEX, VVVV, WIG;
defm VPACKUSWB : sse2_pack<0x67, "vpackuswb", v16i8, v8i16, X86Packus, VR128,
i128mem, SchedWriteShuffle.XMM, load, 0>,
- VEX_4V, WIG;
+ VEX, VVVV, WIG;
defm VPACKUSDW : sse4_pack<0x2B, "vpackusdw", v8i16, v4i32, X86Packus, VR128,
i128mem, SchedWriteShuffle.XMM, load, 0>,
- VEX_4V, WIG;
+ VEX, VVVV, WIG;
}
let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
defm VPACKSSWBY : sse2_pack<0x63, "vpacksswb", v32i8, v16i16, X86Packss, VR256,
i256mem, SchedWriteShuffle.YMM, load, 0>,
- VEX_4V, VEX_L, WIG;
+ VEX, VVVV, VEX_L, WIG;
defm VPACKSSDWY : sse2_pack<0x6B, "vpackssdw", v16i16, v8i32, X86Packss, VR256,
i256mem, SchedWriteShuffle.YMM, load, 0>,
- VEX_4V, VEX_L, WIG;
+ VEX, VVVV, VEX_L, WIG;
defm VPACKUSWBY : sse2_pack<0x67, "vpackuswb", v32i8, v16i16, X86Packus, VR256,
i256mem, SchedWriteShuffle.YMM, load, 0>,
- VEX_4V, VEX_L, WIG;
+ VEX, VVVV, VEX_L, WIG;
defm VPACKUSDWY : sse4_pack<0x2B, "vpackusdw", v16i16, v8i32, X86Packus, VR256,
i256mem, SchedWriteShuffle.YMM, load, 0>,
- VEX_4V, VEX_L, WIG;
+ VEX, VVVV, VEX_L, WIG;
}
let Constraints = "$src1 = $dst" in {
@@ -3892,61 +3892,61 @@ multiclass sse2_unpack<bits<8> opc, string OpcodeStr, ValueType vt,
let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
defm VPUNPCKLBW : sse2_unpack<0x60, "vpunpcklbw", v16i8, X86Unpckl, VR128,
i128mem, SchedWriteShuffle.XMM, load, 0>,
- VEX_4V, WIG;
+ VEX, VVVV, WIG;
defm VPUNPCKLWD : sse2_unpack<0x61, "vpunpcklwd", v8i16, X86Unpckl, VR128,
i128mem, SchedWriteShuffle.XMM, load, 0>,
- VEX_4V, WIG;
+ VEX, VVVV, WIG;
defm VPUNPCKHBW : sse2_unpack<0x68, "vpunpckhbw", v16i8, X86Unpckh, VR128,
i128mem, SchedWriteShuffle.XMM, load, 0>,
- VEX_4V, WIG;
+ VEX, VVVV, WIG;
defm VPUNPCKHWD : sse2_unpack<0x69, "vpunpckhwd", v8i16, X86Unpckh, VR128,
i128mem, SchedWriteShuffle.XMM, load, 0>,
- VEX_4V, WIG;
+ VEX, VVVV, WIG;
}
let Predicates = [HasAVX, NoVLX] in {
defm VPUNPCKLDQ : sse2_unpack<0x62, "vpunpckldq", v4i32, X86Unpckl, VR128,
i128mem, SchedWriteShuffle.XMM, load, 0>,
- VEX_4V, WIG;
+ VEX, VVVV, WIG;
defm VPUNPCKLQDQ : sse2_unpack<0x6C, "vpunpcklqdq", v2i64, X86Unpckl, VR128,
i128mem, SchedWriteShuffle.XMM, load, 0>,
- VEX_4V, WIG;
+ VEX, VVVV, WIG;
defm VPUNPCKHDQ : sse2_unpack<0x6A, "vpunpckhdq", v4i32, X86Unpckh, VR128,
i128mem, SchedWriteShuffle.XMM, load, 0>,
- VEX_4V, WIG;
+ VEX, VVVV, WIG;
defm VPUNPCKHQDQ : sse2_unpack<0x6D, "vpunpckhqdq", v2i64, X86Unpckh, VR128,
i128mem, SchedWriteShuffle.XMM, load, 0>,
- VEX_4V, WIG;
+ VEX, VVVV, WIG;
}
let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
defm VPUNPCKLBWY : sse2_unpack<0x60, "vpunpcklbw", v32i8, X86Unpckl, VR256,
i256mem, SchedWriteShuffle.YMM, load, 0>,
- VEX_4V, VEX_L, WIG;
+ VEX, VVVV, VEX_L, WIG;
defm VPUNPCKLWDY : sse2_unpack<0x61, "vpunpcklwd", v16i16, X86Unpckl, VR256,
i256mem, SchedWriteShuffle.YMM, load, 0>,
- VEX_4V, VEX_L, WIG;
+ VEX, VVVV, VEX_L, WIG;
defm VPUNPCKHBWY : sse2_unpack<0x68, "vpunpckhbw", v32i8, X86Unpckh, VR256,
i256mem, SchedWriteShuffle.YMM, load, 0>,
- VEX_4V, VEX_L, WIG;
+ VEX, VVVV, VEX_L, WIG;
defm VPUNPCKHWDY : sse2_unpack<0x69, "vpunpckhwd", v16i16, X86Unpckh, VR256,
i256mem, SchedWriteShuffle.YMM, load, 0>,
- VEX_4V, VEX_L, WIG;
+ VEX, VVVV, VEX_L, WIG;
}
let Predicates = [HasAVX2, NoVLX] in {
defm VPUNPCKLDQY : sse2_unpack<0x62, "vpunpckldq", v8i32, X86Unpckl, VR256,
i256mem, SchedWriteShuffle.YMM, load, 0>,
- VEX_4V, VEX_L, WIG;
+ VEX, VVVV, VEX_L, WIG;
defm VPUNPCKLQDQY : sse2_unpack<0x6C, "vpunpcklqdq", v4i64, X86Unpckl, VR256,
i256mem, SchedWriteShuffle.YMM, load, 0>,
- VEX_4V, VEX_L, WIG;
+ VEX, VVVV, VEX_L, WIG;
defm VPUNPCKHDQY : sse2_unpack<0x6A, "vpunpckhdq", v8i32, X86Unpckh, VR256,
i256mem, SchedWriteShuffle.YMM, load, 0>,
- VEX_4V, VEX_L, WIG;
+ VEX, VVVV, VEX_L, WIG;
defm VPUNPCKHQDQY : sse2_unpack<0x6D, "vpunpckhqdq", v4i64, X86Unpckh, VR256,
i256mem, SchedWriteShuffle.YMM, load, 0>,
- VEX_4V, VEX_L, WIG;
+ VEX, VVVV, VEX_L, WIG;
}
let Constraints = "$src1 = $dst" in {
@@ -4014,7 +4014,7 @@ def PEXTRWrr : PDIi8<0xC5, MRMSrcReg,
// Insert
let Predicates = [HasAVX, NoBWI] in
-defm VPINSRW : sse2_pinsrw<0>, PD, VEX_4V, WIG;
+defm VPINSRW : sse2_pinsrw<0>, PD, VEX, VVVV, WIG;
let Predicates = [UseSSE2], Constraints = "$src1 = $dst" in
defm PINSRW : sse2_pinsrw, PD;
@@ -4563,18 +4563,18 @@ let Predicates = [HasAVX] in {
let ExeDomain = SSEPackedSingle in {
defm VADDSUBPS : sse3_addsub<"vaddsubps", v4f32, VR128, f128mem,
SchedWriteFAddSizes.PS.XMM, loadv4f32, 0>,
- XD, VEX_4V, WIG;
+ XD, VEX, VVVV, WIG;
defm VADDSUBPSY : sse3_addsub<"vaddsubps", v8f32, VR256, f256mem,
SchedWriteFAddSizes.PS.YMM, loadv8f32, 0>,
- XD, VEX_4V, VEX_L, WIG;
+ XD, VEX, VVVV, VEX_L, WIG;
}
let ExeDomain = SSEPackedDouble in {
defm VADDSUBPD : sse3_addsub<"vaddsubpd", v2f64, VR128, f128mem,
SchedWriteFAddSizes.PD.XMM, loadv2f64, 0>,
- PD, VEX_4V, WIG;
+ PD, VEX, VVVV, WIG;
defm VADDSUBPDY : sse3_addsub<"vaddsubpd", v4f64, VR256, f256mem,
SchedWriteFAddSizes.PD.YMM, loadv4f64, 0>,
- PD, VEX_4V, VEX_L, WIG;
+ PD, VEX, VVVV, VEX_L, WIG;
}
}
let Constraints = "$src1 = $dst", Predicates = [UseSSE3] in {
@@ -4635,23 +4635,23 @@ let Uses = [MXCSR], mayRaiseFPException = 1 in {
let Predicates = [HasAVX] in {
let ExeDomain = SSEPackedSingle in {
defm VHADDPS : S3D_Int<0x7C, "vhaddps", v4f32, VR128, f128mem,
- X86fhadd, WriteFHAdd, loadv4f32, 0>, VEX_4V, WIG;
+ X86fhadd, WriteFHAdd, loadv4f32, 0>, VEX, VVVV, WIG;
defm VHSUBPS : S3D_Int<0x7D, "vhsubps", v4f32, VR128, f128mem,
- X86fhsub, WriteFHAdd, loadv4f32, 0>, VEX_4V, WIG;
+ X86fhsub, WriteFHAdd, loadv4f32, 0>, VEX, VVVV, WIG;
defm VHADDPSY : S3D_Int<0x7C, "vhaddps", v8f32, VR256, f256mem,
- X86fhadd, WriteFHAddY, loadv8f32, 0>, VEX_4V, VEX_L, WIG;
+ X86fhadd, WriteFHAddY, loadv8f32, 0>, VEX, VVVV, VEX_L, WIG;
defm VHSUBPSY : S3D_Int<0x7D, "vhsubps", v8f32, VR256, f256mem,
- X86fhsub, WriteFHAddY, loadv8f32, 0>, VEX_4V, VEX_L, WIG;
+ X86fhsub, WriteFHAddY, loadv8f32, 0>, VEX, VVVV, VEX_L, WIG;
}
let ExeDomain = SSEPackedDouble in {
defm VHADDPD : S3_Int<0x7C, "vhaddpd", v2f64, VR128, f128mem,
- X86fhadd, WriteFHAdd, loadv2f64, 0>, VEX_4V, WIG;
+ X86fhadd, WriteFHAdd, loadv2f64, 0>, VEX, VVVV, WIG;
defm VHSUBPD : S3_Int<0x7D, "vhsubpd", v2f64, VR128, f128mem,
- X86fhsub, WriteFHAdd, loadv2f64, 0>, VEX_4V, WIG;
+ X86fhsub, WriteFHAdd, loadv2f64, 0>, VEX, VVVV, WIG;
defm VHADDPDY : S3_Int<0x7C, "vhaddpd", v4f64, VR256, f256mem,
- X86fhadd, WriteFHAddY, loadv4f64, 0>, VEX_4V, VEX_L, WIG;
+ X86fhadd, WriteFHAddY, loadv4f64, 0>, VEX, VVVV, VEX_L, WIG;
defm VHSUBPDY : S3_Int<0x7D, "vhsubpd", v4f64, VR256, f256mem,
- X86fhsub, WriteFHAddY, loadv4f64, 0>, VEX_4V, VEX_L, WIG;
+ X86fhsub, WriteFHAddY, loadv4f64, 0>, VEX, VVVV, VEX_L, WIG;
}
}
@@ -4806,45 +4806,45 @@ let ImmT = NoImm, Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
let isCommutable = 0 in {
defm VPSHUFB : SS3I_binop_rm<0x00, "vpshufb", X86pshufb, v16i8, v16i8,
VR128, load, i128mem,
- SchedWriteVarShuffle.XMM, 0>, VEX_4V, WIG;
+ SchedWriteVarShuffle.XMM, 0>, VEX, VVVV, WIG;
defm VPMADDUBSW : SS3I_binop_rm<0x04, "vpmaddubsw", X86vpmaddubsw, v8i16,
v16i8, VR128, load, i128mem,
- SchedWriteVecIMul.XMM, 0>, VEX_4V, WIG;
+ SchedWriteVecIMul.XMM, 0>, VEX, VVVV, WIG;
}
defm VPMULHRSW : SS3I_binop_rm<0x0B, "vpmulhrsw", X86mulhrs, v8i16, v8i16,
VR128, load, i128mem,
- SchedWriteVecIMul.XMM, 0>, VEX_4V, WIG;
+ SchedWriteVecIMul.XMM, 0>, VEX, VVVV, WIG;
}
let ImmT = NoImm, Predicates = [HasAVX] in {
let isCommutable = 0 in {
defm VPHADDW : SS3I_binop_rm<0x01, "vphaddw", X86hadd, v8i16, v8i16, VR128,
load, i128mem,
- SchedWritePHAdd.XMM, 0>, VEX_4V, WIG;
+ SchedWritePHAdd.XMM, 0>, VEX, VVVV, WIG;
defm VPHADDD : SS3I_binop_rm<0x02, "vphaddd", X86hadd, v4i32, v4i32, VR128,
load, i128mem,
- SchedWritePHAdd.XMM, 0>, VEX_4V, WIG;
+ SchedWritePHAdd.XMM, 0>, VEX, VVVV, WIG;
defm VPHSUBW : SS3I_binop_rm<0x05, "vphsubw", X86hsub, v8i16, v8i16, VR128,
load, i128mem,
- SchedWritePHAdd.XMM, 0>, VEX_4V, WIG;
+ SchedWritePHAdd.XMM, 0>, VEX, VVVV, WIG;
defm VPHSUBD : SS3I_binop_rm<0x06, "vphsubd", X86hsub, v4i32, v4i32, VR128,
load, i128mem,
- SchedWritePHAdd.XMM, 0>, VEX_4V, WIG;
+ SchedWritePHAdd.XMM, 0>, VEX, VVVV, WIG;
defm VPSIGNB : SS3I_binop_rm_int<0x08, "vpsignb",
int_x86_ssse3_psign_b_128,
- SchedWriteVecALU.XMM, load, 0>, VEX_4V, WIG;
+ SchedWriteVecALU.XMM, load, 0>, VEX, VVVV, WIG;
defm VPSIGNW : SS3I_binop_rm_int<0x09, "vpsignw",
int_x86_ssse3_psign_w_128,
- SchedWriteVecALU.XMM, load, 0>, VEX_4V, WIG;
+ SchedWriteVecALU.XMM, load, 0>, VEX, VVVV, WIG;
defm VPSIGND : SS3I_binop_rm_int<0x0A, "vpsignd",
int_x86_ssse3_psign_d_128,
- SchedWriteVecALU.XMM, load, 0>, VEX_4V, WIG;
+ SchedWriteVecALU.XMM, load, 0>, VEX, VVVV, WIG;
defm VPHADDSW : SS3I_binop_rm_int<0x03, "vphaddsw",
int_x86_ssse3_phadd_sw_128,
- SchedWritePHAdd.XMM, load, 0>, VEX_4V, WIG;
+ SchedWritePHAdd.XMM, load, 0>, VEX, VVVV, WIG;
defm VPHSUBSW : SS3I_binop_rm_int<0x07, "vphsubsw",
int_x86_ssse3_phsub_sw_128,
- SchedWritePHAdd.XMM, load, 0>, VEX_4V, WIG;
+ SchedWritePHAdd.XMM, load, 0>, VEX, VVVV, WIG;
}
}
@@ -4852,42 +4852,42 @@ let ImmT = NoImm, Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
let isCommutable = 0 in {
defm VPSHUFBY : SS3I_binop_rm<0x00, "vpshufb", X86pshufb, v32i8, v32i8,
VR256, load, i256mem,
- SchedWriteVarShuffle.YMM, 0>, VEX_4V, VEX_L, WIG;
+ SchedWriteVarShuffle.YMM, 0>, VEX, VVVV, VEX_L, WIG;
defm VPMADDUBSWY : SS3I_binop_rm<0x04, "vpmaddubsw", X86vpmaddubsw, v16i16,
v32i8, VR256, load, i256mem,
- SchedWriteVecIMul.YMM, 0>, VEX_4V, VEX_L, WIG;
+ SchedWriteVecIMul.YMM, 0>, VEX, VVVV, VEX_L, WIG;
}
defm VPMULHRSWY : SS3I_binop_rm<0x0B, "vpmulhrsw", X86mulhrs, v16i16, v16i16,
VR256, load, i256mem,
- SchedWriteVecIMul.YMM, 0>, VEX_4V, VEX_L, WIG;
+ SchedWriteVecIMul.YMM, 0>, VEX, VVVV, VEX_L, WIG;
}
let ImmT = NoImm, Predicates = [HasAVX2] in {
let isCommutable = 0 in {
defm VPHADDWY : SS3I_binop_rm<0x01, "vphaddw", X86hadd, v16i16, v16i16,
VR256, load, i256mem,
- SchedWritePHAdd.YMM, 0>, VEX_4V, VEX_L, WIG;
+ SchedWritePHAdd.YMM, 0>, VEX, VVVV, VEX_L, WIG;
defm VPHADDDY : SS3I_binop_rm<0x02, "vphaddd", X86hadd, v8i32, v8i32, VR256,
load, i256mem,
- SchedWritePHAdd.YMM, 0>, VEX_4V, VEX_L, WIG;
+ SchedWritePHAdd.YMM, 0>, VEX, VVVV, VEX_L, WIG;
defm VPHSUBWY : SS3I_binop_rm<0x05, "vphsubw", X86hsub, v16i16, v16i16,
VR256, load, i256mem,
- SchedWritePHAdd.YMM, 0>, VEX_4V, VEX_L, WIG;
+ SchedWritePHAdd.YMM, 0>, VEX, VVVV, VEX_L, WIG;
defm VPHSUBDY : SS3I_binop_rm<0x06, "vphsubd", X86hsub, v8i32, v8i32, VR256,
load, i256mem,
- SchedWritePHAdd.YMM, 0>, VEX_4V, VEX_L, WIG;
+ SchedWritePHAdd.YMM, 0>, VEX, VVVV, VEX_L, WIG;
defm VPSIGNB : SS3I_binop_rm_int_y<0x08, "vpsignb", int_x86_avx2_psign_b,
- SchedWriteVecALU.YMM>, VEX_4V, VEX_L, WIG;
+ SchedWriteVecALU.YMM>, VEX, VVVV, VEX_L, WIG;
defm VPSIGNW : SS3I_binop_rm_int_y<0x09, "vpsignw", int_x86_avx2_psign_w,
- SchedWriteVecALU.YMM>, VEX_4V, VEX_L, WIG;
+ SchedWriteVecALU.YMM>, VEX, VVVV, VEX_L, WIG;
defm VPSIGND : SS3I_binop_rm_int_y<0x0A, "vpsignd", int_x86_avx2_psign_d,
- SchedWriteVecALU.YMM>, VEX_4V, VEX_L, WIG;
+ SchedWriteVecALU.YMM>, VEX, VVVV, VEX_L, WIG;
defm VPHADDSW : SS3I_binop_rm_int_y<0x03, "vphaddsw",
int_x86_avx2_phadd_sw,
- SchedWritePHAdd.YMM>, VEX_4V, VEX_L, WIG;
+ SchedWritePHAdd.YMM>, VEX, VVVV, VEX_L, WIG;
defm VPHSUBSW : SS3I_binop_rm_int_y<0x07, "vphsubsw",
int_x86_avx2_phsub_sw,
- SchedWritePHAdd.YMM>, VEX_4V, VEX_L, WIG;
+ SchedWritePHAdd.YMM>, VEX, VVVV, VEX_L, WIG;
}
}
@@ -4956,10 +4956,10 @@ multiclass ssse3_palignr<string asm, ValueType VT, RegisterClass RC,
let Predicates = [HasAVX, NoVLX_Or_NoBWI] in
defm VPALIGNR : ssse3_palignr<"vpalignr", v16i8, VR128, load, i128mem,
- SchedWriteShuffle.XMM, 0>, VEX_4V, WIG;
+ SchedWriteShuffle.XMM, 0>, VEX, VVVV, WIG;
let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in
defm VPALIGNRY : ssse3_palignr<"vpalignr", v32i8, VR256, load, i256mem,
- SchedWriteShuffle.YMM, 0>, VEX_4V, VEX_L, WIG;
+ SchedWriteShuffle.YMM, 0>, VEX, VVVV, VEX_L, WIG;
let Constraints = "$src1 = $dst", Predicates = [UseSSSE3] in
defm PALIGNR : ssse3_palignr<"palignr", v16i8, VR128, memop, i128mem,
SchedWriteShuffle.XMM>;
@@ -5367,7 +5367,7 @@ multiclass SS41I_insert8<bits<8> opc, string asm, bit Is2Addr = 1> {
}
let Predicates = [HasAVX, NoBWI] in {
- defm VPINSRB : SS41I_insert8<0x20, "vpinsrb", 0>, VEX_4V, WIG;
+ defm VPINSRB : SS41I_insert8<0x20, "vpinsrb", 0>, VEX, VVVV, WIG;
def : Pat<(X86pinsrb VR128:$src1, (i32 (anyext (i8 GR8:$src2))), timm:$src3),
(VPINSRBrr VR128:$src1, (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
GR8:$src2, sub_8bit), timm:$src3)>;
@@ -5398,7 +5398,7 @@ multiclass SS41I_insert32<bits<8> opc, string asm, bit Is2Addr = 1> {
}
let Predicates = [HasAVX, NoDQI] in
- defm VPINSRD : SS41I_insert32<0x22, "vpinsrd", 0>, VEX_4V;
+ defm VPINSRD : SS41I_insert32<0x22, "vpinsrd", 0>, VEX, VVVV;
let Constraints = "$src1 = $dst" in
defm PINSRD : SS41I_insert32<0x22, "pinsrd">;
@@ -5424,7 +5424,7 @@ multiclass SS41I_insert64<bits<8> opc, string asm, bit Is2Addr = 1> {
}
let Predicates = [HasAVX, NoDQI] in
- defm VPINSRQ : SS41I_insert64<0x22, "vpinsrq", 0>, VEX_4V, REX_W;
+ defm VPINSRQ : SS41I_insert64<0x22, "vpinsrq", 0>, VEX, VVVV, REX_W;
let Constraints = "$src1 = $dst" in
defm PINSRQ : SS41I_insert64<0x22, "pinsrq">, REX_W;
@@ -5459,7 +5459,7 @@ multiclass SS41I_insertf32<bits<8> opc, string asm, bit Is2Addr = 1> {
let ExeDomain = SSEPackedSingle in {
let Predicates = [UseAVX] in
defm VINSERTPS : SS41I_insertf32<0x21, "vinsertps", 0>,
- VEX_4V, WIG;
+ VEX, VVVV, WIG;
let Constraints = "$src1 = $dst" in
defm INSERTPS : SS41I_insertf32<0x21, "insertps", 1>;
}
@@ -5638,9 +5638,9 @@ let Predicates = [HasAVX, NoVLX] in {
let Predicates = [UseAVX] in {
defm VROUND : sse41_fp_binop_s<0x0A, 0x0B, "vround", SchedWriteFRnd.Scl,
v4f32, v2f64, X86RndScales, 0>,
- VEX_4V, VEX_LIG, WIG, SIMD_EXC;
+ VEX, VVVV, VEX_LIG, WIG, SIMD_EXC;
defm VROUND : avx_fp_unop_rm<0x0A, 0x0B, "vround", SchedWriteFRnd.Scl>,
- VEX_4V, VEX_LIG, WIG, SIMD_EXC;
+ VEX, VVVV, VEX_LIG, WIG, SIMD_EXC;
}
let Predicates = [UseAVX] in {
@@ -5842,65 +5842,65 @@ multiclass SS48I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
let Predicates = [HasAVX, NoVLX] in {
defm VPMINSD : SS48I_binop_rm<0x39, "vpminsd", smin, v4i32, VR128,
load, i128mem, SchedWriteVecALU.XMM, 0>,
- VEX_4V, WIG;
+ VEX, VVVV, WIG;
defm VPMINUD : SS48I_binop_rm<0x3B, "vpminud", umin, v4i32, VR128,
load, i128mem, SchedWriteVecALU.XMM, 0>,
- VEX_4V, WIG;
+ VEX, VVVV, WIG;
defm VPMAXSD : SS48I_binop_rm<0x3D, "vpmaxsd", smax, v4i32, VR128,
load, i128mem, SchedWriteVecALU.XMM, 0>,
- VEX_4V, WIG;
+ VEX, VVVV, WIG;
defm VPMAXUD : SS48I_binop_rm<0x3F, "vpmaxud", umax, v4i32, VR128,
load, i128mem, SchedWriteVecALU.XMM, 0>,
- VEX_4V, WIG;
+ VEX, VVVV, WIG;
defm VPMULDQ : SS48I_binop_rm<0x28, "vpmuldq", X86pmuldq, v2i64, VR128,
load, i128mem, SchedWriteVecIMul.XMM, 0>,
- VEX_4V, WIG;
+ VEX, VVVV, WIG;
}
let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
defm VPMINSB : SS48I_binop_rm<0x38, "vpminsb", smin, v16i8, VR128,
load, i128mem, SchedWriteVecALU.XMM, 0>,
- VEX_4V, WIG;
+ VEX, VVVV, WIG;
defm VPMINUW : SS48I_binop_rm<0x3A, "vpminuw", umin, v8i16, VR128,
load, i128mem, SchedWriteVecALU.XMM, 0>,
- VEX_4V, WIG;
+ VEX, VVVV, WIG;
defm VPMAXSB : SS48I_binop_rm<0x3C, "vpmaxsb", smax, v16i8, VR128,
load, i128mem, SchedWriteVecALU.XMM, 0>,
- VEX_4V, WIG;
+ VEX, VVVV, WIG;
defm VPMAXUW : SS48I_binop_rm<0x3E, "vpmaxuw", umax, v8i16, VR128,
load, i128mem, SchedWriteVecALU.XMM, 0>,
- VEX_4V, WIG;
+ VEX, VVVV, WIG;
}
let Predicates = [HasAVX2, NoVLX] in {
defm VPMINSDY : SS48I_binop_rm<0x39, "vpminsd", smin, v8i32, VR256,
load, i256mem, SchedWriteVecALU.YMM, 0>,
- VEX_4V, VEX_L, WIG;
+ VEX, VVVV, VEX_L, WIG;
defm VPMINUDY : SS48I_binop_rm<0x3B, "vpminud", umin, v8i32, VR256,
load, i256mem, SchedWriteVecALU.YMM, 0>,
- VEX_4V, VEX_L, WIG;
+ VEX, VVVV, VEX_L, WIG;
defm VPMAXSDY : SS48I_binop_rm<0x3D, "vpmaxsd", smax, v8i32, VR256,
load, i256mem, SchedWriteVecALU.YMM, 0>,
- VEX_4V, VEX_L, WIG;
+ VEX, VVVV, VEX_L, WIG;
defm VPMAXUDY : SS48I_binop_rm<0x3F, "vpmaxud", umax, v8i32, VR256,
load, i256mem, SchedWriteVecALU.YMM, 0>,
- VEX_4V, VEX_L, WIG;
+ VEX, VVVV, VEX_L, WIG;
defm VPMULDQY : SS48I_binop_rm<0x28, "vpmuldq", X86pmuldq, v4i64, VR256,
load, i256mem, SchedWriteVecIMul.YMM, 0>,
- VEX_4V, VEX_L, WIG;
+ VEX, VVVV, VEX_L, WIG;
}
let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
defm VPMINSBY : SS48I_binop_rm<0x38, "vpminsb", smin, v32i8, VR256,
load, i256mem, SchedWriteVecALU.YMM, 0>,
- VEX_4V, VEX_L, WIG;
+ VEX, VVVV, VEX_L, WIG;
defm VPMINUWY : SS48I_binop_rm<0x3A, "vpminuw", umin, v16i16, VR256,
load, i256mem, SchedWriteVecALU.YMM, 0>,
- VEX_4V, VEX_L, WIG;
+ VEX, VVVV, VEX_L, WIG;
defm VPMAXSBY : SS48I_binop_rm<0x3C, "vpmaxsb", smax, v32i8, VR256,
load, i256mem, SchedWriteVecALU.YMM, 0>,
- VEX_4V, VEX_L, WIG;
+ VEX, VVVV, VEX_L, WIG;
defm VPMAXUWY : SS48I_binop_rm<0x3E, "vpmaxuw", umax, v16i16, VR256,
load, i256mem, SchedWriteVecALU.YMM, 0>,
- VEX_4V, VEX_L, WIG;
+ VEX, VVVV, VEX_L, WIG;
}
let Constraints = "$src1 = $dst" in {
@@ -5927,20 +5927,20 @@ let Constraints = "$src1 = $dst" in {
let Predicates = [HasAVX, NoVLX] in
defm VPMULLD : SS48I_binop_rm<0x40, "vpmulld", mul, v4i32, VR128,
load, i128mem, SchedWritePMULLD.XMM, 0>,
- VEX_4V, WIG;
+ VEX, VVVV, WIG;
let Predicates = [HasAVX] in
defm VPCMPEQQ : SS48I_binop_rm<0x29, "vpcmpeqq", X86pcmpeq, v2i64, VR128,
load, i128mem, SchedWriteVecALU.XMM, 0>,
- VEX_4V, WIG;
+ VEX, VVVV, WIG;
let Predicates = [HasAVX2, NoVLX] in
defm VPMULLDY : SS48I_binop_rm<0x40, "vpmulld", mul, v8i32, VR256,
load, i256mem, SchedWritePMULLD.YMM, 0>,
- VEX_4V, VEX_L, WIG;
+ VEX, VVVV, VEX_L, WIG;
let Predicates = [HasAVX2] in
defm VPCMPEQQY : SS48I_binop_rm<0x29, "vpcmpeqq", X86pcmpeq, v4i64, VR256,
load, i256mem, SchedWriteVecALU.YMM, 0>,
- VEX_4V, VEX_L, WIG;
+ VEX, VVVV, VEX_L, WIG;
let Constraints = "$src1 = $dst" in {
defm PMULLD : SS48I_binop_rm<0x40, "pmulld", mul, v4i32, VR128,
@@ -6088,22 +6088,22 @@ let Predicates = [HasAVX] in {
let isCommutable = 0 in {
defm VMPSADBW : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_sse41_mpsadbw,
VR128, load, i128mem, 0,
- SchedWriteMPSAD.XMM>, VEX_4V, WIG;
+ SchedWriteMPSAD.XMM>, VEX, VVVV, WIG;
}
let Uses = [MXCSR], mayRaiseFPException = 1 in {
let ExeDomain = SSEPackedSingle in
defm VDPPS : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_sse41_dpps,
VR128, load, f128mem, 0,
- SchedWriteDPPS.XMM>, VEX_4V, WIG;
+ SchedWriteDPPS.XMM>, VEX, VVVV, WIG;
let ExeDomain = SSEPackedDouble in
defm VDPPD : SS41I_binop_rmi_int<0x41, "vdppd", int_x86_sse41_dppd,
VR128, load, f128mem, 0,
- SchedWriteDPPD.XMM>, VEX_4V, WIG;
+ SchedWriteDPPD.XMM>, VEX, VVVV, WIG;
let ExeDomain = SSEPackedSingle in
defm VDPPSY : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_avx_dp_ps_256,
VR256, load, i256mem, 0,
- SchedWriteDPPS.YMM>, VEX_4V, VEX_L, WIG;
+ SchedWriteDPPS.YMM>, VEX, VVVV, VEX_L, WIG;
}
}
@@ -6111,7 +6111,7 @@ let Predicates = [HasAVX2] in {
let isCommutable = 0 in {
defm VMPSADBWY : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_avx2_mpsadbw,
VR256, load, i256mem, 0,
- SchedWriteMPSAD.YMM>, VEX_4V, VEX_L, WIG;
+ SchedWriteMPSAD.YMM>, VEX, VVVV, VEX_L, WIG;
}
}
@@ -6170,30 +6170,30 @@ let Predicates = [HasAVX] in {
defm VBLENDPS : SS41I_blend_rmi<0x0C, "vblendps", X86Blendi, v4f32,
VR128, load, f128mem, 0, SSEPackedSingle,
SchedWriteFBlend.XMM, BlendCommuteImm4>,
- VEX_4V, WIG;
+ VEX, VVVV, WIG;
defm VBLENDPSY : SS41I_blend_rmi<0x0C, "vblendps", X86Blendi, v8f32,
VR256, load, f256mem, 0, SSEPackedSingle,
SchedWriteFBlend.YMM, BlendCommuteImm8>,
- VEX_4V, VEX_L, WIG;
+ VEX, VVVV, VEX_L, WIG;
defm VBLENDPD : SS41I_blend_rmi<0x0D, "vblendpd", X86Blendi, v2f64,
VR128, load, f128mem, 0, SSEPackedDouble,
SchedWriteFBlend.XMM, BlendCommuteImm2>,
- VEX_4V, WIG;
+ VEX, VVVV, WIG;
defm VBLENDPDY : SS41I_blend_rmi<0x0D, "vblendpd", X86Blendi, v4f64,
VR256, load, f256mem, 0, SSEPackedDouble,
SchedWriteFBlend.YMM, BlendCommuteImm4>,
- VEX_4V, VEX_L, WIG;
+ VEX, VVVV, VEX_L, WIG;
defm VPBLENDW : SS41I_blend_rmi<0x0E, "vpblendw", X86Blendi, v8i16,
VR128, load, i128mem, 0, SSEPackedInt,
SchedWriteBlend.XMM, BlendCommuteImm8>,
- VEX_4V, WIG;
+ VEX, VVVV, WIG;
}
let Predicates = [HasAVX2] in {
defm VPBLENDWY : SS41I_blend_rmi<0x0E, "vpblendw", X86Blendi, v16i16,
VR256, load, i256mem, 0, SSEPackedInt,
SchedWriteBlend.YMM, BlendCommuteImm8>,
- VEX_4V, VEX_L, WIG;
+ VEX, VVVV, VEX_L, WIG;
}
// Emulate vXi32/vXi64 blends with vXf32/vXf64 or pblendw.
@@ -6290,7 +6290,7 @@ multiclass SS41I_quaternary_avx<bits<8> opc, string OpcodeStr, RegisterClass RC,
!strconcat(OpcodeStr,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
[(set RC:$dst, (VT (OpNode RC:$src3, RC:$src2, RC:$src1)))],
- SSEPackedInt>, TAPD, VEX_4V,
+ SSEPackedInt>, TAPD, VEX, VVVV,
Sched<[sched]>;
def rm : Ii8Reg<opc, MRMSrcMem, (outs RC:$dst),
@@ -6299,7 +6299,7 @@ multiclass SS41I_quaternary_avx<bits<8> opc, string OpcodeStr, RegisterClass RC,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
[(set RC:$dst,
(OpNode RC:$src3, (mem_frag addr:$src2),
- RC:$src1))], SSEPackedInt>, TAPD, VEX_4V,
+ RC:$src1))], SSEPackedInt>, TAPD, VEX, VVVV,
Sched<[sched.Folded, sched.ReadAfterFold,
// x86memop:$src2
ReadDefault, ReadDefault, ReadDefault, ReadDefault,
@@ -6564,12 +6564,12 @@ multiclass SS42I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
let Predicates = [HasAVX] in
defm VPCMPGTQ : SS42I_binop_rm<0x37, "vpcmpgtq", X86pcmpgt, v2i64, VR128,
load, i128mem, SchedWriteVecALU.XMM, 0>,
- VEX_4V, WIG;
+ VEX, VVVV, WIG;
let Predicates = [HasAVX2] in
defm VPCMPGTQY : SS42I_binop_rm<0x37, "vpcmpgtq", X86pcmpgt, v4i64, VR256,
load, i256mem, SchedWriteVecALU.YMM, 0>,
- VEX_4V, VEX_L, WIG;
+ VEX, VVVV, VEX_L, WIG;
let Constraints = "$src1 = $dst" in
defm PCMPGTQ : SS42I_binop_rm<0x37, "pcmpgtq", X86pcmpgt, v2i64, VR128,
@@ -6832,28 +6832,28 @@ multiclass AESI_binop_rm_int<bits<8> opc, string OpcodeStr,
// Perform One Round of an AES Encryption/Decryption Flow
let Predicates = [HasAVX, NoVLX_Or_NoVAES, HasAES] in {
defm VAESENC : AESI_binop_rm_int<0xDC, "vaesenc",
- int_x86_aesni_aesenc, load>, VEX_4V, WIG;
+ int_x86_aesni_aesenc, load>, VEX, VVVV, WIG;
defm VAESENCLAST : AESI_binop_rm_int<0xDD, "vaesenclast",
- int_x86_aesni_aesenclast, load>, VEX_4V, WIG;
+ int_x86_aesni_aesenclast, load>, VEX, VVVV, WIG;
defm VAESDEC : AESI_binop_rm_int<0xDE, "vaesdec",
- int_x86_aesni_aesdec, load>, VEX_4V, WIG;
+ int_x86_aesni_aesdec, load>, VEX, VVVV, WIG;
defm VAESDECLAST : AESI_binop_rm_int<0xDF, "vaesdeclast",
- int_x86_aesni_aesdeclast, load>, VEX_4V, WIG;
+ int_x86_aesni_aesdeclast, load>, VEX, VVVV, WIG;
}
let Predicates = [NoVLX, HasVAES] in {
defm VAESENCY : AESI_binop_rm_int<0xDC, "vaesenc",
int_x86_aesni_aesenc_256, load, 0, VR256,
- i256mem>, VEX_4V, VEX_L, WIG;
+ i256mem>, VEX, VVVV, VEX_L, WIG;
defm VAESENCLASTY : AESI_binop_rm_int<0xDD, "vaesenclast",
int_x86_aesni_aesenclast_256, load, 0, VR256,
- i256mem>, VEX_4V, VEX_L, WIG;
+ i256mem>, VEX, VVVV, VEX_L, WIG;
defm VAESDECY : AESI_binop_rm_int<0xDE, "vaesdec",
int_x86_aesni_aesdec_256, load, 0, VR256,
- i256mem>, VEX_4V, VEX_L, WIG;
+ i256mem>, VEX, VVVV, VEX_L, WIG;
defm VAESDECLASTY : AESI_binop_rm_int<0xDF, "vaesdeclast",
int_x86_aesni_aesdeclast_256, load, 0, VR256,
- i256mem>, VEX_4V, VEX_L, WIG;
+ i256mem>, VEX, VVVV, VEX_L, WIG;
}
let Constraints = "$src1 = $dst" in {
@@ -6994,11 +6994,11 @@ multiclass vpclmulqdq<RegisterClass RC, X86MemOperand MemOp,
let Predicates = [HasAVX, NoVLX_Or_NoVPCLMULQDQ, HasPCLMUL] in
defm VPCLMULQDQ : vpclmulqdq<VR128, i128mem, load,
- int_x86_pclmulqdq>, VEX_4V, WIG;
+ int_x86_pclmulqdq>, VEX, VVVV, WIG;
let Predicates = [NoVLX, HasVPCLMULQDQ] in
defm VPCLMULQDQY : vpclmulqdq<VR256, i256mem, load,
- int_x86_pclmulqdq_256>, VEX_4V, VEX_L, WIG;
+ int_x86_pclmulqdq_256>, VEX, VVVV, VEX_L, WIG;
multiclass vpclmulqdq_aliases_impl<string InstStr, RegisterClass RC,
X86MemOperand MemOp, string Hi, string Lo> {
@@ -7169,11 +7169,11 @@ let isCommutable = 1 in
def VPERM2F128rr : AVXAIi8<0x06, MRMSrcReg, (outs VR256:$dst),
(ins VR256:$src1, VR256:$src2, u8imm:$src3),
"vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", []>,
- VEX_4V, VEX_L, Sched<[WriteFShuffle256]>;
+ VEX, VVVV, VEX_L, Sched<[WriteFShuffle256]>;
def VPERM2F128rm : AVXAIi8<0x06, MRMSrcMem, (outs VR256:$dst),
(ins VR256:$src1, f256mem:$src2, u8imm:$src3),
"vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", []>,
- VEX_4V, VEX_L, Sched<[WriteFShuffle256.Folded, WriteFShuffle256.ReadAfterFold]>;
+ VEX, VVVV, VEX_L, Sched<[WriteFShuffle256.Folded, WriteFShuffle256.ReadAfterFold]>;
}
// Immediate transform to help with commuting.
@@ -7212,12 +7212,12 @@ let hasSideEffects = 0, ExeDomain = SSEPackedSingle in {
def VINSERTF128rr : AVXAIi8<0x18, MRMSrcReg, (outs VR256:$dst),
(ins VR256:$src1, VR128:$src2, u8imm:$src3),
"vinsertf128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
- []>, Sched<[WriteFShuffle256]>, VEX_4V, VEX_L;
+ []>, Sched<[WriteFShuffle256]>, VEX, VVVV, VEX_L;
let mayLoad = 1 in
def VINSERTF128rm : AVXAIi8<0x18, MRMSrcMem, (outs VR256:$dst),
(ins VR256:$src1, f128mem:$src2, u8imm:$src3),
"vinsertf128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
- []>, Sched<[WriteFShuffle256.Folded, WriteFShuffle256.ReadAfterFold]>, VEX_4V, VEX_L;
+ []>, Sched<[WriteFShuffle256.Folded, WriteFShuffle256.ReadAfterFold]>, VEX, VVVV, VEX_L;
}
// To create a 256-bit all ones value, we should produce VCMPTRUEPS
@@ -7315,22 +7315,22 @@ multiclass avx_movmask_rm<bits<8> opc_rm, bits<8> opc_mr, string OpcodeStr,
(ins VR128:$src1, f128mem:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(set VR128:$dst, (IntLd addr:$src2, VR128:$src1))]>,
- VEX_4V, Sched<[schedX.RM]>;
+ VEX, VVVV, Sched<[schedX.RM]>;
def Yrm : AVX8I<opc_rm, MRMSrcMem, (outs VR256:$dst),
(ins VR256:$src1, f256mem:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(set VR256:$dst, (IntLd256 addr:$src2, VR256:$src1))]>,
- VEX_4V, VEX_L, Sched<[schedY.RM]>;
+ VEX, VVVV, VEX_L, Sched<[schedY.RM]>;
def mr : AVX8I<opc_mr, MRMDestMem, (outs),
(ins f128mem:$dst, VR128:$src1, VR128:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(IntSt addr:$dst, VR128:$src1, VR128:$src2)]>,
- VEX_4V, Sched<[schedX.MR]>;
+ VEX, VVVV, Sched<[schedX.MR]>;
def Ymr : AVX8I<opc_mr, MRMDestMem, (outs),
(ins f256mem:$dst, VR256:$src1, VR256:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(IntSt256 addr:$dst, VR256:$src1, VR256:$src2)]>,
- VEX_4V, VEX_L, Sched<[schedY.MR]>;
+ VEX, VVVV, VEX_L, Sched<[schedY.MR]>;
}
let ExeDomain = SSEPackedSingle in
@@ -7361,14 +7361,14 @@ multiclass avx_vnni_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
!strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
[(set VR128:$dst, (v4i32 (OpNode VR128:$src1,
VR128:$src2, VR128:$src3)))]>,
- VEX_4V, Sched<[SchedWriteVecIMul.XMM]>;
+ VEX, VVVV, Sched<[SchedWriteVecIMul.XMM]>;
def rm : AVX8I<opc, MRMSrcMem, (outs VR128:$dst),
(ins VR128:$src1, VR128:$src2, i128mem:$src3),
!strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
[(set VR128:$dst, (v4i32 (OpNode VR128:$src1, VR128:$src2,
(loadv4i32 addr:$src3))))]>,
- VEX_4V, Sched<[SchedWriteVecIMul.XMM.Folded,
+ VEX, VVVV, Sched<[SchedWriteVecIMul.XMM.Folded,
SchedWriteVecIMul.XMM.ReadAfterFold,
SchedWriteVecIMul.XMM.ReadAfterFold]>;
@@ -7378,14 +7378,14 @@ multiclass avx_vnni_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
!strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
[(set VR256:$dst, (v8i32 (OpNode VR256:$src1,
VR256:$src2, VR256:$src3)))]>,
- VEX_4V, VEX_L, Sched<[SchedWriteVecIMul.YMM]>;
+ VEX, VVVV, VEX_L, Sched<[SchedWriteVecIMul.YMM]>;
def Yrm : AVX8I<opc, MRMSrcMem, (outs VR256:$dst),
(ins VR256:$src1, VR256:$src2, i256mem:$src3),
!strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
[(set VR256:$dst, (v8i32 (OpNode VR256:$src1, VR256:$src2,
(loadv8i32 addr:$src3))))]>,
- VEX_4V, VEX_L, Sched<[SchedWriteVecIMul.YMM.Folded,
+ VEX, VVVV, VEX_L, Sched<[SchedWriteVecIMul.YMM.Folded,
SchedWriteVecIMul.YMM.ReadAfterFold,
SchedWriteVecIMul.YMM.ReadAfterFold]>;
}
@@ -7424,13 +7424,13 @@ multiclass avx_permil<bits<8> opc_rm, bits<8> opc_rmi, string OpcodeStr,
def rr : AVX8I<opc_rm, MRMSrcReg, (outs RC:$dst),
(ins RC:$src1, RC:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
- [(set RC:$dst, (f_vt (X86VPermilpv RC:$src1, (i_vt RC:$src2))))]>, VEX_4V,
+ [(set RC:$dst, (f_vt (X86VPermilpv RC:$src1, (i_vt RC:$src2))))]>, VEX, VVVV,
Sched<[varsched]>;
def rm : AVX8I<opc_rm, MRMSrcMem, (outs RC:$dst),
(ins RC:$src1, x86memop_i:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(set RC:$dst, (f_vt (X86VPermilpv RC:$src1,
- (i_vt (load addr:$src2)))))]>, VEX_4V,
+ (i_vt (load addr:$src2)))))]>, VEX, VVVV,
Sched<[varsched.Folded, sched.ReadAfterFold]>;
def ri : AVXAIi8<opc_rmi, MRMSrcReg, (outs RC:$dst),
@@ -7558,14 +7558,14 @@ multiclass AVX2_blend_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode,
!strconcat(OpcodeStr,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
[(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, timm:$src3)))]>,
- Sched<[sched]>, VEX_4V;
+ Sched<[sched]>, VEX, VVVV;
def rmi : AVX2AIi8<opc, MRMSrcMem, (outs RC:$dst),
(ins RC:$src1, x86memop:$src2, u8imm:$src3),
!strconcat(OpcodeStr,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
[(set RC:$dst,
(OpVT (OpNode RC:$src1, (load addr:$src2), timm:$src3)))]>,
- Sched<[sched.Folded, sched.ReadAfterFold]>, VEX_4V;
+ Sched<[sched.Folded, sched.ReadAfterFold]>, VEX, VVVV;
// Pattern to commute if load is in first source.
def : Pat<(OpVT (OpNode (load addr:$src2), RC:$src1, timm:$src3)),
@@ -7815,7 +7815,7 @@ multiclass avx2_perm<bits<8> opc, string OpcodeStr,
"\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(set VR256:$dst,
(OpVT (X86VPermv VR256:$src1, VR256:$src2)))]>,
- Sched<[Sched]>, VEX_4V, VEX_L;
+ Sched<[Sched]>, VEX, VVVV, VEX_L;
def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst),
(ins VR256:$src1, memOp:$src2),
!strconcat(OpcodeStr,
@@ -7823,7 +7823,7 @@ multiclass avx2_perm<bits<8> opc, string OpcodeStr,
[(set VR256:$dst,
(OpVT (X86VPermv VR256:$src1,
(load addr:$src2))))]>,
- Sched<[Sched.Folded, Sched.ReadAfterFold]>, VEX_4V, VEX_L;
+ Sched<[Sched.Folded, Sched.ReadAfterFold]>, VEX, VVVV, VEX_L;
}
}
@@ -7866,11 +7866,11 @@ let isCommutable = 1 in
def VPERM2I128rr : AVX2AIi8<0x46, MRMSrcReg, (outs VR256:$dst),
(ins VR256:$src1, VR256:$src2, u8imm:$src3),
"vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", []>,
- Sched<[WriteShuffle256]>, VEX_4V, VEX_L;
+ Sched<[WriteShuffle256]>, VEX, VVVV, VEX_L;
def VPERM2I128rm : AVX2AIi8<0x46, MRMSrcMem, (outs VR256:$dst),
(ins VR256:$src1, f256mem:$src2, u8imm:$src3),
"vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", []>,
- Sched<[WriteShuffle256.Folded, WriteShuffle256.ReadAfterFold]>, VEX_4V, VEX_L;
+ Sched<[WriteShuffle256.Folded, WriteShuffle256.ReadAfterFold]>, VEX, VVVV, VEX_L;
let Predicates = [HasAVX2] in {
defm : vperm2x128_lowering<"VPERM2I128", v4i64, loadv4i64>;
@@ -7888,12 +7888,12 @@ let hasSideEffects = 0 in {
def VINSERTI128rr : AVX2AIi8<0x38, MRMSrcReg, (outs VR256:$dst),
(ins VR256:$src1, VR128:$src2, u8imm:$src3),
"vinserti128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
- []>, Sched<[WriteShuffle256]>, VEX_4V, VEX_L;
+ []>, Sched<[WriteShuffle256]>, VEX, VVVV, VEX_L;
let mayLoad = 1 in
def VINSERTI128rm : AVX2AIi8<0x38, MRMSrcMem, (outs VR256:$dst),
(ins VR256:$src1, i128mem:$src2, u8imm:$src3),
"vinserti128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
- []>, Sched<[WriteShuffle256.Folded, WriteShuffle256.ReadAfterFold]>, VEX_4V, VEX_L;
+ []>, Sched<[WriteShuffle256.Folded, WriteShuffle256.ReadAfterFold]>, VEX, VVVV, VEX_L;
}
let Predicates = [HasAVX2, NoVLX] in {
@@ -7939,22 +7939,22 @@ multiclass avx2_pmovmask<string OpcodeStr,
(ins VR128:$src1, i128mem:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(set VR128:$dst, (IntLd128 addr:$src2, VR128:$src1))]>,
- VEX_4V, Sched<[schedX.RM]>;
+ VEX, VVVV, Sched<[schedX.RM]>;
def Yrm : AVX28I<0x8c, MRMSrcMem, (outs VR256:$dst),
(ins VR256:$src1, i256mem:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(set VR256:$dst, (IntLd256 addr:$src2, VR256:$src1))]>,
- VEX_4V, VEX_L, Sched<[schedY.RM]>;
+ VEX, VVVV, VEX_L, Sched<[schedY.RM]>;
def mr : AVX28I<0x8e, MRMDestMem, (outs),
(ins i128mem:$dst, VR128:$src1, VR128:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(IntSt128 addr:$dst, VR128:$src1, VR128:$src2)]>,
- VEX_4V, Sched<[schedX.MR]>;
+ VEX, VVVV, Sched<[schedX.MR]>;
def Ymr : AVX28I<0x8e, MRMDestMem, (outs),
(ins i256mem:$dst, VR256:$src1, VR256:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(IntSt256 addr:$dst, VR256:$src1, VR256:$src2)]>,
- VEX_4V, VEX_L, Sched<[schedY.MR]>;
+ VEX, VVVV, VEX_L, Sched<[schedY.MR]>;
}
defm VPMASKMOVD : avx2_pmovmask<"vpmaskmovd",
@@ -8012,28 +8012,28 @@ multiclass avx2_var_shift<bits<8> opc, string OpcodeStr, SDNode OpNode,
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(set VR128:$dst,
(vt128 (OpNode VR128:$src1, (vt128 VR128:$src2))))]>,
- VEX_4V, Sched<[SchedWriteVarVecShift.XMM]>;
+ VEX, VVVV, Sched<[SchedWriteVarVecShift.XMM]>;
def rm : AVX28I<opc, MRMSrcMem, (outs VR128:$dst),
(ins VR128:$src1, i128mem:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(set VR128:$dst,
(vt128 (OpNode VR128:$src1,
(vt128 (load addr:$src2)))))]>,
- VEX_4V, Sched<[SchedWriteVarVecShift.XMM.Folded,
+ VEX, VVVV, Sched<[SchedWriteVarVecShift.XMM.Folded,
SchedWriteVarVecShift.XMM.ReadAfterFold]>;
def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst),
(ins VR256:$src1, VR256:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(set VR256:$dst,
(vt256 (OpNode VR256:$src1, (vt256 VR256:$src2))))]>,
- VEX_4V, VEX_L, Sched<[SchedWriteVarVecShift.YMM]>;
+ VEX, VVVV, VEX_L, Sched<[SchedWriteVarVecShift.YMM]>;
def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst),
(ins VR256:$src1, i256mem:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(set VR256:$dst,
(vt256 (OpNode VR256:$src1,
(vt256 (load addr:$src2)))))]>,
- VEX_4V, VEX_L, Sched<[SchedWriteVarVecShift.YMM.Folded,
+ VEX, VVVV, VEX_L, Sched<[SchedWriteVarVecShift.YMM.Folded,
SchedWriteVarVecShift.YMM.ReadAfterFold]>;
}
@@ -8146,10 +8146,10 @@ multiclass GF2P8AFFINE_common<bits<8> Op, string OpStr, SDNode OpNode> {
let Predicates = [HasGFNI, HasAVX, NoVLX] in {
defm V#NAME : GF2P8AFFINE_rmi<Op, "v"#OpStr, v16i8, OpNode, VR128,
load, i128mem, SchedWriteVecIMul.XMM>,
- VEX_4V, REX_W;
+ VEX, VVVV, REX_W;
defm V#NAME#Y : GF2P8AFFINE_rmi<Op, "v"#OpStr, v32i8, OpNode, VR256,
load, i256mem, SchedWriteVecIMul.YMM>,
- VEX_4V, VEX_L, REX_W;
+ VEX, VVVV, VEX_L, REX_W;
}
}
@@ -8160,9 +8160,9 @@ defm GF2P8MULB : GF2P8MULB_rm<"gf2p8mulb", v16i8, VR128, memop,
i128mem, SchedWriteVecALU.XMM, 1>;
let Predicates = [HasGFNI, HasAVX, NoVLX] in {
defm VGF2P8MULB : GF2P8MULB_rm<"vgf2p8mulb", v16i8, VR128, load,
- i128mem, SchedWriteVecALU.XMM>, VEX_4V;
+ i128mem, SchedWriteVecALU.XMM>, VEX, VVVV;
defm VGF2P8MULBY : GF2P8MULB_rm<"vgf2p8mulb", v32i8, VR256, load,
- i256mem, SchedWriteVecALU.YMM>, VEX_4V, VEX_L;
+ i256mem, SchedWriteVecALU.YMM>, VEX, VVVV, VEX_L;
}
// GF2P8AFFINEINVQB, GF2P8AFFINEQB
let isCommutable = 0 in {
@@ -8183,28 +8183,28 @@ multiclass avx_ifma_rm<bits<8> opc, string OpcodeStr, SDNode OpNode> {
!strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
[(set VR128:$dst, (v2i64 (OpNode VR128:$src2,
VR128:$src3, VR128:$src1)))]>,
- VEX_4V, Sched<[SchedWriteVecIMul.XMM]>;
+ VEX, VVVV, Sched<[SchedWriteVecIMul.XMM]>;
}
def rm : AVX8I<opc, MRMSrcMem, (outs VR128:$dst),
(ins VR128:$src1, VR128:$src2, i128mem:$src3),
!strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
[(set VR128:$dst, (v2i64 (OpNode VR128:$src2,
(loadv2i64 addr:$src3), VR128:$src1)))]>,
- VEX_4V, Sched<[SchedWriteVecIMul.XMM]>;
+ VEX, VVVV, Sched<[SchedWriteVecIMul.XMM]>;
let isCommutable = 1 in {
def Yrr : AVX8I<opc, MRMSrcReg, (outs VR256:$dst),
(ins VR256:$src1, VR256:$src2, VR256:$src3),
!strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
[(set VR256:$dst, (v4i64 (OpNode VR256:$src2,
VR256:$src3, VR256:$src1)))]>,
- VEX_4V, VEX_L, Sched<[SchedWriteVecIMul.YMM]>;
+ VEX, VVVV, VEX_L, Sched<[SchedWriteVecIMul.YMM]>;
}
def Yrm : AVX8I<opc, MRMSrcMem, (outs VR256:$dst),
(ins VR256:$src1, VR256:$src2, i256mem:$src3),
!strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
[(set VR256:$dst, (v4i64 (OpNode VR256:$src2,
(loadv4i64 addr:$src3), VR256:$src1)))]>,
- VEX_4V, VEX_L, Sched<[SchedWriteVecIMul.YMM]>;
+ VEX, VVVV, VEX_L, Sched<[SchedWriteVecIMul.YMM]>;
}
defm VPMADD52HUQ : avx_ifma_rm<0xb5, "vpmadd52huq", x86vpmadd52h>, REX_W, ExplicitVEXPrefix;
@@ -8222,13 +8222,13 @@ multiclass avx_dotprod_rm<bits<8> Opc, string OpcodeStr, ValueType OpVT,
(ins RC:$src1, RC:$src2, RC:$src3),
!strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
[(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, RC:$src3)))]>,
- VEX_4V, Sched<[Sched]>;
+ VEX, VVVV, Sched<[Sched]>;
def rm : I<Opc, MRMSrcMem, (outs RC:$dst),
(ins RC:$src1, RC:$src2, X86memop:$src3),
!strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
[(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2,
(MemOpFrag addr:$src3))))]>,
- VEX_4V, Sched<[Sched.Folded, Sched.ReadAfterFold]>;
+ VEX, VVVV, Sched<[Sched.Folded, Sched.ReadAfterFold]>;
}
let Predicates = [HasAVXVNNIINT8] in {
@@ -8349,7 +8349,7 @@ def VSHA512RNDS2rr : I<0xcb, MRMSrcReg, (outs VR256:$dst),
"vsha512rnds2\t{$src3, $src2, $dst|$dst, $src2, $src3}",
[(set VR256:$dst,
(int_x86_vsha512rnds2 VR256:$src1, VR256:$src2, VR128:$src3))]>,
- VEX_L, VEX_4V, T8XD, Sched<[WriteVecIMul]>;
+ VEX_L, VEX, VVVV, T8XD, Sched<[WriteVecIMul]>;
}
// FIXME: Is there a better scheduler class for SM3 than WriteVecIMul?
@@ -8361,14 +8361,14 @@ let Predicates = [HasSM3], Constraints = "$src1 = $dst" in {
[(set VR128:$dst,
(!cast<Intrinsic>("int_x86_"#OpStr) VR128:$src1,
VR128:$src2, VR128:$src3))]>,
- Sched<[WriteVecIMul]>, VEX_4V;
+ Sched<[WriteVecIMul]>, VEX, VVVV;
def rm : I<0xda, MRMSrcMem, (outs VR128:$dst),
(ins VR128:$src1, VR128:$src2, i128mem:$src3),
!strconcat(OpStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
[(set VR128:$dst,
(!cast<Intrinsic>("int_x86_"#OpStr) VR128:$src1,
VR128:$src2, (loadv4i32 addr:$src3)))]>,
- Sched<[WriteVecIMul]>, VEX_4V;
+ Sched<[WriteVecIMul]>, VEX, VVVV;
}
multiclass VSM3RNDS2_Base {
@@ -8391,7 +8391,7 @@ let Predicates = [HasSM3], Constraints = "$src1 = $dst" in {
defm VSM3MSG1 : SM3_Base<"vsm3msg1">, T8PS;
defm VSM3MSG2 : SM3_Base<"vsm3msg2">, T8PD;
-defm VSM3RNDS2 : VSM3RNDS2_Base, VEX_4V, TAPD;
+defm VSM3RNDS2 : VSM3RNDS2_Base, VEX, VVVV, TAPD;
// FIXME: Is there a better scheduler class for SM4 than WriteVecIMul?
let Predicates = [HasSM4] in {
@@ -8412,10 +8412,10 @@ let Predicates = [HasSM4] in {
}
}
-defm VSM4KEY4 : SM4_Base<"vsm4key4", VR128, "128", loadv4i32, i128mem>, T8XS, VEX_4V;
-defm VSM4KEY4Y : SM4_Base<"vsm4key4", VR256, "256", loadv8i32, i256mem>, T8XS, VEX_L, VEX_4V;
-defm VSM4RNDS4 : SM4_Base<"vsm4rnds4", VR128, "128", loadv4i32, i128mem>, T8XD, VEX_4V;
-defm VSM4RNDS4Y : SM4_Base<"vsm4rnds4", VR256, "256", loadv8i32, i256mem>, T8XD, VEX_L, VEX_4V;
+defm VSM4KEY4 : SM4_Base<"vsm4key4", VR128, "128", loadv4i32, i128mem>, T8XS, VEX, VVVV;
+defm VSM4KEY4Y : SM4_Base<"vsm4key4", VR256, "256", loadv8i32, i256mem>, T8XS, VEX_L, VEX, VVVV;
+defm VSM4RNDS4 : SM4_Base<"vsm4rnds4", VR128, "128", loadv4i32, i128mem>, T8XD, VEX, VVVV;
+defm VSM4RNDS4Y : SM4_Base<"vsm4rnds4", VR256, "256", loadv8i32, i256mem>, T8XD, VEX_L, VEX, VVVV;
let Predicates = [HasAVXVNNIINT16], Constraints = "$src1 = $dst" in
multiclass avx_vnni_int16<bits<8> opc, string OpcodeStr, bit IsCommutable> {
@@ -8426,7 +8426,7 @@ multiclass avx_vnni_int16<bits<8> opc, string OpcodeStr, bit IsCommutable> {
[(set VR128:$dst,
(v4i32 (!cast<Intrinsic>("int_x86_avx2_"#OpcodeStr#"_128")
VR128:$src1, VR128:$src2, VR128:$src3)))]>,
- VEX_4V, Sched<[SchedWriteVecIMul.XMM]>;
+ VEX, VVVV, Sched<[SchedWriteVecIMul.XMM]>;
def rm : I<opc, MRMSrcMem, (outs VR128:$dst),
(ins VR128:$src1, VR128:$src2, i128mem:$src3),
@@ -8434,7 +8434,7 @@ multiclass avx_vnni_int16<bits<8> opc, string OpcodeStr, bit IsCommutable> {
[(set VR128:$dst,
(v4i32 (!cast<Intrinsic>("int_x86_avx2_"#OpcodeStr#"_128")
VR128:$src1, VR128:$src2, (loadv4i32 addr:$src3))))]>,
- VEX_4V, Sched<[SchedWriteVecIMul.XMM]>;
+ VEX, VVVV, Sched<[SchedWriteVecIMul.XMM]>;
let isCommutable = IsCommutable in
def Yrr : I<opc, MRMSrcReg, (outs VR256:$dst),
@@ -8443,7 +8443,7 @@ multiclass avx_vnni_int16<bits<8> opc, string OpcodeStr, bit IsCommutable> {
[(set VR256:$dst,
(v8i32 (!cast<Intrinsic>("int_x86_avx2_"#OpcodeStr#"_256")
VR256:$src1, VR256:$src2, VR256:$src3)))]>,
- VEX_4V, VEX_L, Sched<[SchedWriteVecIMul.YMM]>;
+ VEX, VVVV, VEX_L, Sched<[SchedWriteVecIMul.YMM]>;
def Yrm : I<opc, MRMSrcMem, (outs VR256:$dst),
(ins VR256:$src1, VR256:$src2, i256mem:$src3),
@@ -8451,7 +8451,7 @@ multiclass avx_vnni_int16<bits<8> opc, string OpcodeStr, bit IsCommutable> {
[(set VR256:$dst,
(v8i32 (!cast<Intrinsic>("int_x86_avx2_"#OpcodeStr#"_256")
VR256:$src1, VR256:$src2, (loadv8i32 addr:$src3))))]>,
- VEX_4V, VEX_L, Sched<[SchedWriteVecIMul.YMM]>;
+ VEX, VVVV, VEX_L, Sched<[SchedWriteVecIMul.YMM]>;
}
defm VPDPWSUD : avx_vnni_int16<0xd2, "vpdpwsud", 0>, T8XS;
diff --git a/llvm/lib/Target/X86/X86InstrSystem.td b/llvm/lib/Target/X86/X86InstrSystem.td
index 51972c63bb2c..25db96b31be7 100644
--- a/llvm/lib/Target/X86/X86InstrSystem.td
+++ b/llvm/lib/Target/X86/X86InstrSystem.td
@@ -520,6 +520,7 @@ let SchedRW = [WriteSystem] in {
} // Defs SSP
} // Uses SSP
+let Predicates = [NoEGPR] in {
def WRSSD : I<0xF6, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
"wrssd\t{$src, $dst|$dst, $src}",
[(int_x86_wrssd GR32:$src, addr:$dst)]>, T8PS;
@@ -532,6 +533,22 @@ let SchedRW = [WriteSystem] in {
def WRUSSQ : RI<0xF5, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
"wrussq\t{$src, $dst|$dst, $src}",
[(int_x86_wrussq GR64:$src, addr:$dst)]>, T8PD;
+}
+
+let Predicates = [HasEGPR, In64BitMode] in {
+ def WRSSD_EVEX : I<0x66, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
+ "wrssd\t{$src, $dst|$dst, $src}",
+ [(int_x86_wrssd GR32:$src, addr:$dst)]>, EVEX, NoCD8, T_MAP4PS;
+ def WRSSQ_EVEX : RI<0x66, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
+ "wrssq\t{$src, $dst|$dst, $src}",
+ [(int_x86_wrssq GR64:$src, addr:$dst)]>, EVEX, NoCD8, T_MAP4PS;
+ def WRUSSD_EVEX : I<0x65, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
+ "wrussd\t{$src, $dst|$dst, $src}",
+ [(int_x86_wrussd GR32:$src, addr:$dst)]>, EVEX, NoCD8, T_MAP4PD;
+ def WRUSSQ_EVEX : RI<0x65, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
+ "wrussq\t{$src, $dst|$dst, $src}",
+ [(int_x86_wrussq GR64:$src, addr:$dst)]>, EVEX, NoCD8, T_MAP4PD;
+}
let Defs = [SSP] in {
let Uses = [SSP] in {
diff --git a/llvm/lib/Target/X86/X86InstrTBM.td b/llvm/lib/Target/X86/X86InstrTBM.td
index ed514038a12e..09200f0c1a9f 100644
--- a/llvm/lib/Target/X86/X86InstrTBM.td
+++ b/llvm/lib/Target/X86/X86InstrTBM.td
@@ -46,11 +46,11 @@ multiclass tbm_binary_rm<bits<8> opc, Format FormReg, Format FormMem,
let hasSideEffects = 0 in {
def rr : I<opc, FormReg, (outs RC:$dst), (ins RC:$src),
!strconcat(OpcodeStr,"\t{$src, $dst|$dst, $src}"), []>,
- XOP_4V, XOP9, Sched<[Sched]>;
+ XOP, VVVV, XOP9, Sched<[Sched]>;
let mayLoad = 1 in
def rm : I<opc, FormMem, (outs RC:$dst), (ins x86memop:$src),
!strconcat(OpcodeStr,"\t{$src, $dst|$dst, $src}"), []>,
- XOP_4V, XOP9, Sched<[Sched.Folded]>;
+ XOP, VVVV, XOP9, Sched<[Sched.Folded]>;
}
}
diff --git a/llvm/lib/Target/X86/X86InstrUtils.td b/llvm/lib/Target/X86/X86InstrUtils.td
index a94efd2b1a05..dd59a641dfaa 100644
--- a/llvm/lib/Target/X86/X86InstrUtils.td
+++ b/llvm/lib/Target/X86/X86InstrUtils.td
@@ -66,11 +66,10 @@ class VEX { Encoding OpEnc = EncVEX; }
class WIG { bit IgnoresW = 1; }
// Special version of REX_W that can be changed to VEX.W==0 for EVEX2VEX.
class VEX_W1X { bit hasREX_W = 1; bit EVEX_W1_VEX_W0 = 1; }
-class VEX_4V : VEX { bit hasVEX_4V = 1; }
class VEX_L { bit hasVEX_L = 1; }
class VEX_LIG { bit ignoresVEX_L = 1; }
+class VVVV { bit hasVEX_4V = 1; }
class EVEX { Encoding OpEnc = EncEVEX; }
-class EVEX_4V : EVEX { bit hasVEX_4V = 1; }
class EVEX_K { bit hasEVEX_K = 1; }
class EVEX_KZ : EVEX_K { bit hasEVEX_Z = 1; }
class EVEX_B { bit hasEVEX_B = 1; }
@@ -88,7 +87,6 @@ class EVEX_CD8<int esize, CD8VForm form> {
}
class NoCD8 { bits<7> CD8_Scale = 0; }
class XOP { Encoding OpEnc = EncXOP; }
-class XOP_4V : XOP { bit hasVEX_4V = 1; }
class EVEX2VEXOverride<string VEXInstrName> {
string EVEX2VEXOverride = VEXInstrName;
}
@@ -860,7 +858,7 @@ class AVX512PI<bits<8> o, Format F, dag outs, dag ins, string asm,
class AVX512FMA3S<bits<8> o, Format F, dag outs, dag ins, string asm,
list<dag>pattern>
: I<o, F, outs, ins, asm, pattern>, T8PD,
- EVEX_4V, Requires<[HasAVX512]>;
+ EVEX, VVVV, Requires<[HasAVX512]>;
class AVX512<bits<8> o, Format F, dag outs, dag ins, string asm,
list<dag>pattern>
@@ -889,29 +887,29 @@ class PCLMULIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
class FMA3<bits<8> o, Format F, dag outs, dag ins, string asm,
list<dag>pattern>
: I<o, F, outs, ins, asm, pattern>, T8PD,
- VEX_4V, FMASC, Requires<[HasFMA, NoFMA4, NoVLX]>;
+ VEX, VVVV, FMASC, Requires<[HasFMA, NoFMA4, NoVLX]>;
class FMA3S<bits<8> o, Format F, dag outs, dag ins, string asm,
list<dag>pattern>
: I<o, F, outs, ins, asm, pattern>, T8PD,
- VEX_4V, FMASC, Requires<[HasFMA, NoFMA4, NoAVX512]>;
+ VEX, VVVV, FMASC, Requires<[HasFMA, NoFMA4, NoAVX512]>;
class FMA3S_Int<bits<8> o, Format F, dag outs, dag ins, string asm,
list<dag>pattern>
: I<o, F, outs, ins, asm, pattern>, T8PD,
- VEX_4V, FMASC, Requires<[HasFMA, NoAVX512]>;
+ VEX, VVVV, FMASC, Requires<[HasFMA, NoAVX512]>;
// FMA4 Instruction Templates
class FMA4<bits<8> o, Format F, dag outs, dag ins, string asm,
list<dag>pattern>
: Ii8Reg<o, F, outs, ins, asm, pattern>, TAPD,
- VEX_4V, FMASC, Requires<[HasFMA4, NoVLX]>;
+ VEX, VVVV, FMASC, Requires<[HasFMA4, NoVLX]>;
class FMA4S<bits<8> o, Format F, dag outs, dag ins, string asm,
list<dag>pattern>
: Ii8Reg<o, F, outs, ins, asm, pattern>, TAPD,
- VEX_4V, FMASC, Requires<[HasFMA4, NoAVX512]>;
+ VEX, VVVV, FMASC, Requires<[HasFMA4, NoAVX512]>;
class FMA4S_Int<bits<8> o, Format F, dag outs, dag ins, string asm,
list<dag>pattern>
: Ii8Reg<o, F, outs, ins, asm, pattern>, TAPD,
- VEX_4V, FMASC, Requires<[HasFMA4]>;
+ VEX, VVVV, FMASC, Requires<[HasFMA4]>;
// XOP 2, 3 and 4 Operand Instruction Template
class IXOP<bits<8> o, Format F, dag outs, dag ins, string asm,
@@ -934,7 +932,7 @@ class IXOPi8Reg<bits<8> o, Format F, dag outs, dag ins, string asm,
class IXOP5<bits<8> o, Format F, dag outs, dag ins, string asm,
list<dag>pattern>
: Ii8Reg<o, F, outs, ins, asm, pattern, SSEPackedInt>, TAPD,
- VEX_4V, Requires<[HasXOP]>;
+ VEX, VVVV, Requires<[HasXOP]>;
// X86-64 Instruction templates...
//
diff --git a/llvm/lib/Target/X86/X86InstrXOP.td b/llvm/lib/Target/X86/X86InstrXOP.td
index a62bb2e855c9..1504d77bfb86 100644
--- a/llvm/lib/Target/X86/X86InstrXOP.td
+++ b/llvm/lib/Target/X86/X86InstrXOP.td
@@ -105,7 +105,7 @@ multiclass xop3op<bits<8> opc, string OpcodeStr, SDNode OpNode,
[(set VR128:$dst,
(vt128 (OpNode (vt128 VR128:$src1),
(vt128 (load addr:$src2)))))]>,
- XOP_4V, REX_W, Sched<[sched.Folded, sched.ReadAfterFold]>;
+ XOP, VVVV, REX_W, Sched<[sched.Folded, sched.ReadAfterFold]>;
def mr : IXOP<opc, MRMSrcMem4VOp3, (outs VR128:$dst),
(ins i128mem:$src1, VR128:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
@@ -119,7 +119,7 @@ multiclass xop3op<bits<8> opc, string OpcodeStr, SDNode OpNode,
(ins VR128:$src1, VR128:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[]>,
- XOP_4V, REX_W, Sched<[sched]>;
+ XOP, VVVV, REX_W, Sched<[sched]>;
}
let ExeDomain = SSEPackedInt in {
@@ -173,7 +173,7 @@ multiclass xop4opm2<bits<8> opc, string OpcodeStr, Intrinsic Int,
!strconcat(OpcodeStr,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
[(set VR128:$dst,
- (Int VR128:$src1, VR128:$src2, VR128:$src3))]>, XOP_4V,
+ (Int VR128:$src1, VR128:$src2, VR128:$src3))]>, XOP, VVVV,
Sched<[sched]>;
def rm : IXOPi8Reg<opc, MRMSrcMem, (outs VR128:$dst),
(ins VR128:$src1, i128mem:$src2, VR128:$src3),
@@ -181,7 +181,7 @@ multiclass xop4opm2<bits<8> opc, string OpcodeStr, Intrinsic Int,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
[(set VR128:$dst,
(Int VR128:$src1, (load addr:$src2),
- VR128:$src3))]>, XOP_4V, Sched<[sched.Folded, sched.ReadAfterFold]>;
+ VR128:$src3))]>, XOP, VVVV, Sched<[sched.Folded, sched.ReadAfterFold]>;
}
let ExeDomain = SSEPackedInt in {
@@ -252,7 +252,7 @@ multiclass xopvpcom<bits<8> opc, string Suffix, SDNode OpNode, ValueType vt128,
[(set VR128:$dst,
(vt128 (OpNode (vt128 VR128:$src1), (vt128 VR128:$src2),
timm:$cc)))]>,
- XOP_4V, Sched<[sched]>;
+ XOP, VVVV, Sched<[sched]>;
def mi : IXOPi8<opc, MRMSrcMem, (outs VR128:$dst),
(ins VR128:$src1, i128mem:$src2, u8imm:$cc),
!strconcat("vpcom", Suffix,
@@ -261,7 +261,7 @@ multiclass xopvpcom<bits<8> opc, string Suffix, SDNode OpNode, ValueType vt128,
(vt128 (OpNode (vt128 VR128:$src1),
(vt128 (load addr:$src2)),
timm:$cc)))]>,
- XOP_4V, Sched<[sched.Folded, sched.ReadAfterFold]>;
+ XOP, VVVV, Sched<[sched.Folded, sched.ReadAfterFold]>;
}
def : Pat<(OpNode (load addr:$src2),
@@ -288,7 +288,7 @@ multiclass xop4op<bits<8> opc, string OpcodeStr, SDNode OpNode,
[(set VR128:$dst,
(vt128 (OpNode (vt128 VR128:$src1), (vt128 VR128:$src2),
(vt128 VR128:$src3))))]>,
- XOP_4V, Sched<[sched]>;
+ XOP, VVVV, Sched<[sched]>;
def rrm : IXOPi8Reg<opc, MRMSrcMemOp4, (outs VR128:$dst),
(ins VR128:$src1, VR128:$src2, i128mem:$src3),
!strconcat(OpcodeStr,
@@ -296,7 +296,7 @@ multiclass xop4op<bits<8> opc, string OpcodeStr, SDNode OpNode,
[(set VR128:$dst,
(vt128 (OpNode (vt128 VR128:$src1), (vt128 VR128:$src2),
(vt128 (load addr:$src3)))))]>,
- XOP_4V, REX_W, Sched<[sched.Folded, sched.ReadAfterFold, sched.ReadAfterFold]>;
+ XOP, VVVV, REX_W, Sched<[sched.Folded, sched.ReadAfterFold, sched.ReadAfterFold]>;
def rmr : IXOPi8Reg<opc, MRMSrcMem, (outs VR128:$dst),
(ins VR128:$src1, i128mem:$src2, VR128:$src3),
!strconcat(OpcodeStr,
@@ -304,7 +304,7 @@ multiclass xop4op<bits<8> opc, string OpcodeStr, SDNode OpNode,
[(set VR128:$dst,
(v16i8 (OpNode (vt128 VR128:$src1), (vt128 (load addr:$src2)),
(vt128 VR128:$src3))))]>,
- XOP_4V, Sched<[sched.Folded, sched.ReadAfterFold,
+ XOP, VVVV, Sched<[sched.Folded, sched.ReadAfterFold,
// 128mem:$src2
ReadDefault, ReadDefault, ReadDefault, ReadDefault,
ReadDefault,
@@ -316,7 +316,7 @@ multiclass xop4op<bits<8> opc, string OpcodeStr, SDNode OpNode,
(ins VR128:$src1, VR128:$src2, VR128:$src3),
!strconcat(OpcodeStr,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
- []>, XOP_4V, REX_W, Sched<[sched]>;
+ []>, XOP, VVVV, REX_W, Sched<[sched]>;
}
let ExeDomain = SSEPackedInt in {
@@ -333,7 +333,7 @@ multiclass xop4op_int<bits<8> opc, string OpcodeStr, RegisterClass RC,
!strconcat(OpcodeStr,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
[(set RC:$dst, (VT (or (and RC:$src3, RC:$src1),
- (X86andnp RC:$src3, RC:$src2))))]>, XOP_4V,
+ (X86andnp RC:$src3, RC:$src2))))]>, XOP, VVVV,
Sched<[sched]>;
// FIXME: We can't write a pattern for this in tablegen.
let hasSideEffects = 0, mayLoad = 1 in
@@ -342,14 +342,14 @@ multiclass xop4op_int<bits<8> opc, string OpcodeStr, RegisterClass RC,
!strconcat(OpcodeStr,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
[]>,
- XOP_4V, REX_W, Sched<[sched.Folded, sched.ReadAfterFold, sched.ReadAfterFold]>;
+ XOP, VVVV, REX_W, Sched<[sched.Folded, sched.ReadAfterFold, sched.ReadAfterFold]>;
def rmr : IXOPi8Reg<opc, MRMSrcMem, (outs RC:$dst),
(ins RC:$src1, x86memop:$src2, RC:$src3),
!strconcat(OpcodeStr,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
[(set RC:$dst, (VT (or (and RC:$src3, RC:$src1),
(X86andnp RC:$src3, (load addr:$src2)))))]>,
- XOP_4V, Sched<[sched.Folded, sched.ReadAfterFold,
+ XOP, VVVV, Sched<[sched.Folded, sched.ReadAfterFold,
// x86memop:$src2
ReadDefault, ReadDefault, ReadDefault, ReadDefault,
ReadDefault,
@@ -361,7 +361,7 @@ multiclass xop4op_int<bits<8> opc, string OpcodeStr, RegisterClass RC,
(ins RC:$src1, RC:$src2, RC:$src3),
!strconcat(OpcodeStr,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
- []>, XOP_4V, REX_W, Sched<[sched]>;
+ []>, XOP, VVVV, REX_W, Sched<[sched]>;
}
let ExeDomain = SSEPackedInt in {
diff --git a/llvm/test/CodeGen/AArch64/arm64-addrmode.ll b/llvm/test/CodeGen/AArch64/arm64-addrmode.ll
index bfef61abd8c1..2181eaaee7db 100644
--- a/llvm/test/CodeGen/AArch64/arm64-addrmode.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-addrmode.ll
@@ -239,8 +239,9 @@ define i32 @LdOffset_i8_zext32(ptr %a) {
define i32 @LdOffset_i8_sext32(ptr %a) {
; CHECK-LABEL: LdOffset_i8_sext32:
; CHECK: // %bb.0:
-; CHECK-NEXT: add x8, x0, #253, lsl #12 // =1036288
-; CHECK-NEXT: ldrsb w0, [x8, #3704]
+; CHECK-NEXT: mov w8, #56952 // =0xde78
+; CHECK-NEXT: movk w8, #15, lsl #16
+; CHECK-NEXT: ldrsb w0, [x0, x8]
; CHECK-NEXT: ret
%arrayidx = getelementptr inbounds i8, ptr %a, i64 1039992
%val = load i8, ptr %arrayidx, align 1
@@ -265,8 +266,9 @@ define i64 @LdOffset_i8_zext64(ptr %a) {
define i64 @LdOffset_i8_sext64(ptr %a) {
; CHECK-LABEL: LdOffset_i8_sext64:
; CHECK: // %bb.0:
-; CHECK-NEXT: add x8, x0, #253, lsl #12 // =1036288
-; CHECK-NEXT: ldrsb x0, [x8, #3704]
+; CHECK-NEXT: mov w8, #56952 // =0xde78
+; CHECK-NEXT: movk w8, #15, lsl #16
+; CHECK-NEXT: ldrsb x0, [x0, x8]
; CHECK-NEXT: ret
%arrayidx = getelementptr inbounds i8, ptr %a, i64 1039992
%val = load i8, ptr %arrayidx, align 1
@@ -278,8 +280,9 @@ define i64 @LdOffset_i8_sext64(ptr %a) {
define i16 @LdOffset_i16(ptr %a) {
; CHECK-LABEL: LdOffset_i16:
; CHECK: // %bb.0:
-; CHECK-NEXT: add x8, x0, #506, lsl #12 // =2072576
-; CHECK-NEXT: ldrh w0, [x8, #7408]
+; CHECK-NEXT: mov w8, #48368 // =0xbcf0
+; CHECK-NEXT: movk w8, #31, lsl #16
+; CHECK-NEXT: ldrh w0, [x0, x8]
; CHECK-NEXT: ret
%arrayidx = getelementptr inbounds i16, ptr %a, i64 1039992
%val = load i16, ptr %arrayidx, align 2
@@ -290,8 +293,9 @@ define i16 @LdOffset_i16(ptr %a) {
define i32 @LdOffset_i16_zext32(ptr %a) {
; CHECK-LABEL: LdOffset_i16_zext32:
; CHECK: // %bb.0:
-; CHECK-NEXT: add x8, x0, #506, lsl #12 // =2072576
-; CHECK-NEXT: ldrh w0, [x8, #7408]
+; CHECK-NEXT: mov w8, #48368 // =0xbcf0
+; CHECK-NEXT: movk w8, #31, lsl #16
+; CHECK-NEXT: ldrh w0, [x0, x8]
; CHECK-NEXT: ret
%arrayidx = getelementptr inbounds i16, ptr %a, i64 1039992
%val = load i16, ptr %arrayidx, align 2
@@ -303,8 +307,9 @@ define i32 @LdOffset_i16_zext32(ptr %a) {
define i32 @LdOffset_i16_sext32(ptr %a) {
; CHECK-LABEL: LdOffset_i16_sext32:
; CHECK: // %bb.0:
-; CHECK-NEXT: add x8, x0, #506, lsl #12 // =2072576
-; CHECK-NEXT: ldrsh w0, [x8, #7408]
+; CHECK-NEXT: mov w8, #48368 // =0xbcf0
+; CHECK-NEXT: movk w8, #31, lsl #16
+; CHECK-NEXT: ldrsh w0, [x0, x8]
; CHECK-NEXT: ret
%arrayidx = getelementptr inbounds i16, ptr %a, i64 1039992
%val = load i16, ptr %arrayidx, align 2
@@ -316,8 +321,9 @@ define i32 @LdOffset_i16_sext32(ptr %a) {
define i64 @LdOffset_i16_zext64(ptr %a) {
; CHECK-LABEL: LdOffset_i16_zext64:
; CHECK: // %bb.0:
-; CHECK-NEXT: add x8, x0, #506, lsl #12 // =2072576
-; CHECK-NEXT: ldrh w0, [x8, #7408]
+; CHECK-NEXT: mov w8, #48368 // =0xbcf0
+; CHECK-NEXT: movk w8, #31, lsl #16
+; CHECK-NEXT: ldrh w0, [x0, x8]
; CHECK-NEXT: ret
%arrayidx = getelementptr inbounds i16, ptr %a, i64 1039992
%val = load i16, ptr %arrayidx, align 2
@@ -329,8 +335,9 @@ define i64 @LdOffset_i16_zext64(ptr %a) {
define i64 @LdOffset_i16_sext64(ptr %a) {
; CHECK-LABEL: LdOffset_i16_sext64:
; CHECK: // %bb.0:
-; CHECK-NEXT: add x8, x0, #506, lsl #12 // =2072576
-; CHECK-NEXT: ldrsh x0, [x8, #7408]
+; CHECK-NEXT: mov w8, #48368 // =0xbcf0
+; CHECK-NEXT: movk w8, #31, lsl #16
+; CHECK-NEXT: ldrsh x0, [x0, x8]
; CHECK-NEXT: ret
%arrayidx = getelementptr inbounds i16, ptr %a, i64 1039992
%val = load i16, ptr %arrayidx, align 2
@@ -342,8 +349,9 @@ define i64 @LdOffset_i16_sext64(ptr %a) {
define i32 @LdOffset_i32(ptr %a) {
; CHECK-LABEL: LdOffset_i32:
; CHECK: // %bb.0:
-; CHECK-NEXT: add x8, x0, #1012, lsl #12 // =4145152
-; CHECK-NEXT: ldr w0, [x8, #14816]
+; CHECK-NEXT: mov w8, #31200 // =0x79e0
+; CHECK-NEXT: movk w8, #63, lsl #16
+; CHECK-NEXT: ldr w0, [x0, x8]
; CHECK-NEXT: ret
%arrayidx = getelementptr inbounds i32, ptr %a, i64 1039992
%val = load i32, ptr %arrayidx, align 4
@@ -354,8 +362,9 @@ define i32 @LdOffset_i32(ptr %a) {
define i64 @LdOffset_i32_zext64(ptr %a) {
; CHECK-LABEL: LdOffset_i32_zext64:
; CHECK: // %bb.0:
-; CHECK-NEXT: add x8, x0, #1012, lsl #12 // =4145152
-; CHECK-NEXT: ldr w0, [x8, #14816]
+; CHECK-NEXT: mov w8, #31200 // =0x79e0
+; CHECK-NEXT: movk w8, #63, lsl #16
+; CHECK-NEXT: ldr w0, [x0, x8]
; CHECK-NEXT: ret
%arrayidx = getelementptr inbounds i32, ptr %a, i64 1039992
%val = load i32, ptr %arrayidx, align 2
@@ -367,8 +376,9 @@ define i64 @LdOffset_i32_zext64(ptr %a) {
define i64 @LdOffset_i32_sext64(ptr %a) {
; CHECK-LABEL: LdOffset_i32_sext64:
; CHECK: // %bb.0:
-; CHECK-NEXT: add x8, x0, #1012, lsl #12 // =4145152
-; CHECK-NEXT: ldrsw x0, [x8, #14816]
+; CHECK-NEXT: mov w8, #31200 // =0x79e0
+; CHECK-NEXT: movk w8, #63, lsl #16
+; CHECK-NEXT: ldrsw x0, [x0, x8]
; CHECK-NEXT: ret
%arrayidx = getelementptr inbounds i32, ptr %a, i64 1039992
%val = load i32, ptr %arrayidx, align 2
@@ -380,8 +390,9 @@ define i64 @LdOffset_i32_sext64(ptr %a) {
define i64 @LdOffset_i64(ptr %a) {
; CHECK-LABEL: LdOffset_i64:
; CHECK: // %bb.0:
-; CHECK-NEXT: add x8, x0, #2024, lsl #12 // =8290304
-; CHECK-NEXT: ldr x0, [x8, #29632]
+; CHECK-NEXT: mov w8, #62400 // =0xf3c0
+; CHECK-NEXT: movk w8, #126, lsl #16
+; CHECK-NEXT: ldr x0, [x0, x8]
; CHECK-NEXT: ret
%arrayidx = getelementptr inbounds i64, ptr %a, i64 1039992
%val = load i64, ptr %arrayidx, align 4
@@ -392,8 +403,9 @@ define i64 @LdOffset_i64(ptr %a) {
define <2 x i32> @LdOffset_v2i32(ptr %a) {
; CHECK-LABEL: LdOffset_v2i32:
; CHECK: // %bb.0:
-; CHECK-NEXT: add x8, x0, #2024, lsl #12 // =8290304
-; CHECK-NEXT: ldr d0, [x8, #29632]
+; CHECK-NEXT: mov w8, #62400 // =0xf3c0
+; CHECK-NEXT: movk w8, #126, lsl #16
+; CHECK-NEXT: ldr d0, [x0, x8]
; CHECK-NEXT: ret
%arrayidx = getelementptr inbounds <2 x i32>, ptr %a, i64 1039992
%val = load <2 x i32>, ptr %arrayidx, align 4
@@ -404,8 +416,9 @@ define <2 x i32> @LdOffset_v2i32(ptr %a) {
define <2 x i64> @LdOffset_v2i64(ptr %a) {
; CHECK-LABEL: LdOffset_v2i64:
; CHECK: // %bb.0:
-; CHECK-NEXT: add x8, x0, #4048, lsl #12 // =16580608
-; CHECK-NEXT: ldr q0, [x8, #59264]
+; CHECK-NEXT: mov w8, #59264 // =0xe780
+; CHECK-NEXT: movk w8, #253, lsl #16
+; CHECK-NEXT: ldr q0, [x0, x8]
; CHECK-NEXT: ret
%arrayidx = getelementptr inbounds <2 x i64>, ptr %a, i64 1039992
%val = load <2 x i64>, ptr %arrayidx, align 4
@@ -416,8 +429,9 @@ define <2 x i64> @LdOffset_v2i64(ptr %a) {
define double @LdOffset_i8_f64(ptr %a) {
; CHECK-LABEL: LdOffset_i8_f64:
; CHECK: // %bb.0:
-; CHECK-NEXT: add x8, x0, #253, lsl #12 // =1036288
-; CHECK-NEXT: ldrsb w8, [x8, #3704]
+; CHECK-NEXT: mov w8, #56952 // =0xde78
+; CHECK-NEXT: movk w8, #15, lsl #16
+; CHECK-NEXT: ldrsb w8, [x0, x8]
; CHECK-NEXT: scvtf d0, w8
; CHECK-NEXT: ret
%arrayidx = getelementptr inbounds i8, ptr %a, i64 1039992
@@ -430,8 +444,9 @@ define double @LdOffset_i8_f64(ptr %a) {
define double @LdOffset_i16_f64(ptr %a) {
; CHECK-LABEL: LdOffset_i16_f64:
; CHECK: // %bb.0:
-; CHECK-NEXT: add x8, x0, #506, lsl #12 // =2072576
-; CHECK-NEXT: ldrsh w8, [x8, #7408]
+; CHECK-NEXT: mov w8, #48368 // =0xbcf0
+; CHECK-NEXT: movk w8, #31, lsl #16
+; CHECK-NEXT: ldrsh w8, [x0, x8]
; CHECK-NEXT: scvtf d0, w8
; CHECK-NEXT: ret
%arrayidx = getelementptr inbounds i16, ptr %a, i64 1039992
@@ -444,8 +459,9 @@ define double @LdOffset_i16_f64(ptr %a) {
define double @LdOffset_i32_f64(ptr %a) {
; CHECK-LABEL: LdOffset_i32_f64:
; CHECK: // %bb.0:
-; CHECK-NEXT: add x8, x0, #1012, lsl #12 // =4145152
-; CHECK-NEXT: ldr s0, [x8, #14816]
+; CHECK-NEXT: mov w8, #31200 // =0x79e0
+; CHECK-NEXT: movk w8, #63, lsl #16
+; CHECK-NEXT: ldr s0, [x0, x8]
; CHECK-NEXT: ucvtf d0, d0
; CHECK-NEXT: ret
%arrayidx = getelementptr inbounds i32, ptr %a, i64 1039992
@@ -458,8 +474,9 @@ define double @LdOffset_i32_f64(ptr %a) {
define double @LdOffset_i64_f64(ptr %a) {
; CHECK-LABEL: LdOffset_i64_f64:
; CHECK: // %bb.0:
-; CHECK-NEXT: add x8, x0, #2024, lsl #12 // =8290304
-; CHECK-NEXT: ldr d0, [x8, #29632]
+; CHECK-NEXT: mov w8, #62400 // =0xf3c0
+; CHECK-NEXT: movk w8, #126, lsl #16
+; CHECK-NEXT: ldr d0, [x0, x8]
; CHECK-NEXT: scvtf d0, d0
; CHECK-NEXT: ret
%arrayidx = getelementptr inbounds i64, ptr %a, i64 1039992
diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll
index 79b9f8caea94..85a24a063aa4 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16.ll
@@ -72,6 +72,1831 @@ define void @test_load_store(ptr addrspace(1) %in, ptr addrspace(1) %out) {
ret void
}
+define <2 x bfloat> @v_load_global_v2bf16(ptr addrspace(1) %ptr) {
+; GCN-LABEL: v_load_global_v2bf16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s4, s6
+; GCN-NEXT: s_mov_b32 s5, s6
+; GCN-NEXT: buffer_load_dword v1, v[0:1], s[4:7], 0 addr64
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v1
+; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_load_global_v2bf16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: s_mov_b32 s6, 0
+; GFX7-NEXT: s_mov_b32 s7, 0xf000
+; GFX7-NEXT: s_mov_b32 s4, s6
+; GFX7-NEXT: s_mov_b32 s5, s6
+; GFX7-NEXT: buffer_load_dword v1, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v1
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_load_global_v2bf16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: flat_load_dword v0, v[0:1]
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_load_global_v2bf16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: global_load_dword v0, v[0:1], off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_load_global_v2bf16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: global_load_dword v0, v[0:1], off
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_load_global_v2bf16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: global_load_b32 v0, v[0:1], off
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %load = load <2 x bfloat>, ptr addrspace(1) %ptr
+ ret <2 x bfloat> %load
+}
+
+define <3 x bfloat> @v_load_global_v3bf16(ptr addrspace(1) %ptr) {
+; GCN-LABEL: v_load_global_v3bf16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s4, s6
+; GCN-NEXT: s_mov_b32 s5, s6
+; GCN-NEXT: buffer_load_dwordx2 v[1:2], v[0:1], s[4:7], 0 addr64
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v1
+; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_load_global_v3bf16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: s_mov_b32 s6, 0
+; GFX7-NEXT: s_mov_b32 s7, 0xf000
+; GFX7-NEXT: s_mov_b32 s4, s6
+; GFX7-NEXT: s_mov_b32 s5, s6
+; GFX7-NEXT: buffer_load_dwordx2 v[1:2], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v1
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_load_global_v3bf16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_load_global_v3bf16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
+; GFX9-NEXT: s_mov_b32 s4, 0xffff
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
+; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX9-NEXT: v_and_or_b32 v0, v0, s4, v2
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_load_global_v3bf16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
+; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v2
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_load_global_v3bf16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
+; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: v_and_or_b32 v0, 0xffff, v0, v2
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %load = load <3 x bfloat>, ptr addrspace(1) %ptr
+ ret <3 x bfloat> %load
+}
+
+define <4 x bfloat> @v_load_global_v4bf16(ptr addrspace(1) %ptr) {
+; GCN-LABEL: v_load_global_v4bf16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s4, s6
+; GCN-NEXT: s_mov_b32 s5, s6
+; GCN-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v2
+; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
+; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_load_global_v4bf16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: s_mov_b32 s6, 0
+; GFX7-NEXT: s_mov_b32 s7, 0xf000
+; GFX7-NEXT: s_mov_b32 s4, s6
+; GFX7-NEXT: s_mov_b32 s5, s6
+; GFX7-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v2
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_load_global_v4bf16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_load_global_v4bf16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_load_global_v4bf16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_load_global_v4bf16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %load = load <4 x bfloat>, ptr addrspace(1) %ptr
+ ret <4 x bfloat> %load
+}
+
+define <6 x bfloat> @v_load_global_v6bf16(ptr addrspace(1) %ptr) {
+; GCN-LABEL: v_load_global_v6bf16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s4, s6
+; GCN-NEXT: s_mov_b32 s5, s6
+; GCN-NEXT: buffer_load_dwordx4 v[3:6], v[0:1], s[4:7], 0 addr64
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v3
+; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v3
+; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v4
+; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v4
+; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v5
+; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_load_global_v6bf16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: s_mov_b32 s6, 0
+; GFX7-NEXT: s_mov_b32 s7, 0xf000
+; GFX7-NEXT: s_mov_b32 s4, s6
+; GFX7-NEXT: s_mov_b32 s5, s6
+; GFX7-NEXT: buffer_load_dwordx3 v[3:5], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v3
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v4
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v5
+; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_load_global_v6bf16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: flat_load_dwordx3 v[0:2], v[0:1]
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_load_global_v6bf16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: global_load_dwordx3 v[0:2], v[0:1], off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_load_global_v6bf16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: global_load_dwordx3 v[0:2], v[0:1], off
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_load_global_v6bf16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: global_load_b96 v[0:2], v[0:1], off
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %load = load <6 x bfloat>, ptr addrspace(1) %ptr
+ ret <6 x bfloat> %load
+}
+
+define <8 x bfloat> @v_load_global_v8bf16(ptr addrspace(1) %ptr) {
+; GCN-LABEL: v_load_global_v8bf16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s4, s6
+; GCN-NEXT: s_mov_b32 s5, s6
+; GCN-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v4
+; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
+; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v5
+; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
+; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v6
+; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v7
+; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_load_global_v8bf16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: s_mov_b32 s6, 0
+; GFX7-NEXT: s_mov_b32 s7, 0xf000
+; GFX7-NEXT: s_mov_b32 s4, s6
+; GFX7-NEXT: s_mov_b32 s5, s6
+; GFX7-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v5
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6
+; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v7
+; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_load_global_v8bf16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_load_global_v8bf16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_load_global_v8bf16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_load_global_v8bf16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %load = load <8 x bfloat>, ptr addrspace(1) %ptr
+ ret <8 x bfloat> %load
+}
+
+define <16 x bfloat> @v_load_global_v16bf16(ptr addrspace(1) %ptr) {
+; GCN-LABEL: v_load_global_v16bf16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s4, s6
+; GCN-NEXT: s_mov_b32 s5, s6
+; GCN-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
+; GCN-NEXT: buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64 offset:16
+; GCN-NEXT: s_waitcnt vmcnt(1)
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v4
+; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
+; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v5
+; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
+; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v6
+; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v7
+; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v12
+; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v12
+; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v13
+; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v13
+; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v14
+; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v14
+; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v15
+; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_load_global_v16bf16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: s_mov_b32 s6, 0
+; GFX7-NEXT: s_mov_b32 s7, 0xf000
+; GFX7-NEXT: s_mov_b32 s4, s6
+; GFX7-NEXT: s_mov_b32 s5, s6
+; GFX7-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64 offset:16
+; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v5
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6
+; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v7
+; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v12
+; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v12
+; GFX7-NEXT: v_lshlrev_b32_e32 v10, 16, v13
+; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v13
+; GFX7-NEXT: v_lshlrev_b32_e32 v12, 16, v14
+; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v14
+; GFX7-NEXT: v_lshlrev_b32_e32 v14, 16, v15
+; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_load_global_v16bf16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v5, v1
+; GFX8-NEXT: v_mov_b32_e32 v4, v0
+; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[4:5]
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, 16, v4
+; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
+; GFX8-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_load_global_v16bf16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v9, v1
+; GFX9-NEXT: v_mov_b32_e32 v8, v0
+; GFX9-NEXT: global_load_dwordx4 v[0:3], v[8:9], off
+; GFX9-NEXT: global_load_dwordx4 v[4:7], v[8:9], off offset:16
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_load_global_v16bf16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v9, v1
+; GFX10-NEXT: v_mov_b32_e32 v8, v0
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: global_load_dwordx4 v[0:3], v[8:9], off
+; GFX10-NEXT: global_load_dwordx4 v[4:7], v[8:9], off offset:16
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_load_global_v16bf16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_load_b128 v[0:3], v[4:5], off
+; GFX11-NEXT: global_load_b128 v[4:7], v[4:5], off offset:16
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %load = load <16 x bfloat>, ptr addrspace(1) %ptr
+ ret <16 x bfloat> %load
+}
+
+define <32 x bfloat> @v_load_global_v32bf16(ptr addrspace(1) %ptr) {
+; GCN-LABEL: v_load_global_v32bf16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s4, s6
+; GCN-NEXT: s_mov_b32 s5, s6
+; GCN-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
+; GCN-NEXT: buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64 offset:16
+; GCN-NEXT: buffer_load_dwordx4 v[20:23], v[0:1], s[4:7], 0 addr64 offset:32
+; GCN-NEXT: buffer_load_dwordx4 v[28:31], v[0:1], s[4:7], 0 addr64 offset:48
+; GCN-NEXT: s_waitcnt vmcnt(3)
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v4
+; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
+; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v5
+; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
+; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v6
+; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v7
+; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GCN-NEXT: s_waitcnt vmcnt(2)
+; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v12
+; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v12
+; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v13
+; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v13
+; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v14
+; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v14
+; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v15
+; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; GCN-NEXT: s_waitcnt vmcnt(1)
+; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v20
+; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v20
+; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v21
+; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v21
+; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v22
+; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v22
+; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v23
+; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v28
+; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v28
+; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v29
+; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v29
+; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v30
+; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v30
+; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v31
+; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_load_global_v32bf16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: s_mov_b32 s6, 0
+; GFX7-NEXT: s_mov_b32 s7, 0xf000
+; GFX7-NEXT: s_mov_b32 s4, s6
+; GFX7-NEXT: s_mov_b32 s5, s6
+; GFX7-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64 offset:16
+; GFX7-NEXT: buffer_load_dwordx4 v[20:23], v[0:1], s[4:7], 0 addr64 offset:32
+; GFX7-NEXT: buffer_load_dwordx4 v[28:31], v[0:1], s[4:7], 0 addr64 offset:48
+; GFX7-NEXT: s_waitcnt vmcnt(3)
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v5
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6
+; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v7
+; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT: s_waitcnt vmcnt(2)
+; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v12
+; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v12
+; GFX7-NEXT: v_lshlrev_b32_e32 v10, 16, v13
+; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v13
+; GFX7-NEXT: v_lshlrev_b32_e32 v12, 16, v14
+; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v14
+; GFX7-NEXT: v_lshlrev_b32_e32 v14, 16, v15
+; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: v_lshlrev_b32_e32 v16, 16, v20
+; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v20
+; GFX7-NEXT: v_lshlrev_b32_e32 v18, 16, v21
+; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v21
+; GFX7-NEXT: v_lshlrev_b32_e32 v20, 16, v22
+; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v22
+; GFX7-NEXT: v_lshlrev_b32_e32 v22, 16, v23
+; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_lshlrev_b32_e32 v24, 16, v28
+; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v28
+; GFX7-NEXT: v_lshlrev_b32_e32 v26, 16, v29
+; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v29
+; GFX7-NEXT: v_lshlrev_b32_e32 v28, 16, v30
+; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v30
+; GFX7-NEXT: v_lshlrev_b32_e32 v30, 16, v31
+; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_load_global_v32bf16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v12, v0
+; GFX8-NEXT: v_mov_b32_e32 v13, v1
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, 16, v12
+; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v13, vcc
+; GFX8-NEXT: v_add_u32_e32 v8, vcc, 32, v12
+; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v13, vcc
+; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[12:13]
+; GFX8-NEXT: v_add_u32_e32 v12, vcc, 48, v12
+; GFX8-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc
+; GFX8-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
+; GFX8-NEXT: flat_load_dwordx4 v[8:11], v[8:9]
+; GFX8-NEXT: flat_load_dwordx4 v[12:15], v[12:13]
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_load_global_v32bf16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v17, v1
+; GFX9-NEXT: v_mov_b32_e32 v16, v0
+; GFX9-NEXT: global_load_dwordx4 v[0:3], v[16:17], off
+; GFX9-NEXT: global_load_dwordx4 v[4:7], v[16:17], off offset:16
+; GFX9-NEXT: global_load_dwordx4 v[8:11], v[16:17], off offset:32
+; GFX9-NEXT: global_load_dwordx4 v[12:15], v[16:17], off offset:48
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_load_global_v32bf16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v17, v1
+; GFX10-NEXT: v_mov_b32_e32 v16, v0
+; GFX10-NEXT: s_clause 0x3
+; GFX10-NEXT: global_load_dwordx4 v[0:3], v[16:17], off
+; GFX10-NEXT: global_load_dwordx4 v[4:7], v[16:17], off offset:16
+; GFX10-NEXT: global_load_dwordx4 v[8:11], v[16:17], off offset:32
+; GFX10-NEXT: global_load_dwordx4 v[12:15], v[16:17], off offset:48
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_load_global_v32bf16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_dual_mov_b32 v13, v1 :: v_dual_mov_b32 v12, v0
+; GFX11-NEXT: s_clause 0x3
+; GFX11-NEXT: global_load_b128 v[0:3], v[12:13], off
+; GFX11-NEXT: global_load_b128 v[4:7], v[12:13], off offset:16
+; GFX11-NEXT: global_load_b128 v[8:11], v[12:13], off offset:32
+; GFX11-NEXT: global_load_b128 v[12:15], v[12:13], off offset:48
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %load = load <32 x bfloat>, ptr addrspace(1) %ptr
+ ret <32 x bfloat> %load
+}
+
+define <64 x bfloat> @v_load_global_v64bf16(ptr addrspace(1) %ptr) {
+; GCN-LABEL: v_load_global_v64bf16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: v_add_i32_e32 v7, vcc, 0x7c, v0
+; GCN-NEXT: v_add_i32_e32 v8, vcc, 0x78, v0
+; GCN-NEXT: v_add_i32_e32 v9, vcc, 0x74, v0
+; GCN-NEXT: v_add_i32_e32 v10, vcc, 0x70, v0
+; GCN-NEXT: v_add_i32_e32 v11, vcc, 0x6c, v0
+; GCN-NEXT: v_add_i32_e32 v12, vcc, 0x68, v0
+; GCN-NEXT: v_add_i32_e32 v13, vcc, 0x64, v0
+; GCN-NEXT: v_add_i32_e32 v14, vcc, 0x60, v0
+; GCN-NEXT: s_mov_b32 s4, s6
+; GCN-NEXT: s_mov_b32 s5, s6
+; GCN-NEXT: buffer_load_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:112
+; GCN-NEXT: v_add_i32_e32 v15, vcc, 0x5c, v0
+; GCN-NEXT: v_add_i32_e32 v16, vcc, 0x58, v0
+; GCN-NEXT: v_add_i32_e32 v17, vcc, 0x54, v0
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_dword v5, v8, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_dword v4, v9, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_dword v3, v10, s[0:3], 0 offen
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:96
+; GCN-NEXT: v_add_i32_e32 v7, vcc, 0x50, v0
+; GCN-NEXT: v_add_i32_e32 v8, vcc, 0x4c, v0
+; GCN-NEXT: v_add_i32_e32 v9, vcc, 0x48, v0
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v6, v11, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_dword v5, v12, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_dword v4, v13, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_dword v3, v14, s[0:3], 0 offen
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:80
+; GCN-NEXT: v_add_i32_e32 v10, vcc, 0x44, v0
+; GCN-NEXT: v_add_i32_e32 v11, vcc, 64, v0
+; GCN-NEXT: v_add_i32_e32 v19, vcc, 60, v0
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v6, v15, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_dword v5, v16, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_dword v4, v17, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:64
+; GCN-NEXT: v_add_i32_e32 v20, vcc, 56, v0
+; GCN-NEXT: v_add_i32_e32 v21, vcc, 52, v0
+; GCN-NEXT: v_add_i32_e32 v22, vcc, 48, v0
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v6, v8, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_dword v5, v9, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_dword v4, v10, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_dword v3, v11, s[0:3], 0 offen
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:48
+; GCN-NEXT: buffer_load_dwordx4 v[7:10], v[1:2], s[4:7], 0 addr64 offset:32
+; GCN-NEXT: buffer_load_dwordx4 v[11:14], v[1:2], s[4:7], 0 addr64
+; GCN-NEXT: buffer_load_dwordx4 v[15:18], v[1:2], s[4:7], 0 addr64 offset:16
+; GCN-NEXT: s_waitcnt vmcnt(3)
+; GCN-NEXT: buffer_store_dword v6, v19, s[0:3], 0 offen
+; GCN-NEXT: v_add_i32_e32 v1, vcc, 44, v0
+; GCN-NEXT: buffer_store_dword v5, v20, s[0:3], 0 offen
+; GCN-NEXT: v_add_i32_e32 v2, vcc, 40, v0
+; GCN-NEXT: buffer_store_dword v4, v21, s[0:3], 0 offen
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_add_i32_e32 v4, vcc, 36, v0
+; GCN-NEXT: buffer_store_dword v3, v22, s[0:3], 0 offen
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: v_add_i32_e32 v3, vcc, 32, v0
+; GCN-NEXT: v_add_i32_e32 v5, vcc, 28, v0
+; GCN-NEXT: v_add_i32_e32 v6, vcc, 24, v0
+; GCN-NEXT: v_add_i32_e32 v19, vcc, 20, v0
+; GCN-NEXT: v_add_i32_e32 v20, vcc, 16, v0
+; GCN-NEXT: s_waitcnt vmcnt(6)
+; GCN-NEXT: buffer_store_dword v10, v1, s[0:3], 0 offen
+; GCN-NEXT: v_add_i32_e32 v1, vcc, 12, v0
+; GCN-NEXT: buffer_store_dword v9, v2, s[0:3], 0 offen
+; GCN-NEXT: v_add_i32_e32 v2, vcc, 8, v0
+; GCN-NEXT: buffer_store_dword v8, v4, s[0:3], 0 offen
+; GCN-NEXT: v_add_i32_e32 v4, vcc, 4, v0
+; GCN-NEXT: buffer_store_dword v7, v3, s[0:3], 0 offen
+; GCN-NEXT: s_waitcnt vmcnt(8)
+; GCN-NEXT: buffer_store_dword v18, v5, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_dword v17, v6, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_dword v16, v19, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_dword v15, v20, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_dword v14, v1, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_dword v13, v2, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_dword v12, v4, s[0:3], 0 offen
+; GCN-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_load_global_v64bf16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: s_mov_b32 s6, 0
+; GFX7-NEXT: s_mov_b32 s7, 0xf000
+; GFX7-NEXT: s_mov_b32 s4, s6
+; GFX7-NEXT: s_mov_b32 s5, s6
+; GFX7-NEXT: buffer_load_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:112
+; GFX7-NEXT: v_add_i32_e32 v7, vcc, 0x7c, v0
+; GFX7-NEXT: v_add_i32_e32 v8, vcc, 0x78, v0
+; GFX7-NEXT: v_add_i32_e32 v9, vcc, 0x74, v0
+; GFX7-NEXT: v_add_i32_e32 v10, vcc, 0x70, v0
+; GFX7-NEXT: v_add_i32_e32 v19, vcc, 52, v0
+; GFX7-NEXT: v_add_i32_e32 v20, vcc, 48, v0
+; GFX7-NEXT: v_add_i32_e32 v21, vcc, 44, v0
+; GFX7-NEXT: v_add_i32_e32 v22, vcc, 40, v0
+; GFX7-NEXT: v_add_i32_e32 v23, vcc, 36, v0
+; GFX7-NEXT: v_add_i32_e32 v24, vcc, 32, v0
+; GFX7-NEXT: v_add_i32_e32 v25, vcc, 28, v0
+; GFX7-NEXT: v_add_i32_e32 v26, vcc, 24, v0
+; GFX7-NEXT: v_add_i32_e32 v27, vcc, 20, v0
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen
+; GFX7-NEXT: buffer_store_dword v5, v8, s[0:3], 0 offen
+; GFX7-NEXT: buffer_store_dword v4, v9, s[0:3], 0 offen
+; GFX7-NEXT: buffer_store_dword v3, v10, s[0:3], 0 offen
+; GFX7-NEXT: buffer_load_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:96
+; GFX7-NEXT: v_add_i32_e32 v7, vcc, 0x6c, v0
+; GFX7-NEXT: v_add_i32_e32 v8, vcc, 0x68, v0
+; GFX7-NEXT: v_add_i32_e32 v9, vcc, 0x64, v0
+; GFX7-NEXT: v_add_i32_e32 v10, vcc, 0x60, v0
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen
+; GFX7-NEXT: buffer_store_dword v5, v8, s[0:3], 0 offen
+; GFX7-NEXT: buffer_store_dword v4, v9, s[0:3], 0 offen
+; GFX7-NEXT: buffer_store_dword v3, v10, s[0:3], 0 offen
+; GFX7-NEXT: buffer_load_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:80
+; GFX7-NEXT: v_add_i32_e32 v7, vcc, 0x5c, v0
+; GFX7-NEXT: v_add_i32_e32 v8, vcc, 0x58, v0
+; GFX7-NEXT: v_add_i32_e32 v9, vcc, 0x54, v0
+; GFX7-NEXT: v_add_i32_e32 v10, vcc, 0x50, v0
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen
+; GFX7-NEXT: buffer_store_dword v5, v8, s[0:3], 0 offen
+; GFX7-NEXT: buffer_store_dword v4, v9, s[0:3], 0 offen
+; GFX7-NEXT: buffer_store_dword v3, v10, s[0:3], 0 offen
+; GFX7-NEXT: buffer_load_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:64
+; GFX7-NEXT: v_add_i32_e32 v7, vcc, 0x4c, v0
+; GFX7-NEXT: v_add_i32_e32 v8, vcc, 0x48, v0
+; GFX7-NEXT: v_add_i32_e32 v9, vcc, 0x44, v0
+; GFX7-NEXT: v_add_i32_e32 v10, vcc, 64, v0
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen
+; GFX7-NEXT: buffer_store_dword v5, v8, s[0:3], 0 offen
+; GFX7-NEXT: buffer_store_dword v4, v9, s[0:3], 0 offen
+; GFX7-NEXT: buffer_store_dword v3, v10, s[0:3], 0 offen
+; GFX7-NEXT: buffer_load_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:48
+; GFX7-NEXT: buffer_load_dwordx4 v[7:10], v[1:2], s[4:7], 0 addr64 offset:32
+; GFX7-NEXT: buffer_load_dwordx4 v[11:14], v[1:2], s[4:7], 0 addr64 offset:16
+; GFX7-NEXT: buffer_load_dwordx4 v[15:18], v[1:2], s[4:7], 0 addr64
+; GFX7-NEXT: v_add_i32_e32 v1, vcc, 60, v0
+; GFX7-NEXT: v_add_i32_e32 v2, vcc, 56, v0
+; GFX7-NEXT: s_waitcnt vmcnt(3)
+; GFX7-NEXT: buffer_store_dword v6, v1, s[0:3], 0 offen
+; GFX7-NEXT: v_add_i32_e32 v1, vcc, 16, v0
+; GFX7-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen
+; GFX7-NEXT: v_add_i32_e32 v2, vcc, 12, v0
+; GFX7-NEXT: buffer_store_dword v4, v19, s[0:3], 0 offen
+; GFX7-NEXT: v_add_i32_e32 v4, vcc, 8, v0
+; GFX7-NEXT: buffer_store_dword v3, v20, s[0:3], 0 offen
+; GFX7-NEXT: v_add_i32_e32 v3, vcc, 4, v0
+; GFX7-NEXT: s_waitcnt vmcnt(6)
+; GFX7-NEXT: buffer_store_dword v10, v21, s[0:3], 0 offen
+; GFX7-NEXT: buffer_store_dword v9, v22, s[0:3], 0 offen
+; GFX7-NEXT: buffer_store_dword v8, v23, s[0:3], 0 offen
+; GFX7-NEXT: buffer_store_dword v7, v24, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(9)
+; GFX7-NEXT: buffer_store_dword v14, v25, s[0:3], 0 offen
+; GFX7-NEXT: buffer_store_dword v13, v26, s[0:3], 0 offen
+; GFX7-NEXT: buffer_store_dword v12, v27, s[0:3], 0 offen
+; GFX7-NEXT: buffer_store_dword v11, v1, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(12)
+; GFX7-NEXT: buffer_store_dword v18, v2, s[0:3], 0 offen
+; GFX7-NEXT: buffer_store_dword v17, v4, s[0:3], 0 offen
+; GFX7-NEXT: buffer_store_dword v16, v3, s[0:3], 0 offen
+; GFX7-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_load_global_v64bf16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v28, v0
+; GFX8-NEXT: v_mov_b32_e32 v29, v1
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, 16, v28
+; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v29, vcc
+; GFX8-NEXT: v_add_u32_e32 v8, vcc, 32, v28
+; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v29, vcc
+; GFX8-NEXT: v_add_u32_e32 v12, vcc, 48, v28
+; GFX8-NEXT: v_addc_u32_e32 v13, vcc, 0, v29, vcc
+; GFX8-NEXT: v_add_u32_e32 v16, vcc, 64, v28
+; GFX8-NEXT: v_addc_u32_e32 v17, vcc, 0, v29, vcc
+; GFX8-NEXT: s_movk_i32 s4, 0x50
+; GFX8-NEXT: v_add_u32_e32 v20, vcc, s4, v28
+; GFX8-NEXT: v_addc_u32_e32 v21, vcc, 0, v29, vcc
+; GFX8-NEXT: s_movk_i32 s4, 0x60
+; GFX8-NEXT: v_add_u32_e32 v24, vcc, s4, v28
+; GFX8-NEXT: v_addc_u32_e32 v25, vcc, 0, v29, vcc
+; GFX8-NEXT: s_movk_i32 s4, 0x70
+; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[28:29]
+; GFX8-NEXT: flat_load_dwordx4 v[12:15], v[12:13]
+; GFX8-NEXT: v_add_u32_e32 v28, vcc, s4, v28
+; GFX8-NEXT: v_addc_u32_e32 v29, vcc, 0, v29, vcc
+; GFX8-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
+; GFX8-NEXT: flat_load_dwordx4 v[8:11], v[8:9]
+; GFX8-NEXT: flat_load_dwordx4 v[16:19], v[16:17]
+; GFX8-NEXT: flat_load_dwordx4 v[20:23], v[20:21]
+; GFX8-NEXT: flat_load_dwordx4 v[24:27], v[24:25]
+; GFX8-NEXT: flat_load_dwordx4 v[28:31], v[28:29]
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_load_global_v64bf16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v29, v1
+; GFX9-NEXT: v_mov_b32_e32 v28, v0
+; GFX9-NEXT: global_load_dwordx4 v[0:3], v[28:29], off
+; GFX9-NEXT: global_load_dwordx4 v[4:7], v[28:29], off offset:16
+; GFX9-NEXT: global_load_dwordx4 v[8:11], v[28:29], off offset:32
+; GFX9-NEXT: global_load_dwordx4 v[12:15], v[28:29], off offset:48
+; GFX9-NEXT: global_load_dwordx4 v[16:19], v[28:29], off offset:64
+; GFX9-NEXT: global_load_dwordx4 v[20:23], v[28:29], off offset:80
+; GFX9-NEXT: global_load_dwordx4 v[24:27], v[28:29], off offset:96
+; GFX9-NEXT: s_nop 0
+; GFX9-NEXT: global_load_dwordx4 v[28:31], v[28:29], off offset:112
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_load_global_v64bf16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v33, v1
+; GFX10-NEXT: v_mov_b32_e32 v32, v0
+; GFX10-NEXT: s_clause 0x7
+; GFX10-NEXT: global_load_dwordx4 v[0:3], v[32:33], off
+; GFX10-NEXT: global_load_dwordx4 v[4:7], v[32:33], off offset:16
+; GFX10-NEXT: global_load_dwordx4 v[8:11], v[32:33], off offset:32
+; GFX10-NEXT: global_load_dwordx4 v[12:15], v[32:33], off offset:48
+; GFX10-NEXT: global_load_dwordx4 v[16:19], v[32:33], off offset:64
+; GFX10-NEXT: global_load_dwordx4 v[20:23], v[32:33], off offset:80
+; GFX10-NEXT: global_load_dwordx4 v[24:27], v[32:33], off offset:96
+; GFX10-NEXT: global_load_dwordx4 v[28:31], v[32:33], off offset:112
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_load_global_v64bf16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_dual_mov_b32 v29, v1 :: v_dual_mov_b32 v28, v0
+; GFX11-NEXT: s_clause 0x7
+; GFX11-NEXT: global_load_b128 v[0:3], v[28:29], off
+; GFX11-NEXT: global_load_b128 v[4:7], v[28:29], off offset:16
+; GFX11-NEXT: global_load_b128 v[8:11], v[28:29], off offset:32
+; GFX11-NEXT: global_load_b128 v[12:15], v[28:29], off offset:48
+; GFX11-NEXT: global_load_b128 v[16:19], v[28:29], off offset:64
+; GFX11-NEXT: global_load_b128 v[20:23], v[28:29], off offset:80
+; GFX11-NEXT: global_load_b128 v[24:27], v[28:29], off offset:96
+; GFX11-NEXT: global_load_b128 v[28:31], v[28:29], off offset:112
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %load = load <64 x bfloat>, ptr addrspace(1) %ptr
+ ret <64 x bfloat> %load
+}
+
+define void @v_store_global_v2bf16(<2 x bfloat> %val, ptr addrspace(1) %ptr) {
+; GCN-LABEL: v_store_global_v2bf16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: v_alignbit_b32 v0, v1, v0, 16
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s4, s6
+; GCN-NEXT: s_mov_b32 s5, s6
+; GCN-NEXT: buffer_store_dword v0, v[2:3], s[4:7], 0 addr64
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_store_global_v2bf16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: s_mov_b32 s6, 0
+; GFX7-NEXT: v_alignbit_b32 v0, v1, v0, 16
+; GFX7-NEXT: s_mov_b32 s7, 0xf000
+; GFX7-NEXT: s_mov_b32 s4, s6
+; GFX7-NEXT: s_mov_b32 s5, s6
+; GFX7-NEXT: buffer_store_dword v0, v[2:3], s[4:7], 0 addr64
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_store_global_v2bf16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: flat_store_dword v[1:2], v0
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_store_global_v2bf16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: global_store_dword v[1:2], v0, off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_store_global_v2bf16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: global_store_dword v[1:2], v0, off
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_store_global_v2bf16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: global_store_b32 v[1:2], v0, off
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ store <2 x bfloat> %val, ptr addrspace(1) %ptr
+ ret void
+}
+
+define void @v_store_global_v3bf16(<3 x bfloat> %val, ptr addrspace(1) %ptr) {
+; GCN-LABEL: v_store_global_v3bf16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: v_alignbit_b32 v0, v1, v0, 16
+; GCN-NEXT: s_mov_b32 s4, s6
+; GCN-NEXT: s_mov_b32 s5, s6
+; GCN-NEXT: buffer_store_short v2, v[3:4], s[4:7], 0 addr64 offset:4
+; GCN-NEXT: buffer_store_dword v0, v[3:4], s[4:7], 0 addr64
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_store_global_v3bf16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: s_mov_b32 s6, 0
+; GFX7-NEXT: v_alignbit_b32 v0, v1, v0, 16
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2
+; GFX7-NEXT: s_mov_b32 s7, 0xf000
+; GFX7-NEXT: s_mov_b32 s4, s6
+; GFX7-NEXT: s_mov_b32 s5, s6
+; GFX7-NEXT: buffer_store_short v1, v[3:4], s[4:7], 0 addr64 offset:4
+; GFX7-NEXT: buffer_store_dword v0, v[3:4], s[4:7], 0 addr64
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_store_global_v3bf16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: flat_store_dword v[2:3], v0
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, 4, v2
+; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; GFX8-NEXT: flat_store_short v[2:3], v1
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_store_global_v3bf16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: global_store_short v[2:3], v1, off offset:4
+; GFX9-NEXT: global_store_dword v[2:3], v0, off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_store_global_v3bf16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: global_store_short v[2:3], v1, off offset:4
+; GFX10-NEXT: global_store_dword v[2:3], v0, off
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_store_global_v3bf16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_store_b16 v[2:3], v1, off offset:4
+; GFX11-NEXT: global_store_b32 v[2:3], v0, off
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ store <3 x bfloat> %val, ptr addrspace(1) %ptr
+ ret void
+}
+
+define void @v_store_global_v4bf16(<4 x bfloat> %val, ptr addrspace(1) %ptr) {
+; GCN-LABEL: v_store_global_v4bf16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v1
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: v_alignbit_b32 v1, v3, v2, 16
+; GCN-NEXT: v_alignbit_b32 v0, v6, v0, 16
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s4, s6
+; GCN-NEXT: s_mov_b32 s5, s6
+; GCN-NEXT: buffer_store_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_store_global_v4bf16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: s_mov_b32 s6, 0
+; GFX7-NEXT: v_alignbit_b32 v2, v3, v2, 16
+; GFX7-NEXT: v_alignbit_b32 v1, v1, v0, 16
+; GFX7-NEXT: s_mov_b32 s7, 0xf000
+; GFX7-NEXT: s_mov_b32 s4, s6
+; GFX7-NEXT: s_mov_b32 s5, s6
+; GFX7-NEXT: buffer_store_dwordx2 v[1:2], v[4:5], s[4:7], 0 addr64
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_store_global_v4bf16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_store_global_v4bf16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_store_global_v4bf16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_store_global_v4bf16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: global_store_b64 v[2:3], v[0:1], off
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ store <4 x bfloat> %val, ptr addrspace(1) %ptr
+ ret void
+}
+
+define void @v_store_global_v8bf16(<8 x bfloat> %val, ptr addrspace(1) %ptr) {
+; GCN-LABEL: v_store_global_v8bf16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v5
+; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT: s_mov_b32 s4, s6
+; GCN-NEXT: s_mov_b32 s5, s6
+; GCN-NEXT: v_alignbit_b32 v5, v7, v6, 16
+; GCN-NEXT: v_alignbit_b32 v4, v10, v4, 16
+; GCN-NEXT: v_alignbit_b32 v3, v3, v2, 16
+; GCN-NEXT: v_alignbit_b32 v2, v1, v0, 16
+; GCN-NEXT: buffer_store_dwordx4 v[2:5], v[8:9], s[4:7], 0 addr64
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_store_global_v8bf16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: s_mov_b32 s6, 0
+; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: s_mov_b32 s7, 0xf000
+; GFX7-NEXT: s_mov_b32 s4, s6
+; GFX7-NEXT: s_mov_b32 s5, s6
+; GFX7-NEXT: v_alignbit_b32 v6, v7, v6, 16
+; GFX7-NEXT: v_alignbit_b32 v5, v5, v4, 16
+; GFX7-NEXT: v_alignbit_b32 v4, v3, v2, 16
+; GFX7-NEXT: v_alignbit_b32 v3, v1, v0, 16
+; GFX7-NEXT: buffer_store_dwordx4 v[3:6], v[8:9], s[4:7], 0 addr64
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_store_global_v8bf16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_store_global_v8bf16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_store_global_v8bf16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: global_store_dwordx4 v[4:5], v[0:3], off
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_store_global_v8bf16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: global_store_b128 v[4:5], v[0:3], off
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ store <8 x bfloat> %val, ptr addrspace(1) %ptr
+ ret void
+}
+
+define void @v_store_global_v16bf16(<16 x bfloat> %val, ptr addrspace(1) %ptr) {
+; GCN-LABEL: v_store_global_v16bf16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v5
+; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15
+; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v13
+; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GCN-NEXT: v_alignbit_b32 v5, v7, v6, 16
+; GCN-NEXT: v_alignbit_b32 v4, v18, v4, 16
+; GCN-NEXT: v_alignbit_b32 v3, v3, v2, 16
+; GCN-NEXT: v_alignbit_b32 v2, v1, v0, 16
+; GCN-NEXT: s_mov_b32 s4, s6
+; GCN-NEXT: s_mov_b32 s5, s6
+; GCN-NEXT: v_alignbit_b32 v13, v15, v14, 16
+; GCN-NEXT: v_alignbit_b32 v12, v19, v12, 16
+; GCN-NEXT: v_alignbit_b32 v11, v11, v10, 16
+; GCN-NEXT: v_alignbit_b32 v10, v9, v8, 16
+; GCN-NEXT: buffer_store_dwordx4 v[10:13], v[16:17], s[4:7], 0 addr64 offset:16
+; GCN-NEXT: buffer_store_dwordx4 v[2:5], v[16:17], s[4:7], 0 addr64
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_store_global_v16bf16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_alignbit_b32 v5, v5, v4, 16
+; GFX7-NEXT: v_alignbit_b32 v4, v3, v2, 16
+; GFX7-NEXT: v_alignbit_b32 v3, v1, v0, 16
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v15
+; GFX7-NEXT: v_alignbit_b32 v14, v0, v14, 16
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v13
+; GFX7-NEXT: v_alignbit_b32 v13, v0, v12, 16
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v11
+; GFX7-NEXT: s_mov_b32 s6, 0
+; GFX7-NEXT: v_alignbit_b32 v12, v0, v10, 16
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v9
+; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX7-NEXT: s_mov_b32 s7, 0xf000
+; GFX7-NEXT: s_mov_b32 s4, s6
+; GFX7-NEXT: s_mov_b32 s5, s6
+; GFX7-NEXT: v_alignbit_b32 v11, v0, v8, 16
+; GFX7-NEXT: v_alignbit_b32 v6, v7, v6, 16
+; GFX7-NEXT: buffer_store_dwordx4 v[11:14], v[16:17], s[4:7], 0 addr64 offset:16
+; GFX7-NEXT: buffer_store_dwordx4 v[3:6], v[16:17], s[4:7], 0 addr64
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_store_global_v16bf16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
+; GFX8-NEXT: s_nop 0
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 16, v8
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v9, vcc
+; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_store_global_v16bf16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: global_store_dwordx4 v[8:9], v[4:7], off offset:16
+; GFX9-NEXT: global_store_dwordx4 v[8:9], v[0:3], off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_store_global_v16bf16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: global_store_dwordx4 v[8:9], v[4:7], off offset:16
+; GFX10-NEXT: global_store_dwordx4 v[8:9], v[0:3], off
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_store_global_v16bf16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_store_b128 v[8:9], v[4:7], off offset:16
+; GFX11-NEXT: global_store_b128 v[8:9], v[0:3], off
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ store <16 x bfloat> %val, ptr addrspace(1) %ptr
+ ret void
+}
+
+define void @v_store_global_v32bf16(<32 x bfloat> %val, ptr addrspace(1) %ptr) {
+; GCN-LABEL: v_store_global_v32bf16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v5
+; GCN-NEXT: v_alignbit_b32 v5, v7, v6, 16
+; GCN-NEXT: v_alignbit_b32 v4, v31, v4, 16
+; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_alignbit_b32 v3, v3, v2, 16
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_alignbit_b32 v2, v1, v0, 16
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v15
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v13
+; GCN-NEXT: v_alignbit_b32 v13, v0, v14, 16
+; GCN-NEXT: v_alignbit_b32 v12, v1, v12, 16
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v11
+; GCN-NEXT: v_alignbit_b32 v11, v0, v10, 16
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v9
+; GCN-NEXT: v_alignbit_b32 v10, v0, v8, 16
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v23
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v21
+; GCN-NEXT: v_alignbit_b32 v9, v0, v22, 16
+; GCN-NEXT: v_alignbit_b32 v8, v1, v20, 16
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v19
+; GCN-NEXT: v_alignbit_b32 v7, v0, v18, 16
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v17
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v29
+; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v27
+; GCN-NEXT: v_alignbit_b32 v6, v0, v16, 16
+; GCN-NEXT: v_alignbit_b32 v16, v1, v28, 16
+; GCN-NEXT: v_alignbit_b32 v15, v14, v26, 16
+; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v25
+; GCN-NEXT: v_alignbit_b32 v14, v0, v24, 16
+; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s4, s6
+; GCN-NEXT: s_mov_b32 s5, s6
+; GCN-NEXT: s_waitcnt vmcnt(1)
+; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dwordx4 v[6:9], v[0:1], s[4:7], 0 addr64 offset:32
+; GCN-NEXT: buffer_store_dwordx4 v[10:13], v[0:1], s[4:7], 0 addr64 offset:16
+; GCN-NEXT: v_alignbit_b32 v17, v17, v30, 16
+; GCN-NEXT: buffer_store_dwordx4 v[14:17], v[0:1], s[4:7], 0 addr64 offset:48
+; GCN-NEXT: buffer_store_dwordx4 v[2:5], v[0:1], s[4:7], 0 addr64
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_store_global_v32bf16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_alignbit_b32 v3, v3, v2, 16
+; GFX7-NEXT: v_alignbit_b32 v2, v1, v0, 16
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v15
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v13
+; GFX7-NEXT: v_alignbit_b32 v13, v0, v14, 16
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v11
+; GFX7-NEXT: v_alignbit_b32 v11, v0, v10, 16
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v9
+; GFX7-NEXT: v_alignbit_b32 v10, v0, v8, 16
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v23
+; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX7-NEXT: v_alignbit_b32 v9, v0, v22, 16
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v19
+; GFX7-NEXT: v_lshrrev_b32_e32 v31, 16, v5
+; GFX7-NEXT: v_alignbit_b32 v5, v7, v6, 16
+; GFX7-NEXT: v_alignbit_b32 v12, v1, v12, 16
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v21
+; GFX7-NEXT: v_alignbit_b32 v7, v0, v18, 16
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v17
+; GFX7-NEXT: buffer_load_dword v17, off, s[0:3], s32
+; GFX7-NEXT: v_alignbit_b32 v8, v1, v20, 16
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v29
+; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v27
+; GFX7-NEXT: v_alignbit_b32 v6, v0, v16, 16
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v25
+; GFX7-NEXT: v_alignbit_b32 v16, v1, v28, 16
+; GFX7-NEXT: v_alignbit_b32 v15, v14, v26, 16
+; GFX7-NEXT: v_alignbit_b32 v14, v0, v24, 16
+; GFX7-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8
+; GFX7-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4
+; GFX7-NEXT: s_mov_b32 s6, 0
+; GFX7-NEXT: s_mov_b32 s7, 0xf000
+; GFX7-NEXT: s_mov_b32 s4, s6
+; GFX7-NEXT: s_mov_b32 s5, s6
+; GFX7-NEXT: v_alignbit_b32 v4, v31, v4, 16
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: buffer_store_dwordx4 v[6:9], v[0:1], s[4:7], 0 addr64 offset:32
+; GFX7-NEXT: buffer_store_dwordx4 v[10:13], v[0:1], s[4:7], 0 addr64 offset:16
+; GFX7-NEXT: v_lshrrev_b32_e32 v17, 16, v17
+; GFX7-NEXT: v_alignbit_b32 v17, v17, v30, 16
+; GFX7-NEXT: buffer_store_dwordx4 v[14:17], v[0:1], s[4:7], 0 addr64 offset:48
+; GFX7-NEXT: buffer_store_dwordx4 v[2:5], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_store_global_v32bf16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: flat_store_dwordx4 v[16:17], v[0:3]
+; GFX8-NEXT: s_nop 0
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 48, v16
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v17, vcc
+; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[12:15]
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v16
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v17, vcc
+; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[8:11]
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 16, v16
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v17, vcc
+; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_store_global_v32bf16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: global_store_dwordx4 v[16:17], v[12:15], off offset:48
+; GFX9-NEXT: global_store_dwordx4 v[16:17], v[8:11], off offset:32
+; GFX9-NEXT: global_store_dwordx4 v[16:17], v[4:7], off offset:16
+; GFX9-NEXT: global_store_dwordx4 v[16:17], v[0:3], off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_store_global_v32bf16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: global_store_dwordx4 v[16:17], v[12:15], off offset:48
+; GFX10-NEXT: global_store_dwordx4 v[16:17], v[8:11], off offset:32
+; GFX10-NEXT: global_store_dwordx4 v[16:17], v[4:7], off offset:16
+; GFX10-NEXT: global_store_dwordx4 v[16:17], v[0:3], off
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_store_global_v32bf16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_clause 0x3
+; GFX11-NEXT: global_store_b128 v[16:17], v[12:15], off offset:48
+; GFX11-NEXT: global_store_b128 v[16:17], v[8:11], off offset:32
+; GFX11-NEXT: global_store_b128 v[16:17], v[4:7], off offset:16
+; GFX11-NEXT: global_store_b128 v[16:17], v[0:3], off
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ store <32 x bfloat> %val, ptr addrspace(1) %ptr
+ ret void
+}
+
+define void @v_store_global_v64bf16(<64 x bfloat> %val, ptr addrspace(1) %ptr) {
+; GCN-LABEL: v_store_global_v64bf16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v23
+; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v21
+; GCN-NEXT: v_alignbit_b32 v22, v23, v22, 16
+; GCN-NEXT: v_alignbit_b32 v21, v21, v20, 16
+; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v19
+; GCN-NEXT: v_alignbit_b32 v20, v19, v18, 16
+; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v17
+; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:136
+; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:132
+; GCN-NEXT: v_alignbit_b32 v19, v19, v16, 16
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s4, s6
+; GCN-NEXT: s_mov_b32 s5, s6
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dwordx4 v[19:22], v[17:18], s[4:7], 0 addr64 offset:32
+; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15
+; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v13
+; GCN-NEXT: v_alignbit_b32 v13, v15, v14, 16
+; GCN-NEXT: v_alignbit_b32 v12, v16, v12, 16
+; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11
+; GCN-NEXT: v_alignbit_b32 v11, v11, v10, 16
+; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GCN-NEXT: v_alignbit_b32 v10, v9, v8, 16
+; GCN-NEXT: buffer_store_dwordx4 v[10:13], v[17:18], s[4:7], 0 addr64 offset:16
+; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:128
+; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:120
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:124
+; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:116
+; GCN-NEXT: s_waitcnt vmcnt(3)
+; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GCN-NEXT: s_waitcnt vmcnt(2)
+; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GCN-NEXT: s_waitcnt vmcnt(1)
+; GCN-NEXT: v_alignbit_b32 v11, v8, v10, 16
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_alignbit_b32 v10, v9, v12, 16
+; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:112
+; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:108
+; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:104
+; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:100
+; GCN-NEXT: s_waitcnt vmcnt(3)
+; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GCN-NEXT: s_waitcnt vmcnt(2)
+; GCN-NEXT: v_alignbit_b32 v9, v8, v9, 16
+; GCN-NEXT: s_waitcnt vmcnt(1)
+; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v12
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_alignbit_b32 v8, v8, v13, 16
+; GCN-NEXT: buffer_store_dwordx4 v[8:11], v[17:18], s[4:7], 0 addr64 offset:112
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:96
+; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:88
+; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:92
+; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:84
+; GCN-NEXT: s_waitcnt vmcnt(3)
+; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GCN-NEXT: s_waitcnt vmcnt(2)
+; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GCN-NEXT: s_waitcnt vmcnt(1)
+; GCN-NEXT: v_alignbit_b32 v11, v8, v10, 16
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_alignbit_b32 v10, v9, v12, 16
+; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:80
+; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:76
+; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:72
+; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:68
+; GCN-NEXT: s_waitcnt vmcnt(3)
+; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8
+; GCN-NEXT: s_waitcnt vmcnt(2)
+; GCN-NEXT: v_alignbit_b32 v9, v8, v9, 16
+; GCN-NEXT: s_waitcnt vmcnt(1)
+; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v12
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_alignbit_b32 v8, v8, v13, 16
+; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v5
+; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v29
+; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v27
+; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v25
+; GCN-NEXT: buffer_store_dwordx4 v[8:11], v[17:18], s[4:7], 0 addr64 offset:96
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32
+; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:32
+; GCN-NEXT: v_alignbit_b32 v5, v7, v6, 16
+; GCN-NEXT: v_alignbit_b32 v4, v12, v4, 16
+; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:28
+; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:24
+; GCN-NEXT: v_alignbit_b32 v3, v3, v2, 16
+; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:20
+; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:16
+; GCN-NEXT: v_alignbit_b32 v2, v1, v0, 16
+; GCN-NEXT: v_alignbit_b32 v8, v13, v28, 16
+; GCN-NEXT: v_alignbit_b32 v7, v14, v26, 16
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:12
+; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8
+; GCN-NEXT: v_alignbit_b32 v6, v15, v24, 16
+; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:4
+; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:64
+; GCN-NEXT: s_waitcnt vmcnt(9)
+; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GCN-NEXT: s_waitcnt vmcnt(8)
+; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10
+; GCN-NEXT: s_waitcnt vmcnt(6)
+; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12
+; GCN-NEXT: v_alignbit_b32 v9, v9, v30, 16
+; GCN-NEXT: v_alignbit_b32 v13, v10, v11, 16
+; GCN-NEXT: s_waitcnt vmcnt(5)
+; GCN-NEXT: v_alignbit_b32 v12, v12, v16, 16
+; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:60
+; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:56
+; GCN-NEXT: s_waitcnt vmcnt(6)
+; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v19
+; GCN-NEXT: s_waitcnt vmcnt(5)
+; GCN-NEXT: v_alignbit_b32 v11, v11, v0, 16
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:52
+; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:48
+; GCN-NEXT: s_waitcnt vmcnt(6)
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT: s_waitcnt vmcnt(4)
+; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15
+; GCN-NEXT: s_waitcnt vmcnt(2)
+; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v10
+; GCN-NEXT: v_alignbit_b32 v10, v1, v14, 16
+; GCN-NEXT: v_alignbit_b32 v22, v15, v16, 16
+; GCN-NEXT: s_waitcnt vmcnt(1)
+; GCN-NEXT: v_alignbit_b32 v21, v20, v0, 16
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:44
+; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:40
+; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:36
+; GCN-NEXT: s_waitcnt vmcnt(3)
+; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v19
+; GCN-NEXT: s_waitcnt vmcnt(2)
+; GCN-NEXT: v_alignbit_b32 v20, v15, v0, 16
+; GCN-NEXT: s_waitcnt vmcnt(1)
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v1
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_alignbit_b32 v19, v0, v14, 16
+; GCN-NEXT: buffer_store_dwordx4 v[19:22], v[17:18], s[4:7], 0 addr64 offset:80
+; GCN-NEXT: buffer_store_dwordx4 v[10:13], v[17:18], s[4:7], 0 addr64 offset:64
+; GCN-NEXT: buffer_store_dwordx4 v[6:9], v[17:18], s[4:7], 0 addr64 offset:48
+; GCN-NEXT: buffer_store_dwordx4 v[2:5], v[17:18], s[4:7], 0 addr64
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_store_global_v64bf16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX7-NEXT: v_lshrrev_b32_e32 v31, 16, v5
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_alignbit_b32 v5, v7, v6, 16
+; GFX7-NEXT: v_alignbit_b32 v4, v31, v4, 16
+; GFX7-NEXT: v_alignbit_b32 v3, v3, v2, 16
+; GFX7-NEXT: v_alignbit_b32 v2, v1, v0, 16
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v15
+; GFX7-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:128
+; GFX7-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:124
+; GFX7-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:120
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:116
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:112
+; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:108
+; GFX7-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:104
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v13
+; GFX7-NEXT: v_alignbit_b32 v13, v0, v14, 16
+; GFX7-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:100
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v11
+; GFX7-NEXT: v_alignbit_b32 v12, v1, v12, 16
+; GFX7-NEXT: v_alignbit_b32 v11, v0, v10, 16
+; GFX7-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:136
+; GFX7-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:132
+; GFX7-NEXT: buffer_load_dword v35, off, s[0:3], s32
+; GFX7-NEXT: v_lshrrev_b32_e32 v9, 16, v9
+; GFX7-NEXT: v_lshrrev_b32_e32 v21, 16, v21
+; GFX7-NEXT: s_mov_b32 s6, 0
+; GFX7-NEXT: v_lshrrev_b32_e32 v23, 16, v23
+; GFX7-NEXT: v_alignbit_b32 v10, v9, v8, 16
+; GFX7-NEXT: v_alignbit_b32 v8, v21, v20, 16
+; GFX7-NEXT: s_mov_b32 s7, 0xf000
+; GFX7-NEXT: v_alignbit_b32 v9, v23, v22, 16
+; GFX7-NEXT: s_mov_b32 s4, s6
+; GFX7-NEXT: s_mov_b32 s5, s6
+; GFX7-NEXT: s_waitcnt vmcnt(10)
+; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX7-NEXT: s_waitcnt vmcnt(9)
+; GFX7-NEXT: v_alignbit_b32 v23, v6, v7, 16
+; GFX7-NEXT: s_waitcnt vmcnt(8)
+; GFX7-NEXT: v_lshrrev_b32_e32 v15, 16, v15
+; GFX7-NEXT: s_waitcnt vmcnt(7)
+; GFX7-NEXT: v_alignbit_b32 v22, v15, v31, 16
+; GFX7-NEXT: s_waitcnt vmcnt(6)
+; GFX7-NEXT: v_lshrrev_b32_e32 v20, 16, v32
+; GFX7-NEXT: s_waitcnt vmcnt(5)
+; GFX7-NEXT: v_alignbit_b32 v21, v20, v33, 16
+; GFX7-NEXT: s_waitcnt vmcnt(4)
+; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v34
+; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v19
+; GFX7-NEXT: v_alignbit_b32 v7, v6, v18, 16
+; GFX7-NEXT: s_waitcnt vmcnt(3)
+; GFX7-NEXT: v_alignbit_b32 v20, v32, v14, 16
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:32
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:28
+; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:24
+; GFX7-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:20
+; GFX7-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:16
+; GFX7-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:12
+; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v17
+; GFX7-NEXT: s_waitcnt vmcnt(7)
+; GFX7-NEXT: buffer_store_dwordx4 v[20:23], v[0:1], s[4:7], 0 addr64 offset:112
+; GFX7-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:96
+; GFX7-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:92
+; GFX7-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:88
+; GFX7-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:84
+; GFX7-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:80
+; GFX7-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:76
+; GFX7-NEXT: v_lshrrev_b32_e32 v15, 16, v29
+; GFX7-NEXT: v_alignbit_b32 v6, v6, v16, 16
+; GFX7-NEXT: v_alignbit_b32 v16, v15, v28, 16
+; GFX7-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:72
+; GFX7-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:68
+; GFX7-NEXT: s_waitcnt vmcnt(14)
+; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v35
+; GFX7-NEXT: v_alignbit_b32 v17, v14, v30, 16
+; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v27
+; GFX7-NEXT: v_alignbit_b32 v15, v14, v26, 16
+; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v25
+; GFX7-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:8
+; GFX7-NEXT: v_alignbit_b32 v14, v14, v24, 16
+; GFX7-NEXT: v_lshrrev_b32_e32 v19, 16, v31
+; GFX7-NEXT: s_waitcnt vmcnt(14)
+; GFX7-NEXT: v_alignbit_b32 v21, v19, v32, 16
+; GFX7-NEXT: s_waitcnt vmcnt(13)
+; GFX7-NEXT: v_lshrrev_b32_e32 v19, 16, v33
+; GFX7-NEXT: s_waitcnt vmcnt(12)
+; GFX7-NEXT: v_alignbit_b32 v20, v19, v34, 16
+; GFX7-NEXT: s_waitcnt vmcnt(11)
+; GFX7-NEXT: v_lshrrev_b32_e32 v19, 16, v36
+; GFX7-NEXT: s_waitcnt vmcnt(10)
+; GFX7-NEXT: v_alignbit_b32 v19, v19, v37, 16
+; GFX7-NEXT: s_waitcnt vmcnt(8)
+; GFX7-NEXT: v_lshrrev_b32_e32 v22, 16, v22
+; GFX7-NEXT: s_waitcnt vmcnt(7)
+; GFX7-NEXT: v_alignbit_b32 v25, v22, v23, 16
+; GFX7-NEXT: s_waitcnt vmcnt(6)
+; GFX7-NEXT: v_lshrrev_b32_e32 v24, 16, v38
+; GFX7-NEXT: s_waitcnt vmcnt(5)
+; GFX7-NEXT: v_alignbit_b32 v24, v24, v18, 16
+; GFX7-NEXT: s_waitcnt vmcnt(4)
+; GFX7-NEXT: v_lshrrev_b32_e32 v18, 16, v39
+; GFX7-NEXT: s_waitcnt vmcnt(3)
+; GFX7-NEXT: v_alignbit_b32 v23, v18, v48, 16
+; GFX7-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:4
+; GFX7-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:64
+; GFX7-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:60
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:56
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:52
+; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:48
+; GFX7-NEXT: s_waitcnt vmcnt(8)
+; GFX7-NEXT: v_lshrrev_b32_e32 v22, 16, v28
+; GFX7-NEXT: s_waitcnt vmcnt(7)
+; GFX7-NEXT: v_alignbit_b32 v22, v22, v29, 16
+; GFX7-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:44
+; GFX7-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:40
+; GFX7-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:36
+; GFX7-NEXT: buffer_store_dwordx4 v[22:25], v[0:1], s[4:7], 0 addr64 offset:96
+; GFX7-NEXT: s_waitcnt vmcnt(10)
+; GFX7-NEXT: v_lshrrev_b32_e32 v22, 16, v26
+; GFX7-NEXT: s_waitcnt vmcnt(9)
+; GFX7-NEXT: v_alignbit_b32 v18, v22, v18, 16
+; GFX7-NEXT: s_waitcnt vmcnt(8)
+; GFX7-NEXT: v_lshrrev_b32_e32 v22, 16, v27
+; GFX7-NEXT: s_waitcnt vmcnt(7)
+; GFX7-NEXT: v_alignbit_b32 v25, v22, v30, 16
+; GFX7-NEXT: s_waitcnt vmcnt(6)
+; GFX7-NEXT: v_lshrrev_b32_e32 v23, 16, v31
+; GFX7-NEXT: s_waitcnt vmcnt(5)
+; GFX7-NEXT: v_alignbit_b32 v24, v23, v32, 16
+; GFX7-NEXT: s_waitcnt vmcnt(4)
+; GFX7-NEXT: v_lshrrev_b32_e32 v22, 16, v33
+; GFX7-NEXT: s_waitcnt vmcnt(3)
+; GFX7-NEXT: v_alignbit_b32 v23, v22, v28, 16
+; GFX7-NEXT: s_waitcnt vmcnt(2)
+; GFX7-NEXT: v_lshrrev_b32_e32 v22, 16, v29
+; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: v_alignbit_b32 v22, v22, v34, 16
+; GFX7-NEXT: buffer_store_dwordx4 v[22:25], v[0:1], s[4:7], 0 addr64 offset:80
+; GFX7-NEXT: buffer_store_dwordx4 v[18:21], v[0:1], s[4:7], 0 addr64 offset:64
+; GFX7-NEXT: buffer_store_dwordx4 v[14:17], v[0:1], s[4:7], 0 addr64 offset:48
+; GFX7-NEXT: buffer_store_dwordx4 v[6:9], v[0:1], s[4:7], 0 addr64 offset:32
+; GFX7-NEXT: buffer_store_dwordx4 v[10:13], v[0:1], s[4:7], 0 addr64 offset:16
+; GFX7-NEXT: buffer_store_dwordx4 v[2:5], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_store_global_v64bf16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
+; GFX8-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
+; GFX8-NEXT: buffer_load_dword v31, off, s[0:3], s32
+; GFX8-NEXT: s_movk_i32 s4, 0x70
+; GFX8-NEXT: s_movk_i32 s5, 0x50
+; GFX8-NEXT: s_waitcnt vmcnt(2)
+; GFX8-NEXT: v_add_u32_e32 v34, vcc, s4, v32
+; GFX8-NEXT: s_waitcnt vmcnt(1)
+; GFX8-NEXT: v_addc_u32_e32 v35, vcc, 0, v33, vcc
+; GFX8-NEXT: s_movk_i32 s4, 0x60
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: flat_store_dwordx4 v[34:35], v[28:31]
+; GFX8-NEXT: flat_store_dwordx4 v[32:33], v[0:3]
+; GFX8-NEXT: v_add_u32_e32 v28, vcc, s4, v32
+; GFX8-NEXT: v_addc_u32_e32 v29, vcc, 0, v33, vcc
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s5, v32
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v33, vcc
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, 64, v32
+; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v33, vcc
+; GFX8-NEXT: flat_store_dwordx4 v[28:29], v[24:27]
+; GFX8-NEXT: s_nop 0
+; GFX8-NEXT: v_add_u32_e32 v24, vcc, 48, v32
+; GFX8-NEXT: v_addc_u32_e32 v25, vcc, 0, v33, vcc
+; GFX8-NEXT: v_add_u32_e32 v26, vcc, 32, v32
+; GFX8-NEXT: v_addc_u32_e32 v27, vcc, 0, v33, vcc
+; GFX8-NEXT: v_add_u32_e32 v28, vcc, 16, v32
+; GFX8-NEXT: v_addc_u32_e32 v29, vcc, 0, v33, vcc
+; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[20:23]
+; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[16:19]
+; GFX8-NEXT: flat_store_dwordx4 v[24:25], v[12:15]
+; GFX8-NEXT: flat_store_dwordx4 v[26:27], v[8:11]
+; GFX8-NEXT: flat_store_dwordx4 v[28:29], v[4:7]
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_store_global_v64bf16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
+; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
+; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: global_store_dwordx4 v[32:33], v[28:31], off offset:112
+; GFX9-NEXT: global_store_dwordx4 v[32:33], v[24:27], off offset:96
+; GFX9-NEXT: global_store_dwordx4 v[32:33], v[20:23], off offset:80
+; GFX9-NEXT: global_store_dwordx4 v[32:33], v[16:19], off offset:64
+; GFX9-NEXT: global_store_dwordx4 v[32:33], v[12:15], off offset:48
+; GFX9-NEXT: global_store_dwordx4 v[32:33], v[8:11], off offset:32
+; GFX9-NEXT: global_store_dwordx4 v[32:33], v[4:7], off offset:16
+; GFX9-NEXT: global_store_dwordx4 v[32:33], v[0:3], off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_store_global_v64bf16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_clause 0x2
+; GFX10-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
+; GFX10-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
+; GFX10-NEXT: buffer_load_dword v31, off, s[0:3], s32
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: global_store_dwordx4 v[32:33], v[28:31], off offset:112
+; GFX10-NEXT: global_store_dwordx4 v[32:33], v[24:27], off offset:96
+; GFX10-NEXT: global_store_dwordx4 v[32:33], v[20:23], off offset:80
+; GFX10-NEXT: global_store_dwordx4 v[32:33], v[16:19], off offset:64
+; GFX10-NEXT: global_store_dwordx4 v[32:33], v[12:15], off offset:48
+; GFX10-NEXT: global_store_dwordx4 v[32:33], v[8:11], off offset:32
+; GFX10-NEXT: global_store_dwordx4 v[32:33], v[4:7], off offset:16
+; GFX10-NEXT: global_store_dwordx4 v[32:33], v[0:3], off
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_store_global_v64bf16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_clause 0x2
+; GFX11-NEXT: scratch_load_b32 v33, off, s32 offset:8
+; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:4
+; GFX11-NEXT: scratch_load_b32 v31, off, s32
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_clause 0x7
+; GFX11-NEXT: global_store_b128 v[32:33], v[28:31], off offset:112
+; GFX11-NEXT: global_store_b128 v[32:33], v[24:27], off offset:96
+; GFX11-NEXT: global_store_b128 v[32:33], v[20:23], off offset:80
+; GFX11-NEXT: global_store_b128 v[32:33], v[16:19], off offset:64
+; GFX11-NEXT: global_store_b128 v[32:33], v[12:15], off offset:48
+; GFX11-NEXT: global_store_b128 v[32:33], v[8:11], off offset:32
+; GFX11-NEXT: global_store_b128 v[32:33], v[4:7], off offset:16
+; GFX11-NEXT: global_store_b128 v[32:33], v[0:3], off
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ store <64 x bfloat> %val, ptr addrspace(1) %ptr
+ ret void
+}
+
+define void @test_store_fpimm(ptr addrspace(1) %ptr0, ptr addrspace(1) %ptr1) {
+; GCN-LABEL: test_store_fpimm:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, 0
+; GCN-NEXT: v_mov_b32_e32 v4, 0x3f80
+; GCN-NEXT: v_mov_b32_e32 v5, 0x4228
+; GCN-NEXT: s_mov_b32 s4, s6
+; GCN-NEXT: s_mov_b32 s5, s6
+; GCN-NEXT: buffer_store_short v4, v[0:1], s[4:7], 0 addr64
+; GCN-NEXT: buffer_store_short v5, v[2:3], s[4:7], 0 addr64
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: test_store_fpimm:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: s_mov_b32 s6, 0
+; GFX7-NEXT: s_mov_b32 s7, 0xf000
+; GFX7-NEXT: s_mov_b32 s4, s6
+; GFX7-NEXT: s_mov_b32 s5, s6
+; GFX7-NEXT: v_mov_b32_e32 v4, 0x3f80
+; GFX7-NEXT: buffer_store_short v4, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: v_mov_b32_e32 v0, 0x4228
+; GFX7-NEXT: buffer_store_short v0, v[2:3], s[4:7], 0 addr64
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: test_store_fpimm:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v4, 0x3f80
+; GFX8-NEXT: flat_store_short v[0:1], v4
+; GFX8-NEXT: v_mov_b32_e32 v0, 0x4228
+; GFX8-NEXT: flat_store_short v[2:3], v0
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: test_store_fpimm:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v4, 0x3f80
+; GFX9-NEXT: global_store_short v[0:1], v4, off
+; GFX9-NEXT: v_mov_b32_e32 v0, 0x4228
+; GFX9-NEXT: global_store_short v[2:3], v0, off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: test_store_fpimm:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v4, 0x3f80
+; GFX10-NEXT: v_mov_b32_e32 v5, 0x4228
+; GFX10-NEXT: global_store_short v[0:1], v4, off
+; GFX10-NEXT: global_store_short v[2:3], v5, off
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: test_store_fpimm:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v4, 0x3f80
+; GFX11-NEXT: v_mov_b32_e32 v5, 0x4228
+; GFX11-NEXT: global_store_b16 v[0:1], v4, off
+; GFX11-NEXT: global_store_b16 v[2:3], v5, off
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ store bfloat 1.0, ptr addrspace(1) %ptr0
+ store bfloat 42.0, ptr addrspace(1) %ptr1
+ ret void
+}
+
define void @test_load_store_f32_to_bf16(ptr addrspace(1) %in, ptr addrspace(1) %out) {
; GCN-LABEL: test_load_store_f32_to_bf16:
; GCN: ; %bb.0:
@@ -8750,6 +10575,112 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
ret <32 x bfloat> %op
}
+define bfloat @v_fadd_bf16_fpimm_0(bfloat %arg0) {
+; GCN-LABEL: v_fadd_bf16_fpimm_0:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fadd_bf16_fpimm_0:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_add_f32_e32 v0, 1.0, v0
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fadd_bf16_fpimm_0:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT: v_add_f32_e32 v0, 1.0, v0
+; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fadd_bf16_fpimm_0:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0
+; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fadd_bf16_fpimm_0:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT: v_add_f32_e32 v0, 1.0, v0
+; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_fadd_bf16_fpimm_0:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_add_f32_e32 v0, 1.0, v0
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %add = fadd bfloat %arg0, 1.0
+ ret bfloat %add
+}
+
+define bfloat @v_fadd_bf16_fpimm_1(bfloat %arg0) {
+; GCN-LABEL: v_fadd_bf16_fpimm_1:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_add_f32_e32 v0, 0x42280000, v0
+; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fadd_bf16_fpimm_1:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_add_f32_e32 v0, 0x42280000, v0
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fadd_bf16_fpimm_1:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT: v_add_f32_e32 v0, 0x42280000, v0
+; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fadd_bf16_fpimm_1:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT: v_add_f32_e32 v0, 0x42280000, v0
+; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fadd_bf16_fpimm_1:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT: v_add_f32_e32 v0, 0x42280000, v0
+; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_fadd_bf16_fpimm_1:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_add_f32_e32 v0, 0x42280000, v0
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %add = fadd bfloat %arg0, 42.0
+ ret bfloat %add
+}
+
define bfloat @v_fsub_bf16(bfloat %a, bfloat %b) {
; GCN-LABEL: v_fsub_bf16:
; GCN: ; %bb.0:
@@ -9504,6 +11435,1509 @@ define <4 x bfloat> @v_fmul_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
ret <4 x bfloat> %op
}
+define <8 x bfloat> @v_fmul_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
+; GCN-LABEL: v_fmul_v8bf16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
+; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
+; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
+; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
+; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_mul_f32_e32 v7, v7, v15
+; GCN-NEXT: v_mul_f32_e32 v6, v6, v14
+; GCN-NEXT: v_mul_f32_e32 v5, v5, v13
+; GCN-NEXT: v_mul_f32_e32 v4, v4, v12
+; GCN-NEXT: v_mul_f32_e32 v3, v3, v11
+; GCN-NEXT: v_mul_f32_e32 v2, v2, v10
+; GCN-NEXT: v_mul_f32_e32 v1, v1, v9
+; GCN-NEXT: v_mul_f32_e32 v0, v0, v8
+; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fmul_v8bf16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
+; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_mul_f32_e32 v7, v7, v15
+; GFX7-NEXT: v_mul_f32_e32 v6, v6, v14
+; GFX7-NEXT: v_mul_f32_e32 v5, v5, v13
+; GFX7-NEXT: v_mul_f32_e32 v4, v4, v12
+; GFX7-NEXT: v_mul_f32_e32 v3, v3, v11
+; GFX7-NEXT: v_mul_f32_e32 v2, v2, v10
+; GFX7-NEXT: v_mul_f32_e32 v1, v1, v9
+; GFX7-NEXT: v_mul_f32_e32 v0, v0, v8
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fmul_v8bf16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_and_b32_e32 v8, 0xffff0000, v7
+; GFX8-NEXT: v_and_b32_e32 v9, 0xffff0000, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX8-NEXT: v_mul_f32_e32 v8, v9, v8
+; GFX8-NEXT: v_mul_f32_e32 v3, v3, v7
+; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v6
+; GFX8-NEXT: v_and_b32_e32 v9, 0xffff0000, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT: v_mul_f32_e32 v7, v9, v7
+; GFX8-NEXT: v_mul_f32_e32 v2, v2, v6
+; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v5
+; GFX8-NEXT: v_and_b32_e32 v9, 0xffff0000, v1
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT: v_mul_f32_e32 v6, v9, v6
+; GFX8-NEXT: v_mul_f32_e32 v1, v1, v5
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
+; GFX8-NEXT: v_and_b32_e32 v9, 0xffff0000, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT: v_mul_f32_e32 v5, v9, v5
+; GFX8-NEXT: v_mul_f32_e32 v0, v0, v4
+; GFX8-NEXT: s_mov_b32 s4, 0x3020706
+; GFX8-NEXT: v_perm_b32 v0, v0, v5, s4
+; GFX8-NEXT: v_perm_b32 v1, v1, v6, s4
+; GFX8-NEXT: v_perm_b32 v2, v2, v7, s4
+; GFX8-NEXT: v_perm_b32 v3, v3, v8, s4
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmul_v8bf16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_and_b32_e32 v8, 0xffff0000, v7
+; GFX9-NEXT: v_and_b32_e32 v9, 0xffff0000, v3
+; GFX9-NEXT: v_mul_f32_e32 v8, v9, v8
+; GFX9-NEXT: v_and_b32_e32 v9, 0xffff0000, v5
+; GFX9-NEXT: v_and_b32_e32 v10, 0xffff0000, v1
+; GFX9-NEXT: v_mul_f32_e32 v9, v10, v9
+; GFX9-NEXT: v_and_b32_e32 v10, 0xffff0000, v6
+; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v2
+; GFX9-NEXT: v_mul_f32_e32 v10, v11, v10
+; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v4
+; GFX9-NEXT: v_and_b32_e32 v12, 0xffff0000, v0
+; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX9-NEXT: v_mul_f32_e32 v1, v1, v5
+; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v6
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX9-NEXT: v_mul_f32_e32 v0, v0, v4
+; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v9
+; GFX9-NEXT: v_mul_f32_e32 v11, v12, v11
+; GFX9-NEXT: v_mul_f32_e32 v3, v3, v7
+; GFX9-NEXT: v_mul_f32_e32 v2, v2, v5
+; GFX9-NEXT: s_mov_b32 s4, 0x3020706
+; GFX9-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v8
+; GFX9-NEXT: v_perm_b32 v0, v0, v11, s4
+; GFX9-NEXT: v_perm_b32 v2, v2, v10, s4
+; GFX9-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fmul_v8bf16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_and_b32_e32 v8, 0xffff0000, v7
+; GFX10-NEXT: v_and_b32_e32 v9, 0xffff0000, v3
+; GFX10-NEXT: v_and_b32_e32 v10, 0xffff0000, v5
+; GFX10-NEXT: v_and_b32_e32 v11, 0xffff0000, v1
+; GFX10-NEXT: v_and_b32_e32 v12, 0xffff0000, v6
+; GFX10-NEXT: v_and_b32_e32 v13, 0xffff0000, v2
+; GFX10-NEXT: v_and_b32_e32 v14, 0xffff0000, v4
+; GFX10-NEXT: v_and_b32_e32 v15, 0xffff0000, v0
+; GFX10-NEXT: v_mul_f32_e32 v8, v9, v8
+; GFX10-NEXT: v_mul_f32_e32 v9, v11, v10
+; GFX10-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-NEXT: v_mul_f32_e32 v10, v13, v12
+; GFX10-NEXT: v_mul_f32_e32 v11, v15, v14
+; GFX10-NEXT: v_mul_f32_e32 v3, v3, v7
+; GFX10-NEXT: v_mul_f32_e32 v1, v1, v5
+; GFX10-NEXT: v_mul_f32_e32 v0, v0, v4
+; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v9
+; GFX10-NEXT: v_mul_f32_e32 v2, v2, v6
+; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v8
+; GFX10-NEXT: v_perm_b32 v0, v0, v11, 0x3020706
+; GFX10-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT: v_perm_b32 v2, v2, v10, 0x3020706
+; GFX10-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_fmul_v8bf16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_and_b32_e32 v13, 0xffff0000, v4
+; GFX11-NEXT: v_and_b32_e32 v14, 0xffff0000, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT: v_and_b32_e32 v9, 0xffff0000, v3
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT: v_and_b32_e32 v10, 0xffff0000, v1
+; GFX11-NEXT: v_and_b32_e32 v8, 0xffff0000, v7
+; GFX11-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX11-NEXT: v_dual_mul_f32 v0, v0, v4 :: v_dual_lshlrev_b32 v1, 16, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_dual_mul_f32 v8, v9, v8 :: v_dual_and_b32 v9, 0xffff0000, v5
+; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-NEXT: v_mul_f32_e32 v3, v3, v7
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_mul_f32_e32 v9, v10, v9
+; GFX11-NEXT: v_dual_mul_f32 v1, v1, v5 :: v_dual_and_b32 v12, 0xffff0000, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v6
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-NEXT: v_and_b32_e32 v11, 0xffff0000, v6
+; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v9
+; GFX11-NEXT: v_mul_f32_e32 v2, v2, v5
+; GFX11-NEXT: v_dual_mul_f32 v10, v12, v11 :: v_dual_mul_f32 v11, v14, v13
+; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v8
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v4
+; GFX11-NEXT: v_perm_b32 v2, v2, v10, 0x3020706
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_perm_b32 v0, v0, v11, 0x3020706
+; GFX11-NEXT: v_or_b32_e32 v3, v3, v5
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %op = fmul <8 x bfloat> %a, %b
+ ret <8 x bfloat> %op
+}
+
+define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
+; GCN-LABEL: v_fmul_v16bf16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
+; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
+; GCN-NEXT: v_mul_f32_e32 v14, v14, v30
+; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
+; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
+; GCN-NEXT: v_mul_f32_e32 v13, v13, v29
+; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
+; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
+; GCN-NEXT: v_mul_f32_e32 v12, v12, v28
+; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
+; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; GCN-NEXT: v_mul_f32_e32 v11, v11, v27
+; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
+; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; GCN-NEXT: v_mul_f32_e32 v10, v10, v26
+; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
+; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
+; GCN-NEXT: v_mul_f32_e32 v9, v9, v25
+; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
+; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; GCN-NEXT: v_mul_f32_e32 v8, v8, v24
+; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
+; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GCN-NEXT: v_mul_f32_e32 v7, v7, v23
+; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
+; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GCN-NEXT: v_mul_f32_e32 v6, v6, v22
+; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
+; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GCN-NEXT: v_mul_f32_e32 v5, v5, v21
+; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
+; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GCN-NEXT: v_mul_f32_e32 v4, v4, v20
+; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32
+; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
+; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
+; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
+; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_mul_f32_e32 v3, v3, v19
+; GCN-NEXT: v_mul_f32_e32 v2, v2, v18
+; GCN-NEXT: v_mul_f32_e32 v1, v1, v17
+; GCN-NEXT: v_mul_f32_e32 v0, v0, v16
+; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
+; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
+; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v20
+; GCN-NEXT: v_mul_f32_e32 v15, v15, v16
+; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
+; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fmul_v16bf16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT: v_mul_f32_e32 v4, v4, v20
+; GFX7-NEXT: buffer_load_dword v20, off, s[0:3], s32
+; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
+; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
+; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
+; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
+; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
+; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
+; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
+; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
+; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
+; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
+; GFX7-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
+; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
+; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
+; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_mul_f32_e32 v0, v0, v16
+; GFX7-NEXT: v_mul_f32_e32 v14, v14, v30
+; GFX7-NEXT: v_mul_f32_e32 v13, v13, v29
+; GFX7-NEXT: v_mul_f32_e32 v12, v12, v28
+; GFX7-NEXT: v_mul_f32_e32 v11, v11, v27
+; GFX7-NEXT: v_mul_f32_e32 v10, v10, v26
+; GFX7-NEXT: v_mul_f32_e32 v9, v9, v25
+; GFX7-NEXT: v_mul_f32_e32 v8, v8, v24
+; GFX7-NEXT: v_mul_f32_e32 v7, v7, v23
+; GFX7-NEXT: v_mul_f32_e32 v6, v6, v22
+; GFX7-NEXT: v_mul_f32_e32 v5, v5, v21
+; GFX7-NEXT: v_mul_f32_e32 v3, v3, v19
+; GFX7-NEXT: v_mul_f32_e32 v2, v2, v18
+; GFX7-NEXT: v_mul_f32_e32 v1, v1, v17
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
+; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
+; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
+; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v20
+; GFX7-NEXT: v_mul_f32_e32 v15, v15, v16
+; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fmul_v16bf16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_and_b32_e32 v16, 0xffff0000, v15
+; GFX8-NEXT: v_and_b32_e32 v17, 0xffff0000, v7
+; GFX8-NEXT: v_lshlrev_b32_e32 v15, 16, v15
+; GFX8-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX8-NEXT: v_mul_f32_e32 v16, v17, v16
+; GFX8-NEXT: v_mul_f32_e32 v7, v7, v15
+; GFX8-NEXT: v_and_b32_e32 v15, 0xffff0000, v14
+; GFX8-NEXT: v_and_b32_e32 v17, 0xffff0000, v6
+; GFX8-NEXT: v_lshlrev_b32_e32 v14, 16, v14
+; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX8-NEXT: v_mul_f32_e32 v15, v17, v15
+; GFX8-NEXT: v_mul_f32_e32 v6, v6, v14
+; GFX8-NEXT: v_and_b32_e32 v14, 0xffff0000, v13
+; GFX8-NEXT: v_and_b32_e32 v17, 0xffff0000, v5
+; GFX8-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX8-NEXT: v_mul_f32_e32 v14, v17, v14
+; GFX8-NEXT: v_mul_f32_e32 v5, v5, v13
+; GFX8-NEXT: v_and_b32_e32 v13, 0xffff0000, v12
+; GFX8-NEXT: v_and_b32_e32 v17, 0xffff0000, v4
+; GFX8-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX8-NEXT: v_mul_f32_e32 v13, v17, v13
+; GFX8-NEXT: v_mul_f32_e32 v4, v4, v12
+; GFX8-NEXT: v_and_b32_e32 v12, 0xffff0000, v11
+; GFX8-NEXT: v_and_b32_e32 v17, 0xffff0000, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX8-NEXT: v_mul_f32_e32 v12, v17, v12
+; GFX8-NEXT: v_mul_f32_e32 v3, v3, v11
+; GFX8-NEXT: v_and_b32_e32 v11, 0xffff0000, v10
+; GFX8-NEXT: v_and_b32_e32 v17, 0xffff0000, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT: v_mul_f32_e32 v11, v17, v11
+; GFX8-NEXT: v_mul_f32_e32 v2, v2, v10
+; GFX8-NEXT: v_and_b32_e32 v10, 0xffff0000, v9
+; GFX8-NEXT: v_and_b32_e32 v17, 0xffff0000, v1
+; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT: v_mul_f32_e32 v10, v17, v10
+; GFX8-NEXT: v_mul_f32_e32 v1, v1, v9
+; GFX8-NEXT: v_and_b32_e32 v9, 0xffff0000, v8
+; GFX8-NEXT: v_and_b32_e32 v17, 0xffff0000, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT: v_mul_f32_e32 v9, v17, v9
+; GFX8-NEXT: v_mul_f32_e32 v0, v0, v8
+; GFX8-NEXT: s_mov_b32 s4, 0x3020706
+; GFX8-NEXT: v_perm_b32 v0, v0, v9, s4
+; GFX8-NEXT: v_perm_b32 v1, v1, v10, s4
+; GFX8-NEXT: v_perm_b32 v2, v2, v11, s4
+; GFX8-NEXT: v_perm_b32 v3, v3, v12, s4
+; GFX8-NEXT: v_perm_b32 v4, v4, v13, s4
+; GFX8-NEXT: v_perm_b32 v5, v5, v14, s4
+; GFX8-NEXT: v_perm_b32 v6, v6, v15, s4
+; GFX8-NEXT: v_perm_b32 v7, v7, v16, s4
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmul_v16bf16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_and_b32_e32 v16, 0xffff0000, v15
+; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v7
+; GFX9-NEXT: v_mul_f32_e32 v16, v17, v16
+; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v14
+; GFX9-NEXT: v_and_b32_e32 v18, 0xffff0000, v6
+; GFX9-NEXT: v_mul_f32_e32 v17, v18, v17
+; GFX9-NEXT: v_and_b32_e32 v18, 0xffff0000, v13
+; GFX9-NEXT: v_and_b32_e32 v19, 0xffff0000, v5
+; GFX9-NEXT: v_mul_f32_e32 v18, v19, v18
+; GFX9-NEXT: v_and_b32_e32 v19, 0xffff0000, v12
+; GFX9-NEXT: v_and_b32_e32 v20, 0xffff0000, v4
+; GFX9-NEXT: v_mul_f32_e32 v19, v20, v19
+; GFX9-NEXT: v_and_b32_e32 v20, 0xffff0000, v11
+; GFX9-NEXT: v_and_b32_e32 v21, 0xffff0000, v3
+; GFX9-NEXT: v_mul_f32_e32 v20, v21, v20
+; GFX9-NEXT: v_and_b32_e32 v21, 0xffff0000, v10
+; GFX9-NEXT: v_and_b32_e32 v22, 0xffff0000, v2
+; GFX9-NEXT: v_mul_f32_e32 v21, v22, v21
+; GFX9-NEXT: v_and_b32_e32 v22, 0xffff0000, v9
+; GFX9-NEXT: v_and_b32_e32 v23, 0xffff0000, v1
+; GFX9-NEXT: v_mul_f32_e32 v22, v23, v22
+; GFX9-NEXT: v_and_b32_e32 v23, 0xffff0000, v8
+; GFX9-NEXT: v_and_b32_e32 v24, 0xffff0000, v0
+; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v15
+; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX9-NEXT: v_lshlrev_b32_e32 v14, 16, v14
+; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX9-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX9-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX9-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-NEXT: v_mul_f32_e32 v23, v24, v23
+; GFX9-NEXT: v_mul_f32_e32 v7, v7, v15
+; GFX9-NEXT: v_mul_f32_e32 v6, v6, v14
+; GFX9-NEXT: v_mul_f32_e32 v5, v5, v13
+; GFX9-NEXT: v_mul_f32_e32 v4, v4, v12
+; GFX9-NEXT: v_mul_f32_e32 v3, v3, v11
+; GFX9-NEXT: v_mul_f32_e32 v2, v2, v10
+; GFX9-NEXT: v_mul_f32_e32 v1, v1, v9
+; GFX9-NEXT: v_mul_f32_e32 v0, v0, v8
+; GFX9-NEXT: s_mov_b32 s4, 0x3020706
+; GFX9-NEXT: v_perm_b32 v0, v0, v23, s4
+; GFX9-NEXT: v_perm_b32 v1, v1, v22, s4
+; GFX9-NEXT: v_perm_b32 v2, v2, v21, s4
+; GFX9-NEXT: v_perm_b32 v3, v3, v20, s4
+; GFX9-NEXT: v_perm_b32 v4, v4, v19, s4
+; GFX9-NEXT: v_perm_b32 v5, v5, v18, s4
+; GFX9-NEXT: v_perm_b32 v6, v6, v17, s4
+; GFX9-NEXT: v_perm_b32 v7, v7, v16, s4
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fmul_v16bf16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_and_b32_e32 v16, 0xffff0000, v15
+; GFX10-NEXT: v_and_b32_e32 v17, 0xffff0000, v7
+; GFX10-NEXT: v_and_b32_e32 v18, 0xffff0000, v6
+; GFX10-NEXT: v_and_b32_e32 v19, 0xffff0000, v13
+; GFX10-NEXT: v_and_b32_e32 v20, 0xffff0000, v5
+; GFX10-NEXT: v_and_b32_e32 v21, 0xffff0000, v12
+; GFX10-NEXT: v_mul_f32_e32 v16, v17, v16
+; GFX10-NEXT: v_and_b32_e32 v17, 0xffff0000, v14
+; GFX10-NEXT: v_and_b32_e32 v22, 0xffff0000, v4
+; GFX10-NEXT: v_and_b32_e32 v23, 0xffff0000, v11
+; GFX10-NEXT: v_and_b32_e32 v24, 0xffff0000, v3
+; GFX10-NEXT: v_and_b32_e32 v25, 0xffff0000, v10
+; GFX10-NEXT: v_and_b32_e32 v26, 0xffff0000, v2
+; GFX10-NEXT: v_mul_f32_e32 v17, v18, v17
+; GFX10-NEXT: v_mul_f32_e32 v18, v20, v19
+; GFX10-NEXT: v_mul_f32_e32 v19, v22, v21
+; GFX10-NEXT: v_mul_f32_e32 v20, v24, v23
+; GFX10-NEXT: v_mul_f32_e32 v21, v26, v25
+; GFX10-NEXT: v_and_b32_e32 v22, 0xffff0000, v9
+; GFX10-NEXT: v_and_b32_e32 v23, 0xffff0000, v1
+; GFX10-NEXT: v_and_b32_e32 v24, 0xffff0000, v8
+; GFX10-NEXT: v_and_b32_e32 v25, 0xffff0000, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v15, 16, v15
+; GFX10-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX10-NEXT: v_lshlrev_b32_e32 v14, 16, v14
+; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX10-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX10-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; GFX10-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GFX10-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX10-NEXT: v_mul_f32_e32 v22, v23, v22
+; GFX10-NEXT: v_mul_f32_e32 v23, v25, v24
+; GFX10-NEXT: v_mul_f32_e32 v7, v7, v15
+; GFX10-NEXT: v_mul_f32_e32 v6, v6, v14
+; GFX10-NEXT: v_mul_f32_e32 v5, v5, v13
+; GFX10-NEXT: v_mul_f32_e32 v0, v0, v8
+; GFX10-NEXT: v_mul_f32_e32 v1, v1, v9
+; GFX10-NEXT: v_mul_f32_e32 v2, v2, v10
+; GFX10-NEXT: v_mul_f32_e32 v3, v3, v11
+; GFX10-NEXT: v_mul_f32_e32 v4, v4, v12
+; GFX10-NEXT: v_perm_b32 v0, v0, v23, 0x3020706
+; GFX10-NEXT: v_perm_b32 v1, v1, v22, 0x3020706
+; GFX10-NEXT: v_perm_b32 v2, v2, v21, 0x3020706
+; GFX10-NEXT: v_perm_b32 v3, v3, v20, 0x3020706
+; GFX10-NEXT: v_perm_b32 v4, v4, v19, 0x3020706
+; GFX10-NEXT: v_perm_b32 v5, v5, v18, 0x3020706
+; GFX10-NEXT: v_perm_b32 v6, v6, v17, 0x3020706
+; GFX10-NEXT: v_perm_b32 v7, v7, v16, 0x3020706
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_fmul_v16bf16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_and_b32_e32 v17, 0xffff0000, v7
+; GFX11-NEXT: v_and_b32_e32 v20, 0xffff0000, v5
+; GFX11-NEXT: v_and_b32_e32 v23, 0xffff0000, v11
+; GFX11-NEXT: v_and_b32_e32 v25, 0xffff0000, v10
+; GFX11-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX11-NEXT: v_and_b32_e32 v26, 0xffff0000, v2
+; GFX11-NEXT: v_and_b32_e32 v22, 0xffff0000, v4
+; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-NEXT: v_and_b32_e32 v24, 0xffff0000, v3
+; GFX11-NEXT: v_and_b32_e32 v19, 0xffff0000, v13
+; GFX11-NEXT: v_and_b32_e32 v21, 0xffff0000, v12
+; GFX11-NEXT: v_and_b32_e32 v18, 0xffff0000, v6
+; GFX11-NEXT: v_and_b32_e32 v16, 0xffff0000, v15
+; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX11-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-NEXT: v_dual_mul_f32 v16, v17, v16 :: v_dual_and_b32 v17, 0xffff0000, v14
+; GFX11-NEXT: v_lshlrev_b32_e32 v15, 16, v15
+; GFX11-NEXT: v_lshlrev_b32_e32 v14, 16, v14
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_dual_mul_f32 v4, v4, v12 :: v_dual_mul_f32 v5, v5, v13
+; GFX11-NEXT: v_dual_mul_f32 v17, v18, v17 :: v_dual_mul_f32 v18, v20, v19
+; GFX11-NEXT: v_mul_f32_e32 v19, v22, v21
+; GFX11-NEXT: v_mul_f32_e32 v7, v7, v15
+; GFX11-NEXT: v_mul_f32_e32 v21, v26, v25
+; GFX11-NEXT: v_dual_mul_f32 v6, v6, v14 :: v_dual_and_b32 v25, 0xffff0000, v0
+; GFX11-NEXT: v_mul_f32_e32 v20, v24, v23
+; GFX11-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-NEXT: v_and_b32_e32 v23, 0xffff0000, v1
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT: v_and_b32_e32 v22, 0xffff0000, v9
+; GFX11-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX11-NEXT: v_and_b32_e32 v24, 0xffff0000, v8
+; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX11-NEXT: v_dual_mul_f32 v2, v2, v10 :: v_dual_mul_f32 v3, v3, v11
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_dual_mul_f32 v1, v1, v9 :: v_dual_mul_f32 v22, v23, v22
+; GFX11-NEXT: v_mul_f32_e32 v23, v25, v24
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_mul_f32_e32 v0, v0, v8
+; GFX11-NEXT: v_perm_b32 v2, v2, v21, 0x3020706
+; GFX11-NEXT: v_perm_b32 v3, v3, v20, 0x3020706
+; GFX11-NEXT: v_perm_b32 v1, v1, v22, 0x3020706
+; GFX11-NEXT: v_perm_b32 v4, v4, v19, 0x3020706
+; GFX11-NEXT: v_perm_b32 v0, v0, v23, 0x3020706
+; GFX11-NEXT: v_perm_b32 v5, v5, v18, 0x3020706
+; GFX11-NEXT: v_perm_b32 v6, v6, v17, 0x3020706
+; GFX11-NEXT: v_perm_b32 v7, v7, v16, 0x3020706
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %op = fmul <16 x bfloat> %a, %b
+ ret <16 x bfloat> %op
+}
+
+define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
+; GCN-LABEL: v_fmul_v32bf16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:128
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32
+; GCN-NEXT: s_waitcnt vmcnt(1)
+; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:124
+; GCN-NEXT: s_waitcnt vmcnt(1)
+; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GCN-NEXT: v_mul_f32_e32 v31, v32, v31
+; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:120
+; GCN-NEXT: v_mul_f32_e32 v30, v30, v32
+; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:116
+; GCN-NEXT: s_waitcnt vmcnt(1)
+; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v33
+; GCN-NEXT: v_mul_f32_e32 v29, v29, v33
+; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:112
+; GCN-NEXT: v_mul_f32_e32 v28, v28, v32
+; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:108
+; GCN-NEXT: s_waitcnt vmcnt(1)
+; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v33
+; GCN-NEXT: v_mul_f32_e32 v27, v27, v33
+; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:104
+; GCN-NEXT: v_mul_f32_e32 v26, v26, v32
+; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:100
+; GCN-NEXT: s_waitcnt vmcnt(1)
+; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v33
+; GCN-NEXT: v_mul_f32_e32 v25, v25, v33
+; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:96
+; GCN-NEXT: v_mul_f32_e32 v24, v24, v32
+; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:92
+; GCN-NEXT: s_waitcnt vmcnt(1)
+; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v33
+; GCN-NEXT: v_mul_f32_e32 v23, v23, v33
+; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:88
+; GCN-NEXT: v_mul_f32_e32 v22, v22, v32
+; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:84
+; GCN-NEXT: s_waitcnt vmcnt(1)
+; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v33
+; GCN-NEXT: v_mul_f32_e32 v21, v21, v33
+; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:80
+; GCN-NEXT: v_mul_f32_e32 v20, v20, v32
+; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:76
+; GCN-NEXT: s_waitcnt vmcnt(1)
+; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v33
+; GCN-NEXT: v_mul_f32_e32 v19, v19, v33
+; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:72
+; GCN-NEXT: v_mul_f32_e32 v18, v18, v32
+; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:68
+; GCN-NEXT: s_waitcnt vmcnt(1)
+; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v33
+; GCN-NEXT: v_mul_f32_e32 v17, v17, v33
+; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:64
+; GCN-NEXT: v_mul_f32_e32 v16, v16, v32
+; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:60
+; GCN-NEXT: s_waitcnt vmcnt(1)
+; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v33
+; GCN-NEXT: v_mul_f32_e32 v15, v15, v33
+; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:56
+; GCN-NEXT: v_mul_f32_e32 v14, v14, v32
+; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:52
+; GCN-NEXT: s_waitcnt vmcnt(1)
+; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v33
+; GCN-NEXT: v_mul_f32_e32 v13, v13, v33
+; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:48
+; GCN-NEXT: v_mul_f32_e32 v12, v12, v32
+; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:44
+; GCN-NEXT: s_waitcnt vmcnt(1)
+; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v33
+; GCN-NEXT: v_mul_f32_e32 v11, v11, v33
+; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:40
+; GCN-NEXT: v_mul_f32_e32 v10, v10, v32
+; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:36
+; GCN-NEXT: s_waitcnt vmcnt(1)
+; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v33
+; GCN-NEXT: v_mul_f32_e32 v9, v9, v33
+; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:32
+; GCN-NEXT: v_mul_f32_e32 v8, v8, v32
+; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:28
+; GCN-NEXT: s_waitcnt vmcnt(1)
+; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v33
+; GCN-NEXT: v_mul_f32_e32 v7, v7, v33
+; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:24
+; GCN-NEXT: v_mul_f32_e32 v6, v6, v32
+; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:20
+; GCN-NEXT: s_waitcnt vmcnt(1)
+; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v33
+; GCN-NEXT: v_mul_f32_e32 v5, v5, v33
+; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:16
+; GCN-NEXT: v_mul_f32_e32 v4, v4, v32
+; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:12
+; GCN-NEXT: s_waitcnt vmcnt(1)
+; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v33
+; GCN-NEXT: v_mul_f32_e32 v3, v3, v33
+; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
+; GCN-NEXT: v_mul_f32_e32 v2, v2, v32
+; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
+; GCN-NEXT: s_waitcnt vmcnt(1)
+; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v33
+; GCN-NEXT: v_mul_f32_e32 v1, v1, v33
+; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GCN-NEXT: v_mul_f32_e32 v0, v0, v32
+; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
+; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
+; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
+; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
+; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
+; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
+; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
+; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
+; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
+; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
+; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
+; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
+; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
+; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
+; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
+; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
+; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
+; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
+; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fmul_v32bf16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:128
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32
+; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
+; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
+; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
+; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
+; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
+; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
+; GFX7-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
+; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
+; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
+; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
+; GFX7-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
+; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
+; GFX7-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
+; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
+; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
+; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
+; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
+; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
+; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_mul_f32_e32 v31, v32, v31
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:124
+; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_mul_f32_e32 v30, v30, v32
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:120
+; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_mul_f32_e32 v29, v29, v32
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:116
+; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_mul_f32_e32 v28, v28, v32
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:112
+; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_mul_f32_e32 v27, v27, v32
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:108
+; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_mul_f32_e32 v26, v26, v32
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:104
+; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_mul_f32_e32 v25, v25, v32
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:100
+; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_mul_f32_e32 v24, v24, v32
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:96
+; GFX7-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_mul_f32_e32 v23, v23, v32
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:92
+; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_mul_f32_e32 v22, v22, v32
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:88
+; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_mul_f32_e32 v21, v21, v32
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:84
+; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_mul_f32_e32 v20, v20, v32
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:80
+; GFX7-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_mul_f32_e32 v19, v19, v32
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:76
+; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_mul_f32_e32 v18, v18, v32
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:72
+; GFX7-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_mul_f32_e32 v17, v17, v32
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:68
+; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_mul_f32_e32 v16, v16, v32
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:64
+; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_mul_f32_e32 v15, v15, v32
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:60
+; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_mul_f32_e32 v14, v14, v32
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:56
+; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_mul_f32_e32 v13, v13, v32
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:52
+; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_mul_f32_e32 v12, v12, v32
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:48
+; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_mul_f32_e32 v11, v11, v32
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:44
+; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_mul_f32_e32 v10, v10, v32
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:40
+; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_mul_f32_e32 v9, v9, v32
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:36
+; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_mul_f32_e32 v8, v8, v32
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:32
+; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_mul_f32_e32 v7, v7, v32
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:28
+; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_mul_f32_e32 v6, v6, v32
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:24
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_mul_f32_e32 v5, v5, v32
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:20
+; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_mul_f32_e32 v4, v4, v32
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:16
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_mul_f32_e32 v3, v3, v32
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:12
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_mul_f32_e32 v2, v2, v32
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_mul_f32_e32 v1, v1, v32
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_mul_f32_e32 v0, v0, v32
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fmul_v32bf16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_and_b32_e32 v31, 0xffff0000, v30
+; GFX8-NEXT: v_and_b32_e32 v32, 0xffff0000, v14
+; GFX8-NEXT: v_lshlrev_b32_e32 v30, 16, v30
+; GFX8-NEXT: v_lshlrev_b32_e32 v14, 16, v14
+; GFX8-NEXT: v_mul_f32_e32 v31, v32, v31
+; GFX8-NEXT: v_mul_f32_e32 v14, v14, v30
+; GFX8-NEXT: v_and_b32_e32 v30, 0xffff0000, v29
+; GFX8-NEXT: v_and_b32_e32 v32, 0xffff0000, v13
+; GFX8-NEXT: v_lshlrev_b32_e32 v29, 16, v29
+; GFX8-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX8-NEXT: v_mul_f32_e32 v30, v32, v30
+; GFX8-NEXT: v_mul_f32_e32 v13, v13, v29
+; GFX8-NEXT: v_and_b32_e32 v29, 0xffff0000, v28
+; GFX8-NEXT: v_and_b32_e32 v32, 0xffff0000, v12
+; GFX8-NEXT: v_lshlrev_b32_e32 v28, 16, v28
+; GFX8-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; GFX8-NEXT: v_mul_f32_e32 v29, v32, v29
+; GFX8-NEXT: v_mul_f32_e32 v12, v12, v28
+; GFX8-NEXT: v_and_b32_e32 v28, 0xffff0000, v27
+; GFX8-NEXT: v_and_b32_e32 v32, 0xffff0000, v11
+; GFX8-NEXT: v_lshlrev_b32_e32 v27, 16, v27
+; GFX8-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GFX8-NEXT: v_mul_f32_e32 v28, v32, v28
+; GFX8-NEXT: v_mul_f32_e32 v11, v11, v27
+; GFX8-NEXT: v_and_b32_e32 v27, 0xffff0000, v26
+; GFX8-NEXT: v_and_b32_e32 v32, 0xffff0000, v10
+; GFX8-NEXT: v_lshlrev_b32_e32 v26, 16, v26
+; GFX8-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GFX8-NEXT: v_mul_f32_e32 v27, v32, v27
+; GFX8-NEXT: v_mul_f32_e32 v10, v10, v26
+; GFX8-NEXT: v_and_b32_e32 v26, 0xffff0000, v25
+; GFX8-NEXT: v_and_b32_e32 v32, 0xffff0000, v9
+; GFX8-NEXT: v_lshlrev_b32_e32 v25, 16, v25
+; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX8-NEXT: v_mul_f32_e32 v26, v32, v26
+; GFX8-NEXT: v_mul_f32_e32 v9, v9, v25
+; GFX8-NEXT: v_and_b32_e32 v25, 0xffff0000, v24
+; GFX8-NEXT: v_and_b32_e32 v32, 0xffff0000, v8
+; GFX8-NEXT: v_lshlrev_b32_e32 v24, 16, v24
+; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX8-NEXT: v_mul_f32_e32 v8, v8, v24
+; GFX8-NEXT: buffer_load_dword v24, off, s[0:3], s32
+; GFX8-NEXT: v_mul_f32_e32 v25, v32, v25
+; GFX8-NEXT: v_and_b32_e32 v32, 0xffff0000, v15
+; GFX8-NEXT: v_lshlrev_b32_e32 v15, 16, v15
+; GFX8-NEXT: s_mov_b32 s4, 0x3020706
+; GFX8-NEXT: v_perm_b32 v8, v8, v25, s4
+; GFX8-NEXT: v_perm_b32 v9, v9, v26, s4
+; GFX8-NEXT: v_perm_b32 v10, v10, v27, s4
+; GFX8-NEXT: v_perm_b32 v11, v11, v28, s4
+; GFX8-NEXT: v_perm_b32 v12, v12, v29, s4
+; GFX8-NEXT: v_perm_b32 v13, v13, v30, s4
+; GFX8-NEXT: v_perm_b32 v14, v14, v31, s4
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_and_b32_e32 v33, 0xffff0000, v24
+; GFX8-NEXT: v_lshlrev_b32_e32 v24, 16, v24
+; GFX8-NEXT: v_mul_f32_e32 v32, v32, v33
+; GFX8-NEXT: v_mul_f32_e32 v15, v15, v24
+; GFX8-NEXT: v_and_b32_e32 v24, 0xffff0000, v23
+; GFX8-NEXT: v_and_b32_e32 v33, 0xffff0000, v7
+; GFX8-NEXT: v_lshlrev_b32_e32 v23, 16, v23
+; GFX8-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX8-NEXT: v_mul_f32_e32 v24, v33, v24
+; GFX8-NEXT: v_mul_f32_e32 v7, v7, v23
+; GFX8-NEXT: v_and_b32_e32 v23, 0xffff0000, v22
+; GFX8-NEXT: v_and_b32_e32 v33, 0xffff0000, v6
+; GFX8-NEXT: v_lshlrev_b32_e32 v22, 16, v22
+; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX8-NEXT: v_mul_f32_e32 v23, v33, v23
+; GFX8-NEXT: v_mul_f32_e32 v6, v6, v22
+; GFX8-NEXT: v_and_b32_e32 v22, 0xffff0000, v21
+; GFX8-NEXT: v_and_b32_e32 v33, 0xffff0000, v5
+; GFX8-NEXT: v_lshlrev_b32_e32 v21, 16, v21
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX8-NEXT: v_mul_f32_e32 v22, v33, v22
+; GFX8-NEXT: v_mul_f32_e32 v5, v5, v21
+; GFX8-NEXT: v_and_b32_e32 v21, 0xffff0000, v20
+; GFX8-NEXT: v_and_b32_e32 v33, 0xffff0000, v4
+; GFX8-NEXT: v_lshlrev_b32_e32 v20, 16, v20
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX8-NEXT: v_mul_f32_e32 v21, v33, v21
+; GFX8-NEXT: v_mul_f32_e32 v4, v4, v20
+; GFX8-NEXT: v_and_b32_e32 v20, 0xffff0000, v19
+; GFX8-NEXT: v_and_b32_e32 v33, 0xffff0000, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v19, 16, v19
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX8-NEXT: v_mul_f32_e32 v20, v33, v20
+; GFX8-NEXT: v_mul_f32_e32 v3, v3, v19
+; GFX8-NEXT: v_and_b32_e32 v19, 0xffff0000, v18
+; GFX8-NEXT: v_and_b32_e32 v33, 0xffff0000, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v18, 16, v18
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT: v_mul_f32_e32 v19, v33, v19
+; GFX8-NEXT: v_mul_f32_e32 v2, v2, v18
+; GFX8-NEXT: v_and_b32_e32 v18, 0xffff0000, v17
+; GFX8-NEXT: v_and_b32_e32 v33, 0xffff0000, v1
+; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v17
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT: v_mul_f32_e32 v18, v33, v18
+; GFX8-NEXT: v_mul_f32_e32 v1, v1, v17
+; GFX8-NEXT: v_and_b32_e32 v17, 0xffff0000, v16
+; GFX8-NEXT: v_and_b32_e32 v33, 0xffff0000, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v16, 16, v16
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT: v_mul_f32_e32 v17, v33, v17
+; GFX8-NEXT: v_mul_f32_e32 v0, v0, v16
+; GFX8-NEXT: v_perm_b32 v0, v0, v17, s4
+; GFX8-NEXT: v_perm_b32 v1, v1, v18, s4
+; GFX8-NEXT: v_perm_b32 v2, v2, v19, s4
+; GFX8-NEXT: v_perm_b32 v3, v3, v20, s4
+; GFX8-NEXT: v_perm_b32 v4, v4, v21, s4
+; GFX8-NEXT: v_perm_b32 v5, v5, v22, s4
+; GFX8-NEXT: v_perm_b32 v6, v6, v23, s4
+; GFX8-NEXT: v_perm_b32 v7, v7, v24, s4
+; GFX8-NEXT: v_perm_b32 v15, v15, v32, s4
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmul_v32bf16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX9-NEXT: v_and_b32_e32 v38, 0xffff0000, v27
+; GFX9-NEXT: v_and_b32_e32 v39, 0xffff0000, v11
+; GFX9-NEXT: v_and_b32_e32 v48, 0xffff0000, v26
+; GFX9-NEXT: v_and_b32_e32 v49, 0xffff0000, v10
+; GFX9-NEXT: v_and_b32_e32 v50, 0xffff0000, v25
+; GFX9-NEXT: v_and_b32_e32 v51, 0xffff0000, v9
+; GFX9-NEXT: v_and_b32_e32 v40, 0xffff0000, v22
+; GFX9-NEXT: v_and_b32_e32 v41, 0xffff0000, v6
+; GFX9-NEXT: v_and_b32_e32 v58, 0xffff0000, v17
+; GFX9-NEXT: v_and_b32_e32 v59, 0xffff0000, v1
+; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v17
+; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT: v_and_b32_e32 v52, 0xffff0000, v24
+; GFX9-NEXT: v_and_b32_e32 v53, 0xffff0000, v8
+; GFX9-NEXT: v_and_b32_e32 v54, 0xffff0000, v23
+; GFX9-NEXT: v_and_b32_e32 v55, 0xffff0000, v7
+; GFX9-NEXT: v_and_b32_e32 v42, 0xffff0000, v21
+; GFX9-NEXT: v_and_b32_e32 v43, 0xffff0000, v5
+; GFX9-NEXT: v_and_b32_e32 v44, 0xffff0000, v20
+; GFX9-NEXT: v_and_b32_e32 v45, 0xffff0000, v4
+; GFX9-NEXT: v_and_b32_e32 v46, 0xffff0000, v19
+; GFX9-NEXT: v_and_b32_e32 v47, 0xffff0000, v3
+; GFX9-NEXT: v_and_b32_e32 v56, 0xffff0000, v18
+; GFX9-NEXT: v_and_b32_e32 v57, 0xffff0000, v2
+; GFX9-NEXT: v_mul_f32_e32 v38, v39, v38
+; GFX9-NEXT: v_mul_f32_e32 v39, v49, v48
+; GFX9-NEXT: v_mul_f32_e32 v48, v51, v50
+; GFX9-NEXT: v_mul_f32_e32 v51, v41, v40
+; GFX9-NEXT: v_mul_f32_e32 v40, v59, v58
+; GFX9-NEXT: v_mul_f32_e32 v1, v1, v17
+; GFX9-NEXT: s_mov_b32 s4, 0x3020706
+; GFX9-NEXT: v_mul_f32_e32 v49, v53, v52
+; GFX9-NEXT: v_mul_f32_e32 v50, v55, v54
+; GFX9-NEXT: v_mul_f32_e32 v52, v43, v42
+; GFX9-NEXT: v_mul_f32_e32 v53, v45, v44
+; GFX9-NEXT: v_mul_f32_e32 v54, v47, v46
+; GFX9-NEXT: v_mul_f32_e32 v55, v57, v56
+; GFX9-NEXT: v_perm_b32 v1, v1, v40, s4
+; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32
+; GFX9-NEXT: v_and_b32_e32 v32, 0xffff0000, v30
+; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v14
+; GFX9-NEXT: v_and_b32_e32 v34, 0xffff0000, v29
+; GFX9-NEXT: v_and_b32_e32 v35, 0xffff0000, v13
+; GFX9-NEXT: v_and_b32_e32 v36, 0xffff0000, v28
+; GFX9-NEXT: v_and_b32_e32 v37, 0xffff0000, v12
+; GFX9-NEXT: v_mul_f32_e32 v32, v33, v32
+; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v16
+; GFX9-NEXT: v_mul_f32_e32 v34, v35, v34
+; GFX9-NEXT: v_and_b32_e32 v35, 0xffff0000, v0
+; GFX9-NEXT: v_mul_f32_e32 v36, v37, v36
+; GFX9-NEXT: v_and_b32_e32 v37, 0xffff0000, v15
+; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v15
+; GFX9-NEXT: v_mul_f32_e32 v33, v35, v33
+; GFX9-NEXT: v_lshlrev_b32_e32 v30, 16, v30
+; GFX9-NEXT: v_lshlrev_b32_e32 v14, 16, v14
+; GFX9-NEXT: v_lshlrev_b32_e32 v29, 16, v29
+; GFX9-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX9-NEXT: v_lshlrev_b32_e32 v28, 16, v28
+; GFX9-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; GFX9-NEXT: v_lshlrev_b32_e32 v27, 16, v27
+; GFX9-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GFX9-NEXT: v_lshlrev_b32_e32 v26, 16, v26
+; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GFX9-NEXT: v_lshlrev_b32_e32 v25, 16, v25
+; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX9-NEXT: v_lshlrev_b32_e32 v24, 16, v24
+; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX9-NEXT: v_lshlrev_b32_e32 v23, 16, v23
+; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX9-NEXT: v_lshlrev_b32_e32 v22, 16, v22
+; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX9-NEXT: v_lshlrev_b32_e32 v21, 16, v21
+; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX9-NEXT: v_lshlrev_b32_e32 v20, 16, v20
+; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX9-NEXT: v_lshlrev_b32_e32 v19, 16, v19
+; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX9-NEXT: v_lshlrev_b32_e32 v18, 16, v18
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v16
+; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-NEXT: v_mul_f32_e32 v14, v14, v30
+; GFX9-NEXT: v_mul_f32_e32 v13, v13, v29
+; GFX9-NEXT: v_mul_f32_e32 v12, v12, v28
+; GFX9-NEXT: v_mul_f32_e32 v11, v11, v27
+; GFX9-NEXT: v_mul_f32_e32 v10, v10, v26
+; GFX9-NEXT: v_mul_f32_e32 v9, v9, v25
+; GFX9-NEXT: v_mul_f32_e32 v8, v8, v24
+; GFX9-NEXT: v_mul_f32_e32 v7, v7, v23
+; GFX9-NEXT: v_mul_f32_e32 v6, v6, v22
+; GFX9-NEXT: v_mul_f32_e32 v5, v5, v21
+; GFX9-NEXT: v_mul_f32_e32 v4, v4, v20
+; GFX9-NEXT: v_mul_f32_e32 v3, v3, v19
+; GFX9-NEXT: v_mul_f32_e32 v2, v2, v18
+; GFX9-NEXT: v_mul_f32_e32 v0, v0, v16
+; GFX9-NEXT: v_perm_b32 v0, v0, v33, s4
+; GFX9-NEXT: v_perm_b32 v2, v2, v55, s4
+; GFX9-NEXT: v_perm_b32 v3, v3, v54, s4
+; GFX9-NEXT: v_perm_b32 v4, v4, v53, s4
+; GFX9-NEXT: v_perm_b32 v5, v5, v52, s4
+; GFX9-NEXT: v_perm_b32 v6, v6, v51, s4
+; GFX9-NEXT: v_perm_b32 v7, v7, v50, s4
+; GFX9-NEXT: v_perm_b32 v8, v8, v49, s4
+; GFX9-NEXT: v_perm_b32 v9, v9, v48, s4
+; GFX9-NEXT: v_perm_b32 v10, v10, v39, s4
+; GFX9-NEXT: v_perm_b32 v11, v11, v38, s4
+; GFX9-NEXT: v_perm_b32 v12, v12, v36, s4
+; GFX9-NEXT: v_perm_b32 v13, v13, v34, s4
+; GFX9-NEXT: v_perm_b32 v14, v14, v32, s4
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_and_b32_e32 v35, 0xffff0000, v31
+; GFX9-NEXT: v_lshlrev_b32_e32 v31, 16, v31
+; GFX9-NEXT: v_mul_f32_e32 v35, v37, v35
+; GFX9-NEXT: v_mul_f32_e32 v15, v15, v31
+; GFX9-NEXT: v_perm_b32 v15, v15, v35, s4
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fmul_v32bf16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: buffer_load_dword v31, off, s[0:3], s32
+; GFX10-NEXT: v_and_b32_e32 v53, 0xffff0000, v24
+; GFX10-NEXT: v_and_b32_e32 v54, 0xffff0000, v8
+; GFX10-NEXT: v_and_b32_e32 v55, 0xffff0000, v23
+; GFX10-NEXT: v_and_b32_e32 v64, 0xffff0000, v7
+; GFX10-NEXT: v_and_b32_e32 v65, 0xffff0000, v22
+; GFX10-NEXT: v_and_b32_e32 v66, 0xffff0000, v6
+; GFX10-NEXT: v_and_b32_e32 v67, 0xffff0000, v21
+; GFX10-NEXT: v_and_b32_e32 v68, 0xffff0000, v5
+; GFX10-NEXT: v_and_b32_e32 v33, 0xffff0000, v30
+; GFX10-NEXT: v_and_b32_e32 v34, 0xffff0000, v14
+; GFX10-NEXT: v_and_b32_e32 v35, 0xffff0000, v29
+; GFX10-NEXT: v_and_b32_e32 v36, 0xffff0000, v13
+; GFX10-NEXT: v_and_b32_e32 v37, 0xffff0000, v28
+; GFX10-NEXT: v_and_b32_e32 v38, 0xffff0000, v12
+; GFX10-NEXT: v_and_b32_e32 v39, 0xffff0000, v27
+; GFX10-NEXT: v_and_b32_e32 v48, 0xffff0000, v11
+; GFX10-NEXT: v_and_b32_e32 v49, 0xffff0000, v26
+; GFX10-NEXT: v_and_b32_e32 v50, 0xffff0000, v10
+; GFX10-NEXT: v_and_b32_e32 v51, 0xffff0000, v25
+; GFX10-NEXT: v_and_b32_e32 v52, 0xffff0000, v9
+; GFX10-NEXT: v_mul_f32_e32 v53, v54, v53
+; GFX10-NEXT: v_and_b32_e32 v54, 0xffff0000, v17
+; GFX10-NEXT: v_mul_f32_e32 v55, v64, v55
+; GFX10-NEXT: v_and_b32_e32 v64, 0xffff0000, v1
+; GFX10-NEXT: v_mul_f32_e32 v65, v66, v65
+; GFX10-NEXT: v_and_b32_e32 v66, 0xffff0000, v16
+; GFX10-NEXT: v_mul_f32_e32 v67, v68, v67
+; GFX10-NEXT: v_and_b32_e32 v68, 0xffff0000, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v17
+; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT: v_lshlrev_b32_e32 v16, 16, v16
+; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT: v_and_b32_e32 v32, 0xffff0000, v15
+; GFX10-NEXT: v_mul_f32_e32 v33, v34, v33
+; GFX10-NEXT: v_and_b32_e32 v34, 0xffff0000, v20
+; GFX10-NEXT: v_mul_f32_e32 v35, v36, v35
+; GFX10-NEXT: v_and_b32_e32 v36, 0xffff0000, v4
+; GFX10-NEXT: v_mul_f32_e32 v37, v38, v37
+; GFX10-NEXT: v_and_b32_e32 v38, 0xffff0000, v19
+; GFX10-NEXT: v_mul_f32_e32 v39, v48, v39
+; GFX10-NEXT: v_and_b32_e32 v48, 0xffff0000, v3
+; GFX10-NEXT: v_mul_f32_e32 v49, v50, v49
+; GFX10-NEXT: v_and_b32_e32 v50, 0xffff0000, v18
+; GFX10-NEXT: v_mul_f32_e32 v51, v52, v51
+; GFX10-NEXT: v_and_b32_e32 v52, 0xffff0000, v2
+; GFX10-NEXT: v_lshlrev_b32_e32 v15, 16, v15
+; GFX10-NEXT: v_lshlrev_b32_e32 v30, 16, v30
+; GFX10-NEXT: v_lshlrev_b32_e32 v14, 16, v14
+; GFX10-NEXT: v_lshlrev_b32_e32 v29, 16, v29
+; GFX10-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX10-NEXT: v_lshlrev_b32_e32 v28, 16, v28
+; GFX10-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; GFX10-NEXT: v_lshlrev_b32_e32 v27, 16, v27
+; GFX10-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GFX10-NEXT: v_lshlrev_b32_e32 v26, 16, v26
+; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GFX10-NEXT: v_lshlrev_b32_e32 v25, 16, v25
+; GFX10-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX10-NEXT: v_lshlrev_b32_e32 v24, 16, v24
+; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX10-NEXT: v_lshlrev_b32_e32 v23, 16, v23
+; GFX10-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX10-NEXT: v_lshlrev_b32_e32 v22, 16, v22
+; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX10-NEXT: v_lshlrev_b32_e32 v21, 16, v21
+; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX10-NEXT: v_lshlrev_b32_e32 v20, 16, v20
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX10-NEXT: v_lshlrev_b32_e32 v19, 16, v19
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v18
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-NEXT: v_mul_f32_e32 v0, v0, v16
+; GFX10-NEXT: v_mul_f32_e32 v1, v1, v17
+; GFX10-NEXT: v_mul_f32_e32 v34, v36, v34
+; GFX10-NEXT: v_mul_f32_e32 v36, v48, v38
+; GFX10-NEXT: v_mul_f32_e32 v38, v52, v50
+; GFX10-NEXT: v_mul_f32_e32 v48, v64, v54
+; GFX10-NEXT: v_mul_f32_e32 v50, v68, v66
+; GFX10-NEXT: v_mul_f32_e32 v14, v14, v30
+; GFX10-NEXT: v_mul_f32_e32 v13, v13, v29
+; GFX10-NEXT: v_mul_f32_e32 v12, v12, v28
+; GFX10-NEXT: v_mul_f32_e32 v11, v11, v27
+; GFX10-NEXT: v_mul_f32_e32 v10, v10, v26
+; GFX10-NEXT: v_mul_f32_e32 v9, v9, v25
+; GFX10-NEXT: v_mul_f32_e32 v8, v8, v24
+; GFX10-NEXT: v_mul_f32_e32 v7, v7, v23
+; GFX10-NEXT: v_mul_f32_e32 v6, v6, v22
+; GFX10-NEXT: v_mul_f32_e32 v5, v5, v21
+; GFX10-NEXT: v_mul_f32_e32 v2, v2, v18
+; GFX10-NEXT: v_mul_f32_e32 v3, v3, v19
+; GFX10-NEXT: v_mul_f32_e32 v4, v4, v20
+; GFX10-NEXT: v_perm_b32 v0, v0, v50, 0x3020706
+; GFX10-NEXT: v_perm_b32 v1, v1, v48, 0x3020706
+; GFX10-NEXT: v_perm_b32 v2, v2, v38, 0x3020706
+; GFX10-NEXT: v_perm_b32 v3, v3, v36, 0x3020706
+; GFX10-NEXT: v_perm_b32 v4, v4, v34, 0x3020706
+; GFX10-NEXT: v_perm_b32 v5, v5, v67, 0x3020706
+; GFX10-NEXT: v_perm_b32 v6, v6, v65, 0x3020706
+; GFX10-NEXT: v_perm_b32 v7, v7, v55, 0x3020706
+; GFX10-NEXT: v_perm_b32 v8, v8, v53, 0x3020706
+; GFX10-NEXT: v_perm_b32 v9, v9, v51, 0x3020706
+; GFX10-NEXT: v_perm_b32 v10, v10, v49, 0x3020706
+; GFX10-NEXT: v_perm_b32 v11, v11, v39, 0x3020706
+; GFX10-NEXT: v_perm_b32 v12, v12, v37, 0x3020706
+; GFX10-NEXT: v_perm_b32 v13, v13, v35, 0x3020706
+; GFX10-NEXT: v_perm_b32 v14, v14, v33, 0x3020706
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_and_b32_e32 v16, 0xffff0000, v31
+; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v31
+; GFX10-NEXT: v_mul_f32_e32 v16, v32, v16
+; GFX10-NEXT: v_mul_f32_e32 v15, v15, v17
+; GFX10-NEXT: v_perm_b32 v15, v15, v16, 0x3020706
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_fmul_v32bf16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: scratch_load_b32 v31, off, s32
+; GFX11-NEXT: v_and_b32_e32 v82, 0xffff0000, v2
+; GFX11-NEXT: v_and_b32_e32 v84, 0xffff0000, v1
+; GFX11-NEXT: v_and_b32_e32 v85, 0xffff0000, v16
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: v_and_b32_e32 v86, 0xffff0000, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v16, 16, v16
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT: v_and_b32_e32 v81, 0xffff0000, v18
+; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v18
+; GFX11-NEXT: v_and_b32_e32 v83, 0xffff0000, v17
+; GFX11-NEXT: v_lshlrev_b32_e32 v17, 16, v17
+; GFX11-NEXT: v_and_b32_e32 v55, 0xffff0000, v23
+; GFX11-NEXT: v_and_b32_e32 v70, 0xffff0000, v4
+; GFX11-NEXT: v_mul_f32_e32 v2, v2, v18
+; GFX11-NEXT: v_and_b32_e32 v65, 0xffff0000, v22
+; GFX11-NEXT: v_dual_mul_f32 v1, v1, v17 :: v_dual_lshlrev_b32 v22, 16, v22
+; GFX11-NEXT: v_and_b32_e32 v66, 0xffff0000, v6
+; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX11-NEXT: v_and_b32_e32 v50, 0xffff0000, v10
+; GFX11-NEXT: v_and_b32_e32 v69, 0xffff0000, v20
+; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT: v_dual_mul_f32 v6, v6, v22 :: v_dual_lshlrev_b32 v23, 16, v23
+; GFX11-NEXT: v_and_b32_e32 v54, 0xffff0000, v8
+; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX11-NEXT: v_and_b32_e32 v71, 0xffff0000, v19
+; GFX11-NEXT: v_dual_mul_f32 v0, v0, v16 :: v_dual_lshlrev_b32 v19, 16, v19
+; GFX11-NEXT: v_and_b32_e32 v64, 0xffff0000, v7
+; GFX11-NEXT: v_and_b32_e32 v49, 0xffff0000, v26
+; GFX11-NEXT: v_lshlrev_b32_e32 v26, 16, v26
+; GFX11-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX11-NEXT: v_and_b32_e32 v52, 0xffff0000, v9
+; GFX11-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX11-NEXT: v_and_b32_e32 v80, 0xffff0000, v3
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT: v_dual_mul_f32 v10, v10, v26 :: v_dual_and_b32 v67, 0xffff0000, v21
+; GFX11-NEXT: v_and_b32_e32 v68, 0xffff0000, v5
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-NEXT: v_dual_mul_f32 v3, v3, v19 :: v_dual_and_b32 v38, 0xffff0000, v12
+; GFX11-NEXT: v_and_b32_e32 v51, 0xffff0000, v25
+; GFX11-NEXT: v_lshlrev_b32_e32 v25, 16, v25
+; GFX11-NEXT: v_mul_f32_e32 v7, v7, v23
+; GFX11-NEXT: v_and_b32_e32 v37, 0xffff0000, v28
+; GFX11-NEXT: v_and_b32_e32 v34, 0xffff0000, v14
+; GFX11-NEXT: v_lshlrev_b32_e32 v14, 16, v14
+; GFX11-NEXT: v_and_b32_e32 v39, 0xffff0000, v27
+; GFX11-NEXT: v_lshlrev_b32_e32 v27, 16, v27
+; GFX11-NEXT: v_mul_f32_e32 v9, v9, v25
+; GFX11-NEXT: v_and_b32_e32 v53, 0xffff0000, v24
+; GFX11-NEXT: v_and_b32_e32 v48, 0xffff0000, v11
+; GFX11-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GFX11-NEXT: v_and_b32_e32 v35, 0xffff0000, v29
+; GFX11-NEXT: v_and_b32_e32 v36, 0xffff0000, v13
+; GFX11-NEXT: v_lshlrev_b32_e32 v20, 16, v20
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_dual_mul_f32 v11, v11, v27 :: v_dual_lshlrev_b32 v4, 16, v4
+; GFX11-NEXT: v_and_b32_e32 v33, 0xffff0000, v30
+; GFX11-NEXT: v_and_b32_e32 v32, 0xffff0000, v15
+; GFX11-NEXT: v_dual_mul_f32 v4, v4, v20 :: v_dual_lshlrev_b32 v15, 16, v15
+; GFX11-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX11-NEXT: v_lshlrev_b32_e32 v28, 16, v28
+; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; GFX11-NEXT: v_lshlrev_b32_e32 v24, 16, v24
+; GFX11-NEXT: v_lshlrev_b32_e32 v21, 16, v21
+; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-NEXT: v_dual_mul_f32 v33, v34, v33 :: v_dual_mul_f32 v34, v36, v35
+; GFX11-NEXT: v_dual_mul_f32 v35, v38, v37 :: v_dual_mul_f32 v12, v12, v28
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_dual_mul_f32 v8, v8, v24 :: v_dual_mul_f32 v5, v5, v21
+; GFX11-NEXT: v_perm_b32 v12, v12, v35, 0x3020706
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_and_b32_e32 v16, 0xffff0000, v31
+; GFX11-NEXT: v_lshlrev_b32_e32 v29, 16, v29
+; GFX11-NEXT: v_lshlrev_b32_e32 v17, 16, v31
+; GFX11-NEXT: v_lshlrev_b32_e32 v30, 16, v30
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_dual_mul_f32 v16, v32, v16 :: v_dual_mul_f32 v13, v13, v29
+; GFX11-NEXT: v_dual_mul_f32 v15, v15, v17 :: v_dual_mul_f32 v14, v14, v30
+; GFX11-NEXT: v_mul_f32_e32 v36, v48, v39
+; GFX11-NEXT: v_dual_mul_f32 v48, v64, v55 :: v_dual_mul_f32 v37, v50, v49
+; GFX11-NEXT: v_mul_f32_e32 v50, v68, v67
+; GFX11-NEXT: v_dual_mul_f32 v38, v52, v51 :: v_dual_mul_f32 v51, v70, v69
+; GFX11-NEXT: v_dual_mul_f32 v52, v80, v71 :: v_dual_mul_f32 v39, v54, v53
+; GFX11-NEXT: v_dual_mul_f32 v53, v82, v81 :: v_dual_mul_f32 v54, v84, v83
+; GFX11-NEXT: v_mul_f32_e32 v55, v86, v85
+; GFX11-NEXT: v_mul_f32_e32 v49, v66, v65
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_perm_b32 v3, v3, v52, 0x3020706
+; GFX11-NEXT: v_perm_b32 v2, v2, v53, 0x3020706
+; GFX11-NEXT: v_perm_b32 v1, v1, v54, 0x3020706
+; GFX11-NEXT: v_perm_b32 v0, v0, v55, 0x3020706
+; GFX11-NEXT: v_perm_b32 v4, v4, v51, 0x3020706
+; GFX11-NEXT: v_perm_b32 v5, v5, v50, 0x3020706
+; GFX11-NEXT: v_perm_b32 v6, v6, v49, 0x3020706
+; GFX11-NEXT: v_perm_b32 v7, v7, v48, 0x3020706
+; GFX11-NEXT: v_perm_b32 v8, v8, v39, 0x3020706
+; GFX11-NEXT: v_perm_b32 v9, v9, v38, 0x3020706
+; GFX11-NEXT: v_perm_b32 v10, v10, v37, 0x3020706
+; GFX11-NEXT: v_perm_b32 v11, v11, v36, 0x3020706
+; GFX11-NEXT: v_perm_b32 v13, v13, v34, 0x3020706
+; GFX11-NEXT: v_perm_b32 v14, v14, v33, 0x3020706
+; GFX11-NEXT: v_perm_b32 v15, v15, v16, 0x3020706
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %op = fmul <32 x bfloat> %a, %b
+ ret <32 x bfloat> %op
+}
+
define bfloat @v_fdiv_bf16(bfloat %a, bfloat %b) {
; GCN-LABEL: v_fdiv_bf16:
; GCN: ; %bb.0:
@@ -9716,7 +13150,8 @@ define amdgpu_ps i32 @s_fabs_bf16(bfloat inreg %a) {
%op = call bfloat @llvm.fabs.bf16(bfloat %a)
%cast = bitcast bfloat %op to i16
%zext = zext i16 %cast to i32
- ret i32 %zext
+ %readlane = call i32 @llvm.amdgcn.readfirstlane(i32 %zext)
+ ret i32 %readlane
}
define bfloat @v_fneg_bf16(bfloat %a) {
@@ -9943,6 +13378,11 @@ define amdgpu_ps i32 @s_fneg_fabs_bf16(bfloat inreg %a) {
declare bfloat @llvm.minnum.bf16(bfloat, bfloat)
declare <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat>, <2 x bfloat>)
+declare <3 x bfloat> @llvm.minnum.v3bf16(<3 x bfloat>, <3 x bfloat>)
+declare <4 x bfloat> @llvm.minnum.v4bf16(<4 x bfloat>, <4 x bfloat>)
+declare <8 x bfloat> @llvm.minnum.v8bf16(<8 x bfloat>, <8 x bfloat>)
+declare <16 x bfloat> @llvm.minnum.v16bf16(<16 x bfloat>, <16 x bfloat>)
+declare <32 x bfloat> @llvm.minnum.v32bf16(<32 x bfloat>, <32 x bfloat>)
define bfloat @v_minnum_bf16(bfloat %a, bfloat %b) {
; GCN-LABEL: v_minnum_bf16:
@@ -10120,8 +13560,2440 @@ define <2 x bfloat> @v_minnum_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
ret <2 x bfloat> %op
}
+define <3 x bfloat> @v_minnum_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
+; GCN-LABEL: v_minnum_v3bf16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_min_f32_e32 v2, v2, v5
+; GCN-NEXT: v_min_f32_e32 v1, v1, v4
+; GCN-NEXT: v_min_f32_e32 v0, v0, v3
+; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_minnum_v3bf16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT: v_min_f32_e32 v2, v2, v5
+; GFX7-NEXT: v_min_f32_e32 v1, v1, v4
+; GFX7-NEXT: v_min_f32_e32 v0, v0, v3
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_minnum_v3bf16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX8-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX8-NEXT: v_min_f32_e32 v1, v1, v3
+; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
+; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX8-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX8-NEXT: v_min_f32_e32 v3, v4, v3
+; GFX8-NEXT: v_min_f32_e32 v0, v0, v2
+; GFX8-NEXT: s_mov_b32 s4, 0x3020706
+; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT: v_perm_b32 v0, v0, v3, s4
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_minnum_v3bf16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT: v_max_f32_e32 v3, v3, v3
+; GFX9-NEXT: v_max_f32_e32 v1, v1, v1
+; GFX9-NEXT: v_min_f32_e32 v1, v1, v3
+; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
+; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-NEXT: v_max_f32_e32 v3, v3, v3
+; GFX9-NEXT: v_max_f32_e32 v4, v4, v4
+; GFX9-NEXT: v_max_f32_e32 v2, v2, v2
+; GFX9-NEXT: v_max_f32_e32 v0, v0, v0
+; GFX9-NEXT: v_min_f32_e32 v3, v4, v3
+; GFX9-NEXT: v_min_f32_e32 v0, v0, v2
+; GFX9-NEXT: s_mov_b32 s4, 0x3020706
+; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minnum_v3bf16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT: v_max_f32_e32 v3, v3, v3
+; GFX10-NEXT: v_max_f32_e32 v4, v4, v4
+; GFX10-NEXT: v_max_f32_e32 v5, v5, v5
+; GFX10-NEXT: v_max_f32_e32 v2, v2, v2
+; GFX10-NEXT: v_max_f32_e32 v0, v0, v0
+; GFX10-NEXT: v_max_f32_e32 v1, v1, v1
+; GFX10-NEXT: v_min_f32_e32 v4, v5, v4
+; GFX10-NEXT: v_min_f32_e32 v0, v0, v2
+; GFX10-NEXT: v_min_f32_e32 v1, v1, v3
+; GFX10-NEXT: v_perm_b32 v0, v0, v4, 0x3020706
+; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_minnum_v3bf16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_dual_max_f32 v4, v4, v4 :: v_dual_lshlrev_b32 v1, 16, v1
+; GFX11-NEXT: v_dual_max_f32 v2, v2, v2 :: v_dual_and_b32 v5, 0xffff0000, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_dual_max_f32 v1, v1, v1 :: v_dual_lshlrev_b32 v0, 16, v0
+; GFX11-NEXT: v_dual_max_f32 v3, v3, v3 :: v_dual_max_f32 v0, v0, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_dual_min_f32 v1, v1, v3 :: v_dual_min_f32 v0, v0, v2
+; GFX11-NEXT: v_max_f32_e32 v5, v5, v5
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: v_min_f32_e32 v4, v5, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_perm_b32 v0, v0, v4, 0x3020706
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %op = call <3 x bfloat> @llvm.minnum.v3bf16(<3 x bfloat> %a, <3 x bfloat> %b)
+ ret <3 x bfloat> %op
+}
+
+define <4 x bfloat> @v_minnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
+; GCN-LABEL: v_minnum_v4bf16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
+; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_min_f32_e32 v3, v3, v7
+; GCN-NEXT: v_min_f32_e32 v2, v2, v6
+; GCN-NEXT: v_min_f32_e32 v1, v1, v5
+; GCN-NEXT: v_min_f32_e32 v0, v0, v4
+; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_minnum_v4bf16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
+; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT: v_min_f32_e32 v3, v3, v7
+; GFX7-NEXT: v_min_f32_e32 v2, v2, v6
+; GFX7-NEXT: v_min_f32_e32 v1, v1, v5
+; GFX7-NEXT: v_min_f32_e32 v0, v0, v4
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_minnum_v4bf16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v3
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GFX8-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GFX8-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX8-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX8-NEXT: v_min_f32_e32 v4, v5, v4
+; GFX8-NEXT: v_min_f32_e32 v1, v1, v3
+; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX8-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX8-NEXT: v_min_f32_e32 v3, v5, v3
+; GFX8-NEXT: v_min_f32_e32 v0, v0, v2
+; GFX8-NEXT: s_mov_b32 s4, 0x3020706
+; GFX8-NEXT: v_perm_b32 v0, v0, v3, s4
+; GFX8-NEXT: v_perm_b32 v1, v1, v4, s4
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_minnum_v4bf16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v3
+; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v1
+; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT: v_max_f32_e32 v4, v4, v4
+; GFX9-NEXT: v_max_f32_e32 v5, v5, v5
+; GFX9-NEXT: v_max_f32_e32 v3, v3, v3
+; GFX9-NEXT: v_max_f32_e32 v1, v1, v1
+; GFX9-NEXT: v_min_f32_e32 v4, v5, v4
+; GFX9-NEXT: v_min_f32_e32 v1, v1, v3
+; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
+; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v0
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-NEXT: v_max_f32_e32 v3, v3, v3
+; GFX9-NEXT: v_max_f32_e32 v5, v5, v5
+; GFX9-NEXT: v_max_f32_e32 v2, v2, v2
+; GFX9-NEXT: v_max_f32_e32 v0, v0, v0
+; GFX9-NEXT: v_min_f32_e32 v3, v5, v3
+; GFX9-NEXT: v_min_f32_e32 v0, v0, v2
+; GFX9-NEXT: s_mov_b32 s4, 0x3020706
+; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4
+; GFX9-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minnum_v4bf16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v3
+; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v1
+; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
+; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-NEXT: v_max_f32_e32 v3, v3, v3
+; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT: v_max_f32_e32 v1, v1, v1
+; GFX10-NEXT: v_max_f32_e32 v4, v4, v4
+; GFX10-NEXT: v_max_f32_e32 v5, v5, v5
+; GFX10-NEXT: v_max_f32_e32 v6, v6, v6
+; GFX10-NEXT: v_max_f32_e32 v7, v7, v7
+; GFX10-NEXT: v_max_f32_e32 v2, v2, v2
+; GFX10-NEXT: v_max_f32_e32 v0, v0, v0
+; GFX10-NEXT: v_min_f32_e32 v1, v1, v3
+; GFX10-NEXT: v_min_f32_e32 v3, v5, v4
+; GFX10-NEXT: v_min_f32_e32 v4, v7, v6
+; GFX10-NEXT: v_min_f32_e32 v0, v0, v2
+; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX10-NEXT: v_perm_b32 v0, v0, v4, 0x3020706
+; GFX10-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_minnum_v4bf16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_dual_max_f32 v5, v5, v5 :: v_dual_lshlrev_b32 v4, 16, v3
+; GFX11-NEXT: v_dual_max_f32 v4, v4, v4 :: v_dual_and_b32 v3, 0xffff0000, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_min_f32_e32 v4, v5, v4
+; GFX11-NEXT: v_dual_max_f32 v3, v3, v3 :: v_dual_and_b32 v6, 0xffff0000, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_dual_max_f32 v5, v6, v6 :: v_dual_lshlrev_b32 v0, 16, v0
+; GFX11-NEXT: v_dual_max_f32 v2, v2, v2 :: v_dual_and_b32 v1, 0xffff0000, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_dual_max_f32 v6, v7, v7 :: v_dual_max_f32 v1, v1, v1
+; GFX11-NEXT: v_dual_max_f32 v0, v0, v0 :: v_dual_min_f32 v1, v1, v3
+; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_min_f32_e32 v4, v6, v5
+; GFX11-NEXT: v_dual_min_f32 v0, v0, v2 :: v_dual_and_b32 v1, 0xffff0000, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_perm_b32 v0, v0, v4, 0x3020706
+; GFX11-NEXT: v_or_b32_e32 v1, v3, v1
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %op = call <4 x bfloat> @llvm.minnum.v4bf16(<4 x bfloat> %a, <4 x bfloat> %b)
+ ret <4 x bfloat> %op
+}
+
+define <8 x bfloat> @v_minnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
+; GCN-LABEL: v_minnum_v8bf16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
+; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
+; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
+; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
+; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
+; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
+; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14
+; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
+; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12
+; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
+; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
+; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_min_f32_e32 v7, v7, v15
+; GCN-NEXT: v_min_f32_e32 v6, v6, v14
+; GCN-NEXT: v_min_f32_e32 v5, v5, v13
+; GCN-NEXT: v_min_f32_e32 v4, v4, v12
+; GCN-NEXT: v_min_f32_e32 v3, v3, v11
+; GCN-NEXT: v_min_f32_e32 v2, v2, v10
+; GCN-NEXT: v_min_f32_e32 v1, v1, v9
+; GCN-NEXT: v_min_f32_e32 v0, v0, v8
+; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_minnum_v8bf16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
+; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15
+; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
+; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14
+; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
+; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13
+; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12
+; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11
+; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT: v_min_f32_e32 v7, v7, v15
+; GFX7-NEXT: v_min_f32_e32 v6, v6, v14
+; GFX7-NEXT: v_min_f32_e32 v5, v5, v13
+; GFX7-NEXT: v_min_f32_e32 v4, v4, v12
+; GFX7-NEXT: v_min_f32_e32 v3, v3, v11
+; GFX7-NEXT: v_min_f32_e32 v2, v2, v10
+; GFX7-NEXT: v_min_f32_e32 v1, v1, v9
+; GFX7-NEXT: v_min_f32_e32 v0, v0, v8
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_minnum_v8bf16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_and_b32_e32 v8, 0xffff0000, v7
+; GFX8-NEXT: v_and_b32_e32 v9, 0xffff0000, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX8-NEXT: v_mul_f32_e32 v8, 1.0, v8
+; GFX8-NEXT: v_mul_f32_e32 v9, 1.0, v9
+; GFX8-NEXT: v_mul_f32_e32 v7, 1.0, v7
+; GFX8-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX8-NEXT: v_min_f32_e32 v8, v9, v8
+; GFX8-NEXT: v_min_f32_e32 v3, v3, v7
+; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v6
+; GFX8-NEXT: v_and_b32_e32 v9, 0xffff0000, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT: v_mul_f32_e32 v7, 1.0, v7
+; GFX8-NEXT: v_mul_f32_e32 v9, 1.0, v9
+; GFX8-NEXT: v_mul_f32_e32 v6, 1.0, v6
+; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX8-NEXT: v_min_f32_e32 v7, v9, v7
+; GFX8-NEXT: v_min_f32_e32 v2, v2, v6
+; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v5
+; GFX8-NEXT: v_and_b32_e32 v9, 0xffff0000, v1
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT: v_mul_f32_e32 v6, 1.0, v6
+; GFX8-NEXT: v_mul_f32_e32 v9, 1.0, v9
+; GFX8-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GFX8-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX8-NEXT: v_min_f32_e32 v6, v9, v6
+; GFX8-NEXT: v_min_f32_e32 v1, v1, v5
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
+; GFX8-NEXT: v_and_b32_e32 v9, 0xffff0000, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GFX8-NEXT: v_mul_f32_e32 v9, 1.0, v9
+; GFX8-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX8-NEXT: v_min_f32_e32 v5, v9, v5
+; GFX8-NEXT: v_min_f32_e32 v0, v0, v4
+; GFX8-NEXT: s_mov_b32 s4, 0x3020706
+; GFX8-NEXT: v_perm_b32 v0, v0, v5, s4
+; GFX8-NEXT: v_perm_b32 v1, v1, v6, s4
+; GFX8-NEXT: v_perm_b32 v2, v2, v7, s4
+; GFX8-NEXT: v_perm_b32 v3, v3, v8, s4
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_minnum_v8bf16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_and_b32_e32 v8, 0xffff0000, v7
+; GFX9-NEXT: v_and_b32_e32 v9, 0xffff0000, v3
+; GFX9-NEXT: v_max_f32_e32 v8, v8, v8
+; GFX9-NEXT: v_max_f32_e32 v9, v9, v9
+; GFX9-NEXT: v_min_f32_e32 v8, v9, v8
+; GFX9-NEXT: v_and_b32_e32 v9, 0xffff0000, v5
+; GFX9-NEXT: v_and_b32_e32 v10, 0xffff0000, v1
+; GFX9-NEXT: v_max_f32_e32 v9, v9, v9
+; GFX9-NEXT: v_max_f32_e32 v10, v10, v10
+; GFX9-NEXT: v_min_f32_e32 v9, v10, v9
+; GFX9-NEXT: v_and_b32_e32 v10, 0xffff0000, v6
+; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v2
+; GFX9-NEXT: v_max_f32_e32 v10, v10, v10
+; GFX9-NEXT: v_max_f32_e32 v11, v11, v11
+; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT: v_min_f32_e32 v10, v11, v10
+; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v4
+; GFX9-NEXT: v_and_b32_e32 v12, 0xffff0000, v0
+; GFX9-NEXT: v_max_f32_e32 v5, v5, v5
+; GFX9-NEXT: v_max_f32_e32 v1, v1, v1
+; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX9-NEXT: v_min_f32_e32 v1, v1, v5
+; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v6
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX9-NEXT: v_max_f32_e32 v4, v4, v4
+; GFX9-NEXT: v_max_f32_e32 v0, v0, v0
+; GFX9-NEXT: v_max_f32_e32 v11, v11, v11
+; GFX9-NEXT: v_max_f32_e32 v12, v12, v12
+; GFX9-NEXT: v_max_f32_e32 v7, v7, v7
+; GFX9-NEXT: v_max_f32_e32 v3, v3, v3
+; GFX9-NEXT: v_max_f32_e32 v5, v5, v5
+; GFX9-NEXT: v_max_f32_e32 v2, v2, v2
+; GFX9-NEXT: v_min_f32_e32 v0, v0, v4
+; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v9
+; GFX9-NEXT: v_min_f32_e32 v11, v12, v11
+; GFX9-NEXT: v_min_f32_e32 v3, v3, v7
+; GFX9-NEXT: v_min_f32_e32 v2, v2, v5
+; GFX9-NEXT: s_mov_b32 s4, 0x3020706
+; GFX9-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v8
+; GFX9-NEXT: v_perm_b32 v0, v0, v11, s4
+; GFX9-NEXT: v_perm_b32 v2, v2, v10, s4
+; GFX9-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minnum_v8bf16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_and_b32_e32 v8, 0xffff0000, v7
+; GFX10-NEXT: v_and_b32_e32 v9, 0xffff0000, v3
+; GFX10-NEXT: v_and_b32_e32 v10, 0xffff0000, v5
+; GFX10-NEXT: v_and_b32_e32 v11, 0xffff0000, v1
+; GFX10-NEXT: v_and_b32_e32 v12, 0xffff0000, v2
+; GFX10-NEXT: v_max_f32_e32 v8, v8, v8
+; GFX10-NEXT: v_max_f32_e32 v9, v9, v9
+; GFX10-NEXT: v_max_f32_e32 v10, v10, v10
+; GFX10-NEXT: v_and_b32_e32 v13, 0xffff0000, v4
+; GFX10-NEXT: v_and_b32_e32 v14, 0xffff0000, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX10-NEXT: v_min_f32_e32 v8, v9, v8
+; GFX10-NEXT: v_max_f32_e32 v9, v11, v11
+; GFX10-NEXT: v_and_b32_e32 v11, 0xffff0000, v6
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-NEXT: v_min_f32_e32 v9, v9, v10
+; GFX10-NEXT: v_max_f32_e32 v10, v11, v11
+; GFX10-NEXT: v_max_f32_e32 v11, v12, v12
+; GFX10-NEXT: v_max_f32_e32 v12, v13, v13
+; GFX10-NEXT: v_max_f32_e32 v13, v14, v14
+; GFX10-NEXT: v_max_f32_e32 v7, v7, v7
+; GFX10-NEXT: v_max_f32_e32 v3, v3, v3
+; GFX10-NEXT: v_max_f32_e32 v5, v5, v5
+; GFX10-NEXT: v_max_f32_e32 v1, v1, v1
+; GFX10-NEXT: v_max_f32_e32 v4, v4, v4
+; GFX10-NEXT: v_max_f32_e32 v0, v0, v0
+; GFX10-NEXT: v_max_f32_e32 v6, v6, v6
+; GFX10-NEXT: v_max_f32_e32 v2, v2, v2
+; GFX10-NEXT: v_min_f32_e32 v10, v11, v10
+; GFX10-NEXT: v_min_f32_e32 v11, v13, v12
+; GFX10-NEXT: v_min_f32_e32 v3, v3, v7
+; GFX10-NEXT: v_min_f32_e32 v1, v1, v5
+; GFX10-NEXT: v_min_f32_e32 v0, v0, v4
+; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v9
+; GFX10-NEXT: v_min_f32_e32 v2, v2, v6
+; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v8
+; GFX10-NEXT: v_perm_b32 v0, v0, v11, 0x3020706
+; GFX10-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT: v_perm_b32 v2, v2, v10, 0x3020706
+; GFX10-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_minnum_v8bf16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_and_b32_e32 v9, 0xffff0000, v3
+; GFX11-NEXT: v_and_b32_e32 v8, 0xffff0000, v7
+; GFX11-NEXT: v_and_b32_e32 v11, 0xffff0000, v1
+; GFX11-NEXT: v_and_b32_e32 v10, 0xffff0000, v5
+; GFX11-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_dual_max_f32 v9, v9, v9 :: v_dual_max_f32 v8, v8, v8
+; GFX11-NEXT: v_dual_max_f32 v11, v11, v11 :: v_dual_max_f32 v10, v10, v10
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_dual_max_f32 v7, v7, v7 :: v_dual_min_f32 v8, v9, v8
+; GFX11-NEXT: v_min_f32_e32 v9, v11, v10
+; GFX11-NEXT: v_and_b32_e32 v11, 0xffff0000, v2
+; GFX11-NEXT: v_and_b32_e32 v12, 0xffff0000, v6
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_dual_max_f32 v11, v11, v11 :: v_dual_lshlrev_b32 v6, 16, v6
+; GFX11-NEXT: v_max_f32_e32 v10, v12, v12
+; GFX11-NEXT: v_and_b32_e32 v13, 0xffff0000, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT: v_dual_max_f32 v2, v2, v2 :: v_dual_lshlrev_b32 v5, 16, v5
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_min_f32_e32 v10, v11, v10
+; GFX11-NEXT: v_dual_max_f32 v13, v13, v13 :: v_dual_and_b32 v12, 0xffff0000, v4
+; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v12, v12, v12
+; GFX11-NEXT: v_max_f32_e32 v0, v0, v0
+; GFX11-NEXT: v_dual_max_f32 v4, v4, v4 :: v_dual_max_f32 v3, v3, v3
+; GFX11-NEXT: v_max_f32_e32 v5, v5, v5
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_min_f32_e32 v11, v13, v12
+; GFX11-NEXT: v_dual_min_f32 v0, v0, v4 :: v_dual_min_f32 v3, v3, v7
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_min_f32_e32 v1, v1, v5
+; GFX11-NEXT: v_dual_max_f32 v5, v6, v6 :: v_dual_and_b32 v4, 0xffff0000, v9
+; GFX11-NEXT: v_perm_b32 v0, v0, v11, 0x3020706
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_dual_min_f32 v2, v2, v5 :: v_dual_and_b32 v5, 0xffff0000, v8
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_perm_b32 v2, v2, v10, 0x3020706
+; GFX11-NEXT: v_or_b32_e32 v3, v3, v5
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %op = call <8 x bfloat> @llvm.minnum.v8bf16(<8 x bfloat> %a, <8 x bfloat> %b)
+ ret <8 x bfloat> %op
+}
+
+define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
+; GCN-LABEL: v_minnum_v16bf16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
+; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
+; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30
+; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14
+; GCN-NEXT: v_min_f32_e32 v14, v14, v30
+; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
+; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
+; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29
+; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
+; GCN-NEXT: v_min_f32_e32 v13, v13, v29
+; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
+; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
+; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28
+; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12
+; GCN-NEXT: v_min_f32_e32 v12, v12, v28
+; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
+; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27
+; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
+; GCN-NEXT: v_min_f32_e32 v11, v11, v27
+; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
+; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26
+; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
+; GCN-NEXT: v_min_f32_e32 v10, v10, v26
+; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
+; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
+; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25
+; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
+; GCN-NEXT: v_min_f32_e32 v9, v9, v25
+; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
+; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24
+; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
+; GCN-NEXT: v_min_f32_e32 v8, v8, v24
+; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
+; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23
+; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
+; GCN-NEXT: v_min_f32_e32 v7, v7, v23
+; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
+; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22
+; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
+; GCN-NEXT: v_min_f32_e32 v6, v6, v22
+; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
+; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GCN-NEXT: v_min_f32_e32 v5, v5, v21
+; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
+; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20
+; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GCN-NEXT: v_min_f32_e32 v4, v4, v20
+; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32
+; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
+; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
+; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
+; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
+; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18
+; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
+; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_min_f32_e32 v3, v3, v19
+; GCN-NEXT: v_min_f32_e32 v2, v2, v18
+; GCN-NEXT: v_min_f32_e32 v1, v1, v17
+; GCN-NEXT: v_min_f32_e32 v0, v0, v16
+; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
+; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
+; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v20
+; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16
+; GCN-NEXT: v_min_f32_e32 v15, v15, v16
+; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
+; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_minnum_v16bf16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22
+; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
+; GFX7-NEXT: v_min_f32_e32 v6, v6, v22
+; GFX7-NEXT: buffer_load_dword v22, off, s[0:3], s32
+; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
+; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
+; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
+; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
+; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
+; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
+; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
+; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
+; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
+; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
+; GFX7-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
+; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
+; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
+; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30
+; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14
+; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29
+; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13
+; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28
+; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12
+; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27
+; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11
+; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26
+; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
+; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25
+; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
+; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24
+; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
+; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23
+; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
+; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15
+; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21
+; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20
+; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19
+; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT: v_min_f32_e32 v14, v14, v30
+; GFX7-NEXT: v_min_f32_e32 v13, v13, v29
+; GFX7-NEXT: v_min_f32_e32 v12, v12, v28
+; GFX7-NEXT: v_min_f32_e32 v11, v11, v27
+; GFX7-NEXT: v_min_f32_e32 v10, v10, v26
+; GFX7-NEXT: v_min_f32_e32 v9, v9, v25
+; GFX7-NEXT: v_min_f32_e32 v8, v8, v24
+; GFX7-NEXT: v_min_f32_e32 v7, v7, v23
+; GFX7-NEXT: v_min_f32_e32 v5, v5, v21
+; GFX7-NEXT: v_min_f32_e32 v4, v4, v20
+; GFX7-NEXT: v_min_f32_e32 v3, v3, v19
+; GFX7-NEXT: v_min_f32_e32 v2, v2, v18
+; GFX7-NEXT: v_min_f32_e32 v1, v1, v17
+; GFX7-NEXT: v_min_f32_e32 v0, v0, v16
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
+; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22
+; GFX7-NEXT: v_min_f32_e32 v15, v15, v22
+; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
+; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
+; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
+; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
+; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_minnum_v16bf16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_and_b32_e32 v16, 0xffff0000, v15
+; GFX8-NEXT: v_and_b32_e32 v17, 0xffff0000, v7
+; GFX8-NEXT: v_lshlrev_b32_e32 v15, 16, v15
+; GFX8-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX8-NEXT: v_mul_f32_e32 v16, 1.0, v16
+; GFX8-NEXT: v_mul_f32_e32 v17, 1.0, v17
+; GFX8-NEXT: v_mul_f32_e32 v15, 1.0, v15
+; GFX8-NEXT: v_mul_f32_e32 v7, 1.0, v7
+; GFX8-NEXT: v_min_f32_e32 v16, v17, v16
+; GFX8-NEXT: v_min_f32_e32 v7, v7, v15
+; GFX8-NEXT: v_and_b32_e32 v15, 0xffff0000, v14
+; GFX8-NEXT: v_and_b32_e32 v17, 0xffff0000, v6
+; GFX8-NEXT: v_lshlrev_b32_e32 v14, 16, v14
+; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX8-NEXT: v_mul_f32_e32 v15, 1.0, v15
+; GFX8-NEXT: v_mul_f32_e32 v17, 1.0, v17
+; GFX8-NEXT: v_mul_f32_e32 v14, 1.0, v14
+; GFX8-NEXT: v_mul_f32_e32 v6, 1.0, v6
+; GFX8-NEXT: v_min_f32_e32 v15, v17, v15
+; GFX8-NEXT: v_min_f32_e32 v6, v6, v14
+; GFX8-NEXT: v_and_b32_e32 v14, 0xffff0000, v13
+; GFX8-NEXT: v_and_b32_e32 v17, 0xffff0000, v5
+; GFX8-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX8-NEXT: v_mul_f32_e32 v14, 1.0, v14
+; GFX8-NEXT: v_mul_f32_e32 v17, 1.0, v17
+; GFX8-NEXT: v_mul_f32_e32 v13, 1.0, v13
+; GFX8-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GFX8-NEXT: v_min_f32_e32 v14, v17, v14
+; GFX8-NEXT: v_min_f32_e32 v5, v5, v13
+; GFX8-NEXT: v_and_b32_e32 v13, 0xffff0000, v12
+; GFX8-NEXT: v_and_b32_e32 v17, 0xffff0000, v4
+; GFX8-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX8-NEXT: v_mul_f32_e32 v13, 1.0, v13
+; GFX8-NEXT: v_mul_f32_e32 v17, 1.0, v17
+; GFX8-NEXT: v_mul_f32_e32 v12, 1.0, v12
+; GFX8-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GFX8-NEXT: v_min_f32_e32 v13, v17, v13
+; GFX8-NEXT: v_min_f32_e32 v4, v4, v12
+; GFX8-NEXT: v_and_b32_e32 v12, 0xffff0000, v11
+; GFX8-NEXT: v_and_b32_e32 v17, 0xffff0000, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX8-NEXT: v_mul_f32_e32 v12, 1.0, v12
+; GFX8-NEXT: v_mul_f32_e32 v17, 1.0, v17
+; GFX8-NEXT: v_mul_f32_e32 v11, 1.0, v11
+; GFX8-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX8-NEXT: v_min_f32_e32 v12, v17, v12
+; GFX8-NEXT: v_min_f32_e32 v3, v3, v11
+; GFX8-NEXT: v_and_b32_e32 v11, 0xffff0000, v10
+; GFX8-NEXT: v_and_b32_e32 v17, 0xffff0000, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT: v_mul_f32_e32 v11, 1.0, v11
+; GFX8-NEXT: v_mul_f32_e32 v17, 1.0, v17
+; GFX8-NEXT: v_mul_f32_e32 v10, 1.0, v10
+; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX8-NEXT: v_min_f32_e32 v11, v17, v11
+; GFX8-NEXT: v_min_f32_e32 v2, v2, v10
+; GFX8-NEXT: v_and_b32_e32 v10, 0xffff0000, v9
+; GFX8-NEXT: v_and_b32_e32 v17, 0xffff0000, v1
+; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT: v_mul_f32_e32 v10, 1.0, v10
+; GFX8-NEXT: v_mul_f32_e32 v17, 1.0, v17
+; GFX8-NEXT: v_mul_f32_e32 v9, 1.0, v9
+; GFX8-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX8-NEXT: v_min_f32_e32 v10, v17, v10
+; GFX8-NEXT: v_min_f32_e32 v1, v1, v9
+; GFX8-NEXT: v_and_b32_e32 v9, 0xffff0000, v8
+; GFX8-NEXT: v_and_b32_e32 v17, 0xffff0000, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT: v_mul_f32_e32 v9, 1.0, v9
+; GFX8-NEXT: v_mul_f32_e32 v17, 1.0, v17
+; GFX8-NEXT: v_mul_f32_e32 v8, 1.0, v8
+; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX8-NEXT: v_min_f32_e32 v9, v17, v9
+; GFX8-NEXT: v_min_f32_e32 v0, v0, v8
+; GFX8-NEXT: s_mov_b32 s4, 0x3020706
+; GFX8-NEXT: v_perm_b32 v0, v0, v9, s4
+; GFX8-NEXT: v_perm_b32 v1, v1, v10, s4
+; GFX8-NEXT: v_perm_b32 v2, v2, v11, s4
+; GFX8-NEXT: v_perm_b32 v3, v3, v12, s4
+; GFX8-NEXT: v_perm_b32 v4, v4, v13, s4
+; GFX8-NEXT: v_perm_b32 v5, v5, v14, s4
+; GFX8-NEXT: v_perm_b32 v6, v6, v15, s4
+; GFX8-NEXT: v_perm_b32 v7, v7, v16, s4
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_minnum_v16bf16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_and_b32_e32 v16, 0xffff0000, v15
+; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v7
+; GFX9-NEXT: v_max_f32_e32 v16, v16, v16
+; GFX9-NEXT: v_max_f32_e32 v17, v17, v17
+; GFX9-NEXT: v_min_f32_e32 v16, v17, v16
+; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v14
+; GFX9-NEXT: v_and_b32_e32 v18, 0xffff0000, v6
+; GFX9-NEXT: v_max_f32_e32 v17, v17, v17
+; GFX9-NEXT: v_max_f32_e32 v18, v18, v18
+; GFX9-NEXT: v_min_f32_e32 v17, v18, v17
+; GFX9-NEXT: v_and_b32_e32 v18, 0xffff0000, v13
+; GFX9-NEXT: v_and_b32_e32 v19, 0xffff0000, v5
+; GFX9-NEXT: v_max_f32_e32 v18, v18, v18
+; GFX9-NEXT: v_max_f32_e32 v19, v19, v19
+; GFX9-NEXT: v_min_f32_e32 v18, v19, v18
+; GFX9-NEXT: v_and_b32_e32 v19, 0xffff0000, v12
+; GFX9-NEXT: v_and_b32_e32 v20, 0xffff0000, v4
+; GFX9-NEXT: v_max_f32_e32 v19, v19, v19
+; GFX9-NEXT: v_max_f32_e32 v20, v20, v20
+; GFX9-NEXT: v_min_f32_e32 v19, v20, v19
+; GFX9-NEXT: v_and_b32_e32 v20, 0xffff0000, v11
+; GFX9-NEXT: v_and_b32_e32 v21, 0xffff0000, v3
+; GFX9-NEXT: v_max_f32_e32 v20, v20, v20
+; GFX9-NEXT: v_max_f32_e32 v21, v21, v21
+; GFX9-NEXT: v_min_f32_e32 v20, v21, v20
+; GFX9-NEXT: v_and_b32_e32 v21, 0xffff0000, v10
+; GFX9-NEXT: v_and_b32_e32 v22, 0xffff0000, v2
+; GFX9-NEXT: v_max_f32_e32 v21, v21, v21
+; GFX9-NEXT: v_max_f32_e32 v22, v22, v22
+; GFX9-NEXT: v_min_f32_e32 v21, v22, v21
+; GFX9-NEXT: v_and_b32_e32 v22, 0xffff0000, v9
+; GFX9-NEXT: v_and_b32_e32 v23, 0xffff0000, v1
+; GFX9-NEXT: v_max_f32_e32 v22, v22, v22
+; GFX9-NEXT: v_max_f32_e32 v23, v23, v23
+; GFX9-NEXT: v_min_f32_e32 v22, v23, v22
+; GFX9-NEXT: v_and_b32_e32 v23, 0xffff0000, v8
+; GFX9-NEXT: v_and_b32_e32 v24, 0xffff0000, v0
+; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v15
+; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX9-NEXT: v_lshlrev_b32_e32 v14, 16, v14
+; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX9-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX9-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX9-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-NEXT: v_max_f32_e32 v23, v23, v23
+; GFX9-NEXT: v_max_f32_e32 v24, v24, v24
+; GFX9-NEXT: v_max_f32_e32 v15, v15, v15
+; GFX9-NEXT: v_max_f32_e32 v7, v7, v7
+; GFX9-NEXT: v_max_f32_e32 v14, v14, v14
+; GFX9-NEXT: v_max_f32_e32 v6, v6, v6
+; GFX9-NEXT: v_max_f32_e32 v13, v13, v13
+; GFX9-NEXT: v_max_f32_e32 v5, v5, v5
+; GFX9-NEXT: v_max_f32_e32 v12, v12, v12
+; GFX9-NEXT: v_max_f32_e32 v4, v4, v4
+; GFX9-NEXT: v_max_f32_e32 v11, v11, v11
+; GFX9-NEXT: v_max_f32_e32 v3, v3, v3
+; GFX9-NEXT: v_max_f32_e32 v10, v10, v10
+; GFX9-NEXT: v_max_f32_e32 v2, v2, v2
+; GFX9-NEXT: v_max_f32_e32 v9, v9, v9
+; GFX9-NEXT: v_max_f32_e32 v1, v1, v1
+; GFX9-NEXT: v_max_f32_e32 v8, v8, v8
+; GFX9-NEXT: v_max_f32_e32 v0, v0, v0
+; GFX9-NEXT: v_min_f32_e32 v23, v24, v23
+; GFX9-NEXT: v_min_f32_e32 v7, v7, v15
+; GFX9-NEXT: v_min_f32_e32 v6, v6, v14
+; GFX9-NEXT: v_min_f32_e32 v5, v5, v13
+; GFX9-NEXT: v_min_f32_e32 v4, v4, v12
+; GFX9-NEXT: v_min_f32_e32 v3, v3, v11
+; GFX9-NEXT: v_min_f32_e32 v2, v2, v10
+; GFX9-NEXT: v_min_f32_e32 v1, v1, v9
+; GFX9-NEXT: v_min_f32_e32 v0, v0, v8
+; GFX9-NEXT: s_mov_b32 s4, 0x3020706
+; GFX9-NEXT: v_perm_b32 v0, v0, v23, s4
+; GFX9-NEXT: v_perm_b32 v1, v1, v22, s4
+; GFX9-NEXT: v_perm_b32 v2, v2, v21, s4
+; GFX9-NEXT: v_perm_b32 v3, v3, v20, s4
+; GFX9-NEXT: v_perm_b32 v4, v4, v19, s4
+; GFX9-NEXT: v_perm_b32 v5, v5, v18, s4
+; GFX9-NEXT: v_perm_b32 v6, v6, v17, s4
+; GFX9-NEXT: v_perm_b32 v7, v7, v16, s4
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minnum_v16bf16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_and_b32_e32 v16, 0xffff0000, v15
+; GFX10-NEXT: v_and_b32_e32 v17, 0xffff0000, v7
+; GFX10-NEXT: v_and_b32_e32 v18, 0xffff0000, v14
+; GFX10-NEXT: v_and_b32_e32 v19, 0xffff0000, v6
+; GFX10-NEXT: v_and_b32_e32 v20, 0xffff0000, v5
+; GFX10-NEXT: v_max_f32_e32 v16, v16, v16
+; GFX10-NEXT: v_max_f32_e32 v17, v17, v17
+; GFX10-NEXT: v_and_b32_e32 v21, 0xffff0000, v12
+; GFX10-NEXT: v_and_b32_e32 v22, 0xffff0000, v4
+; GFX10-NEXT: v_and_b32_e32 v23, 0xffff0000, v3
+; GFX10-NEXT: v_and_b32_e32 v24, 0xffff0000, v10
+; GFX10-NEXT: v_min_f32_e32 v16, v17, v16
+; GFX10-NEXT: v_max_f32_e32 v17, v18, v18
+; GFX10-NEXT: v_max_f32_e32 v18, v19, v19
+; GFX10-NEXT: v_and_b32_e32 v19, 0xffff0000, v13
+; GFX10-NEXT: v_and_b32_e32 v25, 0xffff0000, v1
+; GFX10-NEXT: v_and_b32_e32 v26, 0xffff0000, v8
+; GFX10-NEXT: v_and_b32_e32 v27, 0xffff0000, v0
+; GFX10-NEXT: v_min_f32_e32 v17, v18, v17
+; GFX10-NEXT: v_max_f32_e32 v18, v19, v19
+; GFX10-NEXT: v_max_f32_e32 v19, v20, v20
+; GFX10-NEXT: v_max_f32_e32 v20, v21, v21
+; GFX10-NEXT: v_max_f32_e32 v21, v22, v22
+; GFX10-NEXT: v_and_b32_e32 v22, 0xffff0000, v11
+; GFX10-NEXT: v_lshlrev_b32_e32 v15, 16, v15
+; GFX10-NEXT: v_min_f32_e32 v18, v19, v18
+; GFX10-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX10-NEXT: v_min_f32_e32 v19, v21, v20
+; GFX10-NEXT: v_max_f32_e32 v20, v22, v22
+; GFX10-NEXT: v_max_f32_e32 v21, v23, v23
+; GFX10-NEXT: v_max_f32_e32 v22, v24, v24
+; GFX10-NEXT: v_and_b32_e32 v23, 0xffff0000, v2
+; GFX10-NEXT: v_and_b32_e32 v24, 0xffff0000, v9
+; GFX10-NEXT: v_lshlrev_b32_e32 v14, 16, v14
+; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX10-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX10-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; GFX10-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX10-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-NEXT: v_max_f32_e32 v23, v23, v23
+; GFX10-NEXT: v_max_f32_e32 v24, v24, v24
+; GFX10-NEXT: v_max_f32_e32 v25, v25, v25
+; GFX10-NEXT: v_max_f32_e32 v26, v26, v26
+; GFX10-NEXT: v_max_f32_e32 v27, v27, v27
+; GFX10-NEXT: v_max_f32_e32 v15, v15, v15
+; GFX10-NEXT: v_max_f32_e32 v7, v7, v7
+; GFX10-NEXT: v_max_f32_e32 v14, v14, v14
+; GFX10-NEXT: v_max_f32_e32 v6, v6, v6
+; GFX10-NEXT: v_max_f32_e32 v13, v13, v13
+; GFX10-NEXT: v_max_f32_e32 v5, v5, v5
+; GFX10-NEXT: v_max_f32_e32 v12, v12, v12
+; GFX10-NEXT: v_max_f32_e32 v11, v11, v11
+; GFX10-NEXT: v_max_f32_e32 v10, v10, v10
+; GFX10-NEXT: v_max_f32_e32 v9, v9, v9
+; GFX10-NEXT: v_max_f32_e32 v8, v8, v8
+; GFX10-NEXT: v_max_f32_e32 v0, v0, v0
+; GFX10-NEXT: v_max_f32_e32 v1, v1, v1
+; GFX10-NEXT: v_max_f32_e32 v2, v2, v2
+; GFX10-NEXT: v_max_f32_e32 v3, v3, v3
+; GFX10-NEXT: v_max_f32_e32 v4, v4, v4
+; GFX10-NEXT: v_min_f32_e32 v20, v21, v20
+; GFX10-NEXT: v_min_f32_e32 v21, v23, v22
+; GFX10-NEXT: v_min_f32_e32 v22, v25, v24
+; GFX10-NEXT: v_min_f32_e32 v23, v27, v26
+; GFX10-NEXT: v_min_f32_e32 v7, v7, v15
+; GFX10-NEXT: v_min_f32_e32 v6, v6, v14
+; GFX10-NEXT: v_min_f32_e32 v5, v5, v13
+; GFX10-NEXT: v_min_f32_e32 v0, v0, v8
+; GFX10-NEXT: v_min_f32_e32 v1, v1, v9
+; GFX10-NEXT: v_min_f32_e32 v2, v2, v10
+; GFX10-NEXT: v_min_f32_e32 v3, v3, v11
+; GFX10-NEXT: v_min_f32_e32 v4, v4, v12
+; GFX10-NEXT: v_perm_b32 v0, v0, v23, 0x3020706
+; GFX10-NEXT: v_perm_b32 v1, v1, v22, 0x3020706
+; GFX10-NEXT: v_perm_b32 v2, v2, v21, 0x3020706
+; GFX10-NEXT: v_perm_b32 v3, v3, v20, 0x3020706
+; GFX10-NEXT: v_perm_b32 v4, v4, v19, 0x3020706
+; GFX10-NEXT: v_perm_b32 v5, v5, v18, 0x3020706
+; GFX10-NEXT: v_perm_b32 v6, v6, v17, 0x3020706
+; GFX10-NEXT: v_perm_b32 v7, v7, v16, 0x3020706
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_minnum_v16bf16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_and_b32_e32 v17, 0xffff0000, v7
+; GFX11-NEXT: v_and_b32_e32 v16, 0xffff0000, v15
+; GFX11-NEXT: v_and_b32_e32 v22, 0xffff0000, v4
+; GFX11-NEXT: v_and_b32_e32 v20, 0xffff0000, v5
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_dual_max_f32 v17, v17, v17 :: v_dual_and_b32 v18, 0xffff0000, v14
+; GFX11-NEXT: v_dual_max_f32 v16, v16, v16 :: v_dual_and_b32 v19, 0xffff0000, v6
+; GFX11-NEXT: v_and_b32_e32 v23, 0xffff0000, v3
+; GFX11-NEXT: v_and_b32_e32 v24, 0xffff0000, v10
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_dual_min_f32 v16, v17, v16 :: v_dual_and_b32 v25, 0xffff0000, v1
+; GFX11-NEXT: v_dual_max_f32 v17, v18, v18 :: v_dual_max_f32 v18, v19, v19
+; GFX11-NEXT: v_and_b32_e32 v19, 0xffff0000, v13
+; GFX11-NEXT: v_and_b32_e32 v21, 0xffff0000, v12
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_dual_max_f32 v25, v25, v25 :: v_dual_and_b32 v26, 0xffff0000, v8
+; GFX11-NEXT: v_dual_min_f32 v17, v18, v17 :: v_dual_max_f32 v18, v19, v19
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_dual_max_f32 v19, v20, v20 :: v_dual_max_f32 v20, v21, v21
+; GFX11-NEXT: v_dual_max_f32 v21, v22, v22 :: v_dual_and_b32 v22, 0xffff0000, v11
+; GFX11-NEXT: v_lshlrev_b32_e32 v14, 16, v14
+; GFX11-NEXT: v_dual_min_f32 v18, v19, v18 :: v_dual_lshlrev_b32 v7, 16, v7
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_dual_min_f32 v19, v21, v20 :: v_dual_max_f32 v20, v22, v22
+; GFX11-NEXT: v_dual_max_f32 v14, v14, v14 :: v_dual_max_f32 v21, v23, v23
+; GFX11-NEXT: v_dual_max_f32 v22, v24, v24 :: v_dual_lshlrev_b32 v15, 16, v15
+; GFX11-NEXT: v_and_b32_e32 v23, 0xffff0000, v2
+; GFX11-NEXT: v_and_b32_e32 v24, 0xffff0000, v9
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_dual_min_f32 v20, v21, v20 :: v_dual_max_f32 v15, v15, v15
+; GFX11-NEXT: v_dual_max_f32 v26, v26, v26 :: v_dual_and_b32 v27, 0xffff0000, v0
+; GFX11-NEXT: v_dual_max_f32 v7, v7, v7 :: v_dual_lshlrev_b32 v6, 16, v6
+; GFX11-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX11-NEXT: v_dual_max_f32 v23, v23, v23 :: v_dual_max_f32 v24, v24, v24
+; GFX11-NEXT: v_dual_max_f32 v27, v27, v27 :: v_dual_max_f32 v6, v6, v6
+; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_dual_max_f32 v13, v13, v13 :: v_dual_lshlrev_b32 v10, 16, v10
+; GFX11-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GFX11-NEXT: v_dual_min_f32 v21, v23, v22 :: v_dual_min_f32 v22, v25, v24
+; GFX11-NEXT: v_dual_min_f32 v23, v27, v26 :: v_dual_lshlrev_b32 v12, 16, v12
+; GFX11-NEXT: v_dual_min_f32 v6, v6, v14 :: v_dual_max_f32 v5, v5, v5
+; GFX11-NEXT: v_dual_max_f32 v10, v10, v10 :: v_dual_max_f32 v11, v11, v11
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX11-NEXT: v_dual_min_f32 v7, v7, v15 :: v_dual_lshlrev_b32 v4, 16, v4
+; GFX11-NEXT: v_dual_max_f32 v12, v12, v12 :: v_dual_min_f32 v5, v5, v13
+; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX11-NEXT: v_dual_max_f32 v0, v0, v0 :: v_dual_lshlrev_b32 v3, 16, v3
+; GFX11-NEXT: v_dual_max_f32 v9, v9, v9 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_dual_max_f32 v8, v8, v8 :: v_dual_lshlrev_b32 v1, 16, v1
+; GFX11-NEXT: v_dual_max_f32 v3, v3, v3 :: v_dual_max_f32 v2, v2, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_dual_max_f32 v4, v4, v4 :: v_dual_max_f32 v1, v1, v1
+; GFX11-NEXT: v_dual_min_f32 v0, v0, v8 :: v_dual_min_f32 v3, v3, v11
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_min_f32_e32 v2, v2, v10
+; GFX11-NEXT: v_dual_min_f32 v4, v4, v12 :: v_dual_min_f32 v1, v1, v9
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_perm_b32 v0, v0, v23, 0x3020706
+; GFX11-NEXT: v_perm_b32 v3, v3, v20, 0x3020706
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_perm_b32 v2, v2, v21, 0x3020706
+; GFX11-NEXT: v_perm_b32 v4, v4, v19, 0x3020706
+; GFX11-NEXT: v_perm_b32 v1, v1, v22, 0x3020706
+; GFX11-NEXT: v_perm_b32 v5, v5, v18, 0x3020706
+; GFX11-NEXT: v_perm_b32 v6, v6, v17, 0x3020706
+; GFX11-NEXT: v_perm_b32 v7, v7, v16, 0x3020706
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %op = call <16 x bfloat> @llvm.minnum.v16bf16(<16 x bfloat> %a, <16 x bfloat> %b)
+ ret <16 x bfloat> %op
+}
+
+define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
+; GCN-LABEL: v_minnum_v32bf16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:128
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32
+; GCN-NEXT: s_waitcnt vmcnt(1)
+; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v31
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:124
+; GCN-NEXT: v_min_f32_e32 v31, v32, v31
+; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
+; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:120
+; GCN-NEXT: v_min_f32_e32 v30, v30, v32
+; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
+; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:116
+; GCN-NEXT: v_min_f32_e32 v29, v29, v32
+; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
+; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:112
+; GCN-NEXT: v_min_f32_e32 v28, v28, v32
+; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
+; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:108
+; GCN-NEXT: v_min_f32_e32 v27, v27, v32
+; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
+; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:104
+; GCN-NEXT: v_min_f32_e32 v26, v26, v32
+; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
+; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:100
+; GCN-NEXT: v_min_f32_e32 v25, v25, v32
+; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
+; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:96
+; GCN-NEXT: v_min_f32_e32 v24, v24, v32
+; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
+; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:92
+; GCN-NEXT: v_min_f32_e32 v23, v23, v32
+; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
+; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:88
+; GCN-NEXT: v_min_f32_e32 v22, v22, v32
+; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
+; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:84
+; GCN-NEXT: v_min_f32_e32 v21, v21, v32
+; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
+; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:80
+; GCN-NEXT: v_min_f32_e32 v20, v20, v32
+; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
+; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:76
+; GCN-NEXT: v_min_f32_e32 v19, v19, v32
+; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:72
+; GCN-NEXT: v_min_f32_e32 v18, v18, v32
+; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
+; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:68
+; GCN-NEXT: v_min_f32_e32 v17, v17, v32
+; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
+; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:64
+; GCN-NEXT: v_min_f32_e32 v16, v16, v32
+; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:60
+; GCN-NEXT: v_min_f32_e32 v15, v15, v32
+; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
+; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:56
+; GCN-NEXT: v_min_f32_e32 v14, v14, v32
+; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
+; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:52
+; GCN-NEXT: v_min_f32_e32 v13, v13, v32
+; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
+; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:48
+; GCN-NEXT: v_min_f32_e32 v12, v12, v32
+; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:44
+; GCN-NEXT: v_min_f32_e32 v11, v11, v32
+; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:40
+; GCN-NEXT: v_min_f32_e32 v10, v10, v32
+; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
+; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:36
+; GCN-NEXT: v_min_f32_e32 v9, v9, v32
+; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:32
+; GCN-NEXT: v_min_f32_e32 v8, v8, v32
+; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:28
+; GCN-NEXT: v_min_f32_e32 v7, v7, v32
+; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:24
+; GCN-NEXT: v_min_f32_e32 v6, v6, v32
+; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:20
+; GCN-NEXT: v_min_f32_e32 v5, v5, v32
+; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:16
+; GCN-NEXT: v_min_f32_e32 v4, v4, v32
+; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:12
+; GCN-NEXT: v_min_f32_e32 v3, v3, v32
+; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
+; GCN-NEXT: v_min_f32_e32 v2, v2, v32
+; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:4
+; GCN-NEXT: v_min_f32_e32 v1, v1, v32
+; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_min_f32_e32 v0, v0, v32
+; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
+; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
+; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
+; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
+; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
+; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
+; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
+; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
+; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
+; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
+; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
+; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
+; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
+; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
+; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
+; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
+; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
+; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
+; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_minnum_v32bf16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:128
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32
+; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
+; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30
+; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
+; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29
+; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
+; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28
+; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
+; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27
+; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
+; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26
+; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
+; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25
+; GFX7-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
+; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24
+; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
+; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23
+; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
+; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22
+; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
+; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21
+; GFX7-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
+; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20
+; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
+; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19
+; GFX7-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18
+; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
+; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17
+; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
+; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16
+; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15
+; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
+; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14
+; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
+; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13
+; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
+; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12
+; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11
+; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
+; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
+; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
+; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
+; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
+; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
+; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT: v_min_f32_e32 v31, v32, v31
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:124
+; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT: v_min_f32_e32 v30, v30, v32
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:120
+; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT: v_min_f32_e32 v29, v29, v32
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:116
+; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT: v_min_f32_e32 v28, v28, v32
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:112
+; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT: v_min_f32_e32 v27, v27, v32
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:108
+; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT: v_min_f32_e32 v26, v26, v32
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:104
+; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT: v_min_f32_e32 v25, v25, v32
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:100
+; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT: v_min_f32_e32 v24, v24, v32
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:96
+; GFX7-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT: v_min_f32_e32 v23, v23, v32
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:92
+; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT: v_min_f32_e32 v22, v22, v32
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:88
+; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT: v_min_f32_e32 v21, v21, v32
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:84
+; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT: v_min_f32_e32 v20, v20, v32
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:80
+; GFX7-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT: v_min_f32_e32 v19, v19, v32
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:76
+; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT: v_min_f32_e32 v18, v18, v32
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:72
+; GFX7-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT: v_min_f32_e32 v17, v17, v32
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:68
+; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT: v_min_f32_e32 v16, v16, v32
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:64
+; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT: v_min_f32_e32 v15, v15, v32
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:60
+; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT: v_min_f32_e32 v14, v14, v32
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:56
+; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT: v_min_f32_e32 v13, v13, v32
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:52
+; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT: v_min_f32_e32 v12, v12, v32
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:48
+; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT: v_min_f32_e32 v11, v11, v32
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:44
+; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT: v_min_f32_e32 v10, v10, v32
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:40
+; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT: v_min_f32_e32 v9, v9, v32
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:36
+; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT: v_min_f32_e32 v8, v8, v32
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:32
+; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT: v_min_f32_e32 v7, v7, v32
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:28
+; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT: v_min_f32_e32 v6, v6, v32
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:24
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT: v_min_f32_e32 v5, v5, v32
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:20
+; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT: v_min_f32_e32 v4, v4, v32
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:16
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT: v_min_f32_e32 v3, v3, v32
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:12
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT: v_min_f32_e32 v2, v2, v32
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT: v_min_f32_e32 v1, v1, v32
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT: v_min_f32_e32 v0, v0, v32
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_minnum_v32bf16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_and_b32_e32 v31, 0xffff0000, v30
+; GFX8-NEXT: v_and_b32_e32 v32, 0xffff0000, v14
+; GFX8-NEXT: v_lshlrev_b32_e32 v30, 16, v30
+; GFX8-NEXT: v_lshlrev_b32_e32 v14, 16, v14
+; GFX8-NEXT: v_mul_f32_e32 v31, 1.0, v31
+; GFX8-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GFX8-NEXT: v_mul_f32_e32 v30, 1.0, v30
+; GFX8-NEXT: v_mul_f32_e32 v14, 1.0, v14
+; GFX8-NEXT: v_min_f32_e32 v31, v32, v31
+; GFX8-NEXT: v_min_f32_e32 v14, v14, v30
+; GFX8-NEXT: v_and_b32_e32 v30, 0xffff0000, v29
+; GFX8-NEXT: v_and_b32_e32 v32, 0xffff0000, v13
+; GFX8-NEXT: v_lshlrev_b32_e32 v29, 16, v29
+; GFX8-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX8-NEXT: v_mul_f32_e32 v30, 1.0, v30
+; GFX8-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GFX8-NEXT: v_mul_f32_e32 v29, 1.0, v29
+; GFX8-NEXT: v_mul_f32_e32 v13, 1.0, v13
+; GFX8-NEXT: v_min_f32_e32 v30, v32, v30
+; GFX8-NEXT: v_min_f32_e32 v13, v13, v29
+; GFX8-NEXT: v_and_b32_e32 v29, 0xffff0000, v28
+; GFX8-NEXT: v_and_b32_e32 v32, 0xffff0000, v12
+; GFX8-NEXT: v_lshlrev_b32_e32 v28, 16, v28
+; GFX8-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; GFX8-NEXT: v_mul_f32_e32 v29, 1.0, v29
+; GFX8-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GFX8-NEXT: v_mul_f32_e32 v28, 1.0, v28
+; GFX8-NEXT: v_mul_f32_e32 v12, 1.0, v12
+; GFX8-NEXT: v_min_f32_e32 v29, v32, v29
+; GFX8-NEXT: v_min_f32_e32 v12, v12, v28
+; GFX8-NEXT: v_and_b32_e32 v28, 0xffff0000, v27
+; GFX8-NEXT: v_and_b32_e32 v32, 0xffff0000, v11
+; GFX8-NEXT: v_mul_f32_e32 v28, 1.0, v28
+; GFX8-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GFX8-NEXT: v_min_f32_e32 v28, v32, v28
+; GFX8-NEXT: buffer_load_dword v32, off, s[0:3], s32
+; GFX8-NEXT: v_lshlrev_b32_e32 v27, 16, v27
+; GFX8-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GFX8-NEXT: v_mul_f32_e32 v27, 1.0, v27
+; GFX8-NEXT: v_mul_f32_e32 v11, 1.0, v11
+; GFX8-NEXT: v_min_f32_e32 v11, v11, v27
+; GFX8-NEXT: v_and_b32_e32 v27, 0xffff0000, v15
+; GFX8-NEXT: v_lshlrev_b32_e32 v15, 16, v15
+; GFX8-NEXT: v_mul_f32_e32 v27, 1.0, v27
+; GFX8-NEXT: v_mul_f32_e32 v15, 1.0, v15
+; GFX8-NEXT: s_mov_b32 s4, 0x3020706
+; GFX8-NEXT: v_perm_b32 v11, v11, v28, s4
+; GFX8-NEXT: v_perm_b32 v12, v12, v29, s4
+; GFX8-NEXT: v_perm_b32 v13, v13, v30, s4
+; GFX8-NEXT: v_perm_b32 v14, v14, v31, s4
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_and_b32_e32 v33, 0xffff0000, v32
+; GFX8-NEXT: v_lshlrev_b32_e32 v32, 16, v32
+; GFX8-NEXT: v_mul_f32_e32 v33, 1.0, v33
+; GFX8-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GFX8-NEXT: v_min_f32_e32 v27, v27, v33
+; GFX8-NEXT: v_min_f32_e32 v15, v15, v32
+; GFX8-NEXT: v_and_b32_e32 v32, 0xffff0000, v26
+; GFX8-NEXT: v_and_b32_e32 v33, 0xffff0000, v10
+; GFX8-NEXT: v_lshlrev_b32_e32 v26, 16, v26
+; GFX8-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GFX8-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GFX8-NEXT: v_mul_f32_e32 v33, 1.0, v33
+; GFX8-NEXT: v_mul_f32_e32 v26, 1.0, v26
+; GFX8-NEXT: v_mul_f32_e32 v10, 1.0, v10
+; GFX8-NEXT: v_min_f32_e32 v32, v33, v32
+; GFX8-NEXT: v_min_f32_e32 v10, v10, v26
+; GFX8-NEXT: v_and_b32_e32 v26, 0xffff0000, v25
+; GFX8-NEXT: v_and_b32_e32 v33, 0xffff0000, v9
+; GFX8-NEXT: v_lshlrev_b32_e32 v25, 16, v25
+; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX8-NEXT: v_mul_f32_e32 v26, 1.0, v26
+; GFX8-NEXT: v_mul_f32_e32 v33, 1.0, v33
+; GFX8-NEXT: v_mul_f32_e32 v25, 1.0, v25
+; GFX8-NEXT: v_mul_f32_e32 v9, 1.0, v9
+; GFX8-NEXT: v_min_f32_e32 v26, v33, v26
+; GFX8-NEXT: v_min_f32_e32 v9, v9, v25
+; GFX8-NEXT: v_and_b32_e32 v25, 0xffff0000, v24
+; GFX8-NEXT: v_and_b32_e32 v33, 0xffff0000, v8
+; GFX8-NEXT: v_lshlrev_b32_e32 v24, 16, v24
+; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX8-NEXT: v_mul_f32_e32 v25, 1.0, v25
+; GFX8-NEXT: v_mul_f32_e32 v33, 1.0, v33
+; GFX8-NEXT: v_mul_f32_e32 v24, 1.0, v24
+; GFX8-NEXT: v_mul_f32_e32 v8, 1.0, v8
+; GFX8-NEXT: v_min_f32_e32 v25, v33, v25
+; GFX8-NEXT: v_min_f32_e32 v8, v8, v24
+; GFX8-NEXT: v_and_b32_e32 v24, 0xffff0000, v23
+; GFX8-NEXT: v_and_b32_e32 v33, 0xffff0000, v7
+; GFX8-NEXT: v_lshlrev_b32_e32 v23, 16, v23
+; GFX8-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX8-NEXT: v_mul_f32_e32 v24, 1.0, v24
+; GFX8-NEXT: v_mul_f32_e32 v33, 1.0, v33
+; GFX8-NEXT: v_mul_f32_e32 v23, 1.0, v23
+; GFX8-NEXT: v_mul_f32_e32 v7, 1.0, v7
+; GFX8-NEXT: v_min_f32_e32 v24, v33, v24
+; GFX8-NEXT: v_min_f32_e32 v7, v7, v23
+; GFX8-NEXT: v_and_b32_e32 v23, 0xffff0000, v22
+; GFX8-NEXT: v_and_b32_e32 v33, 0xffff0000, v6
+; GFX8-NEXT: v_lshlrev_b32_e32 v22, 16, v22
+; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX8-NEXT: v_mul_f32_e32 v23, 1.0, v23
+; GFX8-NEXT: v_mul_f32_e32 v33, 1.0, v33
+; GFX8-NEXT: v_mul_f32_e32 v22, 1.0, v22
+; GFX8-NEXT: v_mul_f32_e32 v6, 1.0, v6
+; GFX8-NEXT: v_min_f32_e32 v23, v33, v23
+; GFX8-NEXT: v_min_f32_e32 v6, v6, v22
+; GFX8-NEXT: v_and_b32_e32 v22, 0xffff0000, v21
+; GFX8-NEXT: v_and_b32_e32 v33, 0xffff0000, v5
+; GFX8-NEXT: v_lshlrev_b32_e32 v21, 16, v21
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX8-NEXT: v_mul_f32_e32 v22, 1.0, v22
+; GFX8-NEXT: v_mul_f32_e32 v33, 1.0, v33
+; GFX8-NEXT: v_mul_f32_e32 v21, 1.0, v21
+; GFX8-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GFX8-NEXT: v_min_f32_e32 v22, v33, v22
+; GFX8-NEXT: v_min_f32_e32 v5, v5, v21
+; GFX8-NEXT: v_and_b32_e32 v21, 0xffff0000, v20
+; GFX8-NEXT: v_and_b32_e32 v33, 0xffff0000, v4
+; GFX8-NEXT: v_lshlrev_b32_e32 v20, 16, v20
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX8-NEXT: v_mul_f32_e32 v21, 1.0, v21
+; GFX8-NEXT: v_mul_f32_e32 v33, 1.0, v33
+; GFX8-NEXT: v_mul_f32_e32 v20, 1.0, v20
+; GFX8-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GFX8-NEXT: v_min_f32_e32 v21, v33, v21
+; GFX8-NEXT: v_min_f32_e32 v4, v4, v20
+; GFX8-NEXT: v_and_b32_e32 v20, 0xffff0000, v19
+; GFX8-NEXT: v_and_b32_e32 v33, 0xffff0000, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v19, 16, v19
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX8-NEXT: v_mul_f32_e32 v20, 1.0, v20
+; GFX8-NEXT: v_mul_f32_e32 v33, 1.0, v33
+; GFX8-NEXT: v_mul_f32_e32 v19, 1.0, v19
+; GFX8-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX8-NEXT: v_min_f32_e32 v20, v33, v20
+; GFX8-NEXT: v_min_f32_e32 v3, v3, v19
+; GFX8-NEXT: v_and_b32_e32 v19, 0xffff0000, v18
+; GFX8-NEXT: v_and_b32_e32 v33, 0xffff0000, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v18, 16, v18
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT: v_mul_f32_e32 v19, 1.0, v19
+; GFX8-NEXT: v_mul_f32_e32 v33, 1.0, v33
+; GFX8-NEXT: v_mul_f32_e32 v18, 1.0, v18
+; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX8-NEXT: v_min_f32_e32 v19, v33, v19
+; GFX8-NEXT: v_min_f32_e32 v2, v2, v18
+; GFX8-NEXT: v_and_b32_e32 v18, 0xffff0000, v17
+; GFX8-NEXT: v_and_b32_e32 v33, 0xffff0000, v1
+; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v17
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT: v_mul_f32_e32 v18, 1.0, v18
+; GFX8-NEXT: v_mul_f32_e32 v33, 1.0, v33
+; GFX8-NEXT: v_mul_f32_e32 v17, 1.0, v17
+; GFX8-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX8-NEXT: v_min_f32_e32 v18, v33, v18
+; GFX8-NEXT: v_min_f32_e32 v1, v1, v17
+; GFX8-NEXT: v_and_b32_e32 v17, 0xffff0000, v16
+; GFX8-NEXT: v_and_b32_e32 v33, 0xffff0000, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v16, 16, v16
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT: v_mul_f32_e32 v17, 1.0, v17
+; GFX8-NEXT: v_mul_f32_e32 v33, 1.0, v33
+; GFX8-NEXT: v_mul_f32_e32 v16, 1.0, v16
+; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX8-NEXT: v_min_f32_e32 v17, v33, v17
+; GFX8-NEXT: v_min_f32_e32 v0, v0, v16
+; GFX8-NEXT: v_perm_b32 v0, v0, v17, s4
+; GFX8-NEXT: v_perm_b32 v1, v1, v18, s4
+; GFX8-NEXT: v_perm_b32 v2, v2, v19, s4
+; GFX8-NEXT: v_perm_b32 v3, v3, v20, s4
+; GFX8-NEXT: v_perm_b32 v4, v4, v21, s4
+; GFX8-NEXT: v_perm_b32 v5, v5, v22, s4
+; GFX8-NEXT: v_perm_b32 v6, v6, v23, s4
+; GFX8-NEXT: v_perm_b32 v7, v7, v24, s4
+; GFX8-NEXT: v_perm_b32 v8, v8, v25, s4
+; GFX8-NEXT: v_perm_b32 v9, v9, v26, s4
+; GFX8-NEXT: v_perm_b32 v10, v10, v32, s4
+; GFX8-NEXT: v_perm_b32 v15, v15, v27, s4
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_minnum_v32bf16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: buffer_load_dword v35, off, s[0:3], s32
+; GFX9-NEXT: v_and_b32_e32 v31, 0xffff0000, v30
+; GFX9-NEXT: v_and_b32_e32 v32, 0xffff0000, v14
+; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v29
+; GFX9-NEXT: v_and_b32_e32 v34, 0xffff0000, v13
+; GFX9-NEXT: v_and_b32_e32 v36, 0xffff0000, v28
+; GFX9-NEXT: v_and_b32_e32 v37, 0xffff0000, v12
+; GFX9-NEXT: v_and_b32_e32 v50, 0xffff0000, v25
+; GFX9-NEXT: v_and_b32_e32 v51, 0xffff0000, v9
+; GFX9-NEXT: v_max_f32_e32 v31, v31, v31
+; GFX9-NEXT: v_max_f32_e32 v32, v32, v32
+; GFX9-NEXT: v_max_f32_e32 v33, v33, v33
+; GFX9-NEXT: v_max_f32_e32 v34, v34, v34
+; GFX9-NEXT: v_max_f32_e32 v36, v36, v36
+; GFX9-NEXT: v_max_f32_e32 v37, v37, v37
+; GFX9-NEXT: v_max_f32_e32 v50, v50, v50
+; GFX9-NEXT: v_max_f32_e32 v51, v51, v51
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX9-NEXT: v_and_b32_e32 v38, 0xffff0000, v27
+; GFX9-NEXT: v_and_b32_e32 v39, 0xffff0000, v11
+; GFX9-NEXT: v_and_b32_e32 v52, 0xffff0000, v24
+; GFX9-NEXT: v_and_b32_e32 v53, 0xffff0000, v8
+; GFX9-NEXT: v_and_b32_e32 v43, 0xffff0000, v21
+; GFX9-NEXT: v_min_f32_e32 v31, v32, v31
+; GFX9-NEXT: v_min_f32_e32 v32, v34, v33
+; GFX9-NEXT: v_min_f32_e32 v33, v37, v36
+; GFX9-NEXT: v_min_f32_e32 v37, v51, v50
+; GFX9-NEXT: v_and_b32_e32 v51, 0xffff0000, v5
+; GFX9-NEXT: v_max_f32_e32 v38, v38, v38
+; GFX9-NEXT: v_max_f32_e32 v39, v39, v39
+; GFX9-NEXT: v_max_f32_e32 v52, v52, v52
+; GFX9-NEXT: v_max_f32_e32 v53, v53, v53
+; GFX9-NEXT: v_max_f32_e32 v50, v43, v43
+; GFX9-NEXT: v_max_f32_e32 v51, v51, v51
+; GFX9-NEXT: v_min_f32_e32 v34, v39, v38
+; GFX9-NEXT: v_min_f32_e32 v38, v53, v52
+; GFX9-NEXT: v_min_f32_e32 v50, v51, v50
+; GFX9-NEXT: v_and_b32_e32 v51, 0xffff0000, v20
+; GFX9-NEXT: v_and_b32_e32 v52, 0xffff0000, v4
+; GFX9-NEXT: v_max_f32_e32 v51, v51, v51
+; GFX9-NEXT: v_max_f32_e32 v52, v52, v52
+; GFX9-NEXT: v_and_b32_e32 v54, 0xffff0000, v23
+; GFX9-NEXT: v_and_b32_e32 v55, 0xffff0000, v7
+; GFX9-NEXT: v_min_f32_e32 v51, v52, v51
+; GFX9-NEXT: v_and_b32_e32 v52, 0xffff0000, v19
+; GFX9-NEXT: v_and_b32_e32 v53, 0xffff0000, v3
+; GFX9-NEXT: v_max_f32_e32 v54, v54, v54
+; GFX9-NEXT: v_max_f32_e32 v55, v55, v55
+; GFX9-NEXT: v_max_f32_e32 v52, v52, v52
+; GFX9-NEXT: v_max_f32_e32 v53, v53, v53
+; GFX9-NEXT: v_min_f32_e32 v39, v55, v54
+; GFX9-NEXT: v_min_f32_e32 v52, v53, v52
+; GFX9-NEXT: v_and_b32_e32 v53, 0xffff0000, v18
+; GFX9-NEXT: v_and_b32_e32 v54, 0xffff0000, v2
+; GFX9-NEXT: v_max_f32_e32 v53, v53, v53
+; GFX9-NEXT: v_max_f32_e32 v54, v54, v54
+; GFX9-NEXT: v_and_b32_e32 v48, 0xffff0000, v26
+; GFX9-NEXT: v_and_b32_e32 v49, 0xffff0000, v10
+; GFX9-NEXT: v_and_b32_e32 v40, 0xffff0000, v22
+; GFX9-NEXT: v_and_b32_e32 v41, 0xffff0000, v6
+; GFX9-NEXT: v_min_f32_e32 v53, v54, v53
+; GFX9-NEXT: v_and_b32_e32 v54, 0xffff0000, v17
+; GFX9-NEXT: v_and_b32_e32 v55, 0xffff0000, v1
+; GFX9-NEXT: v_max_f32_e32 v48, v48, v48
+; GFX9-NEXT: v_max_f32_e32 v49, v49, v49
+; GFX9-NEXT: v_max_f32_e32 v40, v40, v40
+; GFX9-NEXT: v_max_f32_e32 v41, v41, v41
+; GFX9-NEXT: v_max_f32_e32 v54, v54, v54
+; GFX9-NEXT: v_max_f32_e32 v55, v55, v55
+; GFX9-NEXT: v_and_b32_e32 v42, 0xffff0000, v15
+; GFX9-NEXT: v_min_f32_e32 v36, v49, v48
+; GFX9-NEXT: v_min_f32_e32 v48, v41, v40
+; GFX9-NEXT: v_min_f32_e32 v54, v55, v54
+; GFX9-NEXT: v_and_b32_e32 v55, 0xffff0000, v16
+; GFX9-NEXT: v_and_b32_e32 v40, 0xffff0000, v0
+; GFX9-NEXT: v_max_f32_e32 v42, v42, v42
+; GFX9-NEXT: v_max_f32_e32 v55, v55, v55
+; GFX9-NEXT: v_max_f32_e32 v40, v40, v40
+; GFX9-NEXT: v_min_f32_e32 v55, v40, v55
+; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v15
+; GFX9-NEXT: s_waitcnt vmcnt(4)
+; GFX9-NEXT: v_and_b32_e32 v49, 0xffff0000, v35
+; GFX9-NEXT: v_max_f32_e32 v49, v49, v49
+; GFX9-NEXT: v_min_f32_e32 v49, v42, v49
+; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX9-NEXT: v_lshlrev_b32_e32 v35, 16, v35
+; GFX9-NEXT: v_lshlrev_b32_e32 v30, 16, v30
+; GFX9-NEXT: v_lshlrev_b32_e32 v14, 16, v14
+; GFX9-NEXT: v_lshlrev_b32_e32 v29, 16, v29
+; GFX9-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX9-NEXT: v_lshlrev_b32_e32 v28, 16, v28
+; GFX9-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; GFX9-NEXT: v_lshlrev_b32_e32 v27, 16, v27
+; GFX9-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GFX9-NEXT: v_lshlrev_b32_e32 v26, 16, v26
+; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GFX9-NEXT: v_lshlrev_b32_e32 v25, 16, v25
+; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX9-NEXT: v_lshlrev_b32_e32 v24, 16, v24
+; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX9-NEXT: v_lshlrev_b32_e32 v23, 16, v23
+; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX9-NEXT: v_lshlrev_b32_e32 v22, 16, v22
+; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX9-NEXT: v_lshlrev_b32_e32 v21, 16, v21
+; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX9-NEXT: v_lshlrev_b32_e32 v20, 16, v20
+; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX9-NEXT: v_lshlrev_b32_e32 v19, 16, v19
+; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX9-NEXT: v_lshlrev_b32_e32 v18, 16, v18
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v17
+; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v16
+; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-NEXT: v_max_f32_e32 v35, v35, v35
+; GFX9-NEXT: v_max_f32_e32 v15, v15, v15
+; GFX9-NEXT: v_max_f32_e32 v30, v30, v30
+; GFX9-NEXT: v_max_f32_e32 v14, v14, v14
+; GFX9-NEXT: v_max_f32_e32 v29, v29, v29
+; GFX9-NEXT: v_max_f32_e32 v13, v13, v13
+; GFX9-NEXT: v_max_f32_e32 v28, v28, v28
+; GFX9-NEXT: v_max_f32_e32 v12, v12, v12
+; GFX9-NEXT: v_max_f32_e32 v27, v27, v27
+; GFX9-NEXT: v_max_f32_e32 v11, v11, v11
+; GFX9-NEXT: v_max_f32_e32 v26, v26, v26
+; GFX9-NEXT: v_max_f32_e32 v10, v10, v10
+; GFX9-NEXT: v_max_f32_e32 v25, v25, v25
+; GFX9-NEXT: v_max_f32_e32 v9, v9, v9
+; GFX9-NEXT: v_max_f32_e32 v24, v24, v24
+; GFX9-NEXT: v_max_f32_e32 v8, v8, v8
+; GFX9-NEXT: v_max_f32_e32 v23, v23, v23
+; GFX9-NEXT: v_max_f32_e32 v7, v7, v7
+; GFX9-NEXT: v_max_f32_e32 v22, v22, v22
+; GFX9-NEXT: v_max_f32_e32 v6, v6, v6
+; GFX9-NEXT: v_max_f32_e32 v21, v21, v21
+; GFX9-NEXT: v_max_f32_e32 v5, v5, v5
+; GFX9-NEXT: v_max_f32_e32 v20, v20, v20
+; GFX9-NEXT: v_max_f32_e32 v4, v4, v4
+; GFX9-NEXT: v_max_f32_e32 v19, v19, v19
+; GFX9-NEXT: v_max_f32_e32 v3, v3, v3
+; GFX9-NEXT: v_max_f32_e32 v18, v18, v18
+; GFX9-NEXT: v_max_f32_e32 v2, v2, v2
+; GFX9-NEXT: v_max_f32_e32 v17, v17, v17
+; GFX9-NEXT: v_max_f32_e32 v1, v1, v1
+; GFX9-NEXT: v_max_f32_e32 v16, v16, v16
+; GFX9-NEXT: v_max_f32_e32 v0, v0, v0
+; GFX9-NEXT: v_min_f32_e32 v15, v15, v35
+; GFX9-NEXT: v_min_f32_e32 v14, v14, v30
+; GFX9-NEXT: v_min_f32_e32 v13, v13, v29
+; GFX9-NEXT: v_min_f32_e32 v12, v12, v28
+; GFX9-NEXT: v_min_f32_e32 v11, v11, v27
+; GFX9-NEXT: v_min_f32_e32 v10, v10, v26
+; GFX9-NEXT: v_min_f32_e32 v9, v9, v25
+; GFX9-NEXT: v_min_f32_e32 v8, v8, v24
+; GFX9-NEXT: v_min_f32_e32 v7, v7, v23
+; GFX9-NEXT: v_min_f32_e32 v6, v6, v22
+; GFX9-NEXT: v_min_f32_e32 v5, v5, v21
+; GFX9-NEXT: v_min_f32_e32 v4, v4, v20
+; GFX9-NEXT: v_min_f32_e32 v3, v3, v19
+; GFX9-NEXT: v_min_f32_e32 v2, v2, v18
+; GFX9-NEXT: v_min_f32_e32 v1, v1, v17
+; GFX9-NEXT: v_min_f32_e32 v0, v0, v16
+; GFX9-NEXT: s_mov_b32 s4, 0x3020706
+; GFX9-NEXT: v_perm_b32 v0, v0, v55, s4
+; GFX9-NEXT: v_perm_b32 v1, v1, v54, s4
+; GFX9-NEXT: v_perm_b32 v2, v2, v53, s4
+; GFX9-NEXT: v_perm_b32 v3, v3, v52, s4
+; GFX9-NEXT: v_perm_b32 v4, v4, v51, s4
+; GFX9-NEXT: v_perm_b32 v5, v5, v50, s4
+; GFX9-NEXT: v_perm_b32 v6, v6, v48, s4
+; GFX9-NEXT: v_perm_b32 v7, v7, v39, s4
+; GFX9-NEXT: v_perm_b32 v8, v8, v38, s4
+; GFX9-NEXT: v_perm_b32 v9, v9, v37, s4
+; GFX9-NEXT: v_perm_b32 v10, v10, v36, s4
+; GFX9-NEXT: v_perm_b32 v11, v11, v34, s4
+; GFX9-NEXT: v_perm_b32 v12, v12, v33, s4
+; GFX9-NEXT: v_perm_b32 v13, v13, v32, s4
+; GFX9-NEXT: v_perm_b32 v14, v14, v31, s4
+; GFX9-NEXT: v_perm_b32 v15, v15, v49, s4
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minnum_v32bf16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: buffer_load_dword v31, off, s[0:3], s32
+; GFX10-NEXT: v_and_b32_e32 v53, 0xffff0000, v24
+; GFX10-NEXT: v_and_b32_e32 v54, 0xffff0000, v8
+; GFX10-NEXT: v_and_b32_e32 v55, 0xffff0000, v23
+; GFX10-NEXT: v_and_b32_e32 v64, 0xffff0000, v7
+; GFX10-NEXT: v_and_b32_e32 v65, 0xffff0000, v22
+; GFX10-NEXT: v_and_b32_e32 v66, 0xffff0000, v6
+; GFX10-NEXT: v_and_b32_e32 v67, 0xffff0000, v21
+; GFX10-NEXT: v_and_b32_e32 v68, 0xffff0000, v5
+; GFX10-NEXT: v_max_f32_e32 v53, v53, v53
+; GFX10-NEXT: v_max_f32_e32 v54, v54, v54
+; GFX10-NEXT: v_max_f32_e32 v55, v55, v55
+; GFX10-NEXT: v_max_f32_e32 v64, v64, v64
+; GFX10-NEXT: v_max_f32_e32 v65, v65, v65
+; GFX10-NEXT: v_max_f32_e32 v66, v66, v66
+; GFX10-NEXT: v_max_f32_e32 v67, v67, v67
+; GFX10-NEXT: v_max_f32_e32 v68, v68, v68
+; GFX10-NEXT: v_and_b32_e32 v32, 0xffff0000, v30
+; GFX10-NEXT: v_and_b32_e32 v34, 0xffff0000, v14
+; GFX10-NEXT: v_and_b32_e32 v35, 0xffff0000, v29
+; GFX10-NEXT: v_and_b32_e32 v36, 0xffff0000, v13
+; GFX10-NEXT: v_and_b32_e32 v37, 0xffff0000, v28
+; GFX10-NEXT: v_and_b32_e32 v38, 0xffff0000, v12
+; GFX10-NEXT: v_and_b32_e32 v39, 0xffff0000, v27
+; GFX10-NEXT: v_and_b32_e32 v48, 0xffff0000, v11
+; GFX10-NEXT: v_and_b32_e32 v49, 0xffff0000, v26
+; GFX10-NEXT: v_and_b32_e32 v50, 0xffff0000, v10
+; GFX10-NEXT: v_and_b32_e32 v51, 0xffff0000, v25
+; GFX10-NEXT: v_and_b32_e32 v52, 0xffff0000, v9
+; GFX10-NEXT: v_min_f32_e32 v53, v54, v53
+; GFX10-NEXT: v_and_b32_e32 v54, 0xffff0000, v17
+; GFX10-NEXT: v_min_f32_e32 v55, v64, v55
+; GFX10-NEXT: v_and_b32_e32 v64, 0xffff0000, v1
+; GFX10-NEXT: v_min_f32_e32 v65, v66, v65
+; GFX10-NEXT: v_and_b32_e32 v66, 0xffff0000, v16
+; GFX10-NEXT: v_min_f32_e32 v67, v68, v67
+; GFX10-NEXT: v_and_b32_e32 v68, 0xffff0000, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v17
+; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT: v_lshlrev_b32_e32 v16, 16, v16
+; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT: v_max_f32_e32 v32, v32, v32
+; GFX10-NEXT: v_max_f32_e32 v34, v34, v34
+; GFX10-NEXT: v_max_f32_e32 v35, v35, v35
+; GFX10-NEXT: v_max_f32_e32 v36, v36, v36
+; GFX10-NEXT: v_max_f32_e32 v37, v37, v37
+; GFX10-NEXT: v_max_f32_e32 v38, v38, v38
+; GFX10-NEXT: v_max_f32_e32 v39, v39, v39
+; GFX10-NEXT: v_max_f32_e32 v48, v48, v48
+; GFX10-NEXT: v_max_f32_e32 v49, v49, v49
+; GFX10-NEXT: v_max_f32_e32 v50, v50, v50
+; GFX10-NEXT: v_max_f32_e32 v51, v51, v51
+; GFX10-NEXT: v_max_f32_e32 v52, v52, v52
+; GFX10-NEXT: v_max_f32_e32 v17, v17, v17
+; GFX10-NEXT: v_max_f32_e32 v1, v1, v1
+; GFX10-NEXT: v_max_f32_e32 v16, v16, v16
+; GFX10-NEXT: v_max_f32_e32 v0, v0, v0
+; GFX10-NEXT: v_and_b32_e32 v33, 0xffff0000, v15
+; GFX10-NEXT: v_min_f32_e32 v32, v34, v32
+; GFX10-NEXT: v_and_b32_e32 v34, 0xffff0000, v20
+; GFX10-NEXT: v_min_f32_e32 v35, v36, v35
+; GFX10-NEXT: v_and_b32_e32 v36, 0xffff0000, v4
+; GFX10-NEXT: v_min_f32_e32 v37, v38, v37
+; GFX10-NEXT: v_and_b32_e32 v38, 0xffff0000, v19
+; GFX10-NEXT: v_min_f32_e32 v39, v48, v39
+; GFX10-NEXT: v_and_b32_e32 v48, 0xffff0000, v3
+; GFX10-NEXT: v_min_f32_e32 v49, v50, v49
+; GFX10-NEXT: v_and_b32_e32 v50, 0xffff0000, v18
+; GFX10-NEXT: v_min_f32_e32 v51, v52, v51
+; GFX10-NEXT: v_and_b32_e32 v52, 0xffff0000, v2
+; GFX10-NEXT: v_lshlrev_b32_e32 v15, 16, v15
+; GFX10-NEXT: v_lshlrev_b32_e32 v30, 16, v30
+; GFX10-NEXT: v_lshlrev_b32_e32 v14, 16, v14
+; GFX10-NEXT: v_lshlrev_b32_e32 v29, 16, v29
+; GFX10-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX10-NEXT: v_lshlrev_b32_e32 v28, 16, v28
+; GFX10-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; GFX10-NEXT: v_lshlrev_b32_e32 v27, 16, v27
+; GFX10-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GFX10-NEXT: v_lshlrev_b32_e32 v26, 16, v26
+; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GFX10-NEXT: v_lshlrev_b32_e32 v25, 16, v25
+; GFX10-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX10-NEXT: v_lshlrev_b32_e32 v24, 16, v24
+; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX10-NEXT: v_lshlrev_b32_e32 v23, 16, v23
+; GFX10-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX10-NEXT: v_lshlrev_b32_e32 v22, 16, v22
+; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX10-NEXT: v_lshlrev_b32_e32 v21, 16, v21
+; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX10-NEXT: v_lshlrev_b32_e32 v20, 16, v20
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX10-NEXT: v_lshlrev_b32_e32 v19, 16, v19
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v18
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-NEXT: v_min_f32_e32 v0, v0, v16
+; GFX10-NEXT: v_min_f32_e32 v1, v1, v17
+; GFX10-NEXT: v_max_f32_e32 v33, v33, v33
+; GFX10-NEXT: v_max_f32_e32 v34, v34, v34
+; GFX10-NEXT: v_max_f32_e32 v36, v36, v36
+; GFX10-NEXT: v_max_f32_e32 v38, v38, v38
+; GFX10-NEXT: v_max_f32_e32 v48, v48, v48
+; GFX10-NEXT: v_max_f32_e32 v50, v50, v50
+; GFX10-NEXT: v_max_f32_e32 v52, v52, v52
+; GFX10-NEXT: v_max_f32_e32 v54, v54, v54
+; GFX10-NEXT: v_max_f32_e32 v64, v64, v64
+; GFX10-NEXT: v_max_f32_e32 v66, v66, v66
+; GFX10-NEXT: v_max_f32_e32 v68, v68, v68
+; GFX10-NEXT: v_max_f32_e32 v15, v15, v15
+; GFX10-NEXT: v_max_f32_e32 v30, v30, v30
+; GFX10-NEXT: v_max_f32_e32 v14, v14, v14
+; GFX10-NEXT: v_max_f32_e32 v29, v29, v29
+; GFX10-NEXT: v_max_f32_e32 v13, v13, v13
+; GFX10-NEXT: v_max_f32_e32 v28, v28, v28
+; GFX10-NEXT: v_max_f32_e32 v12, v12, v12
+; GFX10-NEXT: v_max_f32_e32 v27, v27, v27
+; GFX10-NEXT: v_max_f32_e32 v11, v11, v11
+; GFX10-NEXT: v_max_f32_e32 v26, v26, v26
+; GFX10-NEXT: v_max_f32_e32 v10, v10, v10
+; GFX10-NEXT: v_max_f32_e32 v25, v25, v25
+; GFX10-NEXT: v_max_f32_e32 v9, v9, v9
+; GFX10-NEXT: v_max_f32_e32 v24, v24, v24
+; GFX10-NEXT: v_max_f32_e32 v8, v8, v8
+; GFX10-NEXT: v_max_f32_e32 v23, v23, v23
+; GFX10-NEXT: v_max_f32_e32 v7, v7, v7
+; GFX10-NEXT: v_max_f32_e32 v22, v22, v22
+; GFX10-NEXT: v_max_f32_e32 v6, v6, v6
+; GFX10-NEXT: v_max_f32_e32 v21, v21, v21
+; GFX10-NEXT: v_max_f32_e32 v5, v5, v5
+; GFX10-NEXT: v_max_f32_e32 v20, v20, v20
+; GFX10-NEXT: v_max_f32_e32 v4, v4, v4
+; GFX10-NEXT: v_max_f32_e32 v19, v19, v19
+; GFX10-NEXT: v_max_f32_e32 v3, v3, v3
+; GFX10-NEXT: v_max_f32_e32 v18, v18, v18
+; GFX10-NEXT: v_max_f32_e32 v2, v2, v2
+; GFX10-NEXT: v_min_f32_e32 v34, v36, v34
+; GFX10-NEXT: v_min_f32_e32 v36, v48, v38
+; GFX10-NEXT: v_min_f32_e32 v38, v52, v50
+; GFX10-NEXT: v_min_f32_e32 v48, v64, v54
+; GFX10-NEXT: v_min_f32_e32 v50, v68, v66
+; GFX10-NEXT: v_min_f32_e32 v14, v14, v30
+; GFX10-NEXT: v_min_f32_e32 v13, v13, v29
+; GFX10-NEXT: v_min_f32_e32 v12, v12, v28
+; GFX10-NEXT: v_min_f32_e32 v11, v11, v27
+; GFX10-NEXT: v_min_f32_e32 v10, v10, v26
+; GFX10-NEXT: v_min_f32_e32 v9, v9, v25
+; GFX10-NEXT: v_min_f32_e32 v8, v8, v24
+; GFX10-NEXT: v_min_f32_e32 v7, v7, v23
+; GFX10-NEXT: v_min_f32_e32 v6, v6, v22
+; GFX10-NEXT: v_min_f32_e32 v5, v5, v21
+; GFX10-NEXT: v_min_f32_e32 v2, v2, v18
+; GFX10-NEXT: v_min_f32_e32 v3, v3, v19
+; GFX10-NEXT: v_min_f32_e32 v4, v4, v20
+; GFX10-NEXT: v_perm_b32 v0, v0, v50, 0x3020706
+; GFX10-NEXT: v_perm_b32 v1, v1, v48, 0x3020706
+; GFX10-NEXT: v_perm_b32 v2, v2, v38, 0x3020706
+; GFX10-NEXT: v_perm_b32 v3, v3, v36, 0x3020706
+; GFX10-NEXT: v_perm_b32 v4, v4, v34, 0x3020706
+; GFX10-NEXT: v_perm_b32 v5, v5, v67, 0x3020706
+; GFX10-NEXT: v_perm_b32 v6, v6, v65, 0x3020706
+; GFX10-NEXT: v_perm_b32 v7, v7, v55, 0x3020706
+; GFX10-NEXT: v_perm_b32 v8, v8, v53, 0x3020706
+; GFX10-NEXT: v_perm_b32 v9, v9, v51, 0x3020706
+; GFX10-NEXT: v_perm_b32 v10, v10, v49, 0x3020706
+; GFX10-NEXT: v_perm_b32 v11, v11, v39, 0x3020706
+; GFX10-NEXT: v_perm_b32 v12, v12, v37, 0x3020706
+; GFX10-NEXT: v_perm_b32 v13, v13, v35, 0x3020706
+; GFX10-NEXT: v_perm_b32 v14, v14, v32, 0x3020706
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_and_b32_e32 v16, 0xffff0000, v31
+; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v31
+; GFX10-NEXT: v_max_f32_e32 v16, v16, v16
+; GFX10-NEXT: v_max_f32_e32 v17, v17, v17
+; GFX10-NEXT: v_min_f32_e32 v16, v33, v16
+; GFX10-NEXT: v_min_f32_e32 v15, v15, v17
+; GFX10-NEXT: v_perm_b32 v15, v15, v16, 0x3020706
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_minnum_v32bf16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: scratch_load_b32 v31, off, s32
+; GFX11-NEXT: v_and_b32_e32 v33, 0xffff0000, v30
+; GFX11-NEXT: v_and_b32_e32 v35, 0xffff0000, v29
+; GFX11-NEXT: v_and_b32_e32 v34, 0xffff0000, v14
+; GFX11-NEXT: v_and_b32_e32 v38, 0xffff0000, v12
+; GFX11-NEXT: v_and_b32_e32 v37, 0xffff0000, v28
+; GFX11-NEXT: v_and_b32_e32 v39, 0xffff0000, v27
+; GFX11-NEXT: v_and_b32_e32 v36, 0xffff0000, v13
+; GFX11-NEXT: v_dual_max_f32 v33, v33, v33 :: v_dual_and_b32 v32, 0xffff0000, v15
+; GFX11-NEXT: v_and_b32_e32 v49, 0xffff0000, v26
+; GFX11-NEXT: v_and_b32_e32 v48, 0xffff0000, v11
+; GFX11-NEXT: v_and_b32_e32 v51, 0xffff0000, v25
+; GFX11-NEXT: v_and_b32_e32 v50, 0xffff0000, v10
+; GFX11-NEXT: v_and_b32_e32 v54, 0xffff0000, v8
+; GFX11-NEXT: v_and_b32_e32 v53, 0xffff0000, v24
+; GFX11-NEXT: v_and_b32_e32 v55, 0xffff0000, v23
+; GFX11-NEXT: v_and_b32_e32 v52, 0xffff0000, v9
+; GFX11-NEXT: v_and_b32_e32 v65, 0xffff0000, v22
+; GFX11-NEXT: v_and_b32_e32 v67, 0xffff0000, v21
+; GFX11-NEXT: v_and_b32_e32 v66, 0xffff0000, v6
+; GFX11-NEXT: v_and_b32_e32 v71, 0xffff0000, v19
+; GFX11-NEXT: v_and_b32_e32 v68, 0xffff0000, v5
+; GFX11-NEXT: v_and_b32_e32 v83, 0xffff0000, v17
+; GFX11-NEXT: v_and_b32_e32 v86, 0xffff0000, v0
+; GFX11-NEXT: v_and_b32_e32 v85, 0xffff0000, v16
+; GFX11-NEXT: v_lshlrev_b32_e32 v15, 16, v15
+; GFX11-NEXT: v_and_b32_e32 v84, 0xffff0000, v1
+; GFX11-NEXT: v_lshlrev_b32_e32 v25, 16, v25
+; GFX11-NEXT: v_lshlrev_b32_e32 v26, 16, v26
+; GFX11-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GFX11-NEXT: v_lshlrev_b32_e32 v21, 16, v21
+; GFX11-NEXT: v_lshlrev_b32_e32 v22, 16, v22
+; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX11-NEXT: v_lshlrev_b32_e32 v17, 16, v17
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: v_dual_max_f32 v35, v35, v35 :: v_dual_max_f32 v34, v34, v34
+; GFX11-NEXT: v_lshlrev_b32_e32 v16, 16, v16
+; GFX11-NEXT: v_dual_max_f32 v38, v38, v38 :: v_dual_max_f32 v37, v37, v37
+; GFX11-NEXT: v_dual_max_f32 v39, v39, v39 :: v_dual_lshlrev_b32 v0, 16, v0
+; GFX11-NEXT: v_max_f32_e32 v36, v36, v36
+; GFX11-NEXT: v_dual_max_f32 v65, v65, v65 :: v_dual_and_b32 v64, 0xffff0000, v7
+; GFX11-NEXT: v_and_b32_e32 v70, 0xffff0000, v4
+; GFX11-NEXT: v_and_b32_e32 v69, 0xffff0000, v20
+; GFX11-NEXT: v_and_b32_e32 v81, 0xffff0000, v18
+; GFX11-NEXT: v_dual_max_f32 v83, v83, v83 :: v_dual_and_b32 v82, 0xffff0000, v2
+; GFX11-NEXT: v_dual_max_f32 v17, v17, v17 :: v_dual_lshlrev_b32 v18, 16, v18
+; GFX11-NEXT: v_dual_max_f32 v1, v1, v1 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX11-NEXT: v_dual_max_f32 v49, v49, v49 :: v_dual_max_f32 v48, v48, v48
+; GFX11-NEXT: v_dual_max_f32 v51, v51, v51 :: v_dual_max_f32 v50, v50, v50
+; GFX11-NEXT: v_dual_max_f32 v54, v54, v54 :: v_dual_max_f32 v53, v53, v53
+; GFX11-NEXT: v_dual_max_f32 v67, v67, v67 :: v_dual_max_f32 v66, v66, v66
+; GFX11-NEXT: v_dual_max_f32 v25, v25, v25 :: v_dual_max_f32 v26, v26, v26
+; GFX11-NEXT: v_dual_max_f32 v9, v9, v9 :: v_dual_max_f32 v10, v10, v10
+; GFX11-NEXT: v_dual_max_f32 v21, v21, v21 :: v_dual_max_f32 v22, v22, v22
+; GFX11-NEXT: v_dual_max_f32 v5, v5, v5 :: v_dual_max_f32 v6, v6, v6
+; GFX11-NEXT: v_dual_min_f32 v33, v34, v33 :: v_dual_max_f32 v16, v16, v16
+; GFX11-NEXT: v_dual_min_f32 v34, v36, v35 :: v_dual_min_f32 v35, v38, v37
+; GFX11-NEXT: v_max_f32_e32 v0, v0, v0
+; GFX11-NEXT: v_dual_max_f32 v81, v81, v81 :: v_dual_and_b32 v80, 0xffff0000, v3
+; GFX11-NEXT: v_lshlrev_b32_e32 v29, 16, v29
+; GFX11-NEXT: v_lshlrev_b32_e32 v30, 16, v30
+; GFX11-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX11-NEXT: v_lshlrev_b32_e32 v14, 16, v14
+; GFX11-NEXT: v_lshlrev_b32_e32 v27, 16, v27
+; GFX11-NEXT: v_lshlrev_b32_e32 v28, 16, v28
+; GFX11-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; GFX11-NEXT: v_lshlrev_b32_e32 v23, 16, v23
+; GFX11-NEXT: v_lshlrev_b32_e32 v24, 16, v24
+; GFX11-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX11-NEXT: v_lshlrev_b32_e32 v19, 16, v19
+; GFX11-NEXT: v_lshlrev_b32_e32 v20, 16, v20
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-NEXT: v_dual_max_f32 v70, v70, v70 :: v_dual_max_f32 v69, v69, v69
+; GFX11-NEXT: v_dual_min_f32 v36, v48, v39 :: v_dual_min_f32 v37, v50, v49
+; GFX11-NEXT: v_min_f32_e32 v39, v54, v53
+; GFX11-NEXT: v_dual_min_f32 v10, v10, v26 :: v_dual_min_f32 v1, v1, v17
+; GFX11-NEXT: v_min_f32_e32 v6, v6, v22
+; GFX11-NEXT: v_dual_max_f32 v32, v32, v32 :: v_dual_max_f32 v55, v55, v55
+; GFX11-NEXT: v_max_f32_e32 v52, v52, v52
+; GFX11-NEXT: v_dual_max_f32 v64, v64, v64 :: v_dual_max_f32 v71, v71, v71
+; GFX11-NEXT: v_max_f32_e32 v68, v68, v68
+; GFX11-NEXT: v_max_f32_e32 v80, v80, v80
+; GFX11-NEXT: v_max_f32_e32 v82, v82, v82
+; GFX11-NEXT: v_dual_max_f32 v86, v86, v86 :: v_dual_max_f32 v85, v85, v85
+; GFX11-NEXT: v_dual_max_f32 v15, v15, v15 :: v_dual_max_f32 v84, v84, v84
+; GFX11-NEXT: v_dual_max_f32 v29, v29, v29 :: v_dual_max_f32 v30, v30, v30
+; GFX11-NEXT: v_dual_max_f32 v13, v13, v13 :: v_dual_max_f32 v14, v14, v14
+; GFX11-NEXT: v_dual_max_f32 v27, v27, v27 :: v_dual_max_f32 v28, v28, v28
+; GFX11-NEXT: v_dual_max_f32 v11, v11, v11 :: v_dual_max_f32 v12, v12, v12
+; GFX11-NEXT: v_dual_max_f32 v23, v23, v23 :: v_dual_max_f32 v24, v24, v24
+; GFX11-NEXT: v_dual_max_f32 v7, v7, v7 :: v_dual_max_f32 v8, v8, v8
+; GFX11-NEXT: v_dual_max_f32 v19, v19, v19 :: v_dual_max_f32 v20, v20, v20
+; GFX11-NEXT: v_dual_max_f32 v3, v3, v3 :: v_dual_max_f32 v4, v4, v4
+; GFX11-NEXT: v_max_f32_e32 v18, v18, v18
+; GFX11-NEXT: v_max_f32_e32 v2, v2, v2
+; GFX11-NEXT: v_dual_min_f32 v38, v52, v51 :: v_dual_min_f32 v53, v82, v81
+; GFX11-NEXT: v_dual_min_f32 v48, v64, v55 :: v_dual_min_f32 v55, v86, v85
+; GFX11-NEXT: v_dual_min_f32 v49, v66, v65 :: v_dual_min_f32 v50, v68, v67
+; GFX11-NEXT: v_min_f32_e32 v13, v13, v29
+; GFX11-NEXT: v_dual_min_f32 v51, v70, v69 :: v_dual_min_f32 v52, v80, v71
+; GFX11-NEXT: v_dual_min_f32 v9, v9, v25 :: v_dual_min_f32 v54, v84, v83
+; GFX11-NEXT: v_dual_min_f32 v5, v5, v21 :: v_dual_min_f32 v14, v14, v30
+; GFX11-NEXT: v_dual_min_f32 v11, v11, v27 :: v_dual_min_f32 v12, v12, v28
+; GFX11-NEXT: v_dual_min_f32 v7, v7, v23 :: v_dual_min_f32 v8, v8, v24
+; GFX11-NEXT: v_dual_min_f32 v3, v3, v19 :: v_dual_min_f32 v4, v4, v20
+; GFX11-NEXT: v_perm_b32 v1, v1, v54, 0x3020706
+; GFX11-NEXT: v_perm_b32 v5, v5, v50, 0x3020706
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_perm_b32 v7, v7, v48, 0x3020706
+; GFX11-NEXT: v_perm_b32 v3, v3, v52, 0x3020706
+; GFX11-NEXT: v_perm_b32 v4, v4, v51, 0x3020706
+; GFX11-NEXT: v_perm_b32 v8, v8, v39, 0x3020706
+; GFX11-NEXT: v_perm_b32 v9, v9, v38, 0x3020706
+; GFX11-NEXT: v_perm_b32 v10, v10, v37, 0x3020706
+; GFX11-NEXT: v_perm_b32 v11, v11, v36, 0x3020706
+; GFX11-NEXT: v_perm_b32 v12, v12, v35, 0x3020706
+; GFX11-NEXT: v_perm_b32 v13, v13, v34, 0x3020706
+; GFX11-NEXT: v_perm_b32 v14, v14, v33, 0x3020706
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_dual_min_f32 v0, v0, v16 :: v_dual_lshlrev_b32 v17, 16, v31
+; GFX11-NEXT: v_and_b32_e32 v16, 0xffff0000, v31
+; GFX11-NEXT: v_perm_b32 v6, v6, v49, 0x3020706
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_dual_max_f32 v17, v17, v17 :: v_dual_min_f32 v2, v2, v18
+; GFX11-NEXT: v_max_f32_e32 v16, v16, v16
+; GFX11-NEXT: v_perm_b32 v0, v0, v55, 0x3020706
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_min_f32_e32 v15, v15, v17
+; GFX11-NEXT: v_perm_b32 v2, v2, v53, 0x3020706
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_min_f32_e32 v16, v32, v16
+; GFX11-NEXT: v_perm_b32 v15, v15, v16, 0x3020706
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %op = call <32 x bfloat> @llvm.minnum.v32bf16(<32 x bfloat> %a, <32 x bfloat> %b)
+ ret <32 x bfloat> %op
+}
+
+
declare bfloat @llvm.maxnum.bf16(bfloat, bfloat)
declare <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat>, <2 x bfloat>)
+declare <3 x bfloat> @llvm.maxnum.v3bf16(<3 x bfloat>, <3 x bfloat>)
+declare <4 x bfloat> @llvm.maxnum.v4bf16(<4 x bfloat>, <4 x bfloat>)
+declare <8 x bfloat> @llvm.maxnum.v8bf16(<8 x bfloat>, <8 x bfloat>)
+declare <16 x bfloat> @llvm.maxnum.v16bf16(<16 x bfloat>, <16 x bfloat>)
+declare <32 x bfloat> @llvm.maxnum.v32bf16(<32 x bfloat>, <32 x bfloat>)
define bfloat @v_maxnum_bf16(bfloat %a, bfloat %b) {
; GCN-LABEL: v_maxnum_bf16:
@@ -10299,6 +16171,2432 @@ define <2 x bfloat> @v_maxnum_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
ret <2 x bfloat> %op
}
+define <3 x bfloat> @v_maxnum_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
+; GCN-LABEL: v_maxnum_v3bf16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_max_f32_e32 v2, v2, v5
+; GCN-NEXT: v_max_f32_e32 v1, v1, v4
+; GCN-NEXT: v_max_f32_e32 v0, v0, v3
+; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_maxnum_v3bf16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT: v_max_f32_e32 v2, v2, v5
+; GFX7-NEXT: v_max_f32_e32 v1, v1, v4
+; GFX7-NEXT: v_max_f32_e32 v0, v0, v3
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_maxnum_v3bf16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX8-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX8-NEXT: v_max_f32_e32 v1, v1, v3
+; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
+; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX8-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX8-NEXT: v_max_f32_e32 v3, v4, v3
+; GFX8-NEXT: v_max_f32_e32 v0, v0, v2
+; GFX8-NEXT: s_mov_b32 s4, 0x3020706
+; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT: v_perm_b32 v0, v0, v3, s4
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_maxnum_v3bf16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT: v_max_f32_e32 v3, v3, v3
+; GFX9-NEXT: v_max_f32_e32 v1, v1, v1
+; GFX9-NEXT: v_max_f32_e32 v1, v1, v3
+; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
+; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-NEXT: v_max_f32_e32 v3, v3, v3
+; GFX9-NEXT: v_max_f32_e32 v4, v4, v4
+; GFX9-NEXT: v_max_f32_e32 v2, v2, v2
+; GFX9-NEXT: v_max_f32_e32 v0, v0, v0
+; GFX9-NEXT: v_max_f32_e32 v3, v4, v3
+; GFX9-NEXT: v_max_f32_e32 v0, v0, v2
+; GFX9-NEXT: s_mov_b32 s4, 0x3020706
+; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maxnum_v3bf16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT: v_max_f32_e32 v3, v3, v3
+; GFX10-NEXT: v_max_f32_e32 v4, v4, v4
+; GFX10-NEXT: v_max_f32_e32 v5, v5, v5
+; GFX10-NEXT: v_max_f32_e32 v2, v2, v2
+; GFX10-NEXT: v_max_f32_e32 v0, v0, v0
+; GFX10-NEXT: v_max_f32_e32 v1, v1, v1
+; GFX10-NEXT: v_max_f32_e32 v4, v5, v4
+; GFX10-NEXT: v_max_f32_e32 v0, v0, v2
+; GFX10-NEXT: v_max_f32_e32 v1, v1, v3
+; GFX10-NEXT: v_perm_b32 v0, v0, v4, 0x3020706
+; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_maxnum_v3bf16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_dual_max_f32 v4, v4, v4 :: v_dual_lshlrev_b32 v1, 16, v1
+; GFX11-NEXT: v_dual_max_f32 v2, v2, v2 :: v_dual_and_b32 v5, 0xffff0000, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_dual_max_f32 v1, v1, v1 :: v_dual_lshlrev_b32 v0, 16, v0
+; GFX11-NEXT: v_dual_max_f32 v3, v3, v3 :: v_dual_max_f32 v0, v0, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_dual_max_f32 v1, v1, v3 :: v_dual_max_f32 v0, v0, v2
+; GFX11-NEXT: v_max_f32_e32 v5, v5, v5
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: v_max_f32_e32 v4, v5, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_perm_b32 v0, v0, v4, 0x3020706
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %op = call <3 x bfloat> @llvm.maxnum.v3bf16(<3 x bfloat> %a, <3 x bfloat> %b)
+ ret <3 x bfloat> %op
+}
+
+define <4 x bfloat> @v_maxnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
+; GCN-LABEL: v_maxnum_v4bf16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
+; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_max_f32_e32 v3, v3, v7
+; GCN-NEXT: v_max_f32_e32 v2, v2, v6
+; GCN-NEXT: v_max_f32_e32 v1, v1, v5
+; GCN-NEXT: v_max_f32_e32 v0, v0, v4
+; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_maxnum_v4bf16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
+; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT: v_max_f32_e32 v3, v3, v7
+; GFX7-NEXT: v_max_f32_e32 v2, v2, v6
+; GFX7-NEXT: v_max_f32_e32 v1, v1, v5
+; GFX7-NEXT: v_max_f32_e32 v0, v0, v4
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_maxnum_v4bf16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v3
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GFX8-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GFX8-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX8-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX8-NEXT: v_max_f32_e32 v4, v5, v4
+; GFX8-NEXT: v_max_f32_e32 v1, v1, v3
+; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX8-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX8-NEXT: v_max_f32_e32 v3, v5, v3
+; GFX8-NEXT: v_max_f32_e32 v0, v0, v2
+; GFX8-NEXT: s_mov_b32 s4, 0x3020706
+; GFX8-NEXT: v_perm_b32 v0, v0, v3, s4
+; GFX8-NEXT: v_perm_b32 v1, v1, v4, s4
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_maxnum_v4bf16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v3
+; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v1
+; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT: v_max_f32_e32 v4, v4, v4
+; GFX9-NEXT: v_max_f32_e32 v5, v5, v5
+; GFX9-NEXT: v_max_f32_e32 v3, v3, v3
+; GFX9-NEXT: v_max_f32_e32 v1, v1, v1
+; GFX9-NEXT: v_max_f32_e32 v4, v5, v4
+; GFX9-NEXT: v_max_f32_e32 v1, v1, v3
+; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
+; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v0
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-NEXT: v_max_f32_e32 v3, v3, v3
+; GFX9-NEXT: v_max_f32_e32 v5, v5, v5
+; GFX9-NEXT: v_max_f32_e32 v2, v2, v2
+; GFX9-NEXT: v_max_f32_e32 v0, v0, v0
+; GFX9-NEXT: v_max_f32_e32 v3, v5, v3
+; GFX9-NEXT: v_max_f32_e32 v0, v0, v2
+; GFX9-NEXT: s_mov_b32 s4, 0x3020706
+; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4
+; GFX9-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maxnum_v4bf16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v3
+; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v1
+; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
+; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-NEXT: v_max_f32_e32 v3, v3, v3
+; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT: v_max_f32_e32 v1, v1, v1
+; GFX10-NEXT: v_max_f32_e32 v4, v4, v4
+; GFX10-NEXT: v_max_f32_e32 v5, v5, v5
+; GFX10-NEXT: v_max_f32_e32 v6, v6, v6
+; GFX10-NEXT: v_max_f32_e32 v7, v7, v7
+; GFX10-NEXT: v_max_f32_e32 v2, v2, v2
+; GFX10-NEXT: v_max_f32_e32 v0, v0, v0
+; GFX10-NEXT: v_max_f32_e32 v1, v1, v3
+; GFX10-NEXT: v_max_f32_e32 v3, v5, v4
+; GFX10-NEXT: v_max_f32_e32 v4, v7, v6
+; GFX10-NEXT: v_max_f32_e32 v0, v0, v2
+; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX10-NEXT: v_perm_b32 v0, v0, v4, 0x3020706
+; GFX10-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_maxnum_v4bf16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_dual_max_f32 v5, v5, v5 :: v_dual_lshlrev_b32 v4, 16, v3
+; GFX11-NEXT: v_dual_max_f32 v4, v4, v4 :: v_dual_and_b32 v3, 0xffff0000, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_max_f32_e32 v4, v5, v4
+; GFX11-NEXT: v_dual_max_f32 v3, v3, v3 :: v_dual_and_b32 v6, 0xffff0000, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_dual_max_f32 v5, v6, v6 :: v_dual_lshlrev_b32 v0, 16, v0
+; GFX11-NEXT: v_dual_max_f32 v2, v2, v2 :: v_dual_and_b32 v1, 0xffff0000, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_dual_max_f32 v6, v7, v7 :: v_dual_max_f32 v1, v1, v1
+; GFX11-NEXT: v_dual_max_f32 v0, v0, v0 :: v_dual_max_f32 v1, v1, v3
+; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_max_f32_e32 v4, v6, v5
+; GFX11-NEXT: v_dual_max_f32 v0, v0, v2 :: v_dual_and_b32 v1, 0xffff0000, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_perm_b32 v0, v0, v4, 0x3020706
+; GFX11-NEXT: v_or_b32_e32 v1, v3, v1
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %op = call <4 x bfloat> @llvm.maxnum.v4bf16(<4 x bfloat> %a, <4 x bfloat> %b)
+ ret <4 x bfloat> %op
+}
+
+define <8 x bfloat> @v_maxnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
+; GCN-LABEL: v_maxnum_v8bf16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
+; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
+; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
+; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
+; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
+; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
+; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14
+; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
+; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12
+; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
+; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
+; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_max_f32_e32 v7, v7, v15
+; GCN-NEXT: v_max_f32_e32 v6, v6, v14
+; GCN-NEXT: v_max_f32_e32 v5, v5, v13
+; GCN-NEXT: v_max_f32_e32 v4, v4, v12
+; GCN-NEXT: v_max_f32_e32 v3, v3, v11
+; GCN-NEXT: v_max_f32_e32 v2, v2, v10
+; GCN-NEXT: v_max_f32_e32 v1, v1, v9
+; GCN-NEXT: v_max_f32_e32 v0, v0, v8
+; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_maxnum_v8bf16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
+; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15
+; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
+; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14
+; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
+; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13
+; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12
+; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11
+; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT: v_max_f32_e32 v7, v7, v15
+; GFX7-NEXT: v_max_f32_e32 v6, v6, v14
+; GFX7-NEXT: v_max_f32_e32 v5, v5, v13
+; GFX7-NEXT: v_max_f32_e32 v4, v4, v12
+; GFX7-NEXT: v_max_f32_e32 v3, v3, v11
+; GFX7-NEXT: v_max_f32_e32 v2, v2, v10
+; GFX7-NEXT: v_max_f32_e32 v1, v1, v9
+; GFX7-NEXT: v_max_f32_e32 v0, v0, v8
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_maxnum_v8bf16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_and_b32_e32 v8, 0xffff0000, v7
+; GFX8-NEXT: v_and_b32_e32 v9, 0xffff0000, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX8-NEXT: v_mul_f32_e32 v8, 1.0, v8
+; GFX8-NEXT: v_mul_f32_e32 v9, 1.0, v9
+; GFX8-NEXT: v_mul_f32_e32 v7, 1.0, v7
+; GFX8-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX8-NEXT: v_max_f32_e32 v8, v9, v8
+; GFX8-NEXT: v_max_f32_e32 v3, v3, v7
+; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v6
+; GFX8-NEXT: v_and_b32_e32 v9, 0xffff0000, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT: v_mul_f32_e32 v7, 1.0, v7
+; GFX8-NEXT: v_mul_f32_e32 v9, 1.0, v9
+; GFX8-NEXT: v_mul_f32_e32 v6, 1.0, v6
+; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX8-NEXT: v_max_f32_e32 v7, v9, v7
+; GFX8-NEXT: v_max_f32_e32 v2, v2, v6
+; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v5
+; GFX8-NEXT: v_and_b32_e32 v9, 0xffff0000, v1
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT: v_mul_f32_e32 v6, 1.0, v6
+; GFX8-NEXT: v_mul_f32_e32 v9, 1.0, v9
+; GFX8-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GFX8-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX8-NEXT: v_max_f32_e32 v6, v9, v6
+; GFX8-NEXT: v_max_f32_e32 v1, v1, v5
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
+; GFX8-NEXT: v_and_b32_e32 v9, 0xffff0000, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GFX8-NEXT: v_mul_f32_e32 v9, 1.0, v9
+; GFX8-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX8-NEXT: v_max_f32_e32 v5, v9, v5
+; GFX8-NEXT: v_max_f32_e32 v0, v0, v4
+; GFX8-NEXT: s_mov_b32 s4, 0x3020706
+; GFX8-NEXT: v_perm_b32 v0, v0, v5, s4
+; GFX8-NEXT: v_perm_b32 v1, v1, v6, s4
+; GFX8-NEXT: v_perm_b32 v2, v2, v7, s4
+; GFX8-NEXT: v_perm_b32 v3, v3, v8, s4
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_maxnum_v8bf16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_and_b32_e32 v8, 0xffff0000, v7
+; GFX9-NEXT: v_and_b32_e32 v9, 0xffff0000, v3
+; GFX9-NEXT: v_max_f32_e32 v8, v8, v8
+; GFX9-NEXT: v_max_f32_e32 v9, v9, v9
+; GFX9-NEXT: v_max_f32_e32 v8, v9, v8
+; GFX9-NEXT: v_and_b32_e32 v9, 0xffff0000, v5
+; GFX9-NEXT: v_and_b32_e32 v10, 0xffff0000, v1
+; GFX9-NEXT: v_max_f32_e32 v9, v9, v9
+; GFX9-NEXT: v_max_f32_e32 v10, v10, v10
+; GFX9-NEXT: v_max_f32_e32 v9, v10, v9
+; GFX9-NEXT: v_and_b32_e32 v10, 0xffff0000, v6
+; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v2
+; GFX9-NEXT: v_max_f32_e32 v10, v10, v10
+; GFX9-NEXT: v_max_f32_e32 v11, v11, v11
+; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT: v_max_f32_e32 v10, v11, v10
+; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v4
+; GFX9-NEXT: v_and_b32_e32 v12, 0xffff0000, v0
+; GFX9-NEXT: v_max_f32_e32 v5, v5, v5
+; GFX9-NEXT: v_max_f32_e32 v1, v1, v1
+; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX9-NEXT: v_max_f32_e32 v1, v1, v5
+; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v6
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX9-NEXT: v_max_f32_e32 v4, v4, v4
+; GFX9-NEXT: v_max_f32_e32 v0, v0, v0
+; GFX9-NEXT: v_max_f32_e32 v11, v11, v11
+; GFX9-NEXT: v_max_f32_e32 v12, v12, v12
+; GFX9-NEXT: v_max_f32_e32 v7, v7, v7
+; GFX9-NEXT: v_max_f32_e32 v3, v3, v3
+; GFX9-NEXT: v_max_f32_e32 v5, v5, v5
+; GFX9-NEXT: v_max_f32_e32 v2, v2, v2
+; GFX9-NEXT: v_max_f32_e32 v0, v0, v4
+; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v9
+; GFX9-NEXT: v_max_f32_e32 v11, v12, v11
+; GFX9-NEXT: v_max_f32_e32 v3, v3, v7
+; GFX9-NEXT: v_max_f32_e32 v2, v2, v5
+; GFX9-NEXT: s_mov_b32 s4, 0x3020706
+; GFX9-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v8
+; GFX9-NEXT: v_perm_b32 v0, v0, v11, s4
+; GFX9-NEXT: v_perm_b32 v2, v2, v10, s4
+; GFX9-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maxnum_v8bf16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_and_b32_e32 v8, 0xffff0000, v7
+; GFX10-NEXT: v_and_b32_e32 v9, 0xffff0000, v3
+; GFX10-NEXT: v_and_b32_e32 v10, 0xffff0000, v5
+; GFX10-NEXT: v_and_b32_e32 v11, 0xffff0000, v1
+; GFX10-NEXT: v_and_b32_e32 v12, 0xffff0000, v2
+; GFX10-NEXT: v_max_f32_e32 v8, v8, v8
+; GFX10-NEXT: v_max_f32_e32 v9, v9, v9
+; GFX10-NEXT: v_max_f32_e32 v10, v10, v10
+; GFX10-NEXT: v_and_b32_e32 v13, 0xffff0000, v4
+; GFX10-NEXT: v_and_b32_e32 v14, 0xffff0000, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX10-NEXT: v_max_f32_e32 v8, v9, v8
+; GFX10-NEXT: v_max_f32_e32 v9, v11, v11
+; GFX10-NEXT: v_and_b32_e32 v11, 0xffff0000, v6
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-NEXT: v_max_f32_e32 v9, v9, v10
+; GFX10-NEXT: v_max_f32_e32 v10, v11, v11
+; GFX10-NEXT: v_max_f32_e32 v11, v12, v12
+; GFX10-NEXT: v_max_f32_e32 v12, v13, v13
+; GFX10-NEXT: v_max_f32_e32 v13, v14, v14
+; GFX10-NEXT: v_max_f32_e32 v7, v7, v7
+; GFX10-NEXT: v_max_f32_e32 v3, v3, v3
+; GFX10-NEXT: v_max_f32_e32 v5, v5, v5
+; GFX10-NEXT: v_max_f32_e32 v1, v1, v1
+; GFX10-NEXT: v_max_f32_e32 v4, v4, v4
+; GFX10-NEXT: v_max_f32_e32 v0, v0, v0
+; GFX10-NEXT: v_max_f32_e32 v6, v6, v6
+; GFX10-NEXT: v_max_f32_e32 v2, v2, v2
+; GFX10-NEXT: v_max_f32_e32 v10, v11, v10
+; GFX10-NEXT: v_max_f32_e32 v11, v13, v12
+; GFX10-NEXT: v_max_f32_e32 v3, v3, v7
+; GFX10-NEXT: v_max_f32_e32 v1, v1, v5
+; GFX10-NEXT: v_max_f32_e32 v0, v0, v4
+; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v9
+; GFX10-NEXT: v_max_f32_e32 v2, v2, v6
+; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v8
+; GFX10-NEXT: v_perm_b32 v0, v0, v11, 0x3020706
+; GFX10-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT: v_perm_b32 v2, v2, v10, 0x3020706
+; GFX10-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_maxnum_v8bf16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_and_b32_e32 v9, 0xffff0000, v3
+; GFX11-NEXT: v_and_b32_e32 v8, 0xffff0000, v7
+; GFX11-NEXT: v_and_b32_e32 v11, 0xffff0000, v1
+; GFX11-NEXT: v_and_b32_e32 v10, 0xffff0000, v5
+; GFX11-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_dual_max_f32 v9, v9, v9 :: v_dual_max_f32 v8, v8, v8
+; GFX11-NEXT: v_dual_max_f32 v11, v11, v11 :: v_dual_max_f32 v10, v10, v10
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_dual_max_f32 v7, v7, v7 :: v_dual_max_f32 v8, v9, v8
+; GFX11-NEXT: v_max_f32_e32 v9, v11, v10
+; GFX11-NEXT: v_and_b32_e32 v11, 0xffff0000, v2
+; GFX11-NEXT: v_and_b32_e32 v12, 0xffff0000, v6
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_dual_max_f32 v11, v11, v11 :: v_dual_lshlrev_b32 v6, 16, v6
+; GFX11-NEXT: v_max_f32_e32 v10, v12, v12
+; GFX11-NEXT: v_and_b32_e32 v13, 0xffff0000, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT: v_dual_max_f32 v2, v2, v2 :: v_dual_lshlrev_b32 v5, 16, v5
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_max_f32_e32 v10, v11, v10
+; GFX11-NEXT: v_dual_max_f32 v13, v13, v13 :: v_dual_and_b32 v12, 0xffff0000, v4
+; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v12, v12, v12
+; GFX11-NEXT: v_max_f32_e32 v0, v0, v0
+; GFX11-NEXT: v_dual_max_f32 v4, v4, v4 :: v_dual_max_f32 v3, v3, v3
+; GFX11-NEXT: v_max_f32_e32 v5, v5, v5
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_max_f32_e32 v11, v13, v12
+; GFX11-NEXT: v_dual_max_f32 v0, v0, v4 :: v_dual_max_f32 v3, v3, v7
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_max_f32_e32 v1, v1, v5
+; GFX11-NEXT: v_dual_max_f32 v5, v6, v6 :: v_dual_and_b32 v4, 0xffff0000, v9
+; GFX11-NEXT: v_perm_b32 v0, v0, v11, 0x3020706
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_dual_max_f32 v2, v2, v5 :: v_dual_and_b32 v5, 0xffff0000, v8
+; GFX11-NEXT: v_or_b32_e32 v1, v1, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_perm_b32 v2, v2, v10, 0x3020706
+; GFX11-NEXT: v_or_b32_e32 v3, v3, v5
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %op = call <8 x bfloat> @llvm.maxnum.v8bf16(<8 x bfloat> %a, <8 x bfloat> %b)
+ ret <8 x bfloat> %op
+}
+
+define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
+; GCN-LABEL: v_maxnum_v16bf16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
+; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
+; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30
+; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14
+; GCN-NEXT: v_max_f32_e32 v14, v14, v30
+; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
+; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
+; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29
+; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
+; GCN-NEXT: v_max_f32_e32 v13, v13, v29
+; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
+; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
+; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28
+; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12
+; GCN-NEXT: v_max_f32_e32 v12, v12, v28
+; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
+; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27
+; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
+; GCN-NEXT: v_max_f32_e32 v11, v11, v27
+; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
+; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26
+; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
+; GCN-NEXT: v_max_f32_e32 v10, v10, v26
+; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
+; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
+; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25
+; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
+; GCN-NEXT: v_max_f32_e32 v9, v9, v25
+; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
+; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24
+; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
+; GCN-NEXT: v_max_f32_e32 v8, v8, v24
+; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
+; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23
+; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
+; GCN-NEXT: v_max_f32_e32 v7, v7, v23
+; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
+; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22
+; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
+; GCN-NEXT: v_max_f32_e32 v6, v6, v22
+; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
+; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GCN-NEXT: v_max_f32_e32 v5, v5, v21
+; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
+; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20
+; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GCN-NEXT: v_max_f32_e32 v4, v4, v20
+; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32
+; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
+; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
+; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
+; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
+; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18
+; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
+; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_max_f32_e32 v3, v3, v19
+; GCN-NEXT: v_max_f32_e32 v2, v2, v18
+; GCN-NEXT: v_max_f32_e32 v1, v1, v17
+; GCN-NEXT: v_max_f32_e32 v0, v0, v16
+; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
+; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
+; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v20
+; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16
+; GCN-NEXT: v_max_f32_e32 v15, v15, v16
+; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
+; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_maxnum_v16bf16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22
+; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
+; GFX7-NEXT: v_max_f32_e32 v6, v6, v22
+; GFX7-NEXT: buffer_load_dword v22, off, s[0:3], s32
+; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
+; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
+; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
+; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
+; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
+; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
+; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
+; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
+; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
+; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
+; GFX7-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
+; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
+; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
+; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30
+; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14
+; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29
+; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13
+; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28
+; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12
+; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27
+; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11
+; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26
+; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
+; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25
+; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
+; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24
+; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
+; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23
+; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
+; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15
+; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21
+; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20
+; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19
+; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT: v_max_f32_e32 v14, v14, v30
+; GFX7-NEXT: v_max_f32_e32 v13, v13, v29
+; GFX7-NEXT: v_max_f32_e32 v12, v12, v28
+; GFX7-NEXT: v_max_f32_e32 v11, v11, v27
+; GFX7-NEXT: v_max_f32_e32 v10, v10, v26
+; GFX7-NEXT: v_max_f32_e32 v9, v9, v25
+; GFX7-NEXT: v_max_f32_e32 v8, v8, v24
+; GFX7-NEXT: v_max_f32_e32 v7, v7, v23
+; GFX7-NEXT: v_max_f32_e32 v5, v5, v21
+; GFX7-NEXT: v_max_f32_e32 v4, v4, v20
+; GFX7-NEXT: v_max_f32_e32 v3, v3, v19
+; GFX7-NEXT: v_max_f32_e32 v2, v2, v18
+; GFX7-NEXT: v_max_f32_e32 v1, v1, v17
+; GFX7-NEXT: v_max_f32_e32 v0, v0, v16
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
+; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22
+; GFX7-NEXT: v_max_f32_e32 v15, v15, v22
+; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
+; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
+; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
+; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
+; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_maxnum_v16bf16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_and_b32_e32 v16, 0xffff0000, v15
+; GFX8-NEXT: v_and_b32_e32 v17, 0xffff0000, v7
+; GFX8-NEXT: v_lshlrev_b32_e32 v15, 16, v15
+; GFX8-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX8-NEXT: v_mul_f32_e32 v16, 1.0, v16
+; GFX8-NEXT: v_mul_f32_e32 v17, 1.0, v17
+; GFX8-NEXT: v_mul_f32_e32 v15, 1.0, v15
+; GFX8-NEXT: v_mul_f32_e32 v7, 1.0, v7
+; GFX8-NEXT: v_max_f32_e32 v16, v17, v16
+; GFX8-NEXT: v_max_f32_e32 v7, v7, v15
+; GFX8-NEXT: v_and_b32_e32 v15, 0xffff0000, v14
+; GFX8-NEXT: v_and_b32_e32 v17, 0xffff0000, v6
+; GFX8-NEXT: v_lshlrev_b32_e32 v14, 16, v14
+; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX8-NEXT: v_mul_f32_e32 v15, 1.0, v15
+; GFX8-NEXT: v_mul_f32_e32 v17, 1.0, v17
+; GFX8-NEXT: v_mul_f32_e32 v14, 1.0, v14
+; GFX8-NEXT: v_mul_f32_e32 v6, 1.0, v6
+; GFX8-NEXT: v_max_f32_e32 v15, v17, v15
+; GFX8-NEXT: v_max_f32_e32 v6, v6, v14
+; GFX8-NEXT: v_and_b32_e32 v14, 0xffff0000, v13
+; GFX8-NEXT: v_and_b32_e32 v17, 0xffff0000, v5
+; GFX8-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX8-NEXT: v_mul_f32_e32 v14, 1.0, v14
+; GFX8-NEXT: v_mul_f32_e32 v17, 1.0, v17
+; GFX8-NEXT: v_mul_f32_e32 v13, 1.0, v13
+; GFX8-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GFX8-NEXT: v_max_f32_e32 v14, v17, v14
+; GFX8-NEXT: v_max_f32_e32 v5, v5, v13
+; GFX8-NEXT: v_and_b32_e32 v13, 0xffff0000, v12
+; GFX8-NEXT: v_and_b32_e32 v17, 0xffff0000, v4
+; GFX8-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX8-NEXT: v_mul_f32_e32 v13, 1.0, v13
+; GFX8-NEXT: v_mul_f32_e32 v17, 1.0, v17
+; GFX8-NEXT: v_mul_f32_e32 v12, 1.0, v12
+; GFX8-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GFX8-NEXT: v_max_f32_e32 v13, v17, v13
+; GFX8-NEXT: v_max_f32_e32 v4, v4, v12
+; GFX8-NEXT: v_and_b32_e32 v12, 0xffff0000, v11
+; GFX8-NEXT: v_and_b32_e32 v17, 0xffff0000, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX8-NEXT: v_mul_f32_e32 v12, 1.0, v12
+; GFX8-NEXT: v_mul_f32_e32 v17, 1.0, v17
+; GFX8-NEXT: v_mul_f32_e32 v11, 1.0, v11
+; GFX8-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX8-NEXT: v_max_f32_e32 v12, v17, v12
+; GFX8-NEXT: v_max_f32_e32 v3, v3, v11
+; GFX8-NEXT: v_and_b32_e32 v11, 0xffff0000, v10
+; GFX8-NEXT: v_and_b32_e32 v17, 0xffff0000, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT: v_mul_f32_e32 v11, 1.0, v11
+; GFX8-NEXT: v_mul_f32_e32 v17, 1.0, v17
+; GFX8-NEXT: v_mul_f32_e32 v10, 1.0, v10
+; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX8-NEXT: v_max_f32_e32 v11, v17, v11
+; GFX8-NEXT: v_max_f32_e32 v2, v2, v10
+; GFX8-NEXT: v_and_b32_e32 v10, 0xffff0000, v9
+; GFX8-NEXT: v_and_b32_e32 v17, 0xffff0000, v1
+; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT: v_mul_f32_e32 v10, 1.0, v10
+; GFX8-NEXT: v_mul_f32_e32 v17, 1.0, v17
+; GFX8-NEXT: v_mul_f32_e32 v9, 1.0, v9
+; GFX8-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX8-NEXT: v_max_f32_e32 v10, v17, v10
+; GFX8-NEXT: v_max_f32_e32 v1, v1, v9
+; GFX8-NEXT: v_and_b32_e32 v9, 0xffff0000, v8
+; GFX8-NEXT: v_and_b32_e32 v17, 0xffff0000, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT: v_mul_f32_e32 v9, 1.0, v9
+; GFX8-NEXT: v_mul_f32_e32 v17, 1.0, v17
+; GFX8-NEXT: v_mul_f32_e32 v8, 1.0, v8
+; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX8-NEXT: v_max_f32_e32 v9, v17, v9
+; GFX8-NEXT: v_max_f32_e32 v0, v0, v8
+; GFX8-NEXT: s_mov_b32 s4, 0x3020706
+; GFX8-NEXT: v_perm_b32 v0, v0, v9, s4
+; GFX8-NEXT: v_perm_b32 v1, v1, v10, s4
+; GFX8-NEXT: v_perm_b32 v2, v2, v11, s4
+; GFX8-NEXT: v_perm_b32 v3, v3, v12, s4
+; GFX8-NEXT: v_perm_b32 v4, v4, v13, s4
+; GFX8-NEXT: v_perm_b32 v5, v5, v14, s4
+; GFX8-NEXT: v_perm_b32 v6, v6, v15, s4
+; GFX8-NEXT: v_perm_b32 v7, v7, v16, s4
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_maxnum_v16bf16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_and_b32_e32 v16, 0xffff0000, v15
+; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v7
+; GFX9-NEXT: v_max_f32_e32 v16, v16, v16
+; GFX9-NEXT: v_max_f32_e32 v17, v17, v17
+; GFX9-NEXT: v_max_f32_e32 v16, v17, v16
+; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v14
+; GFX9-NEXT: v_and_b32_e32 v18, 0xffff0000, v6
+; GFX9-NEXT: v_max_f32_e32 v17, v17, v17
+; GFX9-NEXT: v_max_f32_e32 v18, v18, v18
+; GFX9-NEXT: v_max_f32_e32 v17, v18, v17
+; GFX9-NEXT: v_and_b32_e32 v18, 0xffff0000, v13
+; GFX9-NEXT: v_and_b32_e32 v19, 0xffff0000, v5
+; GFX9-NEXT: v_max_f32_e32 v18, v18, v18
+; GFX9-NEXT: v_max_f32_e32 v19, v19, v19
+; GFX9-NEXT: v_max_f32_e32 v18, v19, v18
+; GFX9-NEXT: v_and_b32_e32 v19, 0xffff0000, v12
+; GFX9-NEXT: v_and_b32_e32 v20, 0xffff0000, v4
+; GFX9-NEXT: v_max_f32_e32 v19, v19, v19
+; GFX9-NEXT: v_max_f32_e32 v20, v20, v20
+; GFX9-NEXT: v_max_f32_e32 v19, v20, v19
+; GFX9-NEXT: v_and_b32_e32 v20, 0xffff0000, v11
+; GFX9-NEXT: v_and_b32_e32 v21, 0xffff0000, v3
+; GFX9-NEXT: v_max_f32_e32 v20, v20, v20
+; GFX9-NEXT: v_max_f32_e32 v21, v21, v21
+; GFX9-NEXT: v_max_f32_e32 v20, v21, v20
+; GFX9-NEXT: v_and_b32_e32 v21, 0xffff0000, v10
+; GFX9-NEXT: v_and_b32_e32 v22, 0xffff0000, v2
+; GFX9-NEXT: v_max_f32_e32 v21, v21, v21
+; GFX9-NEXT: v_max_f32_e32 v22, v22, v22
+; GFX9-NEXT: v_max_f32_e32 v21, v22, v21
+; GFX9-NEXT: v_and_b32_e32 v22, 0xffff0000, v9
+; GFX9-NEXT: v_and_b32_e32 v23, 0xffff0000, v1
+; GFX9-NEXT: v_max_f32_e32 v22, v22, v22
+; GFX9-NEXT: v_max_f32_e32 v23, v23, v23
+; GFX9-NEXT: v_max_f32_e32 v22, v23, v22
+; GFX9-NEXT: v_and_b32_e32 v23, 0xffff0000, v8
+; GFX9-NEXT: v_and_b32_e32 v24, 0xffff0000, v0
+; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v15
+; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX9-NEXT: v_lshlrev_b32_e32 v14, 16, v14
+; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX9-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX9-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX9-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-NEXT: v_max_f32_e32 v23, v23, v23
+; GFX9-NEXT: v_max_f32_e32 v24, v24, v24
+; GFX9-NEXT: v_max_f32_e32 v15, v15, v15
+; GFX9-NEXT: v_max_f32_e32 v7, v7, v7
+; GFX9-NEXT: v_max_f32_e32 v14, v14, v14
+; GFX9-NEXT: v_max_f32_e32 v6, v6, v6
+; GFX9-NEXT: v_max_f32_e32 v13, v13, v13
+; GFX9-NEXT: v_max_f32_e32 v5, v5, v5
+; GFX9-NEXT: v_max_f32_e32 v12, v12, v12
+; GFX9-NEXT: v_max_f32_e32 v4, v4, v4
+; GFX9-NEXT: v_max_f32_e32 v11, v11, v11
+; GFX9-NEXT: v_max_f32_e32 v3, v3, v3
+; GFX9-NEXT: v_max_f32_e32 v10, v10, v10
+; GFX9-NEXT: v_max_f32_e32 v2, v2, v2
+; GFX9-NEXT: v_max_f32_e32 v9, v9, v9
+; GFX9-NEXT: v_max_f32_e32 v1, v1, v1
+; GFX9-NEXT: v_max_f32_e32 v8, v8, v8
+; GFX9-NEXT: v_max_f32_e32 v0, v0, v0
+; GFX9-NEXT: v_max_f32_e32 v23, v24, v23
+; GFX9-NEXT: v_max_f32_e32 v7, v7, v15
+; GFX9-NEXT: v_max_f32_e32 v6, v6, v14
+; GFX9-NEXT: v_max_f32_e32 v5, v5, v13
+; GFX9-NEXT: v_max_f32_e32 v4, v4, v12
+; GFX9-NEXT: v_max_f32_e32 v3, v3, v11
+; GFX9-NEXT: v_max_f32_e32 v2, v2, v10
+; GFX9-NEXT: v_max_f32_e32 v1, v1, v9
+; GFX9-NEXT: v_max_f32_e32 v0, v0, v8
+; GFX9-NEXT: s_mov_b32 s4, 0x3020706
+; GFX9-NEXT: v_perm_b32 v0, v0, v23, s4
+; GFX9-NEXT: v_perm_b32 v1, v1, v22, s4
+; GFX9-NEXT: v_perm_b32 v2, v2, v21, s4
+; GFX9-NEXT: v_perm_b32 v3, v3, v20, s4
+; GFX9-NEXT: v_perm_b32 v4, v4, v19, s4
+; GFX9-NEXT: v_perm_b32 v5, v5, v18, s4
+; GFX9-NEXT: v_perm_b32 v6, v6, v17, s4
+; GFX9-NEXT: v_perm_b32 v7, v7, v16, s4
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maxnum_v16bf16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_and_b32_e32 v16, 0xffff0000, v15
+; GFX10-NEXT: v_and_b32_e32 v17, 0xffff0000, v7
+; GFX10-NEXT: v_and_b32_e32 v18, 0xffff0000, v14
+; GFX10-NEXT: v_and_b32_e32 v19, 0xffff0000, v6
+; GFX10-NEXT: v_and_b32_e32 v20, 0xffff0000, v5
+; GFX10-NEXT: v_max_f32_e32 v16, v16, v16
+; GFX10-NEXT: v_max_f32_e32 v17, v17, v17
+; GFX10-NEXT: v_and_b32_e32 v21, 0xffff0000, v12
+; GFX10-NEXT: v_and_b32_e32 v22, 0xffff0000, v4
+; GFX10-NEXT: v_and_b32_e32 v23, 0xffff0000, v3
+; GFX10-NEXT: v_and_b32_e32 v24, 0xffff0000, v10
+; GFX10-NEXT: v_max_f32_e32 v16, v17, v16
+; GFX10-NEXT: v_max_f32_e32 v17, v18, v18
+; GFX10-NEXT: v_max_f32_e32 v18, v19, v19
+; GFX10-NEXT: v_and_b32_e32 v19, 0xffff0000, v13
+; GFX10-NEXT: v_and_b32_e32 v25, 0xffff0000, v1
+; GFX10-NEXT: v_and_b32_e32 v26, 0xffff0000, v8
+; GFX10-NEXT: v_and_b32_e32 v27, 0xffff0000, v0
+; GFX10-NEXT: v_max_f32_e32 v17, v18, v17
+; GFX10-NEXT: v_max_f32_e32 v18, v19, v19
+; GFX10-NEXT: v_max_f32_e32 v19, v20, v20
+; GFX10-NEXT: v_max_f32_e32 v20, v21, v21
+; GFX10-NEXT: v_max_f32_e32 v21, v22, v22
+; GFX10-NEXT: v_and_b32_e32 v22, 0xffff0000, v11
+; GFX10-NEXT: v_lshlrev_b32_e32 v15, 16, v15
+; GFX10-NEXT: v_max_f32_e32 v18, v19, v18
+; GFX10-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX10-NEXT: v_max_f32_e32 v19, v21, v20
+; GFX10-NEXT: v_max_f32_e32 v20, v22, v22
+; GFX10-NEXT: v_max_f32_e32 v21, v23, v23
+; GFX10-NEXT: v_max_f32_e32 v22, v24, v24
+; GFX10-NEXT: v_and_b32_e32 v23, 0xffff0000, v2
+; GFX10-NEXT: v_and_b32_e32 v24, 0xffff0000, v9
+; GFX10-NEXT: v_lshlrev_b32_e32 v14, 16, v14
+; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX10-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX10-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; GFX10-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX10-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-NEXT: v_max_f32_e32 v23, v23, v23
+; GFX10-NEXT: v_max_f32_e32 v24, v24, v24
+; GFX10-NEXT: v_max_f32_e32 v25, v25, v25
+; GFX10-NEXT: v_max_f32_e32 v26, v26, v26
+; GFX10-NEXT: v_max_f32_e32 v27, v27, v27
+; GFX10-NEXT: v_max_f32_e32 v15, v15, v15
+; GFX10-NEXT: v_max_f32_e32 v7, v7, v7
+; GFX10-NEXT: v_max_f32_e32 v14, v14, v14
+; GFX10-NEXT: v_max_f32_e32 v6, v6, v6
+; GFX10-NEXT: v_max_f32_e32 v13, v13, v13
+; GFX10-NEXT: v_max_f32_e32 v5, v5, v5
+; GFX10-NEXT: v_max_f32_e32 v12, v12, v12
+; GFX10-NEXT: v_max_f32_e32 v11, v11, v11
+; GFX10-NEXT: v_max_f32_e32 v10, v10, v10
+; GFX10-NEXT: v_max_f32_e32 v9, v9, v9
+; GFX10-NEXT: v_max_f32_e32 v8, v8, v8
+; GFX10-NEXT: v_max_f32_e32 v0, v0, v0
+; GFX10-NEXT: v_max_f32_e32 v1, v1, v1
+; GFX10-NEXT: v_max_f32_e32 v2, v2, v2
+; GFX10-NEXT: v_max_f32_e32 v3, v3, v3
+; GFX10-NEXT: v_max_f32_e32 v4, v4, v4
+; GFX10-NEXT: v_max_f32_e32 v20, v21, v20
+; GFX10-NEXT: v_max_f32_e32 v21, v23, v22
+; GFX10-NEXT: v_max_f32_e32 v22, v25, v24
+; GFX10-NEXT: v_max_f32_e32 v23, v27, v26
+; GFX10-NEXT: v_max_f32_e32 v7, v7, v15
+; GFX10-NEXT: v_max_f32_e32 v6, v6, v14
+; GFX10-NEXT: v_max_f32_e32 v5, v5, v13
+; GFX10-NEXT: v_max_f32_e32 v0, v0, v8
+; GFX10-NEXT: v_max_f32_e32 v1, v1, v9
+; GFX10-NEXT: v_max_f32_e32 v2, v2, v10
+; GFX10-NEXT: v_max_f32_e32 v3, v3, v11
+; GFX10-NEXT: v_max_f32_e32 v4, v4, v12
+; GFX10-NEXT: v_perm_b32 v0, v0, v23, 0x3020706
+; GFX10-NEXT: v_perm_b32 v1, v1, v22, 0x3020706
+; GFX10-NEXT: v_perm_b32 v2, v2, v21, 0x3020706
+; GFX10-NEXT: v_perm_b32 v3, v3, v20, 0x3020706
+; GFX10-NEXT: v_perm_b32 v4, v4, v19, 0x3020706
+; GFX10-NEXT: v_perm_b32 v5, v5, v18, 0x3020706
+; GFX10-NEXT: v_perm_b32 v6, v6, v17, 0x3020706
+; GFX10-NEXT: v_perm_b32 v7, v7, v16, 0x3020706
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_maxnum_v16bf16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_and_b32_e32 v17, 0xffff0000, v7
+; GFX11-NEXT: v_and_b32_e32 v16, 0xffff0000, v15
+; GFX11-NEXT: v_and_b32_e32 v22, 0xffff0000, v4
+; GFX11-NEXT: v_and_b32_e32 v20, 0xffff0000, v5
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_dual_max_f32 v17, v17, v17 :: v_dual_and_b32 v18, 0xffff0000, v14
+; GFX11-NEXT: v_dual_max_f32 v16, v16, v16 :: v_dual_and_b32 v19, 0xffff0000, v6
+; GFX11-NEXT: v_and_b32_e32 v23, 0xffff0000, v3
+; GFX11-NEXT: v_and_b32_e32 v24, 0xffff0000, v10
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_dual_max_f32 v16, v17, v16 :: v_dual_and_b32 v25, 0xffff0000, v1
+; GFX11-NEXT: v_dual_max_f32 v17, v18, v18 :: v_dual_max_f32 v18, v19, v19
+; GFX11-NEXT: v_and_b32_e32 v19, 0xffff0000, v13
+; GFX11-NEXT: v_and_b32_e32 v21, 0xffff0000, v12
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_dual_max_f32 v25, v25, v25 :: v_dual_and_b32 v26, 0xffff0000, v8
+; GFX11-NEXT: v_dual_max_f32 v17, v18, v17 :: v_dual_max_f32 v18, v19, v19
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_dual_max_f32 v19, v20, v20 :: v_dual_max_f32 v20, v21, v21
+; GFX11-NEXT: v_dual_max_f32 v21, v22, v22 :: v_dual_and_b32 v22, 0xffff0000, v11
+; GFX11-NEXT: v_lshlrev_b32_e32 v14, 16, v14
+; GFX11-NEXT: v_dual_max_f32 v18, v19, v18 :: v_dual_lshlrev_b32 v7, 16, v7
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_dual_max_f32 v19, v21, v20 :: v_dual_max_f32 v20, v22, v22
+; GFX11-NEXT: v_dual_max_f32 v14, v14, v14 :: v_dual_max_f32 v21, v23, v23
+; GFX11-NEXT: v_dual_max_f32 v22, v24, v24 :: v_dual_lshlrev_b32 v15, 16, v15
+; GFX11-NEXT: v_and_b32_e32 v23, 0xffff0000, v2
+; GFX11-NEXT: v_and_b32_e32 v24, 0xffff0000, v9
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_dual_max_f32 v20, v21, v20 :: v_dual_max_f32 v15, v15, v15
+; GFX11-NEXT: v_dual_max_f32 v26, v26, v26 :: v_dual_and_b32 v27, 0xffff0000, v0
+; GFX11-NEXT: v_dual_max_f32 v7, v7, v7 :: v_dual_lshlrev_b32 v6, 16, v6
+; GFX11-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX11-NEXT: v_dual_max_f32 v23, v23, v23 :: v_dual_max_f32 v24, v24, v24
+; GFX11-NEXT: v_dual_max_f32 v27, v27, v27 :: v_dual_max_f32 v6, v6, v6
+; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_dual_max_f32 v13, v13, v13 :: v_dual_lshlrev_b32 v10, 16, v10
+; GFX11-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GFX11-NEXT: v_dual_max_f32 v21, v23, v22 :: v_dual_max_f32 v22, v25, v24
+; GFX11-NEXT: v_dual_max_f32 v23, v27, v26 :: v_dual_lshlrev_b32 v12, 16, v12
+; GFX11-NEXT: v_dual_max_f32 v6, v6, v14 :: v_dual_max_f32 v5, v5, v5
+; GFX11-NEXT: v_dual_max_f32 v10, v10, v10 :: v_dual_max_f32 v11, v11, v11
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX11-NEXT: v_dual_max_f32 v7, v7, v15 :: v_dual_lshlrev_b32 v4, 16, v4
+; GFX11-NEXT: v_dual_max_f32 v12, v12, v12 :: v_dual_max_f32 v5, v5, v13
+; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX11-NEXT: v_dual_max_f32 v0, v0, v0 :: v_dual_lshlrev_b32 v3, 16, v3
+; GFX11-NEXT: v_dual_max_f32 v9, v9, v9 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_dual_max_f32 v8, v8, v8 :: v_dual_lshlrev_b32 v1, 16, v1
+; GFX11-NEXT: v_dual_max_f32 v3, v3, v3 :: v_dual_max_f32 v2, v2, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_dual_max_f32 v4, v4, v4 :: v_dual_max_f32 v1, v1, v1
+; GFX11-NEXT: v_dual_max_f32 v0, v0, v8 :: v_dual_max_f32 v3, v3, v11
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_max_f32_e32 v2, v2, v10
+; GFX11-NEXT: v_dual_max_f32 v4, v4, v12 :: v_dual_max_f32 v1, v1, v9
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_perm_b32 v0, v0, v23, 0x3020706
+; GFX11-NEXT: v_perm_b32 v3, v3, v20, 0x3020706
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_perm_b32 v2, v2, v21, 0x3020706
+; GFX11-NEXT: v_perm_b32 v4, v4, v19, 0x3020706
+; GFX11-NEXT: v_perm_b32 v1, v1, v22, 0x3020706
+; GFX11-NEXT: v_perm_b32 v5, v5, v18, 0x3020706
+; GFX11-NEXT: v_perm_b32 v6, v6, v17, 0x3020706
+; GFX11-NEXT: v_perm_b32 v7, v7, v16, 0x3020706
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %op = call <16 x bfloat> @llvm.maxnum.v16bf16(<16 x bfloat> %a, <16 x bfloat> %b)
+ ret <16 x bfloat> %op
+}
+
+define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
+; GCN-LABEL: v_maxnum_v32bf16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:128
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32
+; GCN-NEXT: s_waitcnt vmcnt(1)
+; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v31
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:124
+; GCN-NEXT: v_max_f32_e32 v31, v32, v31
+; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
+; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:120
+; GCN-NEXT: v_max_f32_e32 v30, v30, v32
+; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
+; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:116
+; GCN-NEXT: v_max_f32_e32 v29, v29, v32
+; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
+; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:112
+; GCN-NEXT: v_max_f32_e32 v28, v28, v32
+; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
+; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:108
+; GCN-NEXT: v_max_f32_e32 v27, v27, v32
+; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
+; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:104
+; GCN-NEXT: v_max_f32_e32 v26, v26, v32
+; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
+; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:100
+; GCN-NEXT: v_max_f32_e32 v25, v25, v32
+; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
+; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:96
+; GCN-NEXT: v_max_f32_e32 v24, v24, v32
+; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
+; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:92
+; GCN-NEXT: v_max_f32_e32 v23, v23, v32
+; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
+; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:88
+; GCN-NEXT: v_max_f32_e32 v22, v22, v32
+; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
+; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:84
+; GCN-NEXT: v_max_f32_e32 v21, v21, v32
+; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
+; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:80
+; GCN-NEXT: v_max_f32_e32 v20, v20, v32
+; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
+; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:76
+; GCN-NEXT: v_max_f32_e32 v19, v19, v32
+; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:72
+; GCN-NEXT: v_max_f32_e32 v18, v18, v32
+; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
+; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:68
+; GCN-NEXT: v_max_f32_e32 v17, v17, v32
+; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
+; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:64
+; GCN-NEXT: v_max_f32_e32 v16, v16, v32
+; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:60
+; GCN-NEXT: v_max_f32_e32 v15, v15, v32
+; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
+; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:56
+; GCN-NEXT: v_max_f32_e32 v14, v14, v32
+; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
+; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:52
+; GCN-NEXT: v_max_f32_e32 v13, v13, v32
+; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
+; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:48
+; GCN-NEXT: v_max_f32_e32 v12, v12, v32
+; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:44
+; GCN-NEXT: v_max_f32_e32 v11, v11, v32
+; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:40
+; GCN-NEXT: v_max_f32_e32 v10, v10, v32
+; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
+; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:36
+; GCN-NEXT: v_max_f32_e32 v9, v9, v32
+; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:32
+; GCN-NEXT: v_max_f32_e32 v8, v8, v32
+; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:28
+; GCN-NEXT: v_max_f32_e32 v7, v7, v32
+; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:24
+; GCN-NEXT: v_max_f32_e32 v6, v6, v32
+; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:20
+; GCN-NEXT: v_max_f32_e32 v5, v5, v32
+; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:16
+; GCN-NEXT: v_max_f32_e32 v4, v4, v32
+; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:12
+; GCN-NEXT: v_max_f32_e32 v3, v3, v32
+; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
+; GCN-NEXT: v_max_f32_e32 v2, v2, v32
+; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:4
+; GCN-NEXT: v_max_f32_e32 v1, v1, v32
+; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_max_f32_e32 v0, v0, v32
+; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
+; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
+; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
+; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
+; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
+; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
+; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
+; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
+; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
+; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
+; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
+; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
+; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
+; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
+; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
+; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
+; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
+; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
+; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_maxnum_v32bf16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:128
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32
+; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
+; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30
+; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
+; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29
+; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
+; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28
+; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
+; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27
+; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
+; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26
+; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
+; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25
+; GFX7-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
+; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24
+; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
+; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23
+; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
+; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22
+; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
+; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21
+; GFX7-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
+; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20
+; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
+; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19
+; GFX7-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18
+; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
+; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17
+; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
+; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16
+; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15
+; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
+; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14
+; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
+; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13
+; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
+; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12
+; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11
+; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
+; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
+; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
+; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
+; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
+; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
+; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT: v_max_f32_e32 v31, v32, v31
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:124
+; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT: v_max_f32_e32 v30, v30, v32
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:120
+; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT: v_max_f32_e32 v29, v29, v32
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:116
+; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT: v_max_f32_e32 v28, v28, v32
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:112
+; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT: v_max_f32_e32 v27, v27, v32
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:108
+; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT: v_max_f32_e32 v26, v26, v32
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:104
+; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT: v_max_f32_e32 v25, v25, v32
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:100
+; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT: v_max_f32_e32 v24, v24, v32
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:96
+; GFX7-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT: v_max_f32_e32 v23, v23, v32
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:92
+; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT: v_max_f32_e32 v22, v22, v32
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:88
+; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT: v_max_f32_e32 v21, v21, v32
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:84
+; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT: v_max_f32_e32 v20, v20, v32
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:80
+; GFX7-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT: v_max_f32_e32 v19, v19, v32
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:76
+; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT: v_max_f32_e32 v18, v18, v32
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:72
+; GFX7-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT: v_max_f32_e32 v17, v17, v32
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:68
+; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT: v_max_f32_e32 v16, v16, v32
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:64
+; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT: v_max_f32_e32 v15, v15, v32
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:60
+; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT: v_max_f32_e32 v14, v14, v32
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:56
+; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT: v_max_f32_e32 v13, v13, v32
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:52
+; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT: v_max_f32_e32 v12, v12, v32
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:48
+; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT: v_max_f32_e32 v11, v11, v32
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:44
+; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT: v_max_f32_e32 v10, v10, v32
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:40
+; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT: v_max_f32_e32 v9, v9, v32
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:36
+; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT: v_max_f32_e32 v8, v8, v32
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:32
+; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT: v_max_f32_e32 v7, v7, v32
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:28
+; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT: v_max_f32_e32 v6, v6, v32
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:24
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT: v_max_f32_e32 v5, v5, v32
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:20
+; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT: v_max_f32_e32 v4, v4, v32
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:16
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT: v_max_f32_e32 v3, v3, v32
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:12
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT: v_max_f32_e32 v2, v2, v32
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT: v_max_f32_e32 v1, v1, v32
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT: v_max_f32_e32 v0, v0, v32
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_maxnum_v32bf16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_and_b32_e32 v31, 0xffff0000, v30
+; GFX8-NEXT: v_and_b32_e32 v32, 0xffff0000, v14
+; GFX8-NEXT: v_lshlrev_b32_e32 v30, 16, v30
+; GFX8-NEXT: v_lshlrev_b32_e32 v14, 16, v14
+; GFX8-NEXT: v_mul_f32_e32 v31, 1.0, v31
+; GFX8-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GFX8-NEXT: v_mul_f32_e32 v30, 1.0, v30
+; GFX8-NEXT: v_mul_f32_e32 v14, 1.0, v14
+; GFX8-NEXT: v_max_f32_e32 v31, v32, v31
+; GFX8-NEXT: v_max_f32_e32 v14, v14, v30
+; GFX8-NEXT: v_and_b32_e32 v30, 0xffff0000, v29
+; GFX8-NEXT: v_and_b32_e32 v32, 0xffff0000, v13
+; GFX8-NEXT: v_lshlrev_b32_e32 v29, 16, v29
+; GFX8-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX8-NEXT: v_mul_f32_e32 v30, 1.0, v30
+; GFX8-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GFX8-NEXT: v_mul_f32_e32 v29, 1.0, v29
+; GFX8-NEXT: v_mul_f32_e32 v13, 1.0, v13
+; GFX8-NEXT: v_max_f32_e32 v30, v32, v30
+; GFX8-NEXT: v_max_f32_e32 v13, v13, v29
+; GFX8-NEXT: v_and_b32_e32 v29, 0xffff0000, v28
+; GFX8-NEXT: v_and_b32_e32 v32, 0xffff0000, v12
+; GFX8-NEXT: v_lshlrev_b32_e32 v28, 16, v28
+; GFX8-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; GFX8-NEXT: v_mul_f32_e32 v29, 1.0, v29
+; GFX8-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GFX8-NEXT: v_mul_f32_e32 v28, 1.0, v28
+; GFX8-NEXT: v_mul_f32_e32 v12, 1.0, v12
+; GFX8-NEXT: v_max_f32_e32 v29, v32, v29
+; GFX8-NEXT: v_max_f32_e32 v12, v12, v28
+; GFX8-NEXT: v_and_b32_e32 v28, 0xffff0000, v27
+; GFX8-NEXT: v_and_b32_e32 v32, 0xffff0000, v11
+; GFX8-NEXT: v_mul_f32_e32 v28, 1.0, v28
+; GFX8-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GFX8-NEXT: v_max_f32_e32 v28, v32, v28
+; GFX8-NEXT: buffer_load_dword v32, off, s[0:3], s32
+; GFX8-NEXT: v_lshlrev_b32_e32 v27, 16, v27
+; GFX8-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GFX8-NEXT: v_mul_f32_e32 v27, 1.0, v27
+; GFX8-NEXT: v_mul_f32_e32 v11, 1.0, v11
+; GFX8-NEXT: v_max_f32_e32 v11, v11, v27
+; GFX8-NEXT: v_and_b32_e32 v27, 0xffff0000, v15
+; GFX8-NEXT: v_lshlrev_b32_e32 v15, 16, v15
+; GFX8-NEXT: v_mul_f32_e32 v27, 1.0, v27
+; GFX8-NEXT: v_mul_f32_e32 v15, 1.0, v15
+; GFX8-NEXT: s_mov_b32 s4, 0x3020706
+; GFX8-NEXT: v_perm_b32 v11, v11, v28, s4
+; GFX8-NEXT: v_perm_b32 v12, v12, v29, s4
+; GFX8-NEXT: v_perm_b32 v13, v13, v30, s4
+; GFX8-NEXT: v_perm_b32 v14, v14, v31, s4
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_and_b32_e32 v33, 0xffff0000, v32
+; GFX8-NEXT: v_lshlrev_b32_e32 v32, 16, v32
+; GFX8-NEXT: v_mul_f32_e32 v33, 1.0, v33
+; GFX8-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GFX8-NEXT: v_max_f32_e32 v27, v27, v33
+; GFX8-NEXT: v_max_f32_e32 v15, v15, v32
+; GFX8-NEXT: v_and_b32_e32 v32, 0xffff0000, v26
+; GFX8-NEXT: v_and_b32_e32 v33, 0xffff0000, v10
+; GFX8-NEXT: v_lshlrev_b32_e32 v26, 16, v26
+; GFX8-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GFX8-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GFX8-NEXT: v_mul_f32_e32 v33, 1.0, v33
+; GFX8-NEXT: v_mul_f32_e32 v26, 1.0, v26
+; GFX8-NEXT: v_mul_f32_e32 v10, 1.0, v10
+; GFX8-NEXT: v_max_f32_e32 v32, v33, v32
+; GFX8-NEXT: v_max_f32_e32 v10, v10, v26
+; GFX8-NEXT: v_and_b32_e32 v26, 0xffff0000, v25
+; GFX8-NEXT: v_and_b32_e32 v33, 0xffff0000, v9
+; GFX8-NEXT: v_lshlrev_b32_e32 v25, 16, v25
+; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX8-NEXT: v_mul_f32_e32 v26, 1.0, v26
+; GFX8-NEXT: v_mul_f32_e32 v33, 1.0, v33
+; GFX8-NEXT: v_mul_f32_e32 v25, 1.0, v25
+; GFX8-NEXT: v_mul_f32_e32 v9, 1.0, v9
+; GFX8-NEXT: v_max_f32_e32 v26, v33, v26
+; GFX8-NEXT: v_max_f32_e32 v9, v9, v25
+; GFX8-NEXT: v_and_b32_e32 v25, 0xffff0000, v24
+; GFX8-NEXT: v_and_b32_e32 v33, 0xffff0000, v8
+; GFX8-NEXT: v_lshlrev_b32_e32 v24, 16, v24
+; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX8-NEXT: v_mul_f32_e32 v25, 1.0, v25
+; GFX8-NEXT: v_mul_f32_e32 v33, 1.0, v33
+; GFX8-NEXT: v_mul_f32_e32 v24, 1.0, v24
+; GFX8-NEXT: v_mul_f32_e32 v8, 1.0, v8
+; GFX8-NEXT: v_max_f32_e32 v25, v33, v25
+; GFX8-NEXT: v_max_f32_e32 v8, v8, v24
+; GFX8-NEXT: v_and_b32_e32 v24, 0xffff0000, v23
+; GFX8-NEXT: v_and_b32_e32 v33, 0xffff0000, v7
+; GFX8-NEXT: v_lshlrev_b32_e32 v23, 16, v23
+; GFX8-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX8-NEXT: v_mul_f32_e32 v24, 1.0, v24
+; GFX8-NEXT: v_mul_f32_e32 v33, 1.0, v33
+; GFX8-NEXT: v_mul_f32_e32 v23, 1.0, v23
+; GFX8-NEXT: v_mul_f32_e32 v7, 1.0, v7
+; GFX8-NEXT: v_max_f32_e32 v24, v33, v24
+; GFX8-NEXT: v_max_f32_e32 v7, v7, v23
+; GFX8-NEXT: v_and_b32_e32 v23, 0xffff0000, v22
+; GFX8-NEXT: v_and_b32_e32 v33, 0xffff0000, v6
+; GFX8-NEXT: v_lshlrev_b32_e32 v22, 16, v22
+; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX8-NEXT: v_mul_f32_e32 v23, 1.0, v23
+; GFX8-NEXT: v_mul_f32_e32 v33, 1.0, v33
+; GFX8-NEXT: v_mul_f32_e32 v22, 1.0, v22
+; GFX8-NEXT: v_mul_f32_e32 v6, 1.0, v6
+; GFX8-NEXT: v_max_f32_e32 v23, v33, v23
+; GFX8-NEXT: v_max_f32_e32 v6, v6, v22
+; GFX8-NEXT: v_and_b32_e32 v22, 0xffff0000, v21
+; GFX8-NEXT: v_and_b32_e32 v33, 0xffff0000, v5
+; GFX8-NEXT: v_lshlrev_b32_e32 v21, 16, v21
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX8-NEXT: v_mul_f32_e32 v22, 1.0, v22
+; GFX8-NEXT: v_mul_f32_e32 v33, 1.0, v33
+; GFX8-NEXT: v_mul_f32_e32 v21, 1.0, v21
+; GFX8-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GFX8-NEXT: v_max_f32_e32 v22, v33, v22
+; GFX8-NEXT: v_max_f32_e32 v5, v5, v21
+; GFX8-NEXT: v_and_b32_e32 v21, 0xffff0000, v20
+; GFX8-NEXT: v_and_b32_e32 v33, 0xffff0000, v4
+; GFX8-NEXT: v_lshlrev_b32_e32 v20, 16, v20
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX8-NEXT: v_mul_f32_e32 v21, 1.0, v21
+; GFX8-NEXT: v_mul_f32_e32 v33, 1.0, v33
+; GFX8-NEXT: v_mul_f32_e32 v20, 1.0, v20
+; GFX8-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GFX8-NEXT: v_max_f32_e32 v21, v33, v21
+; GFX8-NEXT: v_max_f32_e32 v4, v4, v20
+; GFX8-NEXT: v_and_b32_e32 v20, 0xffff0000, v19
+; GFX8-NEXT: v_and_b32_e32 v33, 0xffff0000, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v19, 16, v19
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX8-NEXT: v_mul_f32_e32 v20, 1.0, v20
+; GFX8-NEXT: v_mul_f32_e32 v33, 1.0, v33
+; GFX8-NEXT: v_mul_f32_e32 v19, 1.0, v19
+; GFX8-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX8-NEXT: v_max_f32_e32 v20, v33, v20
+; GFX8-NEXT: v_max_f32_e32 v3, v3, v19
+; GFX8-NEXT: v_and_b32_e32 v19, 0xffff0000, v18
+; GFX8-NEXT: v_and_b32_e32 v33, 0xffff0000, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v18, 16, v18
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT: v_mul_f32_e32 v19, 1.0, v19
+; GFX8-NEXT: v_mul_f32_e32 v33, 1.0, v33
+; GFX8-NEXT: v_mul_f32_e32 v18, 1.0, v18
+; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX8-NEXT: v_max_f32_e32 v19, v33, v19
+; GFX8-NEXT: v_max_f32_e32 v2, v2, v18
+; GFX8-NEXT: v_and_b32_e32 v18, 0xffff0000, v17
+; GFX8-NEXT: v_and_b32_e32 v33, 0xffff0000, v1
+; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v17
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT: v_mul_f32_e32 v18, 1.0, v18
+; GFX8-NEXT: v_mul_f32_e32 v33, 1.0, v33
+; GFX8-NEXT: v_mul_f32_e32 v17, 1.0, v17
+; GFX8-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX8-NEXT: v_max_f32_e32 v18, v33, v18
+; GFX8-NEXT: v_max_f32_e32 v1, v1, v17
+; GFX8-NEXT: v_and_b32_e32 v17, 0xffff0000, v16
+; GFX8-NEXT: v_and_b32_e32 v33, 0xffff0000, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v16, 16, v16
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT: v_mul_f32_e32 v17, 1.0, v17
+; GFX8-NEXT: v_mul_f32_e32 v33, 1.0, v33
+; GFX8-NEXT: v_mul_f32_e32 v16, 1.0, v16
+; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX8-NEXT: v_max_f32_e32 v17, v33, v17
+; GFX8-NEXT: v_max_f32_e32 v0, v0, v16
+; GFX8-NEXT: v_perm_b32 v0, v0, v17, s4
+; GFX8-NEXT: v_perm_b32 v1, v1, v18, s4
+; GFX8-NEXT: v_perm_b32 v2, v2, v19, s4
+; GFX8-NEXT: v_perm_b32 v3, v3, v20, s4
+; GFX8-NEXT: v_perm_b32 v4, v4, v21, s4
+; GFX8-NEXT: v_perm_b32 v5, v5, v22, s4
+; GFX8-NEXT: v_perm_b32 v6, v6, v23, s4
+; GFX8-NEXT: v_perm_b32 v7, v7, v24, s4
+; GFX8-NEXT: v_perm_b32 v8, v8, v25, s4
+; GFX8-NEXT: v_perm_b32 v9, v9, v26, s4
+; GFX8-NEXT: v_perm_b32 v10, v10, v32, s4
+; GFX8-NEXT: v_perm_b32 v15, v15, v27, s4
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_maxnum_v32bf16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: buffer_load_dword v35, off, s[0:3], s32
+; GFX9-NEXT: v_and_b32_e32 v31, 0xffff0000, v30
+; GFX9-NEXT: v_and_b32_e32 v32, 0xffff0000, v14
+; GFX9-NEXT: v_and_b32_e32 v33, 0xffff0000, v29
+; GFX9-NEXT: v_and_b32_e32 v34, 0xffff0000, v13
+; GFX9-NEXT: v_and_b32_e32 v36, 0xffff0000, v28
+; GFX9-NEXT: v_and_b32_e32 v37, 0xffff0000, v12
+; GFX9-NEXT: v_and_b32_e32 v50, 0xffff0000, v25
+; GFX9-NEXT: v_and_b32_e32 v51, 0xffff0000, v9
+; GFX9-NEXT: v_max_f32_e32 v31, v31, v31
+; GFX9-NEXT: v_max_f32_e32 v32, v32, v32
+; GFX9-NEXT: v_max_f32_e32 v33, v33, v33
+; GFX9-NEXT: v_max_f32_e32 v34, v34, v34
+; GFX9-NEXT: v_max_f32_e32 v36, v36, v36
+; GFX9-NEXT: v_max_f32_e32 v37, v37, v37
+; GFX9-NEXT: v_max_f32_e32 v50, v50, v50
+; GFX9-NEXT: v_max_f32_e32 v51, v51, v51
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX9-NEXT: v_and_b32_e32 v38, 0xffff0000, v27
+; GFX9-NEXT: v_and_b32_e32 v39, 0xffff0000, v11
+; GFX9-NEXT: v_and_b32_e32 v52, 0xffff0000, v24
+; GFX9-NEXT: v_and_b32_e32 v53, 0xffff0000, v8
+; GFX9-NEXT: v_and_b32_e32 v43, 0xffff0000, v21
+; GFX9-NEXT: v_max_f32_e32 v31, v32, v31
+; GFX9-NEXT: v_max_f32_e32 v32, v34, v33
+; GFX9-NEXT: v_max_f32_e32 v33, v37, v36
+; GFX9-NEXT: v_max_f32_e32 v37, v51, v50
+; GFX9-NEXT: v_and_b32_e32 v51, 0xffff0000, v5
+; GFX9-NEXT: v_max_f32_e32 v38, v38, v38
+; GFX9-NEXT: v_max_f32_e32 v39, v39, v39
+; GFX9-NEXT: v_max_f32_e32 v52, v52, v52
+; GFX9-NEXT: v_max_f32_e32 v53, v53, v53
+; GFX9-NEXT: v_max_f32_e32 v50, v43, v43
+; GFX9-NEXT: v_max_f32_e32 v51, v51, v51
+; GFX9-NEXT: v_max_f32_e32 v34, v39, v38
+; GFX9-NEXT: v_max_f32_e32 v38, v53, v52
+; GFX9-NEXT: v_max_f32_e32 v50, v51, v50
+; GFX9-NEXT: v_and_b32_e32 v51, 0xffff0000, v20
+; GFX9-NEXT: v_and_b32_e32 v52, 0xffff0000, v4
+; GFX9-NEXT: v_max_f32_e32 v51, v51, v51
+; GFX9-NEXT: v_max_f32_e32 v52, v52, v52
+; GFX9-NEXT: v_and_b32_e32 v54, 0xffff0000, v23
+; GFX9-NEXT: v_and_b32_e32 v55, 0xffff0000, v7
+; GFX9-NEXT: v_max_f32_e32 v51, v52, v51
+; GFX9-NEXT: v_and_b32_e32 v52, 0xffff0000, v19
+; GFX9-NEXT: v_and_b32_e32 v53, 0xffff0000, v3
+; GFX9-NEXT: v_max_f32_e32 v54, v54, v54
+; GFX9-NEXT: v_max_f32_e32 v55, v55, v55
+; GFX9-NEXT: v_max_f32_e32 v52, v52, v52
+; GFX9-NEXT: v_max_f32_e32 v53, v53, v53
+; GFX9-NEXT: v_max_f32_e32 v39, v55, v54
+; GFX9-NEXT: v_max_f32_e32 v52, v53, v52
+; GFX9-NEXT: v_and_b32_e32 v53, 0xffff0000, v18
+; GFX9-NEXT: v_and_b32_e32 v54, 0xffff0000, v2
+; GFX9-NEXT: v_max_f32_e32 v53, v53, v53
+; GFX9-NEXT: v_max_f32_e32 v54, v54, v54
+; GFX9-NEXT: v_and_b32_e32 v48, 0xffff0000, v26
+; GFX9-NEXT: v_and_b32_e32 v49, 0xffff0000, v10
+; GFX9-NEXT: v_and_b32_e32 v40, 0xffff0000, v22
+; GFX9-NEXT: v_and_b32_e32 v41, 0xffff0000, v6
+; GFX9-NEXT: v_max_f32_e32 v53, v54, v53
+; GFX9-NEXT: v_and_b32_e32 v54, 0xffff0000, v17
+; GFX9-NEXT: v_and_b32_e32 v55, 0xffff0000, v1
+; GFX9-NEXT: v_max_f32_e32 v48, v48, v48
+; GFX9-NEXT: v_max_f32_e32 v49, v49, v49
+; GFX9-NEXT: v_max_f32_e32 v40, v40, v40
+; GFX9-NEXT: v_max_f32_e32 v41, v41, v41
+; GFX9-NEXT: v_max_f32_e32 v54, v54, v54
+; GFX9-NEXT: v_max_f32_e32 v55, v55, v55
+; GFX9-NEXT: v_and_b32_e32 v42, 0xffff0000, v15
+; GFX9-NEXT: v_max_f32_e32 v36, v49, v48
+; GFX9-NEXT: v_max_f32_e32 v48, v41, v40
+; GFX9-NEXT: v_max_f32_e32 v54, v55, v54
+; GFX9-NEXT: v_and_b32_e32 v55, 0xffff0000, v16
+; GFX9-NEXT: v_and_b32_e32 v40, 0xffff0000, v0
+; GFX9-NEXT: v_max_f32_e32 v42, v42, v42
+; GFX9-NEXT: v_max_f32_e32 v55, v55, v55
+; GFX9-NEXT: v_max_f32_e32 v40, v40, v40
+; GFX9-NEXT: v_max_f32_e32 v55, v40, v55
+; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v15
+; GFX9-NEXT: s_waitcnt vmcnt(4)
+; GFX9-NEXT: v_and_b32_e32 v49, 0xffff0000, v35
+; GFX9-NEXT: v_max_f32_e32 v49, v49, v49
+; GFX9-NEXT: v_max_f32_e32 v49, v42, v49
+; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX9-NEXT: v_lshlrev_b32_e32 v35, 16, v35
+; GFX9-NEXT: v_lshlrev_b32_e32 v30, 16, v30
+; GFX9-NEXT: v_lshlrev_b32_e32 v14, 16, v14
+; GFX9-NEXT: v_lshlrev_b32_e32 v29, 16, v29
+; GFX9-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX9-NEXT: v_lshlrev_b32_e32 v28, 16, v28
+; GFX9-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; GFX9-NEXT: v_lshlrev_b32_e32 v27, 16, v27
+; GFX9-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GFX9-NEXT: v_lshlrev_b32_e32 v26, 16, v26
+; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GFX9-NEXT: v_lshlrev_b32_e32 v25, 16, v25
+; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX9-NEXT: v_lshlrev_b32_e32 v24, 16, v24
+; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX9-NEXT: v_lshlrev_b32_e32 v23, 16, v23
+; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX9-NEXT: v_lshlrev_b32_e32 v22, 16, v22
+; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX9-NEXT: v_lshlrev_b32_e32 v21, 16, v21
+; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX9-NEXT: v_lshlrev_b32_e32 v20, 16, v20
+; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX9-NEXT: v_lshlrev_b32_e32 v19, 16, v19
+; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX9-NEXT: v_lshlrev_b32_e32 v18, 16, v18
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v17
+; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v16
+; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-NEXT: v_max_f32_e32 v35, v35, v35
+; GFX9-NEXT: v_max_f32_e32 v15, v15, v15
+; GFX9-NEXT: v_max_f32_e32 v30, v30, v30
+; GFX9-NEXT: v_max_f32_e32 v14, v14, v14
+; GFX9-NEXT: v_max_f32_e32 v29, v29, v29
+; GFX9-NEXT: v_max_f32_e32 v13, v13, v13
+; GFX9-NEXT: v_max_f32_e32 v28, v28, v28
+; GFX9-NEXT: v_max_f32_e32 v12, v12, v12
+; GFX9-NEXT: v_max_f32_e32 v27, v27, v27
+; GFX9-NEXT: v_max_f32_e32 v11, v11, v11
+; GFX9-NEXT: v_max_f32_e32 v26, v26, v26
+; GFX9-NEXT: v_max_f32_e32 v10, v10, v10
+; GFX9-NEXT: v_max_f32_e32 v25, v25, v25
+; GFX9-NEXT: v_max_f32_e32 v9, v9, v9
+; GFX9-NEXT: v_max_f32_e32 v24, v24, v24
+; GFX9-NEXT: v_max_f32_e32 v8, v8, v8
+; GFX9-NEXT: v_max_f32_e32 v23, v23, v23
+; GFX9-NEXT: v_max_f32_e32 v7, v7, v7
+; GFX9-NEXT: v_max_f32_e32 v22, v22, v22
+; GFX9-NEXT: v_max_f32_e32 v6, v6, v6
+; GFX9-NEXT: v_max_f32_e32 v21, v21, v21
+; GFX9-NEXT: v_max_f32_e32 v5, v5, v5
+; GFX9-NEXT: v_max_f32_e32 v20, v20, v20
+; GFX9-NEXT: v_max_f32_e32 v4, v4, v4
+; GFX9-NEXT: v_max_f32_e32 v19, v19, v19
+; GFX9-NEXT: v_max_f32_e32 v3, v3, v3
+; GFX9-NEXT: v_max_f32_e32 v18, v18, v18
+; GFX9-NEXT: v_max_f32_e32 v2, v2, v2
+; GFX9-NEXT: v_max_f32_e32 v17, v17, v17
+; GFX9-NEXT: v_max_f32_e32 v1, v1, v1
+; GFX9-NEXT: v_max_f32_e32 v16, v16, v16
+; GFX9-NEXT: v_max_f32_e32 v0, v0, v0
+; GFX9-NEXT: v_max_f32_e32 v15, v15, v35
+; GFX9-NEXT: v_max_f32_e32 v14, v14, v30
+; GFX9-NEXT: v_max_f32_e32 v13, v13, v29
+; GFX9-NEXT: v_max_f32_e32 v12, v12, v28
+; GFX9-NEXT: v_max_f32_e32 v11, v11, v27
+; GFX9-NEXT: v_max_f32_e32 v10, v10, v26
+; GFX9-NEXT: v_max_f32_e32 v9, v9, v25
+; GFX9-NEXT: v_max_f32_e32 v8, v8, v24
+; GFX9-NEXT: v_max_f32_e32 v7, v7, v23
+; GFX9-NEXT: v_max_f32_e32 v6, v6, v22
+; GFX9-NEXT: v_max_f32_e32 v5, v5, v21
+; GFX9-NEXT: v_max_f32_e32 v4, v4, v20
+; GFX9-NEXT: v_max_f32_e32 v3, v3, v19
+; GFX9-NEXT: v_max_f32_e32 v2, v2, v18
+; GFX9-NEXT: v_max_f32_e32 v1, v1, v17
+; GFX9-NEXT: v_max_f32_e32 v0, v0, v16
+; GFX9-NEXT: s_mov_b32 s4, 0x3020706
+; GFX9-NEXT: v_perm_b32 v0, v0, v55, s4
+; GFX9-NEXT: v_perm_b32 v1, v1, v54, s4
+; GFX9-NEXT: v_perm_b32 v2, v2, v53, s4
+; GFX9-NEXT: v_perm_b32 v3, v3, v52, s4
+; GFX9-NEXT: v_perm_b32 v4, v4, v51, s4
+; GFX9-NEXT: v_perm_b32 v5, v5, v50, s4
+; GFX9-NEXT: v_perm_b32 v6, v6, v48, s4
+; GFX9-NEXT: v_perm_b32 v7, v7, v39, s4
+; GFX9-NEXT: v_perm_b32 v8, v8, v38, s4
+; GFX9-NEXT: v_perm_b32 v9, v9, v37, s4
+; GFX9-NEXT: v_perm_b32 v10, v10, v36, s4
+; GFX9-NEXT: v_perm_b32 v11, v11, v34, s4
+; GFX9-NEXT: v_perm_b32 v12, v12, v33, s4
+; GFX9-NEXT: v_perm_b32 v13, v13, v32, s4
+; GFX9-NEXT: v_perm_b32 v14, v14, v31, s4
+; GFX9-NEXT: v_perm_b32 v15, v15, v49, s4
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_maxnum_v32bf16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: buffer_load_dword v31, off, s[0:3], s32
+; GFX10-NEXT: v_and_b32_e32 v53, 0xffff0000, v24
+; GFX10-NEXT: v_and_b32_e32 v54, 0xffff0000, v8
+; GFX10-NEXT: v_and_b32_e32 v55, 0xffff0000, v23
+; GFX10-NEXT: v_and_b32_e32 v64, 0xffff0000, v7
+; GFX10-NEXT: v_and_b32_e32 v65, 0xffff0000, v22
+; GFX10-NEXT: v_and_b32_e32 v66, 0xffff0000, v6
+; GFX10-NEXT: v_and_b32_e32 v67, 0xffff0000, v21
+; GFX10-NEXT: v_and_b32_e32 v68, 0xffff0000, v5
+; GFX10-NEXT: v_max_f32_e32 v53, v53, v53
+; GFX10-NEXT: v_max_f32_e32 v54, v54, v54
+; GFX10-NEXT: v_max_f32_e32 v55, v55, v55
+; GFX10-NEXT: v_max_f32_e32 v64, v64, v64
+; GFX10-NEXT: v_max_f32_e32 v65, v65, v65
+; GFX10-NEXT: v_max_f32_e32 v66, v66, v66
+; GFX10-NEXT: v_max_f32_e32 v67, v67, v67
+; GFX10-NEXT: v_max_f32_e32 v68, v68, v68
+; GFX10-NEXT: v_and_b32_e32 v32, 0xffff0000, v30
+; GFX10-NEXT: v_and_b32_e32 v34, 0xffff0000, v14
+; GFX10-NEXT: v_and_b32_e32 v35, 0xffff0000, v29
+; GFX10-NEXT: v_and_b32_e32 v36, 0xffff0000, v13
+; GFX10-NEXT: v_and_b32_e32 v37, 0xffff0000, v28
+; GFX10-NEXT: v_and_b32_e32 v38, 0xffff0000, v12
+; GFX10-NEXT: v_and_b32_e32 v39, 0xffff0000, v27
+; GFX10-NEXT: v_and_b32_e32 v48, 0xffff0000, v11
+; GFX10-NEXT: v_and_b32_e32 v49, 0xffff0000, v26
+; GFX10-NEXT: v_and_b32_e32 v50, 0xffff0000, v10
+; GFX10-NEXT: v_and_b32_e32 v51, 0xffff0000, v25
+; GFX10-NEXT: v_and_b32_e32 v52, 0xffff0000, v9
+; GFX10-NEXT: v_max_f32_e32 v53, v54, v53
+; GFX10-NEXT: v_and_b32_e32 v54, 0xffff0000, v17
+; GFX10-NEXT: v_max_f32_e32 v55, v64, v55
+; GFX10-NEXT: v_and_b32_e32 v64, 0xffff0000, v1
+; GFX10-NEXT: v_max_f32_e32 v65, v66, v65
+; GFX10-NEXT: v_and_b32_e32 v66, 0xffff0000, v16
+; GFX10-NEXT: v_max_f32_e32 v67, v68, v67
+; GFX10-NEXT: v_and_b32_e32 v68, 0xffff0000, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v17
+; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT: v_lshlrev_b32_e32 v16, 16, v16
+; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT: v_max_f32_e32 v32, v32, v32
+; GFX10-NEXT: v_max_f32_e32 v34, v34, v34
+; GFX10-NEXT: v_max_f32_e32 v35, v35, v35
+; GFX10-NEXT: v_max_f32_e32 v36, v36, v36
+; GFX10-NEXT: v_max_f32_e32 v37, v37, v37
+; GFX10-NEXT: v_max_f32_e32 v38, v38, v38
+; GFX10-NEXT: v_max_f32_e32 v39, v39, v39
+; GFX10-NEXT: v_max_f32_e32 v48, v48, v48
+; GFX10-NEXT: v_max_f32_e32 v49, v49, v49
+; GFX10-NEXT: v_max_f32_e32 v50, v50, v50
+; GFX10-NEXT: v_max_f32_e32 v51, v51, v51
+; GFX10-NEXT: v_max_f32_e32 v52, v52, v52
+; GFX10-NEXT: v_max_f32_e32 v17, v17, v17
+; GFX10-NEXT: v_max_f32_e32 v1, v1, v1
+; GFX10-NEXT: v_max_f32_e32 v16, v16, v16
+; GFX10-NEXT: v_max_f32_e32 v0, v0, v0
+; GFX10-NEXT: v_and_b32_e32 v33, 0xffff0000, v15
+; GFX10-NEXT: v_max_f32_e32 v32, v34, v32
+; GFX10-NEXT: v_and_b32_e32 v34, 0xffff0000, v20
+; GFX10-NEXT: v_max_f32_e32 v35, v36, v35
+; GFX10-NEXT: v_and_b32_e32 v36, 0xffff0000, v4
+; GFX10-NEXT: v_max_f32_e32 v37, v38, v37
+; GFX10-NEXT: v_and_b32_e32 v38, 0xffff0000, v19
+; GFX10-NEXT: v_max_f32_e32 v39, v48, v39
+; GFX10-NEXT: v_and_b32_e32 v48, 0xffff0000, v3
+; GFX10-NEXT: v_max_f32_e32 v49, v50, v49
+; GFX10-NEXT: v_and_b32_e32 v50, 0xffff0000, v18
+; GFX10-NEXT: v_max_f32_e32 v51, v52, v51
+; GFX10-NEXT: v_and_b32_e32 v52, 0xffff0000, v2
+; GFX10-NEXT: v_lshlrev_b32_e32 v15, 16, v15
+; GFX10-NEXT: v_lshlrev_b32_e32 v30, 16, v30
+; GFX10-NEXT: v_lshlrev_b32_e32 v14, 16, v14
+; GFX10-NEXT: v_lshlrev_b32_e32 v29, 16, v29
+; GFX10-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX10-NEXT: v_lshlrev_b32_e32 v28, 16, v28
+; GFX10-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; GFX10-NEXT: v_lshlrev_b32_e32 v27, 16, v27
+; GFX10-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GFX10-NEXT: v_lshlrev_b32_e32 v26, 16, v26
+; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GFX10-NEXT: v_lshlrev_b32_e32 v25, 16, v25
+; GFX10-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX10-NEXT: v_lshlrev_b32_e32 v24, 16, v24
+; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX10-NEXT: v_lshlrev_b32_e32 v23, 16, v23
+; GFX10-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX10-NEXT: v_lshlrev_b32_e32 v22, 16, v22
+; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX10-NEXT: v_lshlrev_b32_e32 v21, 16, v21
+; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX10-NEXT: v_lshlrev_b32_e32 v20, 16, v20
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX10-NEXT: v_lshlrev_b32_e32 v19, 16, v19
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v18
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-NEXT: v_max_f32_e32 v0, v0, v16
+; GFX10-NEXT: v_max_f32_e32 v1, v1, v17
+; GFX10-NEXT: v_max_f32_e32 v33, v33, v33
+; GFX10-NEXT: v_max_f32_e32 v34, v34, v34
+; GFX10-NEXT: v_max_f32_e32 v36, v36, v36
+; GFX10-NEXT: v_max_f32_e32 v38, v38, v38
+; GFX10-NEXT: v_max_f32_e32 v48, v48, v48
+; GFX10-NEXT: v_max_f32_e32 v50, v50, v50
+; GFX10-NEXT: v_max_f32_e32 v52, v52, v52
+; GFX10-NEXT: v_max_f32_e32 v54, v54, v54
+; GFX10-NEXT: v_max_f32_e32 v64, v64, v64
+; GFX10-NEXT: v_max_f32_e32 v66, v66, v66
+; GFX10-NEXT: v_max_f32_e32 v68, v68, v68
+; GFX10-NEXT: v_max_f32_e32 v15, v15, v15
+; GFX10-NEXT: v_max_f32_e32 v30, v30, v30
+; GFX10-NEXT: v_max_f32_e32 v14, v14, v14
+; GFX10-NEXT: v_max_f32_e32 v29, v29, v29
+; GFX10-NEXT: v_max_f32_e32 v13, v13, v13
+; GFX10-NEXT: v_max_f32_e32 v28, v28, v28
+; GFX10-NEXT: v_max_f32_e32 v12, v12, v12
+; GFX10-NEXT: v_max_f32_e32 v27, v27, v27
+; GFX10-NEXT: v_max_f32_e32 v11, v11, v11
+; GFX10-NEXT: v_max_f32_e32 v26, v26, v26
+; GFX10-NEXT: v_max_f32_e32 v10, v10, v10
+; GFX10-NEXT: v_max_f32_e32 v25, v25, v25
+; GFX10-NEXT: v_max_f32_e32 v9, v9, v9
+; GFX10-NEXT: v_max_f32_e32 v24, v24, v24
+; GFX10-NEXT: v_max_f32_e32 v8, v8, v8
+; GFX10-NEXT: v_max_f32_e32 v23, v23, v23
+; GFX10-NEXT: v_max_f32_e32 v7, v7, v7
+; GFX10-NEXT: v_max_f32_e32 v22, v22, v22
+; GFX10-NEXT: v_max_f32_e32 v6, v6, v6
+; GFX10-NEXT: v_max_f32_e32 v21, v21, v21
+; GFX10-NEXT: v_max_f32_e32 v5, v5, v5
+; GFX10-NEXT: v_max_f32_e32 v20, v20, v20
+; GFX10-NEXT: v_max_f32_e32 v4, v4, v4
+; GFX10-NEXT: v_max_f32_e32 v19, v19, v19
+; GFX10-NEXT: v_max_f32_e32 v3, v3, v3
+; GFX10-NEXT: v_max_f32_e32 v18, v18, v18
+; GFX10-NEXT: v_max_f32_e32 v2, v2, v2
+; GFX10-NEXT: v_max_f32_e32 v34, v36, v34
+; GFX10-NEXT: v_max_f32_e32 v36, v48, v38
+; GFX10-NEXT: v_max_f32_e32 v38, v52, v50
+; GFX10-NEXT: v_max_f32_e32 v48, v64, v54
+; GFX10-NEXT: v_max_f32_e32 v50, v68, v66
+; GFX10-NEXT: v_max_f32_e32 v14, v14, v30
+; GFX10-NEXT: v_max_f32_e32 v13, v13, v29
+; GFX10-NEXT: v_max_f32_e32 v12, v12, v28
+; GFX10-NEXT: v_max_f32_e32 v11, v11, v27
+; GFX10-NEXT: v_max_f32_e32 v10, v10, v26
+; GFX10-NEXT: v_max_f32_e32 v9, v9, v25
+; GFX10-NEXT: v_max_f32_e32 v8, v8, v24
+; GFX10-NEXT: v_max_f32_e32 v7, v7, v23
+; GFX10-NEXT: v_max_f32_e32 v6, v6, v22
+; GFX10-NEXT: v_max_f32_e32 v5, v5, v21
+; GFX10-NEXT: v_max_f32_e32 v2, v2, v18
+; GFX10-NEXT: v_max_f32_e32 v3, v3, v19
+; GFX10-NEXT: v_max_f32_e32 v4, v4, v20
+; GFX10-NEXT: v_perm_b32 v0, v0, v50, 0x3020706
+; GFX10-NEXT: v_perm_b32 v1, v1, v48, 0x3020706
+; GFX10-NEXT: v_perm_b32 v2, v2, v38, 0x3020706
+; GFX10-NEXT: v_perm_b32 v3, v3, v36, 0x3020706
+; GFX10-NEXT: v_perm_b32 v4, v4, v34, 0x3020706
+; GFX10-NEXT: v_perm_b32 v5, v5, v67, 0x3020706
+; GFX10-NEXT: v_perm_b32 v6, v6, v65, 0x3020706
+; GFX10-NEXT: v_perm_b32 v7, v7, v55, 0x3020706
+; GFX10-NEXT: v_perm_b32 v8, v8, v53, 0x3020706
+; GFX10-NEXT: v_perm_b32 v9, v9, v51, 0x3020706
+; GFX10-NEXT: v_perm_b32 v10, v10, v49, 0x3020706
+; GFX10-NEXT: v_perm_b32 v11, v11, v39, 0x3020706
+; GFX10-NEXT: v_perm_b32 v12, v12, v37, 0x3020706
+; GFX10-NEXT: v_perm_b32 v13, v13, v35, 0x3020706
+; GFX10-NEXT: v_perm_b32 v14, v14, v32, 0x3020706
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_and_b32_e32 v16, 0xffff0000, v31
+; GFX10-NEXT: v_lshlrev_b32_e32 v17, 16, v31
+; GFX10-NEXT: v_max_f32_e32 v16, v16, v16
+; GFX10-NEXT: v_max_f32_e32 v17, v17, v17
+; GFX10-NEXT: v_max_f32_e32 v16, v33, v16
+; GFX10-NEXT: v_max_f32_e32 v15, v15, v17
+; GFX10-NEXT: v_perm_b32 v15, v15, v16, 0x3020706
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_maxnum_v32bf16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: scratch_load_b32 v31, off, s32
+; GFX11-NEXT: v_and_b32_e32 v33, 0xffff0000, v30
+; GFX11-NEXT: v_and_b32_e32 v35, 0xffff0000, v29
+; GFX11-NEXT: v_and_b32_e32 v34, 0xffff0000, v14
+; GFX11-NEXT: v_and_b32_e32 v38, 0xffff0000, v12
+; GFX11-NEXT: v_and_b32_e32 v37, 0xffff0000, v28
+; GFX11-NEXT: v_and_b32_e32 v39, 0xffff0000, v27
+; GFX11-NEXT: v_and_b32_e32 v36, 0xffff0000, v13
+; GFX11-NEXT: v_dual_max_f32 v33, v33, v33 :: v_dual_and_b32 v32, 0xffff0000, v15
+; GFX11-NEXT: v_and_b32_e32 v49, 0xffff0000, v26
+; GFX11-NEXT: v_and_b32_e32 v48, 0xffff0000, v11
+; GFX11-NEXT: v_and_b32_e32 v51, 0xffff0000, v25
+; GFX11-NEXT: v_and_b32_e32 v50, 0xffff0000, v10
+; GFX11-NEXT: v_and_b32_e32 v54, 0xffff0000, v8
+; GFX11-NEXT: v_and_b32_e32 v53, 0xffff0000, v24
+; GFX11-NEXT: v_and_b32_e32 v55, 0xffff0000, v23
+; GFX11-NEXT: v_and_b32_e32 v52, 0xffff0000, v9
+; GFX11-NEXT: v_and_b32_e32 v65, 0xffff0000, v22
+; GFX11-NEXT: v_and_b32_e32 v67, 0xffff0000, v21
+; GFX11-NEXT: v_and_b32_e32 v66, 0xffff0000, v6
+; GFX11-NEXT: v_and_b32_e32 v71, 0xffff0000, v19
+; GFX11-NEXT: v_and_b32_e32 v68, 0xffff0000, v5
+; GFX11-NEXT: v_and_b32_e32 v83, 0xffff0000, v17
+; GFX11-NEXT: v_and_b32_e32 v86, 0xffff0000, v0
+; GFX11-NEXT: v_and_b32_e32 v85, 0xffff0000, v16
+; GFX11-NEXT: v_lshlrev_b32_e32 v15, 16, v15
+; GFX11-NEXT: v_and_b32_e32 v84, 0xffff0000, v1
+; GFX11-NEXT: v_lshlrev_b32_e32 v25, 16, v25
+; GFX11-NEXT: v_lshlrev_b32_e32 v26, 16, v26
+; GFX11-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GFX11-NEXT: v_lshlrev_b32_e32 v21, 16, v21
+; GFX11-NEXT: v_lshlrev_b32_e32 v22, 16, v22
+; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX11-NEXT: v_lshlrev_b32_e32 v17, 16, v17
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: v_dual_max_f32 v35, v35, v35 :: v_dual_max_f32 v34, v34, v34
+; GFX11-NEXT: v_lshlrev_b32_e32 v16, 16, v16
+; GFX11-NEXT: v_dual_max_f32 v38, v38, v38 :: v_dual_max_f32 v37, v37, v37
+; GFX11-NEXT: v_dual_max_f32 v39, v39, v39 :: v_dual_lshlrev_b32 v0, 16, v0
+; GFX11-NEXT: v_max_f32_e32 v36, v36, v36
+; GFX11-NEXT: v_dual_max_f32 v65, v65, v65 :: v_dual_and_b32 v64, 0xffff0000, v7
+; GFX11-NEXT: v_and_b32_e32 v70, 0xffff0000, v4
+; GFX11-NEXT: v_and_b32_e32 v69, 0xffff0000, v20
+; GFX11-NEXT: v_and_b32_e32 v81, 0xffff0000, v18
+; GFX11-NEXT: v_dual_max_f32 v83, v83, v83 :: v_dual_and_b32 v82, 0xffff0000, v2
+; GFX11-NEXT: v_dual_max_f32 v17, v17, v17 :: v_dual_lshlrev_b32 v18, 16, v18
+; GFX11-NEXT: v_dual_max_f32 v1, v1, v1 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX11-NEXT: v_dual_max_f32 v49, v49, v49 :: v_dual_max_f32 v48, v48, v48
+; GFX11-NEXT: v_dual_max_f32 v51, v51, v51 :: v_dual_max_f32 v50, v50, v50
+; GFX11-NEXT: v_dual_max_f32 v54, v54, v54 :: v_dual_max_f32 v53, v53, v53
+; GFX11-NEXT: v_dual_max_f32 v67, v67, v67 :: v_dual_max_f32 v66, v66, v66
+; GFX11-NEXT: v_dual_max_f32 v25, v25, v25 :: v_dual_max_f32 v26, v26, v26
+; GFX11-NEXT: v_dual_max_f32 v9, v9, v9 :: v_dual_max_f32 v10, v10, v10
+; GFX11-NEXT: v_dual_max_f32 v21, v21, v21 :: v_dual_max_f32 v22, v22, v22
+; GFX11-NEXT: v_dual_max_f32 v5, v5, v5 :: v_dual_max_f32 v6, v6, v6
+; GFX11-NEXT: v_dual_max_f32 v33, v34, v33 :: v_dual_max_f32 v16, v16, v16
+; GFX11-NEXT: v_dual_max_f32 v34, v36, v35 :: v_dual_max_f32 v35, v38, v37
+; GFX11-NEXT: v_max_f32_e32 v0, v0, v0
+; GFX11-NEXT: v_dual_max_f32 v81, v81, v81 :: v_dual_and_b32 v80, 0xffff0000, v3
+; GFX11-NEXT: v_lshlrev_b32_e32 v29, 16, v29
+; GFX11-NEXT: v_lshlrev_b32_e32 v30, 16, v30
+; GFX11-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX11-NEXT: v_lshlrev_b32_e32 v14, 16, v14
+; GFX11-NEXT: v_lshlrev_b32_e32 v27, 16, v27
+; GFX11-NEXT: v_lshlrev_b32_e32 v28, 16, v28
+; GFX11-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; GFX11-NEXT: v_lshlrev_b32_e32 v23, 16, v23
+; GFX11-NEXT: v_lshlrev_b32_e32 v24, 16, v24
+; GFX11-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX11-NEXT: v_lshlrev_b32_e32 v19, 16, v19
+; GFX11-NEXT: v_lshlrev_b32_e32 v20, 16, v20
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-NEXT: v_dual_max_f32 v70, v70, v70 :: v_dual_max_f32 v69, v69, v69
+; GFX11-NEXT: v_dual_max_f32 v36, v48, v39 :: v_dual_max_f32 v37, v50, v49
+; GFX11-NEXT: v_max_f32_e32 v39, v54, v53
+; GFX11-NEXT: v_dual_max_f32 v10, v10, v26 :: v_dual_max_f32 v1, v1, v17
+; GFX11-NEXT: v_max_f32_e32 v6, v6, v22
+; GFX11-NEXT: v_dual_max_f32 v32, v32, v32 :: v_dual_max_f32 v55, v55, v55
+; GFX11-NEXT: v_max_f32_e32 v52, v52, v52
+; GFX11-NEXT: v_dual_max_f32 v64, v64, v64 :: v_dual_max_f32 v71, v71, v71
+; GFX11-NEXT: v_max_f32_e32 v68, v68, v68
+; GFX11-NEXT: v_max_f32_e32 v80, v80, v80
+; GFX11-NEXT: v_max_f32_e32 v82, v82, v82
+; GFX11-NEXT: v_dual_max_f32 v86, v86, v86 :: v_dual_max_f32 v85, v85, v85
+; GFX11-NEXT: v_dual_max_f32 v15, v15, v15 :: v_dual_max_f32 v84, v84, v84
+; GFX11-NEXT: v_dual_max_f32 v29, v29, v29 :: v_dual_max_f32 v30, v30, v30
+; GFX11-NEXT: v_dual_max_f32 v13, v13, v13 :: v_dual_max_f32 v14, v14, v14
+; GFX11-NEXT: v_dual_max_f32 v27, v27, v27 :: v_dual_max_f32 v28, v28, v28
+; GFX11-NEXT: v_dual_max_f32 v11, v11, v11 :: v_dual_max_f32 v12, v12, v12
+; GFX11-NEXT: v_dual_max_f32 v23, v23, v23 :: v_dual_max_f32 v24, v24, v24
+; GFX11-NEXT: v_dual_max_f32 v7, v7, v7 :: v_dual_max_f32 v8, v8, v8
+; GFX11-NEXT: v_dual_max_f32 v19, v19, v19 :: v_dual_max_f32 v20, v20, v20
+; GFX11-NEXT: v_dual_max_f32 v3, v3, v3 :: v_dual_max_f32 v4, v4, v4
+; GFX11-NEXT: v_max_f32_e32 v18, v18, v18
+; GFX11-NEXT: v_max_f32_e32 v2, v2, v2
+; GFX11-NEXT: v_dual_max_f32 v38, v52, v51 :: v_dual_max_f32 v53, v82, v81
+; GFX11-NEXT: v_dual_max_f32 v48, v64, v55 :: v_dual_max_f32 v55, v86, v85
+; GFX11-NEXT: v_dual_max_f32 v49, v66, v65 :: v_dual_max_f32 v50, v68, v67
+; GFX11-NEXT: v_max_f32_e32 v13, v13, v29
+; GFX11-NEXT: v_dual_max_f32 v51, v70, v69 :: v_dual_max_f32 v52, v80, v71
+; GFX11-NEXT: v_dual_max_f32 v9, v9, v25 :: v_dual_max_f32 v54, v84, v83
+; GFX11-NEXT: v_dual_max_f32 v5, v5, v21 :: v_dual_max_f32 v14, v14, v30
+; GFX11-NEXT: v_dual_max_f32 v11, v11, v27 :: v_dual_max_f32 v12, v12, v28
+; GFX11-NEXT: v_dual_max_f32 v7, v7, v23 :: v_dual_max_f32 v8, v8, v24
+; GFX11-NEXT: v_dual_max_f32 v3, v3, v19 :: v_dual_max_f32 v4, v4, v20
+; GFX11-NEXT: v_perm_b32 v1, v1, v54, 0x3020706
+; GFX11-NEXT: v_perm_b32 v5, v5, v50, 0x3020706
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_perm_b32 v7, v7, v48, 0x3020706
+; GFX11-NEXT: v_perm_b32 v3, v3, v52, 0x3020706
+; GFX11-NEXT: v_perm_b32 v4, v4, v51, 0x3020706
+; GFX11-NEXT: v_perm_b32 v8, v8, v39, 0x3020706
+; GFX11-NEXT: v_perm_b32 v9, v9, v38, 0x3020706
+; GFX11-NEXT: v_perm_b32 v10, v10, v37, 0x3020706
+; GFX11-NEXT: v_perm_b32 v11, v11, v36, 0x3020706
+; GFX11-NEXT: v_perm_b32 v12, v12, v35, 0x3020706
+; GFX11-NEXT: v_perm_b32 v13, v13, v34, 0x3020706
+; GFX11-NEXT: v_perm_b32 v14, v14, v33, 0x3020706
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_dual_max_f32 v0, v0, v16 :: v_dual_lshlrev_b32 v17, 16, v31
+; GFX11-NEXT: v_and_b32_e32 v16, 0xffff0000, v31
+; GFX11-NEXT: v_perm_b32 v6, v6, v49, 0x3020706
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_dual_max_f32 v17, v17, v17 :: v_dual_max_f32 v2, v2, v18
+; GFX11-NEXT: v_max_f32_e32 v16, v16, v16
+; GFX11-NEXT: v_perm_b32 v0, v0, v55, 0x3020706
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_max_f32_e32 v15, v15, v17
+; GFX11-NEXT: v_perm_b32 v2, v2, v53, 0x3020706
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_max_f32_e32 v16, v32, v16
+; GFX11-NEXT: v_perm_b32 v15, v15, v16, 0x3020706
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %op = call <32 x bfloat> @llvm.maxnum.v32bf16(<32 x bfloat> %a, <32 x bfloat> %b)
+ ret <32 x bfloat> %op
+}
+
declare bfloat @llvm.sqrt.bf16(bfloat)
define bfloat @v_sqrt_bf16(bfloat %a) {
diff --git a/llvm/test/CodeGen/AMDGPU/function-args.ll b/llvm/test/CodeGen/AMDGPU/function-args.ll
index de9e320a363a..f24cc6f177d6 100644
--- a/llvm/test/CodeGen/AMDGPU/function-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/function-args.ll
@@ -3087,8 +3087,8 @@ define void @void_func_v32i32_v2i32_v2f32(<32 x i32> %arg0, <2 x i32> %arg1, <2
ret void
}
-define void @void_func_v32i32_v2i16_v2f16_v2bf16(<32 x i32> %arg0, <2 x i16> %arg1, <2 x half> %arg2, <2 x bfloat> %arg3) #0 {
-; CI-LABEL: void_func_v32i32_v2i16_v2f16_v2bf16:
+define void @void_func_v32i32_v2i16_v2f16_v2bf16_v4bf16(<32 x i32> %arg0, <2 x i16> %arg1, <2 x half> %arg2, <2 x bfloat> %arg3, <4 x bfloat> %arg4) #0 {
+; CI-LABEL: void_func_v32i32_v2i16_v2f16_v2bf16_v4bf16:
; CI: ; %bb.0:
; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CI-NEXT: buffer_load_dword v31, off, s[0:3], s32
@@ -3103,39 +3103,55 @@ define void @void_func_v32i32_v2i16_v2f16_v2bf16(<32 x i32> %arg0, <2 x i16> %ar
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0
; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:20
-; CI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:24
-; CI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:12
-; CI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:16
-; CI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:8
+; CI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:32
+; CI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:36
+; CI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:40
+; CI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:20
; CI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0
; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:4
-; CI-NEXT: v_lshrrev_b32_e32 v14, 16, v16
-; CI-NEXT: v_cvt_f16_f32_e32 v15, v17
-; CI-NEXT: v_lshrrev_b32_e32 v13, 16, v20
-; CI-NEXT: v_cvt_f16_f32_e32 v16, v18
+; CI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:28
+; CI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:24
+; CI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:12
+; CI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:16
+; CI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:8
; CI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0
; CI-NEXT: s_waitcnt vmcnt(0)
+; CI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:4
+; CI-NEXT: v_lshrrev_b32_e32 v10, 16, v16
+; CI-NEXT: v_lshrrev_b32_e32 v11, 16, v17
+; CI-NEXT: v_lshrrev_b32_e32 v16, 16, v18
+; CI-NEXT: v_lshrrev_b32_e32 v17, 16, v19
+; CI-NEXT: v_lshrrev_b32_e32 v12, 16, v12
+; CI-NEXT: v_lshrrev_b32_e32 v9, 16, v20
+; CI-NEXT: v_cvt_f16_f32_e32 v13, v13
+; CI-NEXT: v_cvt_f16_f32_e32 v14, v14
; CI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: buffer_store_short v19, off, s[4:7], 0
+; CI-NEXT: buffer_store_short v15, off, s[4:7], 0
+; CI-NEXT: s_waitcnt vmcnt(0)
+; CI-NEXT: buffer_store_short v8, off, s[4:7], 0
+; CI-NEXT: s_waitcnt vmcnt(0)
+; CI-NEXT: buffer_store_short v14, off, s[4:7], 0
+; CI-NEXT: s_waitcnt vmcnt(0)
+; CI-NEXT: buffer_store_short v13, off, s[4:7], 0
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: buffer_store_short v12, off, s[4:7], 0
; CI-NEXT: s_waitcnt vmcnt(0)
+; CI-NEXT: buffer_store_short v17, off, s[4:7], 0
+; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: buffer_store_short v16, off, s[4:7], 0
; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: buffer_store_short v15, off, s[4:7], 0
+; CI-NEXT: buffer_store_short v11, off, s[4:7], 0
; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: buffer_store_short v14, off, s[4:7], 0
+; CI-NEXT: buffer_store_short v10, off, s[4:7], 0
; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: buffer_store_short v13, off, s[4:7], 0
+; CI-NEXT: buffer_store_short v9, off, s[4:7], 0
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: s_setpc_b64 s[30:31]
;
-; VI-LABEL: void_func_v32i32_v2i16_v2f16_v2bf16:
+; VI-LABEL: void_func_v32i32_v2i16_v2f16_v2bf16_v4bf16:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32
@@ -3150,26 +3166,38 @@ define void @void_func_v32i32_v2i16_v2f16_v2bf16(<32 x i32> %arg0, <2 x i16> %ar
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:8
-; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:12
-; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:4
+; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:16
+; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:20
+; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:4
+; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:8
+; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:12
; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0
; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v16
+; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v20
; VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dword v20, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v17, off, s[4:7], 0
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dword v16, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v18, off, s[4:7], 0
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dword v17, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v19, off, s[4:7], 0
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_store_short v13, off, s[4:7], 0
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_store_short v16, off, s[4:7], 0
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_store_short v12, off, s[4:7], 0
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_store_short v20, off, s[4:7], 0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: void_func_v32i32_v2i16_v2f16_v2bf16:
+; GFX9-LABEL: void_func_v32i32_v2i16_v2f16_v2bf16_v4bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32
@@ -3184,36 +3212,54 @@ define void @void_func_v32i32_v2i16_v2f16_v2bf16(<32 x i32> %arg0, <2 x i16> %ar
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:8
-; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:12
-; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:4
+; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:16
+; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:20
+; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:4
+; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:8
+; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:12
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0
; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v16
+; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v20
; GFX9-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_store_dword v20, off, s[4:7], 0
+; GFX9-NEXT: buffer_store_dword v17, off, s[4:7], 0
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_store_dword v16, off, s[4:7], 0
+; GFX9-NEXT: buffer_store_dword v18, off, s[4:7], 0
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_store_dword v17, off, s[4:7], 0
+; GFX9-NEXT: buffer_store_dword v19, off, s[4:7], 0
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_store_short v13, off, s[4:7], 0
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_store_short v16, off, s[4:7], 0
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_store_short v12, off, s[4:7], 0
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_store_short v20, off, s[4:7], 0
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: void_func_v32i32_v2i16_v2f16_v2bf16:
+; GFX11-LABEL: void_func_v32i32_v2i16_v2f16_v2bf16_v4bf16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_clause 0x3
+; GFX11-NEXT: s_clause 0x5
+; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:16
+; GFX11-NEXT: scratch_load_b32 v33, off, s32 offset:20
; GFX11-NEXT: scratch_load_b32 v31, off, s32
-; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:4
-; GFX11-NEXT: scratch_load_b32 v33, off, s32 offset:8
-; GFX11-NEXT: scratch_load_b32 v34, off, s32 offset:12
+; GFX11-NEXT: scratch_load_b32 v34, off, s32 offset:4
+; GFX11-NEXT: scratch_load_b32 v35, off, s32 offset:8
+; GFX11-NEXT: scratch_load_b32 v36, off, s32 offset:12
; GFX11-NEXT: s_mov_b32 s3, 0x31016000
; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_waitcnt vmcnt(5)
+; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v32
+; GFX11-NEXT: s_waitcnt vmcnt(4)
+; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v33
; GFX11-NEXT: s_waitcnt vmcnt(3)
; GFX11-NEXT: buffer_store_b128 v[28:31], off, s[0:3], 0 dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
@@ -3232,19 +3278,28 @@ define void @void_func_v32i32_v2i16_v2f16_v2bf16(<32 x i32> %arg0, <2 x i16> %ar
; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_waitcnt vmcnt(2)
-; GFX11-NEXT: buffer_store_b32 v32, off, s[0:3], 0 dlc
+; GFX11-NEXT: buffer_store_b32 v34, off, s[0:3], 0 dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_waitcnt vmcnt(1)
-; GFX11-NEXT: buffer_store_b32 v33, off, s[0:3], 0 dlc
+; GFX11-NEXT: buffer_store_b32 v35, off, s[0:3], 0 dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_store_b32 v34, off, s[0:3], 0 dlc
+; GFX11-NEXT: buffer_store_b32 v36, off, s[0:3], 0 dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: buffer_store_b16 v38, off, s[0:3], 0 dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: buffer_store_b16 v33, off, s[0:3], 0 dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: buffer_store_b16 v37, off, s[0:3], 0 dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: buffer_store_b16 v32, off, s[0:3], 0 dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_setpc_b64 s[30:31]
store volatile <32 x i32> %arg0, ptr addrspace(1) undef
store volatile <2 x i16> %arg1, ptr addrspace(1) undef
store volatile <2 x half> %arg2, ptr addrspace(1) undef
store volatile <2 x bfloat> %arg3, ptr addrspace(1) undef
+ store volatile <4 x bfloat> %arg4, ptr addrspace(1) undef
ret void
}
diff --git a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
index 91cff9d1a541..844aa57de05c 100644
--- a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
@@ -1582,8 +1582,8 @@ define <6 x half> @shuffle_v6f16_452367(ptr addrspace(1) %arg0, ptr addrspace(1)
ret <6 x half> %shuffle
}
-define amdgpu_kernel void @fma_shuffle(ptr addrspace(1) nocapture readonly %A, ptr addrspace(1) nocapture readonly %B, ptr addrspace(1) nocapture %C) {
-; GFX9-LABEL: fma_shuffle:
+define amdgpu_kernel void @fma_shuffle_v2f16(ptr addrspace(1) nocapture readonly %A, ptr addrspace(1) nocapture readonly %B, ptr addrspace(1) nocapture %C) {
+; GFX9-LABEL: fma_shuffle_v2f16:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10
@@ -1600,7 +1600,7 @@ define amdgpu_kernel void @fma_shuffle(ptr addrspace(1) nocapture readonly %A, p
; GFX9-NEXT: global_store_dwordx2 v6, v[0:1], s[6:7]
; GFX9-NEXT: s_endpgm
;
-; GFX10-LABEL: fma_shuffle:
+; GFX10-LABEL: fma_shuffle_v2f16:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
@@ -1619,7 +1619,7 @@ define amdgpu_kernel void @fma_shuffle(ptr addrspace(1) nocapture readonly %A, p
; GFX10-NEXT: global_store_dwordx2 v6, v[0:1], s[6:7]
; GFX10-NEXT: s_endpgm
;
-; GFX11-LABEL: fma_shuffle:
+; GFX11-LABEL: fma_shuffle_v2f16:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0
@@ -1758,12 +1758,8 @@ define amdgpu_kernel void @shuffle_scalar_load_v8i32_0123(ptr addrspace(4) %in,
ret void
}
-declare <2 x half> @llvm.fma.v2f16(<2 x half>, <2 x half>, <2 x half>) #0
-declare i32 @llvm.amdgcn.workitem.id.x() #0
-
-attributes #0 = { nounwind readnone speculatable }
-define <2 x half> @low16bits(ptr addrspace(1) %x0, ptr addrspace(1) %x1) {
-; GFX9-LABEL: low16bits:
+define <2 x half> @low16bits_v2f16(ptr addrspace(1) %x0, ptr addrspace(1) %x1) {
+; GFX9-LABEL: low16bits_v2f16:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: global_load_dword v4, v[0:1], off
@@ -1773,7 +1769,7 @@ define <2 x half> @low16bits(ptr addrspace(1) %x0, ptr addrspace(1) %x1) {
; GFX9-NEXT: v_perm_b32 v0, v5, v4, s4
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-LABEL: low16bits:
+; GFX10-LABEL: low16bits_v2f16:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: global_load_dword v4, v[0:1], off
@@ -1782,7 +1778,7 @@ define <2 x half> @low16bits(ptr addrspace(1) %x0, ptr addrspace(1) %x1) {
; GFX10-NEXT: v_perm_b32 v0, v5, v4, 0x5040100
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: low16bits:
+; GFX11-LABEL: low16bits_v2f16:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: global_load_b32 v0, v[0:1], off
@@ -1798,8 +1794,8 @@ entry:
ret <2 x half> %vy1.2.vec.insert
}
-define <2 x half> @hi16bits(ptr addrspace(1) %x0, ptr addrspace(1) %x1) {
-; GFX9-LABEL: hi16bits:
+define <2 x half> @hi16bits_v2f16(ptr addrspace(1) %x0, ptr addrspace(1) %x1) {
+; GFX9-LABEL: hi16bits_v2f16:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: global_load_dword v4, v[0:1], off
@@ -1809,7 +1805,7 @@ define <2 x half> @hi16bits(ptr addrspace(1) %x0, ptr addrspace(1) %x1) {
; GFX9-NEXT: v_perm_b32 v0, v5, v4, s4
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-LABEL: hi16bits:
+; GFX10-LABEL: hi16bits_v2f16:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: global_load_dword v4, v[0:1], off
@@ -1818,7 +1814,7 @@ define <2 x half> @hi16bits(ptr addrspace(1) %x0, ptr addrspace(1) %x1) {
; GFX10-NEXT: v_perm_b32 v0, v5, v4, 0x7060302
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: hi16bits:
+; GFX11-LABEL: hi16bits_v2f16:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: global_load_b32 v0, v[0:1], off
@@ -1834,8 +1830,8 @@ entry:
ret <2 x half> %vy1.2.vec.insert
}
-define <2 x half> @low16hi16bits(ptr addrspace(1) %x0, ptr addrspace(1) %x1) {
-; GFX9-LABEL: low16hi16bits:
+define <2 x half> @low16hi16bits_v2f16(ptr addrspace(1) %x0, ptr addrspace(1) %x1) {
+; GFX9-LABEL: low16hi16bits_v2f16:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: global_load_dword v4, v[0:1], off
@@ -1845,7 +1841,7 @@ define <2 x half> @low16hi16bits(ptr addrspace(1) %x0, ptr addrspace(1) %x1) {
; GFX9-NEXT: v_bfi_b32 v0, s4, v4, v5
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-LABEL: low16hi16bits:
+; GFX10-LABEL: low16hi16bits_v2f16:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: global_load_dword v4, v[0:1], off
@@ -1854,7 +1850,7 @@ define <2 x half> @low16hi16bits(ptr addrspace(1) %x0, ptr addrspace(1) %x1) {
; GFX10-NEXT: v_bfi_b32 v0, 0xffff, v4, v5
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: low16hi16bits:
+; GFX11-LABEL: low16hi16bits_v2f16:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: global_load_b32 v0, v[0:1], off
@@ -1870,8 +1866,8 @@ entry:
ret <2 x half> %vy1.2.vec.insert
}
-define <2 x half> @hi16low16bits(ptr addrspace(1) %x0, ptr addrspace(1) %x1) {
-; GFX9-LABEL: hi16low16bits:
+define <2 x half> @hi16low16bits_v2bf16(ptr addrspace(1) %x0, ptr addrspace(1) %x1) {
+; GFX9-LABEL: hi16low16bits_v2bf16:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: global_load_dword v4, v[0:1], off
@@ -1880,7 +1876,7 @@ define <2 x half> @hi16low16bits(ptr addrspace(1) %x0, ptr addrspace(1) %x1) {
; GFX9-NEXT: v_alignbit_b32 v0, v5, v4, 16
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-LABEL: hi16low16bits:
+; GFX10-LABEL: hi16low16bits_v2bf16:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: global_load_dword v4, v[0:1], off
@@ -1889,7 +1885,7 @@ define <2 x half> @hi16low16bits(ptr addrspace(1) %x0, ptr addrspace(1) %x1) {
; GFX10-NEXT: v_alignbit_b32 v0, v5, v4, 16
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: hi16low16bits:
+; GFX11-LABEL: hi16low16bits_v2bf16:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: global_load_b32 v0, v[0:1], off
@@ -2675,3 +2671,2354 @@ define void @shuffle_v16i32_concat(ptr addrspace(1) %arg0, ptr addrspace(1) %arg
store <16 x i32> %shuffle, ptr addrspace(1) %out
ret void
}
+
+define <4 x bfloat> @shuffle_v4bf16_23uu(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v4bf16_23uu:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4
+; GFX9-NEXT: s_mov_b32 s4, 0xffff
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v0
+; GFX9-NEXT: v_and_or_b32 v0, v0, s4, v1
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v4bf16_23uu:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: global_load_dword v0, v[0:1], off offset:4
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v0
+; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v1
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v4bf16_23uu:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:4
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_and_or_b32 v0, 0xffff, v0, v1
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
+ %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
+ %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+ ret <4 x bfloat> %shuffle
+}
+
+define <4 x bfloat> @shuffle_v4bf16_234u(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v4bf16_234u:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:4
+; GFX9-NEXT: global_load_dword v5, v[2:3], off
+; GFX9-NEXT: s_mov_b32 s4, 0xffff
+; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v4
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v5
+; GFX9-NEXT: v_and_or_b32 v0, v4, s4, v0
+; GFX9-NEXT: v_and_or_b32 v1, v5, s4, v1
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v4bf16_234u:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:4
+; GFX10-NEXT: global_load_dword v5, v[2:3], off
+; GFX10-NEXT: s_waitcnt vmcnt(1)
+; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v4
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v5
+; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v4, v0
+; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v5, v1
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v4bf16_234u:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:4
+; GFX11-NEXT: global_load_b32 v1, v[2:3], off
+; GFX11-NEXT: s_waitcnt vmcnt(1)
+; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_and_or_b32 v0, 0xffff, v0, v2
+; GFX11-NEXT: v_and_or_b32 v1, 0xffff, v1, v3
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
+ %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
+ %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 2, i32 3, i32 4, i32 undef>
+ ret <4 x bfloat> %shuffle
+}
+
+define <4 x bfloat> @shuffle_v4bf16_u1u3(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v4bf16_u1u3:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v4bf16_u1u3:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v4bf16_u1u3:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
+ %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
+ %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 undef, i32 1, i32 undef, i32 3>
+ ret <4 x bfloat> %shuffle
+}
+
+define <4 x bfloat> @shuffle_v4bf16_u3u1(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v4bf16_u3u1:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: global_load_dwordx2 v[1:2], v[0:1], off
+; GFX9-NEXT: s_mov_b32 s4, 0xffff
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v2
+; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
+; GFX9-NEXT: v_and_or_b32 v0, v2, s4, v0
+; GFX9-NEXT: v_and_or_b32 v1, v1, s4, v3
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v4bf16_u3u1:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: global_load_dwordx2 v[1:2], v[0:1], off
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v2
+; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
+; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v2, v0
+; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, v3
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v4bf16_u3u1:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: global_load_b64 v[1:2], v[0:1], off
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v2
+; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_and_or_b32 v0, 0xffff, v2, v0
+; GFX11-NEXT: v_and_or_b32 v1, 0xffff, v1, v3
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
+ %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
+ %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 undef, i32 3, i32 undef, i32 1>
+ ret <4 x bfloat> %shuffle
+}
+
+define <4 x bfloat> @shuffle_v4bf16_u3uu(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v4bf16_u3uu:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4
+; GFX9-NEXT: s_mov_b32 s4, 0xffff
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v0
+; GFX9-NEXT: v_and_or_b32 v0, v0, s4, v1
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v4bf16_u3uu:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: global_load_dword v0, v[0:1], off offset:4
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v0
+; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v1
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v4bf16_u3uu:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:4
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_and_or_b32 v0, 0xffff, v0, v1
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
+ %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
+ %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 undef, i32 3, i32 undef, i32 undef>
+ ret <4 x bfloat> %shuffle
+}
+
+define <4 x bfloat> @shuffle_v4bf16_3u6u(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v4bf16_3u6u:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: global_load_dword v5, v[0:1], off offset:4
+; GFX9-NEXT: global_load_dword v4, v[2:3], off offset:4
+; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v5
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v1, v4
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v4bf16_3u6u:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: global_load_dword v5, v[0:1], off offset:4
+; GFX10-NEXT: global_load_dword v4, v[2:3], off offset:4
+; GFX10-NEXT: s_waitcnt vmcnt(1)
+; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v5
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v1, v4
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v4bf16_3u6u:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:4
+; GFX11-NEXT: global_load_b32 v1, v[2:3], off offset:4
+; GFX11-NEXT: s_waitcnt vmcnt(1)
+; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
+ %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
+ %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 3, i32 undef, i32 6, i32 undef>
+ ret <4 x bfloat> %shuffle
+}
+
+define <4 x bfloat> @shuffle_v4bf16_3uu7(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v4bf16_3uu7:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: global_load_dword v5, v[0:1], off offset:4
+; GFX9-NEXT: global_load_dword v4, v[2:3], off offset:4
+; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v5
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v1, v4
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v4bf16_3uu7:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: global_load_dword v5, v[0:1], off offset:4
+; GFX10-NEXT: global_load_dword v4, v[2:3], off offset:4
+; GFX10-NEXT: s_waitcnt vmcnt(1)
+; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v5
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v1, v4
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v4bf16_3uu7:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:4
+; GFX11-NEXT: global_load_b32 v1, v[2:3], off offset:4
+; GFX11-NEXT: s_waitcnt vmcnt(1)
+; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
+ %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
+ %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 3, i32 undef, i32 undef, i32 7>
+ ret <4 x bfloat> %shuffle
+}
+
+define <4 x bfloat> @shuffle_v4bf16_35u5(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v4bf16_35u5:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:4
+; GFX9-NEXT: global_load_dword v5, v[2:3], off
+; GFX9-NEXT: s_mov_b32 s4, 0x3020706
+; GFX9-NEXT: s_mov_b32 s5, 0xffff
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_perm_b32 v0, v4, v5, s4
+; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v5
+; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
+; GFX9-NEXT: v_and_or_b32 v1, v5, s5, v1
+; GFX9-NEXT: v_and_or_b32 v0, v0, s5, v2
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v4bf16_35u5:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:4
+; GFX10-NEXT: global_load_dword v5, v[2:3], off
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_perm_b32 v0, v4, v5, 0x3020706
+; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v5
+; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v0
+; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v1
+; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v5, v2
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v4bf16_35u5:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:4
+; GFX11-NEXT: global_load_b32 v1, v[2:3], off
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_perm_b32 v0, v0, v1, 0x3020706
+; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
+; GFX11-NEXT: v_and_or_b32 v1, 0xffff, v1, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: v_and_or_b32 v0, 0xffff, v0, v2
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
+ %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
+ %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 3, i32 5, i32 undef, i32 5>
+ ret <4 x bfloat> %shuffle
+}
+
+define <4 x bfloat> @shuffle_v4bf16_357u(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v4bf16_357u:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: global_load_dword v6, v[0:1], off offset:4
+; GFX9-NEXT: global_load_dwordx2 v[4:5], v[2:3], off
+; GFX9-NEXT: s_mov_b32 s4, 0x3020706
+; GFX9-NEXT: s_mov_b32 s5, 0xffff
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_perm_b32 v0, v6, v4, s4
+; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
+; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v5
+; GFX9-NEXT: v_and_or_b32 v0, v0, s5, v2
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v4bf16_357u:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: global_load_dword v6, v[0:1], off offset:4
+; GFX10-NEXT: global_load_dwordx2 v[4:5], v[2:3], off
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_perm_b32 v0, v6, v4, 0x3020706
+; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v0
+; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v1
+; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v5
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v4bf16_357u:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: global_load_b32 v4, v[0:1], off offset:4
+; GFX11-NEXT: global_load_b64 v[0:1], v[2:3], off
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_perm_b32 v0, v4, v0, 0x3020706
+; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
+; GFX11-NEXT: v_and_or_b32 v0, 0xffff, v0, v2
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
+ %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
+ %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 3, i32 5, i32 7, i32 undef>
+ ret <4 x bfloat> %shuffle
+}
+
+define <4 x bfloat> @shuffle_v4bf16_0101(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v4bf16_0101:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: global_load_dword v0, v[0:1], off
+; GFX9-NEXT: s_mov_b32 s4, 0xffff
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v0
+; GFX9-NEXT: v_and_or_b32 v0, v0, s4, v1
+; GFX9-NEXT: v_mov_b32_e32 v1, v0
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v4bf16_0101:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: global_load_dword v0, v[0:1], off
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v0
+; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v1
+; GFX10-NEXT: v_mov_b32_e32 v1, v0
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v4bf16_0101:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: global_load_b32 v0, v[0:1], off
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_or_b32 v0, 0xffff, v0, v1
+; GFX11-NEXT: v_mov_b32_e32 v1, v0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
+ %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
+ %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+ ret <4 x bfloat> %shuffle
+}
+
+define <4 x bfloat> @shuffle_v4bf16_0123(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v4bf16_0123:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v4bf16_0123:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v4bf16_0123:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
+ %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
+ %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ ret <4 x bfloat> %shuffle
+}
+
+define <4 x bfloat> @shuffle_v4bf16_0145(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v4bf16_0145:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: global_load_dword v4, v[0:1], off
+; GFX9-NEXT: global_load_dword v5, v[2:3], off
+; GFX9-NEXT: s_mov_b32 s4, 0xffff
+; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v4
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v5
+; GFX9-NEXT: v_and_or_b32 v0, v4, s4, v0
+; GFX9-NEXT: v_and_or_b32 v1, v5, s4, v1
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v4bf16_0145:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: global_load_dword v4, v[0:1], off
+; GFX10-NEXT: global_load_dword v5, v[2:3], off
+; GFX10-NEXT: s_waitcnt vmcnt(1)
+; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v4
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v5
+; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v4, v0
+; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v5, v1
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v4bf16_0145:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: global_load_b32 v0, v[0:1], off
+; GFX11-NEXT: global_load_b32 v1, v[2:3], off
+; GFX11-NEXT: s_waitcnt vmcnt(1)
+; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_and_or_b32 v0, 0xffff, v0, v2
+; GFX11-NEXT: v_and_or_b32 v1, 0xffff, v1, v3
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
+ %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
+ %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+ ret <4 x bfloat> %shuffle
+}
+
+define <4 x bfloat> @shuffle_v4bf16_0167(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v4bf16_0167:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: global_load_dword v5, v[0:1], off
+; GFX9-NEXT: global_load_dword v4, v[2:3], off offset:4
+; GFX9-NEXT: s_mov_b32 s4, 0xffff
+; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v5
+; GFX9-NEXT: v_and_or_b32 v0, v5, s4, v0
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v1, v4
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v4bf16_0167:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: global_load_dword v5, v[0:1], off
+; GFX10-NEXT: global_load_dword v4, v[2:3], off offset:4
+; GFX10-NEXT: s_waitcnt vmcnt(1)
+; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v5
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v1, v4
+; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v5, v0
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v4bf16_0167:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: global_load_b32 v0, v[0:1], off
+; GFX11-NEXT: global_load_b32 v1, v[2:3], off offset:4
+; GFX11-NEXT: s_waitcnt vmcnt(1)
+; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_and_or_b32 v0, 0xffff, v0, v2
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
+ %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
+ %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+ ret <4 x bfloat> %shuffle
+}
+
+define <4 x bfloat> @shuffle_v4bf16_2301(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v4bf16_2301:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: global_load_dwordx2 v[1:2], v[0:1], off
+; GFX9-NEXT: s_mov_b32 s4, 0xffff
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v2
+; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
+; GFX9-NEXT: v_and_or_b32 v0, v2, s4, v0
+; GFX9-NEXT: v_and_or_b32 v1, v1, s4, v3
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v4bf16_2301:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: global_load_dwordx2 v[1:2], v[0:1], off
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v2
+; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
+; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v2, v0
+; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, v3
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v4bf16_2301:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: global_load_b64 v[1:2], v[0:1], off
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v2
+; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_and_or_b32 v0, 0xffff, v2, v0
+; GFX11-NEXT: v_and_or_b32 v1, 0xffff, v1, v3
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
+ %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
+ %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
+ ret <4 x bfloat> %shuffle
+}
+
+define <4 x bfloat> @shuffle_v4bf16_2323(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v4bf16_2323:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4
+; GFX9-NEXT: s_mov_b32 s4, 0xffff
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v0
+; GFX9-NEXT: v_and_or_b32 v0, v0, s4, v1
+; GFX9-NEXT: v_mov_b32_e32 v1, v0
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v4bf16_2323:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: global_load_dword v0, v[0:1], off offset:4
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v0
+; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v1
+; GFX10-NEXT: v_mov_b32_e32 v1, v0
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v4bf16_2323:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:4
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_or_b32 v0, 0xffff, v0, v1
+; GFX11-NEXT: v_mov_b32_e32 v1, v0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
+ %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
+ %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 2, i32 3, i32 2, i32 3>
+ ret <4 x bfloat> %shuffle
+}
+
+define <4 x bfloat> @shuffle_v4bf16_2345(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v4bf16_2345:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:4
+; GFX9-NEXT: global_load_dword v5, v[2:3], off
+; GFX9-NEXT: s_mov_b32 s4, 0xffff
+; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v4
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v5
+; GFX9-NEXT: v_and_or_b32 v0, v4, s4, v0
+; GFX9-NEXT: v_and_or_b32 v1, v5, s4, v1
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v4bf16_2345:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:4
+; GFX10-NEXT: global_load_dword v5, v[2:3], off
+; GFX10-NEXT: s_waitcnt vmcnt(1)
+; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v4
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v5
+; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v4, v0
+; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v5, v1
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v4bf16_2345:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:4
+; GFX11-NEXT: global_load_b32 v1, v[2:3], off
+; GFX11-NEXT: s_waitcnt vmcnt(1)
+; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_and_or_b32 v0, 0xffff, v0, v2
+; GFX11-NEXT: v_and_or_b32 v1, 0xffff, v1, v3
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
+ %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
+ %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+ ret <4 x bfloat> %shuffle
+}
+
+define <4 x bfloat> @shuffle_v4bf16_2367(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v4bf16_2367:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: global_load_dword v5, v[0:1], off offset:4
+; GFX9-NEXT: global_load_dword v4, v[2:3], off offset:4
+; GFX9-NEXT: s_mov_b32 s4, 0xffff
+; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v5
+; GFX9-NEXT: v_and_or_b32 v0, v5, s4, v0
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v1, v4
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v4bf16_2367:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: global_load_dword v5, v[0:1], off offset:4
+; GFX10-NEXT: global_load_dword v4, v[2:3], off offset:4
+; GFX10-NEXT: s_waitcnt vmcnt(1)
+; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v5
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v1, v4
+; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v5, v0
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v4bf16_2367:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:4
+; GFX11-NEXT: global_load_b32 v1, v[2:3], off offset:4
+; GFX11-NEXT: s_waitcnt vmcnt(1)
+; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_and_or_b32 v0, 0xffff, v0, v2
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
+ %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
+ %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
+ ret <4 x bfloat> %shuffle
+}
+
+define <4 x bfloat> @shuffle_v4bf16_4501(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v4bf16_4501:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: global_load_dword v4, v[2:3], off
+; GFX9-NEXT: global_load_dword v5, v[0:1], off
+; GFX9-NEXT: s_mov_b32 s4, 0xffff
+; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v4
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v5
+; GFX9-NEXT: v_and_or_b32 v0, v4, s4, v0
+; GFX9-NEXT: v_and_or_b32 v1, v5, s4, v1
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v4bf16_4501:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: global_load_dword v4, v[2:3], off
+; GFX10-NEXT: global_load_dword v5, v[0:1], off
+; GFX10-NEXT: s_waitcnt vmcnt(1)
+; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v4
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v5
+; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v4, v0
+; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v5, v1
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v4bf16_4501:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: global_load_b32 v2, v[2:3], off
+; GFX11-NEXT: global_load_b32 v1, v[0:1], off
+; GFX11-NEXT: s_waitcnt vmcnt(1)
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v2
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_and_or_b32 v0, 0xffff, v2, v0
+; GFX11-NEXT: v_and_or_b32 v1, 0xffff, v1, v3
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
+ %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
+ %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
+ ret <4 x bfloat> %shuffle
+}
+
+define <4 x bfloat> @shuffle_v4bf16_4523(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v4bf16_4523:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: global_load_dword v5, v[2:3], off
+; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:4
+; GFX9-NEXT: s_mov_b32 s4, 0xffff
+; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v5
+; GFX9-NEXT: v_and_or_b32 v0, v5, s4, v0
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v1, v4
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v4bf16_4523:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: global_load_dword v5, v[2:3], off
+; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:4
+; GFX10-NEXT: s_waitcnt vmcnt(1)
+; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v5
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v1, v4
+; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v5, v0
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v4bf16_4523:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: global_load_b32 v2, v[2:3], off
+; GFX11-NEXT: global_load_b32 v1, v[0:1], off offset:4
+; GFX11-NEXT: s_waitcnt vmcnt(1)
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_and_or_b32 v0, 0xffff, v2, v0
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
+ %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
+ %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+ ret <4 x bfloat> %shuffle
+}
+
+define <4 x bfloat> @shuffle_v4bf16_4545(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v4bf16_4545:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: global_load_dword v0, v[2:3], off
+; GFX9-NEXT: s_mov_b32 s4, 0xffff
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v0
+; GFX9-NEXT: v_and_or_b32 v0, v0, s4, v1
+; GFX9-NEXT: v_mov_b32_e32 v1, v0
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v4bf16_4545:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: global_load_dword v0, v[2:3], off
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v0
+; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v1
+; GFX10-NEXT: v_mov_b32_e32 v1, v0
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v4bf16_4545:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: global_load_b32 v0, v[2:3], off
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_or_b32 v0, 0xffff, v0, v1
+; GFX11-NEXT: v_mov_b32_e32 v1, v0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
+ %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
+ %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 4, i32 5, i32 4, i32 5>
+ ret <4 x bfloat> %shuffle
+}
+
+define <4 x bfloat> @shuffle_v4bf16_4567(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v4bf16_4567:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: global_load_dwordx2 v[0:1], v[2:3], off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v4bf16_4567:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: global_load_dwordx2 v[0:1], v[2:3], off
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v4bf16_4567:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: global_load_b64 v[0:1], v[2:3], off
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
+ %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
+ %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ ret <4 x bfloat> %shuffle
+}
+
+define <4 x bfloat> @shuffle_v4bf16_6701(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v4bf16_6701:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: global_load_dword v4, v[2:3], off offset:4
+; GFX9-NEXT: global_load_dword v5, v[0:1], off
+; GFX9-NEXT: s_mov_b32 s4, 0xffff
+; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v4
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v5
+; GFX9-NEXT: v_and_or_b32 v0, v4, s4, v0
+; GFX9-NEXT: v_and_or_b32 v1, v5, s4, v1
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v4bf16_6701:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: global_load_dword v4, v[2:3], off offset:4
+; GFX10-NEXT: global_load_dword v5, v[0:1], off
+; GFX10-NEXT: s_waitcnt vmcnt(1)
+; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v4
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v5
+; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v4, v0
+; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v5, v1
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v4bf16_6701:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: global_load_b32 v2, v[2:3], off offset:4
+; GFX11-NEXT: global_load_b32 v1, v[0:1], off
+; GFX11-NEXT: s_waitcnt vmcnt(1)
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v2
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_and_or_b32 v0, 0xffff, v2, v0
+; GFX11-NEXT: v_and_or_b32 v1, 0xffff, v1, v3
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
+ %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
+ %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
+ ret <4 x bfloat> %shuffle
+}
+
+define <4 x bfloat> @shuffle_v4bf16_6723(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v4bf16_6723:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: global_load_dword v5, v[2:3], off offset:4
+; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:4
+; GFX9-NEXT: s_mov_b32 s4, 0xffff
+; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v5
+; GFX9-NEXT: v_and_or_b32 v0, v5, s4, v0
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v1, v4
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v4bf16_6723:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: global_load_dword v5, v[2:3], off offset:4
+; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:4
+; GFX10-NEXT: s_waitcnt vmcnt(1)
+; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v5
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v1, v4
+; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v5, v0
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v4bf16_6723:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: global_load_b32 v2, v[2:3], off offset:4
+; GFX11-NEXT: global_load_b32 v1, v[0:1], off offset:4
+; GFX11-NEXT: s_waitcnt vmcnt(1)
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_and_or_b32 v0, 0xffff, v2, v0
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
+ %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
+ %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 6, i32 7, i32 2, i32 3>
+ ret <4 x bfloat> %shuffle
+}
+
+define <4 x bfloat> @shuffle_v4bf16_6745(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v4bf16_6745:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: global_load_dwordx2 v[1:2], v[2:3], off
+; GFX9-NEXT: s_mov_b32 s4, 0xffff
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v2
+; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
+; GFX9-NEXT: v_and_or_b32 v0, v2, s4, v0
+; GFX9-NEXT: v_and_or_b32 v1, v1, s4, v3
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v4bf16_6745:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: global_load_dwordx2 v[1:2], v[2:3], off
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v2
+; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
+; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v2, v0
+; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, v3
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v4bf16_6745:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: global_load_b64 v[1:2], v[2:3], off
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v2
+; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_and_or_b32 v0, 0xffff, v2, v0
+; GFX11-NEXT: v_and_or_b32 v1, 0xffff, v1, v3
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
+ %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
+ %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 6, i32 7, i32 4, i32 5>
+ ret <4 x bfloat> %shuffle
+}
+
+define <4 x bfloat> @shuffle_v4bf16_6767(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v4bf16_6767:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: global_load_dword v0, v[2:3], off offset:4
+; GFX9-NEXT: s_mov_b32 s4, 0xffff
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v0
+; GFX9-NEXT: v_and_or_b32 v0, v0, s4, v1
+; GFX9-NEXT: v_mov_b32_e32 v1, v0
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v4bf16_6767:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: global_load_dword v0, v[2:3], off offset:4
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v0
+; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v1
+; GFX10-NEXT: v_mov_b32_e32 v1, v0
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v4bf16_6767:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: global_load_b32 v0, v[2:3], off offset:4
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_or_b32 v0, 0xffff, v0, v1
+; GFX11-NEXT: v_mov_b32_e32 v1, v0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
+ %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
+ %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 6, i32 7, i32 6, i32 7>
+ ret <4 x bfloat> %shuffle
+}
+
+define <4 x bfloat> @shuffle_v4bf16_2356(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v4bf16_2356:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: global_load_dwordx2 v[4:5], v[2:3], off
+; GFX9-NEXT: global_load_dword v6, v[0:1], off offset:4
+; GFX9-NEXT: s_mov_b32 s4, 0xffff
+; GFX9-NEXT: s_mov_b32 s5, 0x3020706
+; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: v_alignbit_b32 v1, v5, v4, 16
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v6
+; GFX9-NEXT: v_and_or_b32 v0, v6, s4, v0
+; GFX9-NEXT: v_perm_b32 v1, v4, v1, s5
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v4bf16_2356:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: global_load_dword v6, v[0:1], off offset:4
+; GFX10-NEXT: global_load_dwordx2 v[4:5], v[2:3], off
+; GFX10-NEXT: s_waitcnt vmcnt(1)
+; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v6
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_alignbit_b32 v1, v5, v4, 16
+; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v6, v0
+; GFX10-NEXT: v_perm_b32 v1, v4, v1, 0x3020706
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v4bf16_2356:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:4
+; GFX11-NEXT: global_load_b64 v[1:2], v[2:3], off
+; GFX11-NEXT: s_waitcnt vmcnt(1)
+; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_alignbit_b32 v2, v2, v1, 16
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_and_or_b32 v0, 0xffff, v0, v3
+; GFX11-NEXT: v_perm_b32 v1, v1, v2, 0x3020706
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
+ %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
+ %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 2, i32 3, i32 5, i32 6>
+ ret <4 x bfloat> %shuffle
+}
+
+define <4 x bfloat> @shuffle_v4bf16_5623(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v4bf16_5623:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: global_load_dwordx2 v[5:6], v[2:3], off
+; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:4
+; GFX9-NEXT: s_mov_b32 s4, 0x3020706
+; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: v_alignbit_b32 v0, v6, v5, 16
+; GFX9-NEXT: v_perm_b32 v0, v5, v0, s4
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v1, v4
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v4bf16_5623:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: global_load_dwordx2 v[5:6], v[2:3], off
+; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:4
+; GFX10-NEXT: s_waitcnt vmcnt(1)
+; GFX10-NEXT: v_alignbit_b32 v0, v6, v5, 16
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v1, v4
+; GFX10-NEXT: v_perm_b32 v0, v5, v0, 0x3020706
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v4bf16_5623:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: global_load_b64 v[2:3], v[2:3], off
+; GFX11-NEXT: global_load_b32 v1, v[0:1], off offset:4
+; GFX11-NEXT: s_waitcnt vmcnt(1)
+; GFX11-NEXT: v_alignbit_b32 v0, v3, v2, 16
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_perm_b32 v0, v2, v0, 0x3020706
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
+ %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
+ %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 5, i32 6, i32 2, i32 3>
+ ret <4 x bfloat> %shuffle
+}
+
+define <4 x bfloat> @shuffle_v4bf16_3456(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v4bf16_3456:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: global_load_dwordx2 v[4:5], v[2:3], off
+; GFX9-NEXT: global_load_dword v6, v[0:1], off offset:4
+; GFX9-NEXT: s_mov_b32 s4, 0xffff
+; GFX9-NEXT: s_mov_b32 s5, 0x3020706
+; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: v_alignbit_b32 v0, v5, v4, 16
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_alignbit_b32 v2, v4, v6, 16
+; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
+; GFX9-NEXT: v_perm_b32 v1, v4, v0, s5
+; GFX9-NEXT: v_and_or_b32 v0, v2, s4, v3
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v4bf16_3456:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: global_load_dword v6, v[0:1], off offset:4
+; GFX10-NEXT: global_load_dwordx2 v[4:5], v[2:3], off
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_alignbit_b32 v0, v4, v6, 16
+; GFX10-NEXT: v_alignbit_b32 v2, v5, v4, 16
+; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v0
+; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v1
+; GFX10-NEXT: v_perm_b32 v1, v4, v2, 0x3020706
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v4bf16_3456:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:4
+; GFX11-NEXT: global_load_b64 v[1:2], v[2:3], off
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_alignbit_b32 v0, v1, v0, 16
+; GFX11-NEXT: v_alignbit_b32 v2, v2, v1, 16
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
+; GFX11-NEXT: v_perm_b32 v1, v1, v2, 0x3020706
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: v_and_or_b32 v0, 0xffff, v0, v3
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
+ %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
+ %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+ ret <4 x bfloat> %shuffle
+}
+
+define <4 x bfloat> @shuffle_v4bf16_5634(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v4bf16_5634:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: global_load_dwordx2 v[4:5], v[2:3], off
+; GFX9-NEXT: global_load_dword v6, v[0:1], off offset:4
+; GFX9-NEXT: s_mov_b32 s4, 0xffff
+; GFX9-NEXT: s_mov_b32 s5, 0x3020706
+; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: v_alignbit_b32 v0, v5, v4, 16
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_alignbit_b32 v1, v4, v6, 16
+; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v1
+; GFX9-NEXT: v_perm_b32 v0, v4, v0, s5
+; GFX9-NEXT: v_and_or_b32 v1, v1, s4, v2
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v4bf16_5634:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: global_load_dword v6, v[0:1], off offset:4
+; GFX10-NEXT: global_load_dwordx2 v[4:5], v[2:3], off
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_alignbit_b32 v1, v4, v6, 16
+; GFX10-NEXT: v_alignbit_b32 v0, v5, v4, 16
+; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v1
+; GFX10-NEXT: v_perm_b32 v0, v4, v0, 0x3020706
+; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, v2
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v4bf16_5634:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: global_load_b32 v4, v[0:1], off offset:4
+; GFX11-NEXT: global_load_b64 v[0:1], v[2:3], off
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_alignbit_b32 v2, v0, v4, 16
+; GFX11-NEXT: v_alignbit_b32 v1, v1, v0, 16
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
+; GFX11-NEXT: v_perm_b32 v0, v0, v1, 0x3020706
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: v_and_or_b32 v1, 0xffff, v2, v3
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
+ %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
+ %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 5, i32 6, i32 3, i32 4>
+ ret <4 x bfloat> %shuffle
+}
+
+define <4 x bfloat> @shuffle_v4bf16_5734(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v4bf16_5734:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: global_load_dwordx2 v[4:5], v[2:3], off
+; GFX9-NEXT: global_load_dword v6, v[0:1], off offset:4
+; GFX9-NEXT: s_mov_b32 s4, 0x3020706
+; GFX9-NEXT: s_mov_b32 s5, 0xffff
+; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: v_perm_b32 v0, v4, v5, s4
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_alignbit_b32 v1, v4, v6, 16
+; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v1
+; GFX9-NEXT: v_perm_b32 v0, v4, v0, s4
+; GFX9-NEXT: v_and_or_b32 v1, v1, s5, v2
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v4bf16_5734:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: global_load_dword v6, v[0:1], off offset:4
+; GFX10-NEXT: global_load_dwordx2 v[4:5], v[2:3], off
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_alignbit_b32 v1, v4, v6, 16
+; GFX10-NEXT: v_perm_b32 v0, v4, v5, 0x3020706
+; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v1
+; GFX10-NEXT: v_perm_b32 v0, v4, v0, 0x3020706
+; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, v2
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v4bf16_5734:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: global_load_b32 v4, v[0:1], off offset:4
+; GFX11-NEXT: global_load_b64 v[0:1], v[2:3], off
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_alignbit_b32 v2, v0, v4, 16
+; GFX11-NEXT: v_perm_b32 v1, v0, v1, 0x3020706
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
+; GFX11-NEXT: v_perm_b32 v0, v0, v1, 0x3020706
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: v_and_or_b32 v1, 0xffff, v2, v3
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
+ %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
+ %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 5, i32 7, i32 3, i32 4>
+ ret <4 x bfloat> %shuffle
+}
+
+define <4 x bfloat> @shuffle_v4bf16_0000(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v4bf16_0000:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: global_load_dword v0, v[0:1], off
+; GFX9-NEXT: s_mov_b32 s4, 0xffff
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v0
+; GFX9-NEXT: v_and_or_b32 v0, v0, s4, v1
+; GFX9-NEXT: v_mov_b32_e32 v1, v0
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v4bf16_0000:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: global_load_dword v0, v[0:1], off
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v0
+; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v1
+; GFX10-NEXT: v_mov_b32_e32 v1, v0
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v4bf16_0000:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: global_load_b32 v0, v[0:1], off
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_or_b32 v0, 0xffff, v0, v1
+; GFX11-NEXT: v_mov_b32_e32 v1, v0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
+ %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
+ %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> zeroinitializer
+ ret <4 x bfloat> %shuffle
+}
+
+define <4 x bfloat> @shuffle_v4bf16_1010(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v4bf16_1010:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: global_load_dword v0, v[0:1], off
+; GFX9-NEXT: s_mov_b32 s4, 0xffff
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_alignbit_b32 v0, v0, v0, 16
+; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v0
+; GFX9-NEXT: v_and_or_b32 v0, v0, s4, v1
+; GFX9-NEXT: v_mov_b32_e32 v1, v0
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v4bf16_1010:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: global_load_dword v0, v[0:1], off
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_alignbit_b32 v0, v0, v0, 16
+; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v0
+; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v1
+; GFX10-NEXT: v_mov_b32_e32 v1, v0
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v4bf16_1010:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: global_load_b32 v0, v[0:1], off
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_alignbit_b32 v0, v0, v0, 16
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v0
+; GFX11-NEXT: v_and_or_b32 v0, 0xffff, v0, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_mov_b32_e32 v1, v0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
+ %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
+ %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 1, i32 0, i32 1, i32 0>
+ ret <4 x bfloat> %shuffle
+}
+
+define <4 x bfloat> @shuffle_v4bf16_1100(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v4bf16_1100:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: global_load_dword v1, v[0:1], off
+; GFX9-NEXT: s_mov_b32 s4, 0xffff
+; GFX9-NEXT: s_mov_b32 s5, 0x7060706
+; GFX9-NEXT: s_mov_b32 s6, 0x3020706
+; GFX9-NEXT: s_mov_b32 s7, 0x3020504
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v1
+; GFX9-NEXT: v_perm_b32 v2, v1, v1, s5
+; GFX9-NEXT: v_and_or_b32 v3, v1, s4, v0
+; GFX9-NEXT: v_perm_b32 v0, v1, v2, s6
+; GFX9-NEXT: v_perm_b32 v1, v1, v3, s7
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v4bf16_1100:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: global_load_dword v1, v[0:1], off
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v1
+; GFX10-NEXT: v_perm_b32 v2, v1, v1, 0x7060706
+; GFX10-NEXT: v_and_or_b32 v3, 0xffff, v1, v0
+; GFX10-NEXT: v_perm_b32 v0, v1, v2, 0x3020706
+; GFX10-NEXT: v_perm_b32 v1, v1, v3, 0x3020504
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v4bf16_1100:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: global_load_b32 v1, v[0:1], off
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v1
+; GFX11-NEXT: v_perm_b32 v2, v1, v1, 0x7060706
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_and_or_b32 v3, 0xffff, v1, v0
+; GFX11-NEXT: v_perm_b32 v0, v1, v2, 0x3020706
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: v_perm_b32 v1, v1, v3, 0x3020504
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
+ %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
+ %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 1, i32 1, i32 0, i32 0>
+ ret <4 x bfloat> %shuffle
+}
+
+define <4 x bfloat> @shuffle_v4bf16_6161(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v4bf16_6161:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: global_load_dword v4, v[0:1], off
+; GFX9-NEXT: global_load_dword v5, v[2:3], off offset:4
+; GFX9-NEXT: s_mov_b32 s4, 0xffff
+; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v4
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_and_or_b32 v0, v5, s4, v0
+; GFX9-NEXT: v_mov_b32_e32 v1, v0
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v4bf16_6161:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: global_load_dword v4, v[0:1], off
+; GFX10-NEXT: global_load_dword v5, v[2:3], off offset:4
+; GFX10-NEXT: s_waitcnt vmcnt(1)
+; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v4
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v5, v0
+; GFX10-NEXT: v_mov_b32_e32 v1, v0
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v4bf16_6161:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: global_load_b32 v0, v[0:1], off
+; GFX11-NEXT: global_load_b32 v1, v[2:3], off offset:4
+; GFX11-NEXT: s_waitcnt vmcnt(1)
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_or_b32 v0, 0xffff, v1, v0
+; GFX11-NEXT: v_mov_b32_e32 v1, v0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
+ %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
+ %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 6, i32 1, i32 6, i32 1>
+ ret <4 x bfloat> %shuffle
+}
+
+define <4 x bfloat> @shuffle_v4bf16_2333(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v4bf16_2333:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4
+; GFX9-NEXT: s_mov_b32 s4, 0x7060706
+; GFX9-NEXT: s_mov_b32 s5, 0xffff
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_perm_b32 v1, v0, v0, s4
+; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
+; GFX9-NEXT: v_and_or_b32 v0, v0, s5, v2
+; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v1
+; GFX9-NEXT: v_and_or_b32 v1, v1, s5, v2
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v4bf16_2333:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: global_load_dword v0, v[0:1], off offset:4
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_perm_b32 v1, v0, v0, 0x7060706
+; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
+; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
+; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v2
+; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, v3
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v4bf16_2333:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:4
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_perm_b32 v1, v0, v0, 0x7060706
+; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
+; GFX11-NEXT: v_and_or_b32 v0, 0xffff, v0, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: v_and_or_b32 v1, 0xffff, v1, v3
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
+ %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
+ %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 2, i32 3, i32 3, i32 3>
+ ret <4 x bfloat> %shuffle
+}
+
+define <4 x bfloat> @shuffle_v4bf16_6667(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v4bf16_6667:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4
+; GFX9-NEXT: s_mov_b32 s4, 0x7060706
+; GFX9-NEXT: s_mov_b32 s5, 0xffff
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_perm_b32 v1, v0, v0, s4
+; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
+; GFX9-NEXT: v_and_or_b32 v0, v0, s5, v2
+; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v1
+; GFX9-NEXT: v_and_or_b32 v1, v1, s5, v2
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v4bf16_6667:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: global_load_dword v0, v[0:1], off offset:4
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_perm_b32 v1, v0, v0, 0x7060706
+; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
+; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
+; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v2
+; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, v3
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v4bf16_6667:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:4
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_perm_b32 v1, v0, v0, 0x7060706
+; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
+; GFX11-NEXT: v_and_or_b32 v0, 0xffff, v0, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: v_and_or_b32 v1, 0xffff, v1, v3
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
+ %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
+ %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 2, i32 3, i32 3, i32 3>
+ ret <4 x bfloat> %shuffle
+}
+
+define <4 x bfloat> @shuffle_v8bf16_0101(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v8bf16_0101:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: global_load_dword v0, v[0:1], off
+; GFX9-NEXT: s_mov_b32 s4, 0xffff
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v0
+; GFX9-NEXT: v_and_or_b32 v0, v0, s4, v1
+; GFX9-NEXT: v_mov_b32_e32 v1, v0
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v8bf16_0101:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: global_load_dword v0, v[0:1], off
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v0
+; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v1
+; GFX10-NEXT: v_mov_b32_e32 v1, v0
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v8bf16_0101:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: global_load_b32 v0, v[0:1], off
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_or_b32 v0, 0xffff, v0, v1
+; GFX11-NEXT: v_mov_b32_e32 v1, v0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %val0 = load <8 x bfloat>, ptr addrspace(1) %arg0
+ %val1 = load <8 x bfloat>, ptr addrspace(1) %arg1
+ %shuffle = shufflevector <8 x bfloat> %val0, <8 x bfloat> %val1, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+ ret <4 x bfloat> %shuffle
+}
+
+define <4 x bfloat> @shuffle_v8bf16_0123(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v8bf16_0123:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v8bf16_0123:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v8bf16_0123:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %val0 = load <8 x bfloat>, ptr addrspace(1) %arg0
+ %val1 = load <8 x bfloat>, ptr addrspace(1) %arg1
+ %shuffle = shufflevector <8 x bfloat> %val0, <8 x bfloat> %val1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ ret <4 x bfloat> %shuffle
+}
+
+define <4 x bfloat> @shuffle_v8bf16_4589(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v8bf16_4589:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:8
+; GFX9-NEXT: global_load_dword v5, v[2:3], off
+; GFX9-NEXT: s_mov_b32 s4, 0xffff
+; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v4
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v5
+; GFX9-NEXT: v_and_or_b32 v0, v4, s4, v0
+; GFX9-NEXT: v_and_or_b32 v1, v5, s4, v1
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v8bf16_4589:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:8
+; GFX10-NEXT: global_load_dword v5, v[2:3], off
+; GFX10-NEXT: s_waitcnt vmcnt(1)
+; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v4
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v5
+; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v4, v0
+; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v5, v1
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v8bf16_4589:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:8
+; GFX11-NEXT: global_load_b32 v1, v[2:3], off
+; GFX11-NEXT: s_waitcnt vmcnt(1)
+; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_and_or_b32 v0, 0xffff, v0, v2
+; GFX11-NEXT: v_and_or_b32 v1, 0xffff, v1, v3
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %val0 = load <8 x bfloat>, ptr addrspace(1) %arg0
+ %val1 = load <8 x bfloat>, ptr addrspace(1) %arg1
+ %shuffle = shufflevector <8 x bfloat> %val0, <8 x bfloat> %val1, <4 x i32> <i32 4, i32 5, i32 8, i32 9>
+ ret <4 x bfloat> %shuffle
+}
+
+define <4 x bfloat> @shuffle_v8bf16_10_11_2_3(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v8bf16_10_11_2_3:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: global_load_dword v5, v[2:3], off offset:4
+; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:4
+; GFX9-NEXT: s_mov_b32 s4, 0xffff
+; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v5
+; GFX9-NEXT: v_and_or_b32 v0, v5, s4, v0
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v1, v4
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v8bf16_10_11_2_3:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: global_load_dword v5, v[2:3], off offset:4
+; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:4
+; GFX10-NEXT: s_waitcnt vmcnt(1)
+; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v5
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v1, v4
+; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v5, v0
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v8bf16_10_11_2_3:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: global_load_b32 v2, v[2:3], off offset:4
+; GFX11-NEXT: global_load_b32 v1, v[0:1], off offset:4
+; GFX11-NEXT: s_waitcnt vmcnt(1)
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_and_or_b32 v0, 0xffff, v2, v0
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %val0 = load <8 x bfloat>, ptr addrspace(1) %arg0
+ %val1 = load <8 x bfloat>, ptr addrspace(1) %arg1
+ %shuffle = shufflevector <8 x bfloat> %val0, <8 x bfloat> %val1, <4 x i32> <i32 10, i32 11, i32 2, i32 3>
+ ret <4 x bfloat> %shuffle
+}
+
+define <4 x bfloat> @shuffle_v8bf16_13_14_2_3(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v8bf16_13_14_2_3:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: global_load_dwordx2 v[5:6], v[2:3], off offset:8
+; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:4
+; GFX9-NEXT: s_mov_b32 s4, 0x3020706
+; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: v_alignbit_b32 v0, v6, v5, 16
+; GFX9-NEXT: v_perm_b32 v0, v5, v0, s4
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v1, v4
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v8bf16_13_14_2_3:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: global_load_dwordx2 v[5:6], v[2:3], off offset:8
+; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:4
+; GFX10-NEXT: s_waitcnt vmcnt(1)
+; GFX10-NEXT: v_alignbit_b32 v0, v6, v5, 16
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v1, v4
+; GFX10-NEXT: v_perm_b32 v0, v5, v0, 0x3020706
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v8bf16_13_14_2_3:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: global_load_b64 v[2:3], v[2:3], off offset:8
+; GFX11-NEXT: global_load_b32 v1, v[0:1], off offset:4
+; GFX11-NEXT: s_waitcnt vmcnt(1)
+; GFX11-NEXT: v_alignbit_b32 v0, v3, v2, 16
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_perm_b32 v0, v2, v0, 0x3020706
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %val0 = load <8 x bfloat>, ptr addrspace(1) %arg0
+ %val1 = load <8 x bfloat>, ptr addrspace(1) %arg1
+ %shuffle = shufflevector <8 x bfloat> %val0, <8 x bfloat> %val1, <4 x i32> <i32 13, i32 14, i32 2, i32 3>
+ ret <4 x bfloat> %shuffle
+}
+
+define <4 x bfloat> @shuffle_v3bf16_0122(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v3bf16_0122:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
+; GFX9-NEXT: s_mov_b32 s4, 0xffff
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX9-NEXT: v_and_or_b32 v1, v1, s4, v2
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v3bf16_0122:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, v2
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v3bf16_0122:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_and_or_b32 v1, 0xffff, v1, v2
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %val0 = load <3 x bfloat>, ptr addrspace(1) %arg0
+ %val1 = load <3 x bfloat>, ptr addrspace(1) %arg1
+ %shuffle = shufflevector <3 x bfloat> %val0, <3 x bfloat> %val1, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
+ ret <4 x bfloat> %shuffle
+}
+
+define <4 x bfloat> @shuffle_v2bf16_0122(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v2bf16_0122:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: global_load_dword v0, v[0:1], off
+; GFX9-NEXT: s_mov_b32 s4, 0xffff
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_alignbit_b32 v1, v0, v0, 16
+; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v1
+; GFX9-NEXT: v_and_or_b32 v1, v1, s4, v2
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v2bf16_0122:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: global_load_dword v0, v[0:1], off
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_alignbit_b32 v1, v0, v0, 16
+; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v1
+; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, v2
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v2bf16_0122:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: global_load_b32 v0, v[0:1], off
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_alignbit_b32 v1, v0, v0, 16
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v1
+; GFX11-NEXT: v_and_or_b32 v1, 0xffff, v1, v2
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %val0 = load <2 x bfloat>, ptr addrspace(1) %arg0
+ %val1 = load <2 x bfloat>, ptr addrspace(1) %arg1
+ %shuffle = shufflevector <2 x bfloat> %val0, <2 x bfloat> %val1, <4 x i32> <i32 0, i32 1, i32 1, i32 0>
+ ret <4 x bfloat> %shuffle
+}
+
+define <6 x bfloat> @shuffle_v6bf16_452367(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v6bf16_452367:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v6, v1
+; GFX9-NEXT: v_mov_b32_e32 v5, v0
+; GFX9-NEXT: v_mov_b32_e32 v4, v3
+; GFX9-NEXT: v_mov_b32_e32 v3, v2
+; GFX9-NEXT: global_load_dwordx3 v[0:2], v[5:6], off
+; GFX9-NEXT: global_load_dword v7, v[3:4], off
+; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: v_mov_b32_e32 v0, v2
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v2, v7
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v6bf16_452367:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v6, v1
+; GFX10-NEXT: v_mov_b32_e32 v5, v0
+; GFX10-NEXT: v_mov_b32_e32 v4, v3
+; GFX10-NEXT: v_mov_b32_e32 v3, v2
+; GFX10-NEXT: global_load_dwordx3 v[0:2], v[5:6], off
+; GFX10-NEXT: global_load_dword v7, v[3:4], off
+; GFX10-NEXT: s_waitcnt vmcnt(1)
+; GFX10-NEXT: v_mov_b32_e32 v0, v2
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v2, v7
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v6bf16_452367:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_mov_b32 v3, v2
+; GFX11-NEXT: global_load_b96 v[0:2], v[0:1], off
+; GFX11-NEXT: global_load_b32 v3, v[3:4], off
+; GFX11-NEXT: s_waitcnt vmcnt(1)
+; GFX11-NEXT: v_mov_b32_e32 v0, v2
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %val0 = load <6 x bfloat>, ptr addrspace(1) %arg0
+ %val1 = load <6 x bfloat>, ptr addrspace(1) %arg1
+ %shuffle = shufflevector <6 x bfloat> %val0, <6 x bfloat> %val1, <6 x i32> <i32 4, i32 5, i32 2, i32 3, i32 6, i32 7>
+ ret <6 x bfloat> %shuffle
+}
+
+define amdgpu_kernel void @fma_shuffle_v2bf16(ptr addrspace(1) nocapture readonly %A, ptr addrspace(1) nocapture readonly %B, ptr addrspace(1) nocapture %C) {
+; GFX9-LABEL: fma_shuffle_v2bf16:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX9-NEXT: v_lshlrev_b32_e32 v6, 3, v0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_load_dwordx2 v[0:1], v6, s[6:7]
+; GFX9-NEXT: global_load_dwordx2 v[2:3], v6, s[2:3]
+; GFX9-NEXT: global_load_dwordx2 v[4:5], v6, s[0:1]
+; GFX9-NEXT: s_mov_b32 s0, 0x3020706
+; GFX9-NEXT: s_waitcnt vmcnt(2)
+; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v0
+; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v2
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v4
+; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX9-NEXT: v_lshlrev_b32_e32 v11, 16, v5
+; GFX9-NEXT: v_lshlrev_b32_e32 v12, 16, v1
+; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT: v_fma_f32 v7, v9, v8, v7
+; GFX9-NEXT: v_fma_f32 v0, v9, v2, v0
+; GFX9-NEXT: v_fma_f32 v8, v11, v8, v12
+; GFX9-NEXT: v_fma_f32 v1, v11, v2, v1
+; GFX9-NEXT: v_and_b32_e32 v10, 0xffff0000, v3
+; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v7
+; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v8
+; GFX9-NEXT: v_fma_f32 v0, v4, v10, v0
+; GFX9-NEXT: v_fma_f32 v2, v4, v3, v2
+; GFX9-NEXT: v_fma_f32 v1, v5, v10, v1
+; GFX9-NEXT: v_fma_f32 v3, v5, v3, v7
+; GFX9-NEXT: v_perm_b32 v0, v2, v0, s0
+; GFX9-NEXT: v_perm_b32 v1, v3, v1, s0
+; GFX9-NEXT: global_store_dwordx2 v6, v[0:1], s[6:7]
+; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: fma_shuffle_v2bf16:
+; GFX10: ; %bb.0: ; %entry
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10
+; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-NEXT: v_lshlrev_b32_e32 v6, 3, v0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_clause 0x2
+; GFX10-NEXT: global_load_dwordx2 v[0:1], v6, s[6:7]
+; GFX10-NEXT: global_load_dwordx2 v[2:3], v6, s[2:3]
+; GFX10-NEXT: global_load_dwordx2 v[4:5], v6, s[0:1]
+; GFX10-NEXT: s_waitcnt vmcnt(2)
+; GFX10-NEXT: v_lshlrev_b32_e32 v7, 16, v0
+; GFX10-NEXT: s_waitcnt vmcnt(1)
+; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v2
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_lshlrev_b32_e32 v9, 16, v4
+; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX10-NEXT: v_lshlrev_b32_e32 v11, 16, v5
+; GFX10-NEXT: v_and_b32_e32 v12, 0xffff0000, v1
+; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT: v_fmac_f32_e32 v7, v9, v8
+; GFX10-NEXT: v_fmac_f32_e32 v0, v9, v2
+; GFX10-NEXT: v_and_b32_e32 v10, 0xffff0000, v3
+; GFX10-NEXT: v_fmac_f32_e32 v12, v11, v2
+; GFX10-NEXT: v_fmac_f32_e32 v1, v11, v8
+; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v5
+; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
+; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v12
+; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX10-NEXT: v_fmac_f32_e32 v0, v4, v10
+; GFX10-NEXT: v_fmac_f32_e32 v5, v4, v3
+; GFX10-NEXT: v_fmac_f32_e32 v7, v2, v10
+; GFX10-NEXT: v_fmac_f32_e32 v1, v2, v3
+; GFX10-NEXT: v_perm_b32 v0, v5, v0, 0x3020706
+; GFX10-NEXT: v_perm_b32 v1, v1, v7, 0x3020706
+; GFX10-NEXT: global_store_dwordx2 v6, v[0:1], s[6:7]
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: fma_shuffle_v2bf16:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x10
+; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-NEXT: v_lshlrev_b32_e32 v6, 3, v0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_clause 0x2
+; GFX11-NEXT: global_load_b64 v[0:1], v6, s[4:5]
+; GFX11-NEXT: global_load_b64 v[2:3], v6, s[2:3]
+; GFX11-NEXT: global_load_b64 v[4:5], v6, s[0:1]
+; GFX11-NEXT: s_waitcnt vmcnt(2)
+; GFX11-NEXT: v_and_b32_e32 v12, 0xffff0000, v1
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_lshlrev_b32_e32 v11, 16, v5
+; GFX11-NEXT: v_and_b32_e32 v10, 0xffff0000, v3
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v2
+; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v9, 16, v4
+; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_fmac_f32_e32 v1, v11, v8
+; GFX11-NEXT: v_dual_fmac_f32 v12, v11, v2 :: v_dual_lshlrev_b32 v7, 16, v0
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_dual_fmac_f32 v0, v9, v2 :: v_dual_and_b32 v1, 0xffff0000, v1
+; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v5
+; GFX11-NEXT: v_dual_fmac_f32 v1, v2, v3 :: v_dual_and_b32 v0, 0xffff0000, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_dual_fmac_f32 v7, v9, v8 :: v_dual_fmac_f32 v0, v4, v10
+; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
+; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v12
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_fmac_f32_e32 v5, v4, v3
+; GFX11-NEXT: v_fmac_f32_e32 v7, v2, v10
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_perm_b32 v0, v5, v0, 0x3020706
+; GFX11-NEXT: v_perm_b32 v1, v1, v7, 0x3020706
+; GFX11-NEXT: global_store_b64 v6, v[0:1], s[4:5]
+; GFX11-NEXT: s_nop 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
+entry:
+ %tmp1 = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %tmp12 = zext i32 %tmp1 to i64
+ %arrayidx = getelementptr inbounds <4 x bfloat>, ptr addrspace(1) %A, i64 %tmp12
+ %tmp14 = load <4 x bfloat>, ptr addrspace(1) %arrayidx, align 8
+ %arrayidx1 = getelementptr inbounds <4 x bfloat>, ptr addrspace(1) %B, i64 %tmp12
+ %tmp15 = load <4 x bfloat>, ptr addrspace(1) %arrayidx1, align 8
+ %arrayidx2 = getelementptr inbounds <4 x bfloat>, ptr addrspace(1) %C, i64 %tmp12
+ %tmp16 = load <4 x bfloat>, ptr addrspace(1) %arrayidx2, align 8
+ %tmp17 = shufflevector <4 x bfloat> %tmp14, <4 x bfloat> undef, <2 x i32> zeroinitializer
+ %tmp18 = shufflevector <4 x bfloat> %tmp15, <4 x bfloat> undef, <2 x i32> <i32 0, i32 1>
+ %tmp19 = shufflevector <4 x bfloat> %tmp16, <4 x bfloat> undef, <2 x i32> <i32 0, i32 1>
+ %tmp20 = tail call <2 x bfloat> @llvm.fma.v2bf16(<2 x bfloat> %tmp17, <2 x bfloat> %tmp18, <2 x bfloat> %tmp19)
+ %tmp21 = shufflevector <4 x bfloat> %tmp14, <4 x bfloat> undef, <2 x i32> <i32 1, i32 1>
+ %tmp22 = shufflevector <4 x bfloat> %tmp15, <4 x bfloat> undef, <2 x i32> <i32 2, i32 3>
+ %tmp23 = tail call <2 x bfloat> @llvm.fma.v2bf16(<2 x bfloat> %tmp21, <2 x bfloat> %tmp22, <2 x bfloat> %tmp20)
+ %tmp24 = shufflevector <2 x bfloat> %tmp23, <2 x bfloat> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+ %tmp25 = shufflevector <4 x bfloat> %tmp24, <4 x bfloat> %tmp16, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+ %tmp26 = shufflevector <4 x bfloat> %tmp14, <4 x bfloat> undef, <2 x i32> <i32 2, i32 2>
+ %tmp27 = shufflevector <4 x bfloat> %tmp25, <4 x bfloat> undef, <2 x i32> <i32 2, i32 3>
+ %tmp28 = tail call <2 x bfloat> @llvm.fma.v2bf16(<2 x bfloat> %tmp26, <2 x bfloat> %tmp18, <2 x bfloat> %tmp27)
+ %tmp29 = shufflevector <4 x bfloat> %tmp14, <4 x bfloat> undef, <2 x i32> <i32 3, i32 3>
+ %tmp30 = tail call <2 x bfloat> @llvm.fma.v2bf16(<2 x bfloat> %tmp29, <2 x bfloat> %tmp22, <2 x bfloat> %tmp28)
+ %tmp31 = shufflevector <2 x bfloat> %tmp30, <2 x bfloat> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+ %tmp32 = shufflevector <4 x bfloat> %tmp25, <4 x bfloat> %tmp31, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+ store <4 x bfloat> %tmp32, ptr addrspace(1) %arrayidx2, align 8
+ ret void
+}
+
+define <4 x bfloat> @shuffle_v4bf16_0456(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v4bf16_0456:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: global_load_dwordx2 v[4:5], v[2:3], off
+; GFX9-NEXT: global_load_dword v6, v[0:1], off
+; GFX9-NEXT: s_mov_b32 s4, 0x1000504
+; GFX9-NEXT: s_mov_b32 s5, 0x3020706
+; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: v_alignbit_b32 v1, v5, v4, 16
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_perm_b32 v0, v6, v4, s4
+; GFX9-NEXT: v_perm_b32 v1, v4, v1, s5
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v4bf16_0456:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: global_load_dwordx2 v[4:5], v[2:3], off
+; GFX10-NEXT: global_load_dword v6, v[0:1], off
+; GFX10-NEXT: s_waitcnt vmcnt(1)
+; GFX10-NEXT: v_alignbit_b32 v1, v5, v4, 16
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_perm_b32 v0, v6, v4, 0x1000504
+; GFX10-NEXT: v_perm_b32 v1, v4, v1, 0x3020706
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v4bf16_0456:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: global_load_b64 v[2:3], v[2:3], off
+; GFX11-NEXT: global_load_b32 v0, v[0:1], off
+; GFX11-NEXT: s_waitcnt vmcnt(1)
+; GFX11-NEXT: v_alignbit_b32 v1, v3, v2, 16
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_perm_b32 v0, v0, v2, 0x1000504
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: v_perm_b32 v1, v2, v1, 0x3020706
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
+ %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
+ %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 0, i32 4, i32 5, i32 6>
+ ret <4 x bfloat> %shuffle
+}
+
+define <2 x bfloat> @low16bits(ptr addrspace(1) %x0, ptr addrspace(1) %x1) {
+; GFX9-LABEL: low16bits:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: global_load_dword v4, v[0:1], off
+; GFX9-NEXT: global_load_dword v5, v[2:3], off
+; GFX9-NEXT: s_mov_b32 s4, 0x1000504
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_perm_b32 v0, v4, v5, s4
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: low16bits:
+; GFX10: ; %bb.0: ; %entry
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: global_load_dword v4, v[0:1], off
+; GFX10-NEXT: global_load_dword v5, v[2:3], off
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_perm_b32 v0, v4, v5, 0x1000504
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: low16bits:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: global_load_b32 v0, v[0:1], off
+; GFX11-NEXT: global_load_b32 v1, v[2:3], off
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_perm_b32 v0, v0, v1, 0x1000504
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+entry:
+ %0 = load <2 x bfloat>, ptr addrspace(1) %x0, align 4
+ %1 = load <2 x bfloat>, ptr addrspace(1) %x1, align 4
+ %vy1.0.vec.insert = shufflevector <2 x bfloat> %0, <2 x bfloat> poison, <2 x i32> <i32 0, i32 undef>
+ %vy1.2.vec.insert = shufflevector <2 x bfloat> %vy1.0.vec.insert, <2 x bfloat> %1, <2 x i32> <i32 0, i32 2>
+ ret <2 x bfloat> %vy1.2.vec.insert
+}
+
+define <2 x bfloat> @hi16bits_v2bf16(ptr addrspace(1) %x0, ptr addrspace(1) %x1) {
+; GFX9-LABEL: hi16bits_v2bf16:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: global_load_dword v4, v[0:1], off
+; GFX9-NEXT: global_load_dword v5, v[2:3], off
+; GFX9-NEXT: s_mov_b32 s4, 0x3020706
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_perm_b32 v0, v4, v5, s4
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: hi16bits_v2bf16:
+; GFX10: ; %bb.0: ; %entry
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: global_load_dword v4, v[0:1], off
+; GFX10-NEXT: global_load_dword v5, v[2:3], off
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_perm_b32 v0, v4, v5, 0x3020706
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: hi16bits_v2bf16:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: global_load_b32 v0, v[0:1], off
+; GFX11-NEXT: global_load_b32 v1, v[2:3], off
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_perm_b32 v0, v0, v1, 0x3020706
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+entry:
+ %0 = load <2 x bfloat>, ptr addrspace(1) %x0, align 4
+ %1 = load <2 x bfloat>, ptr addrspace(1) %x1, align 4
+ %vy1.0.vec.insert = shufflevector <2 x bfloat> %0, <2 x bfloat> poison, <2 x i32> <i32 1, i32 undef>
+ %vy1.2.vec.insert = shufflevector <2 x bfloat> %vy1.0.vec.insert, <2 x bfloat> %1, <2 x i32> <i32 0, i32 3>
+ ret <2 x bfloat> %vy1.2.vec.insert
+}
+
+define <2 x bfloat> @low16hi16bits_v2bf16(ptr addrspace(1) %x0, ptr addrspace(1) %x1) {
+; GFX9-LABEL: low16hi16bits_v2bf16:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: global_load_dword v4, v[0:1], off
+; GFX9-NEXT: global_load_dword v5, v[2:3], off
+; GFX9-NEXT: s_mov_b32 s4, 0x3020504
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_perm_b32 v0, v4, v5, s4
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: low16hi16bits_v2bf16:
+; GFX10: ; %bb.0: ; %entry
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: global_load_dword v4, v[0:1], off
+; GFX10-NEXT: global_load_dword v5, v[2:3], off
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_perm_b32 v0, v4, v5, 0x3020504
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: low16hi16bits_v2bf16:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: global_load_b32 v0, v[0:1], off
+; GFX11-NEXT: global_load_b32 v1, v[2:3], off
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_perm_b32 v0, v0, v1, 0x3020504
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+entry:
+ %0 = load <2 x bfloat>, ptr addrspace(1) %x0, align 4
+ %1 = load <2 x bfloat>, ptr addrspace(1) %x1, align 4
+ %vy1.0.vec.insert = shufflevector <2 x bfloat> %0, <2 x bfloat> poison, <2 x i32> <i32 0, i32 undef>
+ %vy1.2.vec.insert = shufflevector <2 x bfloat> %vy1.0.vec.insert, <2 x bfloat> %1, <2 x i32> <i32 0, i32 3>
+ ret <2 x bfloat> %vy1.2.vec.insert
+}
+
+define <2 x bfloat> @hi16low16bits(ptr addrspace(1) %x0, ptr addrspace(1) %x1) {
+; GFX9-LABEL: hi16low16bits:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: global_load_dword v4, v[0:1], off
+; GFX9-NEXT: global_load_dword v5, v[2:3], off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_alignbit_b32 v0, v5, v4, 16
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: hi16low16bits:
+; GFX10: ; %bb.0: ; %entry
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: global_load_dword v4, v[0:1], off
+; GFX10-NEXT: global_load_dword v5, v[2:3], off
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_alignbit_b32 v0, v5, v4, 16
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: hi16low16bits:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: global_load_b32 v0, v[0:1], off
+; GFX11-NEXT: global_load_b32 v1, v[2:3], off
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_alignbit_b32 v0, v1, v0, 16
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+entry:
+ %0 = load <2 x bfloat>, ptr addrspace(1) %x0, align 4
+ %1 = load <2 x bfloat>, ptr addrspace(1) %x1, align 4
+ %vy1.0.vec.insert = shufflevector <2 x bfloat> %0, <2 x bfloat> poison, <2 x i32> <i32 1, i32 undef>
+ %vy1.2.vec.insert = shufflevector <2 x bfloat> %vy1.0.vec.insert, <2 x bfloat> %1, <2 x i32> <i32 0, i32 2>
+ ret <2 x bfloat> %vy1.2.vec.insert
+}
+
+define <2 x bfloat> @v2bfloat_hi16bits(ptr addrspace(1) %x0) {
+; GFX9-LABEL: v2bfloat_hi16bits:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: global_load_dword v0, v[0:1], off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v2bfloat_hi16bits:
+; GFX10: ; %bb.0: ; %entry
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: global_load_dword v0, v[0:1], off
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v2bfloat_hi16bits:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: global_load_b32 v0, v[0:1], off
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+entry:
+ %load0 = load <2 x bfloat>, ptr addrspace(1) %x0, align 4
+ %insert1 = insertelement <2 x bfloat> undef, bfloat 0.0, i32 0
+ %insert2 = insertelement <2 x bfloat> %insert1, bfloat 0.0, i32 1
+ %vec.ret = shufflevector <2 x bfloat> %insert2, <2 x bfloat> %load0, <2 x i32> <i32 0, i32 3>
+ ret <2 x bfloat> %vec.ret
+}
+
+define void @shuffle_v8bf16_concat(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1, ptr addrspace(1) %out) {
+; GFX9-LABEL: shuffle_v8bf16_concat:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off
+; GFX9-NEXT: global_load_dwordx2 v[8:9], v[2:3], off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: global_store_dwordx4 v[4:5], v[6:9], off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v8bf16_concat:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: global_load_dwordx2 v[6:7], v[0:1], off
+; GFX10-NEXT: global_load_dwordx2 v[8:9], v[2:3], off
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: global_store_dwordx4 v[4:5], v[6:9], off
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v8bf16_concat:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off
+; GFX11-NEXT: global_load_b64 v[2:3], v[2:3], off
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: global_store_b128 v[4:5], v[0:3], off
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
+ %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
+ %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ store <8 x bfloat> %shuffle, ptr addrspace(1) %out
+ ret void
+}
+
+define void @shuffle_v16bf16_concat(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1, ptr addrspace(1) %out) {
+; GFX9-LABEL: shuffle_v16bf16_concat:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: global_load_dwordx4 v[6:9], v[2:3], off
+; GFX9-NEXT: global_load_dwordx4 v[10:13], v[0:1], off
+; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: global_store_dwordx4 v[4:5], v[6:9], off offset:16
+; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: global_store_dwordx4 v[4:5], v[10:13], off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v16bf16_concat:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: global_load_dwordx4 v[6:9], v[2:3], off
+; GFX10-NEXT: global_load_dwordx4 v[10:13], v[0:1], off
+; GFX10-NEXT: s_waitcnt vmcnt(1)
+; GFX10-NEXT: global_store_dwordx4 v[4:5], v[6:9], off offset:16
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: global_store_dwordx4 v[4:5], v[10:13], off
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v16bf16_concat:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: global_load_b128 v[6:9], v[2:3], off
+; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off
+; GFX11-NEXT: s_waitcnt vmcnt(1)
+; GFX11-NEXT: global_store_b128 v[4:5], v[6:9], off offset:16
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: global_store_b128 v[4:5], v[0:3], off
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %val0 = load <8 x bfloat>, ptr addrspace(1) %arg0
+ %val1 = load <8 x bfloat>, ptr addrspace(1) %arg1
+ %shuffle = shufflevector <8 x bfloat> %val0, <8 x bfloat> %val1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ store <16 x bfloat> %shuffle, ptr addrspace(1) %out
+ ret void
+}
+
+define void @shuffle_v32bf16_concat(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1, ptr addrspace(1) %out) {
+; GFX9-LABEL: shuffle_v32bf16_concat:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: global_load_dwordx4 v[6:9], v[2:3], off
+; GFX9-NEXT: global_load_dwordx4 v[10:13], v[2:3], off offset:16
+; GFX9-NEXT: global_load_dwordx4 v[14:17], v[0:1], off
+; GFX9-NEXT: global_load_dwordx4 v[18:21], v[0:1], off offset:16
+; GFX9-NEXT: s_waitcnt vmcnt(3)
+; GFX9-NEXT: global_store_dwordx4 v[4:5], v[6:9], off offset:32
+; GFX9-NEXT: s_waitcnt vmcnt(3)
+; GFX9-NEXT: global_store_dwordx4 v[4:5], v[10:13], off offset:48
+; GFX9-NEXT: s_waitcnt vmcnt(3)
+; GFX9-NEXT: global_store_dwordx4 v[4:5], v[14:17], off
+; GFX9-NEXT: s_waitcnt vmcnt(3)
+; GFX9-NEXT: global_store_dwordx4 v[4:5], v[18:21], off offset:16
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v32bf16_concat:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: global_load_dwordx4 v[6:9], v[2:3], off
+; GFX10-NEXT: global_load_dwordx4 v[10:13], v[2:3], off offset:16
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: global_load_dwordx4 v[14:17], v[0:1], off
+; GFX10-NEXT: global_load_dwordx4 v[18:21], v[0:1], off offset:16
+; GFX10-NEXT: s_waitcnt vmcnt(3)
+; GFX10-NEXT: global_store_dwordx4 v[4:5], v[6:9], off offset:32
+; GFX10-NEXT: s_waitcnt vmcnt(2)
+; GFX10-NEXT: global_store_dwordx4 v[4:5], v[10:13], off offset:48
+; GFX10-NEXT: s_waitcnt vmcnt(1)
+; GFX10-NEXT: global_store_dwordx4 v[4:5], v[14:17], off
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: global_store_dwordx4 v[4:5], v[18:21], off offset:16
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v32bf16_concat:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_load_b128 v[6:9], v[2:3], off
+; GFX11-NEXT: global_load_b128 v[10:13], v[2:3], off offset:16
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_load_b128 v[14:17], v[0:1], off
+; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off offset:16
+; GFX11-NEXT: s_waitcnt vmcnt(3)
+; GFX11-NEXT: global_store_b128 v[4:5], v[6:9], off offset:32
+; GFX11-NEXT: s_waitcnt vmcnt(2)
+; GFX11-NEXT: global_store_b128 v[4:5], v[10:13], off offset:48
+; GFX11-NEXT: s_waitcnt vmcnt(1)
+; GFX11-NEXT: global_store_b128 v[4:5], v[14:17], off
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: global_store_b128 v[4:5], v[0:3], off offset:16
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %val0 = load <16 x bfloat>, ptr addrspace(1) %arg0
+ %val1 = load <16 x bfloat>, ptr addrspace(1) %arg1
+ %shuffle = shufflevector <16 x bfloat> %val0, <16 x bfloat> %val1, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+ store <32 x bfloat> %shuffle, ptr addrspace(1) %out
+ ret void
+}
+
+declare <2 x half> @llvm.fma.v2f16(<2 x half>, <2 x half>, <2 x half>) #0
+declare <2 x bfloat> @llvm.fma.v2bf16(<2 x bfloat>, <2 x bfloat>, <2 x bfloat>) #0
+declare i32 @llvm.amdgcn.workitem.id.x() #0
+
+attributes #0 = { nounwind readnone speculatable }
diff --git a/llvm/test/MC/Disassembler/X86/apx/wrssd.txt b/llvm/test/MC/Disassembler/X86/apx/wrssd.txt
new file mode 100644
index 000000000000..600e85e1440e
--- /dev/null
+++ b/llvm/test/MC/Disassembler/X86/apx/wrssd.txt
@@ -0,0 +1,6 @@
+# RUN: llvm-mc --disassemble %s -triple=x86_64 | FileCheck %s --check-prefixes=ATT
+# RUN: llvm-mc --disassemble %s -triple=x86_64 -x86-asm-syntax=intel --output-asm-variant=1 | FileCheck %s --check-prefixes=INTEL
+
+# ATT: wrssd %r18d, 291(%r28,%r29,4)
+# INTEL: wrssd dword ptr [r28 + 4*r29 + 291], r18d
+0x62,0x8c,0x78,0x08,0x66,0x94,0xac,0x23,0x01,0x00,0x00
diff --git a/llvm/test/MC/Disassembler/X86/apx/wrssq.txt b/llvm/test/MC/Disassembler/X86/apx/wrssq.txt
new file mode 100644
index 000000000000..9f5b26321fd2
--- /dev/null
+++ b/llvm/test/MC/Disassembler/X86/apx/wrssq.txt
@@ -0,0 +1,6 @@
+# RUN: llvm-mc --disassemble %s -triple=x86_64 | FileCheck %s --check-prefixes=ATT
+# RUN: llvm-mc --disassemble %s -triple=x86_64 -x86-asm-syntax=intel --output-asm-variant=1 | FileCheck %s --check-prefixes=INTEL
+
+# ATT: wrssq %r19, 291(%r28,%r29,4)
+# INTEL: wrssq qword ptr [r28 + 4*r29 + 291], r19
+0x62,0x8c,0xf8,0x08,0x66,0x9c,0xac,0x23,0x01,0x00,0x00
diff --git a/llvm/test/MC/Disassembler/X86/apx/wrussd.txt b/llvm/test/MC/Disassembler/X86/apx/wrussd.txt
new file mode 100644
index 000000000000..1b8b0007e2d3
--- /dev/null
+++ b/llvm/test/MC/Disassembler/X86/apx/wrussd.txt
@@ -0,0 +1,6 @@
+# RUN: llvm-mc --disassemble %s -triple=x86_64 | FileCheck %s --check-prefixes=ATT
+# RUN: llvm-mc --disassemble %s -triple=x86_64 -x86-asm-syntax=intel --output-asm-variant=1 | FileCheck %s --check-prefixes=INTEL
+
+# ATT: wrussd %r18d, 291(%r28,%r29,4)
+# INTEL: wrussd dword ptr [r28 + 4*r29 + 291], r18d
+0x62,0x8c,0x79,0x08,0x65,0x94,0xac,0x23,0x01,0x00,0x00
diff --git a/llvm/test/MC/Disassembler/X86/apx/wrussq.txt b/llvm/test/MC/Disassembler/X86/apx/wrussq.txt
new file mode 100644
index 000000000000..7ff51f617c5c
--- /dev/null
+++ b/llvm/test/MC/Disassembler/X86/apx/wrussq.txt
@@ -0,0 +1,6 @@
+# RUN: llvm-mc --disassemble %s -triple=x86_64 | FileCheck %s --check-prefixes=ATT
+# RUN: llvm-mc --disassemble %s -triple=x86_64 -x86-asm-syntax=intel --output-asm-variant=1 | FileCheck %s --check-prefixes=INTEL
+
+# ATT: wrussq %r19, 291(%r28,%r29,4)
+# INTEL: wrussq qword ptr [r28 + 4*r29 + 291], r19
+0x62,0x8c,0xf9,0x08,0x65,0x9c,0xac,0x23,0x01,0x00,0x00
diff --git a/llvm/test/MC/X86/apx/wrssd-att.s b/llvm/test/MC/X86/apx/wrssd-att.s
new file mode 100644
index 000000000000..409b3010f5c7
--- /dev/null
+++ b/llvm/test/MC/X86/apx/wrssd-att.s
@@ -0,0 +1,8 @@
+# RUN: llvm-mc -triple x86_64 --show-encoding %s | FileCheck %s
+# RUN: not llvm-mc -triple i386 -show-encoding %s 2>&1 | FileCheck %s --check-prefix=ERROR
+
+# ERROR-COUNT-1: error:
+# ERROR-NOT: error:
+# CHECK: wrssd %r18d, 291(%r28,%r29,4)
+# CHECK: encoding: [0x62,0x8c,0x78,0x08,0x66,0x94,0xac,0x23,0x01,0x00,0x00]
+ wrssd %r18d, 291(%r28,%r29,4)
diff --git a/llvm/test/MC/X86/apx/wrssd-intel.s b/llvm/test/MC/X86/apx/wrssd-intel.s
new file mode 100644
index 000000000000..1d402f2c5177
--- /dev/null
+++ b/llvm/test/MC/X86/apx/wrssd-intel.s
@@ -0,0 +1,5 @@
+# RUN: llvm-mc -triple x86_64 -x86-asm-syntax=intel -output-asm-variant=1 --show-encoding %s | FileCheck %s
+
+# CHECK: wrssd dword ptr [r28 + 4*r29 + 291], r18d
+# CHECK: encoding: [0x62,0x8c,0x78,0x08,0x66,0x94,0xac,0x23,0x01,0x00,0x00]
+ wrssd dword ptr [r28 + 4*r29 + 291], r18d
diff --git a/llvm/test/MC/X86/apx/wrssq-att.s b/llvm/test/MC/X86/apx/wrssq-att.s
new file mode 100644
index 000000000000..1f616ac2e4e4
--- /dev/null
+++ b/llvm/test/MC/X86/apx/wrssq-att.s
@@ -0,0 +1,8 @@
+# RUN: llvm-mc -triple x86_64 --show-encoding %s | FileCheck %s
+# RUN: not llvm-mc -triple i386 -show-encoding %s 2>&1 | FileCheck %s --check-prefix=ERROR
+
+# ERROR-COUNT-1: error:
+# ERROR-NOT: error:
+# CHECK: wrssq %r19, 291(%r28,%r29,4)
+# CHECK: encoding: [0x62,0x8c,0xf8,0x08,0x66,0x9c,0xac,0x23,0x01,0x00,0x00]
+ wrssq %r19, 291(%r28,%r29,4)
diff --git a/llvm/test/MC/X86/apx/wrssq-intel.s b/llvm/test/MC/X86/apx/wrssq-intel.s
new file mode 100644
index 000000000000..d31dca55ca4a
--- /dev/null
+++ b/llvm/test/MC/X86/apx/wrssq-intel.s
@@ -0,0 +1,5 @@
+# RUN: llvm-mc -triple x86_64 -x86-asm-syntax=intel -output-asm-variant=1 --show-encoding %s | FileCheck %s
+
+# CHECK: wrssq qword ptr [r28 + 4*r29 + 291], r19
+# CHECK: encoding: [0x62,0x8c,0xf8,0x08,0x66,0x9c,0xac,0x23,0x01,0x00,0x00]
+ wrssq qword ptr [r28 + 4*r29 + 291], r19
diff --git a/llvm/test/MC/X86/apx/wrussd-att.s b/llvm/test/MC/X86/apx/wrussd-att.s
new file mode 100644
index 000000000000..269d9a8aa858
--- /dev/null
+++ b/llvm/test/MC/X86/apx/wrussd-att.s
@@ -0,0 +1,8 @@
+# RUN: llvm-mc -triple x86_64 --show-encoding %s | FileCheck %s
+# RUN: not llvm-mc -triple i386 -show-encoding %s 2>&1 | FileCheck %s --check-prefix=ERROR
+
+# ERROR-COUNT-1: error:
+# ERROR-NOT: error:
+# CHECK: wrussd %r18d, 291(%r28,%r29,4)
+# CHECK: encoding: [0x62,0x8c,0x79,0x08,0x65,0x94,0xac,0x23,0x01,0x00,0x00]
+ wrussd %r18d, 291(%r28,%r29,4)
diff --git a/llvm/test/MC/X86/apx/wrussd-intel.s b/llvm/test/MC/X86/apx/wrussd-intel.s
new file mode 100644
index 000000000000..fed6eb10d4ad
--- /dev/null
+++ b/llvm/test/MC/X86/apx/wrussd-intel.s
@@ -0,0 +1,5 @@
+# RUN: llvm-mc -triple x86_64 -x86-asm-syntax=intel -output-asm-variant=1 --show-encoding %s | FileCheck %s
+
+# CHECK: wrussd dword ptr [r28 + 4*r29 + 291], r18d
+# CHECK: encoding: [0x62,0x8c,0x79,0x08,0x65,0x94,0xac,0x23,0x01,0x00,0x00]
+ wrussd dword ptr [r28 + 4*r29 + 291], r18d
diff --git a/llvm/test/MC/X86/apx/wrussq-att.s b/llvm/test/MC/X86/apx/wrussq-att.s
new file mode 100644
index 000000000000..b41360cd9db0
--- /dev/null
+++ b/llvm/test/MC/X86/apx/wrussq-att.s
@@ -0,0 +1,8 @@
+# RUN: llvm-mc -triple x86_64 --show-encoding %s | FileCheck %s
+# RUN: not llvm-mc -triple i386 -show-encoding %s 2>&1 | FileCheck %s --check-prefix=ERROR
+
+# ERROR-COUNT-1: error:
+# ERROR-NOT: error:
+# CHECK: wrussq %r19, 291(%r28,%r29,4)
+# CHECK: encoding: [0x62,0x8c,0xf9,0x08,0x65,0x9c,0xac,0x23,0x01,0x00,0x00]
+ wrussq %r19, 291(%r28,%r29,4)
diff --git a/llvm/test/MC/X86/apx/wrussq-intel.s b/llvm/test/MC/X86/apx/wrussq-intel.s
new file mode 100644
index 000000000000..a9a96da9d3d1
--- /dev/null
+++ b/llvm/test/MC/X86/apx/wrussq-intel.s
@@ -0,0 +1,5 @@
+# RUN: llvm-mc -triple x86_64 -x86-asm-syntax=intel -output-asm-variant=1 --show-encoding %s | FileCheck %s
+
+# CHECK: wrussq qword ptr [r28 + 4*r29 + 291], r19
+# CHECK: encoding: [0x62,0x8c,0xf9,0x08,0x65,0x9c,0xac,0x23,0x01,0x00,0x00]
+ wrussq qword ptr [r28 + 4*r29 + 291], r19
diff --git a/llvm/utils/chunk-print-before-all.py b/llvm/utils/chunk-print-before-all.py
index fe0eaaea1c20..fef8eb64c540 100755
--- a/llvm/utils/chunk-print-before-all.py
+++ b/llvm/utils/chunk-print-before-all.py
@@ -30,13 +30,13 @@ def print_chunk(lines, prefix, pass_name):
is_dump = False
cur = []
for line in sys.stdin:
- if line.startswith("*** IR Dump Before "):
+ if "*** IR Dump Before " in line:
if len(cur) != 0:
print_chunk(cur, "before", pass_name)
cur = []
cur.append("; " + line)
pass_name = get_pass_name(line, "Before")
- elif line.startswith("*** IR Dump After "):
+ elif "*** IR Dump After " in line:
if len(cur) != 0:
print_chunk(cur, "after", pass_name)
cur = []
diff --git a/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td b/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td
index 423118f79e73..40d874dc99dd 100644
--- a/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td
+++ b/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td
@@ -1582,22 +1582,27 @@ def Vector_LoadOp : Vector_Op<"load"> {
vector. If the memref element type is vector, it should match the result
vector type.
- Example 1: 1-D vector load on a scalar memref.
+ Example: 0-D vector load on a scalar memref.
+ ```mlir
+ %result = vector.load %base[%i, %j] : memref<100x100xf32>, vector<f32>
+ ```
+
+ Example: 1-D vector load on a scalar memref.
```mlir
%result = vector.load %base[%i, %j] : memref<100x100xf32>, vector<8xf32>
```
- Example 2: 1-D vector load on a vector memref.
+ Example: 1-D vector load on a vector memref.
```mlir
%result = vector.load %memref[%i, %j] : memref<200x100xvector<8xf32>>, vector<8xf32>
```
- Example 3: 2-D vector load on a scalar memref.
+ Example: 2-D vector load on a scalar memref.
```mlir
%result = vector.load %memref[%i, %j] : memref<200x100xf32>, vector<4x8xf32>
```
- Example 4: 2-D vector load on a vector memref.
+ Example: 2-D vector load on a vector memref.
```mlir
%result = vector.load %memref[%i, %j] : memref<200x100xvector<4x8xf32>>, vector<4x8xf32>
```
@@ -1608,12 +1613,12 @@ def Vector_LoadOp : Vector_Op<"load"> {
loaded out of bounds. Not all targets may support out-of-bounds vector
loads.
- Example 5: Potential out-of-bound vector load.
+ Example: Potential out-of-bound vector load.
```mlir
%result = vector.load %memref[%index] : memref<?xf32>, vector<8xf32>
```
- Example 6: Explicit out-of-bound vector load.
+ Example: Explicit out-of-bound vector load.
```mlir
%result = vector.load %memref[%c0] : memref<7xf32>, vector<8xf32>
```
@@ -1622,7 +1627,7 @@ def Vector_LoadOp : Vector_Op<"load"> {
let arguments = (ins Arg<AnyMemRef, "the reference to load from",
[MemRead]>:$base,
Variadic<Index>:$indices);
- let results = (outs AnyVector:$result);
+ let results = (outs AnyVectorOfAnyRank:$result);
let extraClassDeclaration = [{
MemRefType getMemRefType() {
@@ -1660,22 +1665,27 @@ def Vector_StoreOp : Vector_Op<"store"> {
to store. If the memref element type is vector, it should match the type
of the value to store.
- Example 1: 1-D vector store on a scalar memref.
+ Example: 0-D vector store on a scalar memref.
+ ```mlir
+ vector.store %valueToStore, %memref[%i, %j] : memref<200x100xf32>, vector<f32>
+ ```
+
+ Example: 1-D vector store on a scalar memref.
```mlir
vector.store %valueToStore, %memref[%i, %j] : memref<200x100xf32>, vector<8xf32>
```
- Example 2: 1-D vector store on a vector memref.
+ Example: 1-D vector store on a vector memref.
```mlir
vector.store %valueToStore, %memref[%i, %j] : memref<200x100xvector<8xf32>>, vector<8xf32>
```
- Example 3: 2-D vector store on a scalar memref.
+ Example: 2-D vector store on a scalar memref.
```mlir
vector.store %valueToStore, %memref[%i, %j] : memref<200x100xf32>, vector<4x8xf32>
```
- Example 4: 2-D vector store on a vector memref.
+ Example: 2-D vector store on a vector memref.
```mlir
vector.store %valueToStore, %memref[%i, %j] : memref<200x100xvector<4x8xf32>>, vector<4x8xf32>
```
@@ -1685,21 +1695,23 @@ def Vector_StoreOp : Vector_Op<"store"> {
target-specific. No assumptions should be made on the memory written out of
bounds. Not all targets may support out-of-bounds vector stores.
- Example 5: Potential out-of-bounds vector store.
+ Example: Potential out-of-bounds vector store.
```mlir
vector.store %valueToStore, %memref[%index] : memref<?xf32>, vector<8xf32>
```
- Example 6: Explicit out-of-bounds vector store.
+ Example: Explicit out-of-bounds vector store.
```mlir
vector.store %valueToStore, %memref[%c0] : memref<7xf32>, vector<8xf32>
```
}];
- let arguments = (ins AnyVector:$valueToStore,
+ let arguments = (ins
+ AnyVectorOfAnyRank:$valueToStore,
Arg<AnyMemRef, "the reference to store to",
[MemWrite]>:$base,
- Variadic<Index>:$indices);
+ Variadic<Index>:$indices
+ );
let extraClassDeclaration = [{
MemRefType getMemRefType() {
diff --git a/mlir/lib/Dialect/MemRef/Transforms/RuntimeOpVerification.cpp b/mlir/lib/Dialect/MemRef/Transforms/RuntimeOpVerification.cpp
index 05a069d98ef3..05b813a3b1e9 100644
--- a/mlir/lib/Dialect/MemRef/Transforms/RuntimeOpVerification.cpp
+++ b/mlir/lib/Dialect/MemRef/Transforms/RuntimeOpVerification.cpp
@@ -8,10 +8,14 @@
#include "mlir/Dialect/MemRef/Transforms/RuntimeOpVerification.h"
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Arith/Utils/Utils.h"
#include "mlir/Dialect/ControlFlow/IR/ControlFlow.h"
#include "mlir/Dialect/ControlFlow/IR/ControlFlowOps.h"
#include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/Dialect/MemRef/Utils/MemRefUtils.h"
+#include "mlir/Dialect/Utils/IndexingUtils.h"
#include "mlir/Interfaces/RuntimeVerifiableOpInterface.h"
using namespace mlir;
@@ -21,6 +25,12 @@ static std::string generateErrorMessage(Operation *op, const std::string &msg) {
std::string buffer;
llvm::raw_string_ostream stream(buffer);
OpPrintingFlags flags;
+ // We may generate a lot of error messages and so we need to ensure the
+ // printing is fast.
+ flags.elideLargeElementsAttrs();
+ flags.printGenericOpForm();
+ flags.skipRegions();
+ flags.useLocalScope();
stream << "ERROR: Runtime op verification failed\n";
op->print(stream, flags);
stream << "\n^ " << msg;
@@ -133,6 +143,161 @@ struct CastOpInterface
}
};
+/// Verifies that the indices on load/store ops are in-bounds of the memref's
+/// index space: 0 <= index#i < dim#i
+template <typename LoadStoreOp>
+struct LoadStoreOpInterface
+ : public RuntimeVerifiableOpInterface::ExternalModel<
+ LoadStoreOpInterface<LoadStoreOp>, LoadStoreOp> {
+ void generateRuntimeVerification(Operation *op, OpBuilder &builder,
+ Location loc) const {
+ auto loadStoreOp = cast<LoadStoreOp>(op);
+
+ auto memref = loadStoreOp.getMemref();
+ auto rank = memref.getType().getRank();
+ if (rank == 0) {
+ return;
+ }
+ auto indices = loadStoreOp.getIndices();
+
+ auto zero = builder.create<arith::ConstantIndexOp>(loc, 0);
+ Value assertCond;
+ for (auto i : llvm::seq<int64_t>(0, rank)) {
+ auto index = indices[i];
+
+ auto dimOp = builder.createOrFold<memref::DimOp>(loc, memref, i);
+
+ auto geLow = builder.createOrFold<arith::CmpIOp>(
+ loc, arith::CmpIPredicate::sge, index, zero);
+ auto ltHigh = builder.createOrFold<arith::CmpIOp>(
+ loc, arith::CmpIPredicate::slt, index, dimOp);
+ auto andOp = builder.createOrFold<arith::AndIOp>(loc, geLow, ltHigh);
+
+ assertCond =
+ i > 0 ? builder.createOrFold<arith::AndIOp>(loc, assertCond, andOp)
+ : andOp;
+ }
+ builder.create<cf::AssertOp>(
+ loc, assertCond, generateErrorMessage(op, "out-of-bounds access"));
+ }
+};
+
+/// Compute the linear index for the provided strided layout and indices.
+Value computeLinearIndex(OpBuilder &builder, Location loc, OpFoldResult offset,
+ ArrayRef<OpFoldResult> strides,
+ ArrayRef<OpFoldResult> indices) {
+ auto [expr, values] = computeLinearIndex(offset, strides, indices);
+ auto index =
+ affine::makeComposedFoldedAffineApply(builder, loc, expr, values);
+ return getValueOrCreateConstantIndexOp(builder, loc, index);
+}
+
+/// Returns two Values representing the bounds of the provided strided layout
+/// metadata. The bounds are returned as a half open interval -- [low, high).
+std::pair<Value, Value> computeLinearBounds(OpBuilder &builder, Location loc,
+ OpFoldResult offset,
+ ArrayRef<OpFoldResult> strides,
+ ArrayRef<OpFoldResult> sizes) {
+ auto zeros = SmallVector<int64_t>(sizes.size(), 0);
+ auto indices = getAsIndexOpFoldResult(builder.getContext(), zeros);
+ auto lowerBound = computeLinearIndex(builder, loc, offset, strides, indices);
+ auto upperBound = computeLinearIndex(builder, loc, offset, strides, sizes);
+ return {lowerBound, upperBound};
+}
+
+/// Returns two Values representing the bounds of the memref. The bounds are
+/// returned as a half open interval -- [low, high).
+std::pair<Value, Value> computeLinearBounds(OpBuilder &builder, Location loc,
+ TypedValue<BaseMemRefType> memref) {
+ auto runtimeMetadata = builder.create<ExtractStridedMetadataOp>(loc, memref);
+ auto offset = runtimeMetadata.getConstifiedMixedOffset();
+ auto strides = runtimeMetadata.getConstifiedMixedStrides();
+ auto sizes = runtimeMetadata.getConstifiedMixedSizes();
+ return computeLinearBounds(builder, loc, offset, strides, sizes);
+}
+
+/// Verifies that the linear bounds of a reinterpret_cast op are within the
+/// linear bounds of the base memref: low >= baseLow && high <= baseHigh
+struct ReinterpretCastOpInterface
+ : public RuntimeVerifiableOpInterface::ExternalModel<
+ ReinterpretCastOpInterface, ReinterpretCastOp> {
+ void generateRuntimeVerification(Operation *op, OpBuilder &builder,
+ Location loc) const {
+ auto reinterpretCast = cast<ReinterpretCastOp>(op);
+ auto baseMemref = reinterpretCast.getSource();
+ auto resultMemref =
+ cast<TypedValue<BaseMemRefType>>(reinterpretCast.getResult());
+
+ builder.setInsertionPointAfter(op);
+
+ // Compute the linear bounds of the base memref
+ auto [baseLow, baseHigh] = computeLinearBounds(builder, loc, baseMemref);
+
+ // Compute the linear bounds of the resulting memref
+ auto [low, high] = computeLinearBounds(builder, loc, resultMemref);
+
+ // Check low >= baseLow
+ auto geLow = builder.createOrFold<arith::CmpIOp>(
+ loc, arith::CmpIPredicate::sge, low, baseLow);
+
+ // Check high <= baseHigh
+ auto leHigh = builder.createOrFold<arith::CmpIOp>(
+ loc, arith::CmpIPredicate::sle, high, baseHigh);
+
+ auto assertCond = builder.createOrFold<arith::AndIOp>(loc, geLow, leHigh);
+
+ builder.create<cf::AssertOp>(
+ loc, assertCond,
+ generateErrorMessage(
+ op,
+ "result of reinterpret_cast is out-of-bounds of the base memref"));
+ }
+};
+
+/// Verifies that the linear bounds of a subview op are within the linear bounds
+/// of the base memref: low >= baseLow && high <= baseHigh
+/// TODO: This is not yet a full runtime verification of subview. For example,
+/// consider:
+/// %m = memref.alloc(%c10, %c10) : memref<10x10xf32>
+/// memref.subview %m[%c0, %c0][%c20, %c2][%c1, %c1]
+/// : memref<?x?xf32> to memref<?x?xf32>
+/// The subview is in-bounds of the entire base memref but the first dimension
+/// is out-of-bounds. Future work would verify the bounds on a per-dimension
+/// basis.
+struct SubViewOpInterface
+ : public RuntimeVerifiableOpInterface::ExternalModel<SubViewOpInterface,
+ SubViewOp> {
+ void generateRuntimeVerification(Operation *op, OpBuilder &builder,
+ Location loc) const {
+ auto subView = cast<SubViewOp>(op);
+ auto baseMemref = cast<TypedValue<BaseMemRefType>>(subView.getSource());
+ auto resultMemref = cast<TypedValue<BaseMemRefType>>(subView.getResult());
+
+ builder.setInsertionPointAfter(op);
+
+ // Compute the linear bounds of the base memref
+ auto [baseLow, baseHigh] = computeLinearBounds(builder, loc, baseMemref);
+
+ // Compute the linear bounds of the resulting memref
+ auto [low, high] = computeLinearBounds(builder, loc, resultMemref);
+
+ // Check low >= baseLow
+ auto geLow = builder.createOrFold<arith::CmpIOp>(
+ loc, arith::CmpIPredicate::sge, low, baseLow);
+
+ // Check high <= baseHigh
+ auto leHigh = builder.createOrFold<arith::CmpIOp>(
+ loc, arith::CmpIPredicate::sle, high, baseHigh);
+
+ auto assertCond = builder.createOrFold<arith::AndIOp>(loc, geLow, leHigh);
+
+ builder.create<cf::AssertOp>(
+ loc, assertCond,
+ generateErrorMessage(op,
+ "subview is out-of-bounds of the base memref"));
+ }
+};
+
struct ExpandShapeOpInterface
: public RuntimeVerifiableOpInterface::ExternalModel<ExpandShapeOpInterface,
ExpandShapeOp> {
@@ -183,8 +348,13 @@ void mlir::memref::registerRuntimeVerifiableOpInterfaceExternalModels(
registry.addExtension(+[](MLIRContext *ctx, memref::MemRefDialect *dialect) {
CastOp::attachInterface<CastOpInterface>(*ctx);
ExpandShapeOp::attachInterface<ExpandShapeOpInterface>(*ctx);
+ LoadOp::attachInterface<LoadStoreOpInterface<LoadOp>>(*ctx);
+ ReinterpretCastOp::attachInterface<ReinterpretCastOpInterface>(*ctx);
+ StoreOp::attachInterface<LoadStoreOpInterface<StoreOp>>(*ctx);
+ SubViewOp::attachInterface<SubViewOpInterface>(*ctx);
// Load additional dialects of which ops may get created.
- ctx->loadDialect<arith::ArithDialect, cf::ControlFlowDialect>();
+ ctx->loadDialect<affine::AffineDialect, arith::ArithDialect,
+ cf::ControlFlowDialect>();
});
}
diff --git a/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir b/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir
index d80392ebd87b..7ea0197bdecb 100644
--- a/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir
+++ b/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir
@@ -2059,6 +2059,36 @@ func.func @vector_store_op_index(%memref : memref<200x100xindex>, %i : index, %j
// -----
+func.func @vector_load_op_0d(%memref : memref<200x100xf32>, %i : index, %j : index) -> vector<f32> {
+ %0 = vector.load %memref[%i, %j] : memref<200x100xf32>, vector<f32>
+ return %0 : vector<f32>
+}
+
+// CHECK-LABEL: func @vector_load_op_0d
+// CHECK: %[[load:.*]] = memref.load %{{.*}}[%{{.*}}, %{{.*}}]
+// CHECK: %[[vec:.*]] = llvm.mlir.undef : vector<1xf32>
+// CHECK: %[[c0:.*]] = llvm.mlir.constant(0 : i32) : i32
+// CHECK: %[[inserted:.*]] = llvm.insertelement %[[load]], %[[vec]][%[[c0]] : i32] : vector<1xf32>
+// CHECK: %[[cast:.*]] = builtin.unrealized_conversion_cast %[[inserted]] : vector<1xf32> to vector<f32>
+// CHECK: return %[[cast]] : vector<f32>
+
+// -----
+
+func.func @vector_store_op_0d(%memref : memref<200x100xf32>, %i : index, %j : index) {
+ %val = arith.constant dense<11.0> : vector<f32>
+ vector.store %val, %memref[%i, %j] : memref<200x100xf32>, vector<f32>
+ return
+}
+
+// CHECK-LABEL: func @vector_store_op_0d
+// CHECK: %[[val:.*]] = arith.constant dense<1.100000e+01> : vector<f32>
+// CHECK: %[[cast:.*]] = builtin.unrealized_conversion_cast %[[val]] : vector<f32> to vector<1xf32>
+// CHECK: %[[c0:.*]] = llvm.mlir.constant(0 : index) : i64
+// CHECK: %[[extracted:.*]] = llvm.extractelement %[[cast]][%[[c0]] : i64] : vector<1xf32>
+// CHECK: memref.store %[[extracted]], %{{.*}}[%{{.*}}, %{{.*}}]
+
+// -----
+
func.func @masked_load_op(%arg0: memref<?xf32>, %arg1: vector<16xi1>, %arg2: vector<16xf32>) -> vector<16xf32> {
%c0 = arith.constant 0: index
%0 = vector.maskedload %arg0[%c0], %arg1, %arg2 : memref<?xf32>, vector<16xi1>, vector<16xf32> into vector<16xf32>
diff --git a/mlir/test/Dialect/Vector/ops.mlir b/mlir/test/Dialect/Vector/ops.mlir
index 9f1ec21cdabf..03532c5c1ceb 100644
--- a/mlir/test/Dialect/Vector/ops.mlir
+++ b/mlir/test/Dialect/Vector/ops.mlir
@@ -725,6 +725,16 @@ func.func @flat_transpose_int(%arg0: vector<16xi32>) -> vector<16xi32> {
return %0 : vector<16xi32>
}
+// CHECK-LABEL: @vector_load_and_store_0d_scalar_memref
+func.func @vector_load_and_store_0d_scalar_memref(%memref : memref<200x100xf32>,
+ %i : index, %j : index) {
+ // CHECK: %[[ld:.*]] = vector.load %{{.*}}[%{{.*}}] : memref<200x100xf32>, vector<f32>
+ %0 = vector.load %memref[%i, %j] : memref<200x100xf32>, vector<f32>
+ // CHECK: vector.store %[[ld]], %{{.*}}[%{{.*}}] : memref<200x100xf32>, vector<f32>
+ vector.store %0, %memref[%i, %j] : memref<200x100xf32>, vector<f32>
+ return
+}
+
// CHECK-LABEL: @vector_load_and_store_1d_scalar_memref
func.func @vector_load_and_store_1d_scalar_memref(%memref : memref<200x100xf32>,
%i : index, %j : index) {
diff --git a/mlir/test/Integration/Dialect/Memref/cast-runtime-verification.mlir b/mlir/test/Integration/Dialect/Memref/cast-runtime-verification.mlir
index 6ad817a73408..52b8c16d753d 100644
--- a/mlir/test/Integration/Dialect/Memref/cast-runtime-verification.mlir
+++ b/mlir/test/Integration/Dialect/Memref/cast-runtime-verification.mlir
@@ -33,26 +33,26 @@ func.func @main() {
%alloc = memref.alloc() : memref<5xf32>
// CHECK: ERROR: Runtime op verification failed
- // CHECK-NEXT: memref.cast %{{.*}} : memref<?xf32> to memref<10xf32>
+ // CHECK-NEXT: "memref.cast"(%{{.*}}) : (memref<?xf32>) -> memref<10xf32>
// CHECK-NEXT: ^ size mismatch of dim 0
// CHECK-NEXT: Location: loc({{.*}})
%1 = memref.cast %alloc : memref<5xf32> to memref<?xf32>
func.call @cast_to_static_dim(%1) : (memref<?xf32>) -> (memref<10xf32>)
// CHECK-NEXT: ERROR: Runtime op verification failed
- // CHECK-NEXT: memref.cast %{{.*}} : memref<*xf32> to memref<f32>
+ // CHECK-NEXT: "memref.cast"(%{{.*}}) : (memref<*xf32>) -> memref<f32>
// CHECK-NEXT: ^ rank mismatch
// CHECK-NEXT: Location: loc({{.*}})
%3 = memref.cast %alloc : memref<5xf32> to memref<*xf32>
func.call @cast_to_ranked(%3) : (memref<*xf32>) -> (memref<f32>)
// CHECK-NEXT: ERROR: Runtime op verification failed
- // CHECK-NEXT: memref.cast %{{.*}} : memref<?xf32, strided<[?], offset: ?>> to memref<?xf32, strided<[9], offset: 5>>
+ // CHECK-NEXT: "memref.cast"(%{{.*}}) : (memref<?xf32, strided<[?], offset: ?>>) -> memref<?xf32, strided<[9], offset: 5>>
// CHECK-NEXT: ^ offset mismatch
// CHECK-NEXT: Location: loc({{.*}})
// CHECK-NEXT: ERROR: Runtime op verification failed
- // CHECK-NEXT: memref.cast %{{.*}} : memref<?xf32, strided<[?], offset: ?>> to memref<?xf32, strided<[9], offset: 5>>
+ // CHECK-NEXT: "memref.cast"(%{{.*}}) : (memref<?xf32, strided<[?], offset: ?>>) -> memref<?xf32, strided<[9], offset: 5>>
// CHECK-NEXT: ^ stride mismatch of dim 0
// CHECK-NEXT: Location: loc({{.*}})
%4 = memref.cast %alloc
diff --git a/mlir/test/Integration/Dialect/Memref/load-runtime-verification.mlir b/mlir/test/Integration/Dialect/Memref/load-runtime-verification.mlir
new file mode 100644
index 000000000000..169dfd705645
--- /dev/null
+++ b/mlir/test/Integration/Dialect/Memref/load-runtime-verification.mlir
@@ -0,0 +1,67 @@
+// RUN: mlir-opt %s -generate-runtime-verification \
+// RUN: -expand-strided-metadata \
+// RUN: -finalize-memref-to-llvm \
+// RUN: -test-cf-assert \
+// RUN: -convert-func-to-llvm \
+// RUN: -reconcile-unrealized-casts | \
+// RUN: mlir-cpu-runner -e main -entry-point-result=void \
+// RUN: -shared-libs=%mlir_runner_utils 2>&1 | \
+// RUN: FileCheck %s
+
+func.func @load(%memref: memref<1xf32>, %index: index) {
+ memref.load %memref[%index] : memref<1xf32>
+ return
+}
+
+func.func @load_dynamic(%memref: memref<?xf32>, %index: index) {
+ memref.load %memref[%index] : memref<?xf32>
+ return
+}
+
+func.func @load_nd_dynamic(%memref: memref<?x?x?xf32>, %index0: index, %index1: index, %index2: index) {
+ memref.load %memref[%index0, %index1, %index2] : memref<?x?x?xf32>
+ return
+}
+
+func.func @main() {
+ %0 = arith.constant 0 : index
+ %1 = arith.constant 1 : index
+ %n1 = arith.constant -1 : index
+ %2 = arith.constant 2 : index
+ %alloca_1 = memref.alloca() : memref<1xf32>
+ %alloc_1 = memref.alloc(%1) : memref<?xf32>
+ %alloc_2x2x2 = memref.alloc(%2, %2, %2) : memref<?x?x?xf32>
+
+ // CHECK: ERROR: Runtime op verification failed
+ // CHECK-NEXT: "memref.load"(%{{.*}}, %{{.*}}) : (memref<1xf32>, index) -> f32
+ // CHECK-NEXT: ^ out-of-bounds access
+ // CHECK-NEXT: Location: loc({{.*}})
+ func.call @load(%alloca_1, %1) : (memref<1xf32>, index) -> ()
+
+ // CHECK: ERROR: Runtime op verification failed
+ // CHECK-NEXT: "memref.load"(%{{.*}}, %{{.*}}) : (memref<?xf32>, index) -> f32
+ // CHECK-NEXT: ^ out-of-bounds access
+ // CHECK-NEXT: Location: loc({{.*}})
+ func.call @load_dynamic(%alloc_1, %1) : (memref<?xf32>, index) -> ()
+
+ // CHECK: ERROR: Runtime op verification failed
+ // CHECK-NEXT: "memref.load"(%{{.*}}, %{{.*}}) : (memref<?x?x?xf32>, index, index, index) -> f32
+ // CHECK-NEXT: ^ out-of-bounds access
+ // CHECK-NEXT: Location: loc({{.*}})
+ func.call @load_nd_dynamic(%alloc_2x2x2, %1, %n1, %0) : (memref<?x?x?xf32>, index, index, index) -> ()
+
+ // CHECK-NOT: ERROR: Runtime op verification failed
+ func.call @load(%alloca_1, %0) : (memref<1xf32>, index) -> ()
+
+ // CHECK-NOT: ERROR: Runtime op verification failed
+ func.call @load_dynamic(%alloc_1, %0) : (memref<?xf32>, index) -> ()
+
+ // CHECK-NOT: ERROR: Runtime op verification failed
+ func.call @load_nd_dynamic(%alloc_2x2x2, %1, %1, %0) : (memref<?x?x?xf32>, index, index, index) -> ()
+
+ memref.dealloc %alloc_1 : memref<?xf32>
+ memref.dealloc %alloc_2x2x2 : memref<?x?x?xf32>
+
+ return
+}
+
diff --git a/mlir/test/Integration/Dialect/Memref/reinterpret-cast-runtime-verification.mlir b/mlir/test/Integration/Dialect/Memref/reinterpret-cast-runtime-verification.mlir
new file mode 100644
index 000000000000..370029154054
--- /dev/null
+++ b/mlir/test/Integration/Dialect/Memref/reinterpret-cast-runtime-verification.mlir
@@ -0,0 +1,74 @@
+// RUN: mlir-opt %s -generate-runtime-verification \
+// RUN: -lower-affine \
+// RUN: -finalize-memref-to-llvm \
+// RUN: -test-cf-assert \
+// RUN: -convert-func-to-llvm \
+// RUN: -reconcile-unrealized-casts | \
+// RUN: mlir-cpu-runner -e main -entry-point-result=void \
+// RUN: -shared-libs=%mlir_runner_utils 2>&1 | \
+// RUN: FileCheck %s
+
+func.func @reinterpret_cast(%memref: memref<1xf32>, %offset: index) {
+ memref.reinterpret_cast %memref to
+ offset: [%offset],
+ sizes: [1],
+ strides: [1]
+ : memref<1xf32> to memref<1xf32, strided<[1], offset: ?>>
+ return
+}
+
+func.func @reinterpret_cast_fully_dynamic(%memref: memref<?xf32>, %offset: index, %size: index, %stride: index) {
+ memref.reinterpret_cast %memref to
+ offset: [%offset],
+ sizes: [%size],
+ strides: [%stride]
+ : memref<?xf32> to memref<?xf32, strided<[?], offset: ?>>
+ return
+}
+
+func.func @main() {
+ %0 = arith.constant 0 : index
+ %1 = arith.constant 1 : index
+ %n1 = arith.constant -1 : index
+ %4 = arith.constant 4 : index
+ %5 = arith.constant 5 : index
+
+ %alloca_1 = memref.alloca() : memref<1xf32>
+ %alloc_4 = memref.alloc(%4) : memref<?xf32>
+
+ // Offset is out-of-bounds
+ // CHECK: ERROR: Runtime op verification failed
+ // CHECK-NEXT: "memref.reinterpret_cast"(%{{.*}})
+ // CHECK-NEXT: ^ result of reinterpret_cast is out-of-bounds of the base memref
+ // CHECK-NEXT: Location: loc({{.*}})
+ func.call @reinterpret_cast(%alloca_1, %1) : (memref<1xf32>, index) -> ()
+
+ // Offset is out-of-bounds
+ // CHECK: ERROR: Runtime op verification failed
+ // CHECK-NEXT: "memref.reinterpret_cast"(%{{.*}})
+ // CHECK-NEXT: ^ result of reinterpret_cast is out-of-bounds of the base memref
+ // CHECK-NEXT: Location: loc({{.*}})
+ func.call @reinterpret_cast(%alloca_1, %n1) : (memref<1xf32>, index) -> ()
+
+ // Size is out-of-bounds
+ // CHECK: ERROR: Runtime op verification failed
+ // CHECK-NEXT: "memref.reinterpret_cast"(%{{.*}})
+ // CHECK-NEXT: ^ result of reinterpret_cast is out-of-bounds of the base memref
+ // CHECK-NEXT: Location: loc({{.*}})
+ func.call @reinterpret_cast_fully_dynamic(%alloc_4, %0, %5, %1) : (memref<?xf32>, index, index, index) -> ()
+
+ // Stride is out-of-bounds
+ // CHECK: ERROR: Runtime op verification failed
+ // CHECK-NEXT: "memref.reinterpret_cast"(%{{.*}})
+ // CHECK-NEXT: ^ result of reinterpret_cast is out-of-bounds of the base memref
+ // CHECK-NEXT: Location: loc({{.*}})
+ func.call @reinterpret_cast_fully_dynamic(%alloc_4, %0, %4, %4) : (memref<?xf32>, index, index, index) -> ()
+
+ // CHECK-NOT: ERROR: Runtime op verification failed
+ func.call @reinterpret_cast(%alloca_1, %0) : (memref<1xf32>, index) -> ()
+
+ // CHECK-NOT: ERROR: Runtime op verification failed
+ func.call @reinterpret_cast_fully_dynamic(%alloc_4, %0, %4, %1) : (memref<?xf32>, index, index, index) -> ()
+
+ return
+}
diff --git a/mlir/test/Integration/Dialect/Memref/subview-runtime-verification.mlir b/mlir/test/Integration/Dialect/Memref/subview-runtime-verification.mlir
new file mode 100644
index 000000000000..48987ce216f1
--- /dev/null
+++ b/mlir/test/Integration/Dialect/Memref/subview-runtime-verification.mlir
@@ -0,0 +1,89 @@
+// RUN: mlir-opt %s -generate-runtime-verification \
+// RUN: -expand-strided-metadata \
+// RUN: -lower-affine \
+// RUN: -finalize-memref-to-llvm \
+// RUN: -test-cf-assert \
+// RUN: -convert-func-to-llvm \
+// RUN: -reconcile-unrealized-casts | \
+// RUN: mlir-cpu-runner -e main -entry-point-result=void \
+// RUN: -shared-libs=%mlir_runner_utils 2>&1 | \
+// RUN: FileCheck %s
+
+func.func @subview(%memref: memref<1xf32>, %offset: index) {
+ memref.subview %memref[%offset] [1] [1] :
+ memref<1xf32> to
+ memref<1xf32, strided<[1], offset: ?>>
+ return
+}
+
+func.func @subview_dynamic(%memref: memref<?x4xf32>, %offset: index, %size: index, %stride: index) {
+ memref.subview %memref[%offset, 0] [%size, 4] [%stride, 1] :
+ memref<?x4xf32> to
+ memref<?x4xf32, strided<[?, 1], offset: ?>>
+ return
+}
+
+func.func @subview_dynamic_rank_reduce(%memref: memref<?x4xf32>, %offset: index, %size: index, %stride: index) {
+ memref.subview %memref[%offset, 0] [%size, 1] [%stride, 1] :
+ memref<?x4xf32> to
+ memref<?xf32, strided<[?], offset: ?>>
+ return
+}
+
+func.func @main() {
+ %0 = arith.constant 0 : index
+ %1 = arith.constant 1 : index
+ %n1 = arith.constant -1 : index
+ %4 = arith.constant 4 : index
+ %5 = arith.constant 5 : index
+
+ %alloca = memref.alloca() : memref<1xf32>
+ %alloc = memref.alloc(%4) : memref<?x4xf32>
+
+ // Offset is out-of-bounds
+ // CHECK: ERROR: Runtime op verification failed
+ // CHECK-NEXT: "memref.subview"
+ // CHECK-NEXT: ^ subview is out-of-bounds of the base memref
+ // CHECK-NEXT: Location: loc({{.*}})
+ func.call @subview_dynamic_rank_reduce(%alloc, %5, %5, %1) : (memref<?x4xf32>, index, index, index) -> ()
+
+ // Offset is out-of-bounds
+ // CHECK: ERROR: Runtime op verification failed
+ // CHECK-NEXT: "memref.subview"
+ // CHECK-NEXT: ^ subview is out-of-bounds of the base memref
+ // CHECK-NEXT: Location: loc({{.*}})
+ func.call @subview(%alloca, %1) : (memref<1xf32>, index) -> ()
+
+ // Offset is out-of-bounds
+ // CHECK: ERROR: Runtime op verification failed
+ // CHECK-NEXT: "memref.subview"
+ // CHECK-NEXT: ^ subview is out-of-bounds of the base memref
+ // CHECK-NEXT: Location: loc({{.*}})
+ func.call @subview(%alloca, %n1) : (memref<1xf32>, index) -> ()
+
+ // Size is out-of-bounds
+ // CHECK: ERROR: Runtime op verification failed
+ // CHECK-NEXT: "memref.subview"
+ // CHECK-NEXT: ^ subview is out-of-bounds of the base memref
+ // CHECK-NEXT: Location: loc({{.*}})
+ func.call @subview_dynamic(%alloc, %0, %5, %1) : (memref<?x4xf32>, index, index, index) -> ()
+
+ // Stride is out-of-bounds
+ // CHECK: ERROR: Runtime op verification failed
+ // CHECK-NEXT: "memref.subview"
+ // CHECK-NEXT: ^ subview is out-of-bounds of the base memref
+ // CHECK-NEXT: Location: loc({{.*}})
+ func.call @subview_dynamic(%alloc, %0, %4, %4) : (memref<?x4xf32>, index, index, index) -> ()
+
+ // CHECK-NOT: ERROR: Runtime op verification failed
+ func.call @subview(%alloca, %0) : (memref<1xf32>, index) -> ()
+
+ // CHECK-NOT: ERROR: Runtime op verification failed
+ func.call @subview_dynamic(%alloc, %0, %4, %1) : (memref<?x4xf32>, index, index, index) -> ()
+
+ // CHECK-NOT: ERROR: Runtime op verification failed
+ func.call @subview_dynamic_rank_reduce(%alloc, %0, %1, %0) : (memref<?x4xf32>, index, index, index) -> ()
+
+
+ return
+}
diff --git a/mlir/test/Target/SPIRV/physical-storage-buffer.mlir b/mlir/test/Target/SPIRV/physical-storage-buffer.mlir
new file mode 100644
index 000000000000..040cfb891cb3
--- /dev/null
+++ b/mlir/test/Target/SPIRV/physical-storage-buffer.mlir
@@ -0,0 +1,48 @@
+// RUN: mlir-translate --no-implicit-module --test-spirv-roundtrip %s | FileCheck %s
+
+// Test file showing how the Physical Storage Buffer extension works end-2-end.
+
+!f32_binding = !spirv.struct<binding_f32_t, (!spirv.rtarray<f32, stride=4> [0])>
+!f32_binding_ptr = !spirv.ptr<!f32_binding, PhysicalStorageBuffer>
+
+!set_0 = !spirv.struct<set_0_t, (!f32_binding_ptr [0],
+ !f32_binding_ptr [8],
+ !f32_binding_ptr [16])>
+!set_0_ptr = !spirv.ptr<!set_0, StorageBuffer>
+
+!set_1 = !spirv.struct<set_1_t, (!f32_binding_ptr [0],
+ !f32_binding_ptr [8])>
+!set_1_ptr = !spirv.ptr<!set_1, StorageBuffer>
+
+spirv.module PhysicalStorageBuffer64 GLSL450 requires #spirv.vce<v1.5,
+ [Shader, Int64, PhysicalStorageBufferAddresses], [SPV_KHR_physical_storage_buffer]> {
+
+ spirv.GlobalVariable @set_0 bind(3, 0) : !set_0_ptr
+ spirv.GlobalVariable @set_1 bind(3, 1) : !set_1_ptr
+
+ // CHECK-LABEL: spirv.func @main() "None"
+ spirv.func @main() "None" {
+ %idx0 = spirv.Constant 0 : i64
+ %idx1 = spirv.Constant 1 : i64
+ %idx2 = spirv.Constant 2 : i64
+ %set_0_addr = spirv.mlir.addressof @set_0 : !set_0_ptr
+ %s0_b2_ptr = spirv.AccessChain %set_0_addr[%idx2] : !set_0_ptr, i64
+ %b2_ptr = spirv.Load "StorageBuffer" %s0_b2_ptr : !f32_binding_ptr
+ %b2_data_ptr = spirv.AccessChain %b2_ptr[%idx0, %idx0] : !f32_binding_ptr, i64, i64
+
+ // CHECK: spirv.Load "PhysicalStorageBuffer"
+ %b2_data = spirv.Load "PhysicalStorageBuffer" %b2_data_ptr ["Aligned", 4] : f32
+
+ %set_1_addr = spirv.mlir.addressof @set_1 : !set_1_ptr
+ %s1_b1_ptr = spirv.AccessChain %set_1_addr[%idx1] : !set_1_ptr, i64
+ %b1_ptr = spirv.Load "StorageBuffer" %s1_b1_ptr : !f32_binding_ptr
+ %b1_data_ptr = spirv.AccessChain %b1_ptr[%idx0, %idx0] : !f32_binding_ptr, i64, i64
+
+ // CHECK: spirv.Store "PhysicalStorageBuffer"
+ spirv.Store "PhysicalStorageBuffer" %b1_data_ptr, %b2_data ["Aligned", 4] : f32
+
+ spirv.Return
+ }
+
+ spirv.EntryPoint "GLCompute" @main, @set_0, @set_1
+}