summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPhoebe Wang <phoebe.wang@intel.com>2024-03-04 18:09:41 +0800
committerTom Stellard <tstellar@redhat.com>2024-03-19 14:06:42 -0700
commit26a1d6601d727a96f4301d0d8647b5a42760ae0c (patch)
tree232815b14ea830e8ab92f111fdb4844e7e5a3f76
parent0bf7ff1028fb7cc81324e9c38c585e6533b754e4 (diff)
[X86] Add missing subvector_subreg_lowering for BF16 (#83720)llvmorg-18.1.2
-rw-r--r--llvm/lib/Target/X86/X86InstrVecCompiler.td3
-rw-r--r--llvm/test/CodeGen/X86/avx512bf16-vl-intrinsics.ll22
-rw-r--r--llvm/test/CodeGen/X86/bfloat.ll1
3 files changed, 25 insertions, 1 deletions
diff --git a/llvm/lib/Target/X86/X86InstrVecCompiler.td b/llvm/lib/Target/X86/X86InstrVecCompiler.td
index bbd19cf8d5b2..461b2badc131 100644
--- a/llvm/lib/Target/X86/X86InstrVecCompiler.td
+++ b/llvm/lib/Target/X86/X86InstrVecCompiler.td
@@ -83,6 +83,7 @@ defm : subvector_subreg_lowering<VR128, v2f64, VR256, v4f64, sub_xmm>;
defm : subvector_subreg_lowering<VR128, v8i16, VR256, v16i16, sub_xmm>;
defm : subvector_subreg_lowering<VR128, v16i8, VR256, v32i8, sub_xmm>;
defm : subvector_subreg_lowering<VR128, v8f16, VR256, v16f16, sub_xmm>;
+defm : subvector_subreg_lowering<VR128, v8bf16, VR256, v16bf16, sub_xmm>;
// A 128-bit subvector extract from the first 512-bit vector position is a
// subregister copy that needs no instruction. Likewise, a 128-bit subvector
@@ -95,6 +96,7 @@ defm : subvector_subreg_lowering<VR128, v2f64, VR512, v8f64, sub_xmm>;
defm : subvector_subreg_lowering<VR128, v8i16, VR512, v32i16, sub_xmm>;
defm : subvector_subreg_lowering<VR128, v16i8, VR512, v64i8, sub_xmm>;
defm : subvector_subreg_lowering<VR128, v8f16, VR512, v32f16, sub_xmm>;
+defm : subvector_subreg_lowering<VR128, v8bf16, VR512, v32bf16, sub_xmm>;
// A 128-bit subvector extract from the first 512-bit vector position is a
// subregister copy that needs no instruction. Likewise, a 128-bit subvector
@@ -107,6 +109,7 @@ defm : subvector_subreg_lowering<VR256, v4f64, VR512, v8f64, sub_ymm>;
defm : subvector_subreg_lowering<VR256, v16i16, VR512, v32i16, sub_ymm>;
defm : subvector_subreg_lowering<VR256, v32i8, VR512, v64i8, sub_ymm>;
defm : subvector_subreg_lowering<VR256, v16f16, VR512, v32f16, sub_ymm>;
+defm : subvector_subreg_lowering<VR256, v16bf16, VR512, v32bf16, sub_ymm>;
// If we're inserting into an all zeros vector, just use a plain move which
diff --git a/llvm/test/CodeGen/X86/avx512bf16-vl-intrinsics.ll b/llvm/test/CodeGen/X86/avx512bf16-vl-intrinsics.ll
index 0826faa1071b..482713e12d15 100644
--- a/llvm/test/CodeGen/X86/avx512bf16-vl-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avx512bf16-vl-intrinsics.ll
@@ -381,3 +381,25 @@ entry:
%1 = shufflevector <8 x bfloat> %0, <8 x bfloat> undef, <16 x i32> zeroinitializer
ret <16 x bfloat> %1
}
+
+define <16 x i32> @pr83358() {
+; X86-LABEL: pr83358:
+; X86: # %bb.0:
+; X86-NEXT: vcvtneps2bf16y {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # encoding: [0x62,0xf2,0x7e,0x28,0x72,0x05,A,A,A,A]
+; X86-NEXT: # fixup A - offset: 6, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4
+; X86-NEXT: vshufi64x2 $0, %zmm0, %zmm0, %zmm0 # encoding: [0x62,0xf3,0xfd,0x48,0x43,0xc0,0x00]
+; X86-NEXT: # zmm0 = zmm0[0,1,0,1,0,1,0,1]
+; X86-NEXT: retl # encoding: [0xc3]
+;
+; X64-LABEL: pr83358:
+; X64: # %bb.0:
+; X64-NEXT: vcvtneps2bf16y {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # encoding: [0x62,0xf2,0x7e,0x28,0x72,0x05,A,A,A,A]
+; X64-NEXT: # fixup A - offset: 6, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte
+; X64-NEXT: vshufi64x2 $0, %zmm0, %zmm0, %zmm0 # encoding: [0x62,0xf3,0xfd,0x48,0x43,0xc0,0x00]
+; X64-NEXT: # zmm0 = zmm0[0,1,0,1,0,1,0,1]
+; X64-NEXT: retq # encoding: [0xc3]
+ %1 = call <8 x bfloat> @llvm.x86.avx512bf16.cvtneps2bf16.256(<8 x float> <float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00, float 8.000000e+00>)
+ %2 = bitcast <8 x bfloat> %1 to <4 x i32>
+ %3 = shufflevector <4 x i32> %2, <4 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+ ret <16 x i32> %3
+}
diff --git a/llvm/test/CodeGen/X86/bfloat.ll b/llvm/test/CodeGen/X86/bfloat.ll
index f2d3c4fb3419..cd1dba176116 100644
--- a/llvm/test/CodeGen/X86/bfloat.ll
+++ b/llvm/test/CodeGen/X86/bfloat.ll
@@ -2423,7 +2423,6 @@ define <16 x bfloat> @fptrunc_v16f32(<16 x float> %a) nounwind {
; AVXNC-LABEL: fptrunc_v16f32:
; AVXNC: # %bb.0:
; AVXNC-NEXT: {vex} vcvtneps2bf16 %ymm0, %xmm0
-; AVXNC-NEXT: vinsertf128 $0, %xmm0, %ymm0, %ymm0
; AVXNC-NEXT: {vex} vcvtneps2bf16 %ymm1, %xmm1
; AVXNC-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVXNC-NEXT: retq