summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLukacma <Marian.Lukac@arm.com>2024-02-23 15:40:44 +0000
committerGitHub <noreply@github.com>2024-02-23 15:40:44 +0000
commit08cb1a62f6f401d66513a20e8689c1ef9059fc63 (patch)
treeca7b9e1433110422356ebf55cfb1f4e0d57dc373
parent3b232f066d40a3e91ac27e421a3baeaca0cd59ec (diff)
[AArch64][SVE] Add intrinsincs to assembly mapping for svpmov (#81861)
This patch enables translation of svpmov intrinsic to the correct assembly instruction, instead of function call.
-rw-r--r--llvm/include/llvm/IR/IntrinsicsAArch64.td46
-rw-r--r--llvm/test/CodeGen/AArch64/sve2p1-intrinsics-pmov-to-pred.ll82
-rw-r--r--llvm/test/CodeGen/AArch64/sve2p1-intrinsics-pmov-to-vector.ll45
3 files changed, 47 insertions, 126 deletions
diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td
index 921e5b95ae03..6b045e412cd5 100644
--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -1367,6 +1367,27 @@ let TargetPrefix = "aarch64" in { // All intrinsics start with "llvm.aarch64.".
llvm_i32_ty,
llvm_i32_ty],
[IntrNoMem, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>]>;
+
+ class SVE2_1VectorArg_Pred_Intrinsic
+ : DefaultAttrsIntrinsic<[LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>],
+ [llvm_anyvector_ty],
+ [IntrNoMem]>;
+
+ class SVE2_1VectorArgIndexed_Pred_Intrinsic
+ : DefaultAttrsIntrinsic<[LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>],
+ [llvm_anyvector_ty, llvm_i32_ty],
+ [IntrNoMem, ImmArg<ArgIndex<1>>]>;
+
+ class SVE2_Pred_1VectorArgIndexed_Intrinsic
+ : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
+ [LLVMMatchType<0>,
+ LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, llvm_i32_ty],
+ [IntrNoMem, ImmArg<ArgIndex<2>>]>;
+
+ class SVE2_Pred_1VectorArg_Intrinsic
+ : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
+ [LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>],
+ [IntrNoMem]>;
// NOTE: There is no relationship between these intrinsics beyond an attempt
// to reuse currently identical class definitions.
@@ -3610,23 +3631,10 @@ def int_aarch64_sve_extq : AdvSIMD_2VectorArgIndexed_Intrinsic;
//
// SVE2.1 - Move predicate to/from vector
//
-def int_aarch64_sve_pmov_to_pred_lane :
- DefaultAttrsIntrinsic<[LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>],
- [llvm_anyvector_ty, llvm_i32_ty],
- [IntrNoMem, ImmArg<ArgIndex<1>>]>;
-
-def int_aarch64_sve_pmov_to_pred_lane_zero :
- DefaultAttrsIntrinsic<[LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>],
- [llvm_anyvector_ty],
- [IntrNoMem]>;
-
-def int_aarch64_sve_pmov_to_vector_lane_merging :
- DefaultAttrsIntrinsic<[llvm_anyvector_ty],
- [LLVMMatchType<0>,
- LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, llvm_i32_ty],
- [IntrNoMem, ImmArg<ArgIndex<2>>]>;
+def int_aarch64_sve_pmov_to_pred_lane : SVE2_1VectorArgIndexed_Pred_Intrinsic;
+
+def int_aarch64_sve_pmov_to_pred_lane_zero : SVE2_1VectorArg_Pred_Intrinsic;
-def int_aarch64_sve_pmov_to_vector_lane_zeroing :
- DefaultAttrsIntrinsic<[llvm_anyvector_ty],
- [LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>],
- [IntrNoMem]>;
+def int_aarch64_sve_pmov_to_vector_lane_merging : SVE2_Pred_1VectorArgIndexed_Intrinsic;
+
+def int_aarch64_sve_pmov_to_vector_lane_zeroing : SVE2_Pred_1VectorArg_Intrinsic; \ No newline at end of file
diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-pmov-to-pred.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-pmov-to-pred.ll
index 7cae1d2c216b..a592dcd4b8ce 100644
--- a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-pmov-to-pred.ll
+++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-pmov-to-pred.ll
@@ -4,12 +4,7 @@
define <vscale x 16 x i1> @test_pmov_to_pred_i8(<vscale x 16 x i8> %zn) {
; CHECK-LABEL: test_pmov_to_pred_i8:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: .cfi_offset w30, -16
-; CHECK-NEXT: mov w0, wzr
-; CHECK-NEXT: bl llvm.aarch64.sve.pmov.to.pred.lane.nxv16i8
-; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: pmov p0.b, z0
; CHECK-NEXT: ret
entry:
%res = call <vscale x 16 x i1> @llvm.aarch64.sve.pmov.to.pred.lane.nxv16i8(<vscale x 16 x i8> %zn, i32 0)
@@ -19,27 +14,10 @@ define <vscale x 16 x i1> @test_pmov_to_pred_i8(<vscale x 16 x i8> %zn) {
define <vscale x 8 x i1> @test_pmov_to_pred_i16(<vscale x 8 x i16> %zn) {
; CHECK-LABEL: test_pmov_to_pred_i16:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
-; CHECK-NEXT: addvl sp, sp, #-2
-; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill
-; CHECK-NEXT: str z8, [sp, #1, mul vl] // 16-byte Folded Spill
-; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG
-; CHECK-NEXT: .cfi_offset w30, -8
-; CHECK-NEXT: .cfi_offset w29, -16
-; CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
-; CHECK-NEXT: mov w0, wzr
-; CHECK-NEXT: mov z8.d, z0.d
-; CHECK-NEXT: bl llvm.aarch64.sve.pmov.to.pred.lane.nxv8i16
-; CHECK-NEXT: mov z0.d, z8.d
-; CHECK-NEXT: mov w0, #1 // =0x1
-; CHECK-NEXT: mov p4.b, p0.b
-; CHECK-NEXT: bl llvm.aarch64.sve.pmov.to.pred.lane.nxv8i16
-; CHECK-NEXT: ptrue p1.h
-; CHECK-NEXT: ldr z8, [sp, #1, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT: eor p0.b, p1/z, p4.b, p0.b
-; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: addvl sp, sp, #2
-; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-NEXT: ptrue p0.h
+; CHECK-NEXT: pmov p1.h, z0[0]
+; CHECK-NEXT: pmov p2.h, z0[1]
+; CHECK-NEXT: eor p0.b, p0/z, p1.b, p2.b
; CHECK-NEXT: ret
entry:
%res1 = call <vscale x 8 x i1> @llvm.aarch64.sve.pmov.to.pred.lane.nxv8i16(<vscale x 8 x i16> %zn, i32 0)
@@ -52,27 +30,10 @@ define <vscale x 8 x i1> @test_pmov_to_pred_i16(<vscale x 8 x i16> %zn) {
define <vscale x 4 x i1> @test_pmov_to_pred_i32(<vscale x 4 x i32> %zn) {
; CHECK-LABEL: test_pmov_to_pred_i32:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
-; CHECK-NEXT: addvl sp, sp, #-2
-; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill
-; CHECK-NEXT: str z8, [sp, #1, mul vl] // 16-byte Folded Spill
-; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG
-; CHECK-NEXT: .cfi_offset w30, -8
-; CHECK-NEXT: .cfi_offset w29, -16
-; CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
-; CHECK-NEXT: mov w0, wzr
-; CHECK-NEXT: mov z8.d, z0.d
-; CHECK-NEXT: bl llvm.aarch64.sve.pmov.to.pred.lane.nxv4i32
-; CHECK-NEXT: mov z0.d, z8.d
-; CHECK-NEXT: mov w0, #3 // =0x3
-; CHECK-NEXT: mov p4.b, p0.b
-; CHECK-NEXT: bl llvm.aarch64.sve.pmov.to.pred.lane.nxv4i32
-; CHECK-NEXT: ptrue p1.s
-; CHECK-NEXT: ldr z8, [sp, #1, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT: eor p0.b, p1/z, p4.b, p0.b
-; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: addvl sp, sp, #2
-; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: pmov p1.s, z0[0]
+; CHECK-NEXT: pmov p2.s, z0[3]
+; CHECK-NEXT: eor p0.b, p0/z, p1.b, p2.b
; CHECK-NEXT: ret
entry:
%res1 = call <vscale x 4 x i1> @llvm.aarch64.sve.pmov.to.pred.lane.nxv4i32(<vscale x 4 x i32> %zn, i32 0)
@@ -85,27 +46,10 @@ define <vscale x 4 x i1> @test_pmov_to_pred_i32(<vscale x 4 x i32> %zn) {
define <vscale x 2 x i1> @test_pmov_to_pred_i64(<vscale x 2 x i64> %zn) {
; CHECK-LABEL: test_pmov_to_pred_i64:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
-; CHECK-NEXT: addvl sp, sp, #-2
-; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill
-; CHECK-NEXT: str z8, [sp, #1, mul vl] // 16-byte Folded Spill
-; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG
-; CHECK-NEXT: .cfi_offset w30, -8
-; CHECK-NEXT: .cfi_offset w29, -16
-; CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
-; CHECK-NEXT: mov w0, wzr
-; CHECK-NEXT: mov z8.d, z0.d
-; CHECK-NEXT: bl llvm.aarch64.sve.pmov.to.pred.lane.nxv2i64
-; CHECK-NEXT: mov z0.d, z8.d
-; CHECK-NEXT: mov w0, #7 // =0x7
-; CHECK-NEXT: mov p4.b, p0.b
-; CHECK-NEXT: bl llvm.aarch64.sve.pmov.to.pred.lane.nxv2i64
-; CHECK-NEXT: ptrue p1.d
-; CHECK-NEXT: ldr z8, [sp, #1, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT: eor p0.b, p1/z, p4.b, p0.b
-; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: addvl sp, sp, #2
-; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: pmov p1.d, z0[0]
+; CHECK-NEXT: pmov p2.d, z0[7]
+; CHECK-NEXT: eor p0.b, p0/z, p1.b, p2.b
; CHECK-NEXT: ret
entry:
%res1 = call <vscale x 2 x i1> @llvm.aarch64.sve.pmov.to.pred.lane.nxv2i64(<vscale x 2 x i64> %zn, i32 0)
diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-pmov-to-vector.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-pmov-to-vector.ll
index 58b240b0fbd6..b7f36c651023 100644
--- a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-pmov-to-vector.ll
+++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-pmov-to-vector.ll
@@ -6,12 +6,7 @@
define <vscale x 8 x i16> @test_pmov_to_vector_i16(<vscale x 8 x i16> %zn, <vscale x 8 x i1> %pn) {
; CHECK-LABEL: test_pmov_to_vector_i16:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: .cfi_offset w30, -16
-; CHECK-NEXT: mov w0, #1 // =0x1
-; CHECK-NEXT: bl llvm.aarch64.sve.pmov.to.vector.lane.merging.nxv8i16
-; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: pmov z0[1], p0.h
; CHECK-NEXT: ret
entry:
%res = call <vscale x 8 x i16> @llvm.aarch64.sve.pmov.to.vector.lane.merging.nxv8i16(<vscale x 8 x i16> %zn, <vscale x 8 x i1> %pn, i32 1)
@@ -21,12 +16,7 @@ define <vscale x 8 x i16> @test_pmov_to_vector_i16(<vscale x 8 x i16> %zn, <vsca
define <vscale x 4 x i32> @test_pmov_to_vector_i32(<vscale x 4 x i32> %zn, <vscale x 4 x i1> %pn) {
; CHECK-LABEL: test_pmov_to_vector_i32:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: .cfi_offset w30, -16
-; CHECK-NEXT: mov w0, #3 // =0x3
-; CHECK-NEXT: bl llvm.aarch64.sve.pmov.to.vector.lane.merging.nxv4i32
-; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: pmov z0[3], p0.s
; CHECK-NEXT: ret
entry:
%res = call <vscale x 4 x i32> @llvm.aarch64.sve.pmov.to.vector.lane.merging.nxv4i32(<vscale x 4 x i32> %zn, <vscale x 4 x i1> %pn, i32 3)
@@ -36,12 +26,7 @@ define <vscale x 4 x i32> @test_pmov_to_vector_i32(<vscale x 4 x i32> %zn, <vsca
define <vscale x 2 x i64> @test_pmov_to_vector_i64(<vscale x 2 x i64> %zn, <vscale x 2 x i1> %pn) {
; CHECK-LABEL: test_pmov_to_vector_i64:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: .cfi_offset w30, -16
-; CHECK-NEXT: mov w0, #7 // =0x7
-; CHECK-NEXT: bl llvm.aarch64.sve.pmov.to.vector.lane.merging.nxv2i64
-; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: pmov z0[7], p0.d
; CHECK-NEXT: ret
entry:
%res = call <vscale x 2 x i64> @llvm.aarch64.sve.pmov.to.vector.lane.merging.nxv2i64(<vscale x 2 x i64> %zn, <vscale x 2 x i1> %pn, i32 7)
@@ -54,11 +39,7 @@ define <vscale x 2 x i64> @test_pmov_to_vector_i64(<vscale x 2 x i64> %zn, <vsca
define <vscale x 16 x i8> @test_pmov_to_vector_zero_i8(<vscale x 16 x i1> %pn) {
; CHECK-LABEL: test_pmov_to_vector_zero_i8:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: .cfi_offset w30, -16
-; CHECK-NEXT: bl llvm.aarch64.sve.pmov.to.vector.lane.zeroing.nxv16i8
-; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: pmov z0, p0.b
; CHECK-NEXT: ret
entry:
%res = call <vscale x 16 x i8> @llvm.aarch64.sve.pmov.to.vector.lane.zeroing.nxv16i8(<vscale x 16 x i1> %pn)
@@ -68,11 +49,7 @@ define <vscale x 16 x i8> @test_pmov_to_vector_zero_i8(<vscale x 16 x i1> %pn) {
define <vscale x 8 x i16> @test_pmov_to_vector_zero_i16(<vscale x 8 x i1> %pn) {
; CHECK-LABEL: test_pmov_to_vector_zero_i16:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: .cfi_offset w30, -16
-; CHECK-NEXT: bl llvm.aarch64.sve.pmov.to.vector.lane.zeroing.nxv8i16
-; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: pmov z0[0], p0.h
; CHECK-NEXT: ret
entry:
%res = call <vscale x 8 x i16> @llvm.aarch64.sve.pmov.to.vector.lane.zeroing.nxv8i16(<vscale x 8 x i1> %pn)
@@ -82,11 +59,7 @@ define <vscale x 8 x i16> @test_pmov_to_vector_zero_i16(<vscale x 8 x i1> %pn) {
define <vscale x 4 x i32> @test_pmov_to_vector_zero_i32(<vscale x 4 x i1> %pn) {
; CHECK-LABEL: test_pmov_to_vector_zero_i32:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: .cfi_offset w30, -16
-; CHECK-NEXT: bl llvm.aarch64.sve.pmov.to.vector.lane.zeroing.nxv4i32
-; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: pmov z0[0], p0.s
; CHECK-NEXT: ret
entry:
%res = call <vscale x 4 x i32> @llvm.aarch64.sve.pmov.to.vector.lane.zeroing.nxv4i32(<vscale x 4 x i1> %pn)
@@ -96,11 +69,7 @@ define <vscale x 4 x i32> @test_pmov_to_vector_zero_i32(<vscale x 4 x i1> %pn) {
define <vscale x 2 x i64> @test_pmov_to_vector_zero_i64(<vscale x 2 x i1> %pn) {
; CHECK-LABEL: test_pmov_to_vector_zero_i64:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: .cfi_offset w30, -16
-; CHECK-NEXT: bl llvm.aarch64.sve.pmov.to.vector.lane.zeroing.nxv2i64
-; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: pmov z0[0], p0.d
; CHECK-NEXT: ret
entry:
%res = call <vscale x 2 x i64> @llvm.aarch64.sve.pmov.to.vector.lane.zeroing.nxv2i64(<vscale x 2 x i1> %pn)