diff options
author | Lukacma <Marian.Lukac@arm.com> | 2024-02-23 15:40:44 +0000 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-02-23 15:40:44 +0000 |
commit | 08cb1a62f6f401d66513a20e8689c1ef9059fc63 (patch) | |
tree | ca7b9e1433110422356ebf55cfb1f4e0d57dc373 | |
parent | 3b232f066d40a3e91ac27e421a3baeaca0cd59ec (diff) |
[AArch64][SVE] Add intrinsincs to assembly mapping for svpmov (#81861)
This patch enables translation of svpmov intrinsic to the correct
assembly instruction, instead of function call.
-rw-r--r-- | llvm/include/llvm/IR/IntrinsicsAArch64.td | 46 | ||||
-rw-r--r-- | llvm/test/CodeGen/AArch64/sve2p1-intrinsics-pmov-to-pred.ll | 82 | ||||
-rw-r--r-- | llvm/test/CodeGen/AArch64/sve2p1-intrinsics-pmov-to-vector.ll | 45 |
3 files changed, 47 insertions, 126 deletions
diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td index 921e5b95ae03..6b045e412cd5 100644 --- a/llvm/include/llvm/IR/IntrinsicsAArch64.td +++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td @@ -1367,6 +1367,27 @@ let TargetPrefix = "aarch64" in { // All intrinsics start with "llvm.aarch64.". llvm_i32_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>]>; + + class SVE2_1VectorArg_Pred_Intrinsic + : DefaultAttrsIntrinsic<[LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>], + [llvm_anyvector_ty], + [IntrNoMem]>; + + class SVE2_1VectorArgIndexed_Pred_Intrinsic + : DefaultAttrsIntrinsic<[LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>], + [llvm_anyvector_ty, llvm_i32_ty], + [IntrNoMem, ImmArg<ArgIndex<1>>]>; + + class SVE2_Pred_1VectorArgIndexed_Intrinsic + : DefaultAttrsIntrinsic<[llvm_anyvector_ty], + [LLVMMatchType<0>, + LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, llvm_i32_ty], + [IntrNoMem, ImmArg<ArgIndex<2>>]>; + + class SVE2_Pred_1VectorArg_Intrinsic + : DefaultAttrsIntrinsic<[llvm_anyvector_ty], + [LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>], + [IntrNoMem]>; // NOTE: There is no relationship between these intrinsics beyond an attempt // to reuse currently identical class definitions. @@ -3610,23 +3631,10 @@ def int_aarch64_sve_extq : AdvSIMD_2VectorArgIndexed_Intrinsic; // // SVE2.1 - Move predicate to/from vector // -def int_aarch64_sve_pmov_to_pred_lane : - DefaultAttrsIntrinsic<[LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>], - [llvm_anyvector_ty, llvm_i32_ty], - [IntrNoMem, ImmArg<ArgIndex<1>>]>; - -def int_aarch64_sve_pmov_to_pred_lane_zero : - DefaultAttrsIntrinsic<[LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>], - [llvm_anyvector_ty], - [IntrNoMem]>; - -def int_aarch64_sve_pmov_to_vector_lane_merging : - DefaultAttrsIntrinsic<[llvm_anyvector_ty], - [LLVMMatchType<0>, - LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, llvm_i32_ty], - [IntrNoMem, ImmArg<ArgIndex<2>>]>; +def int_aarch64_sve_pmov_to_pred_lane : SVE2_1VectorArgIndexed_Pred_Intrinsic; + +def int_aarch64_sve_pmov_to_pred_lane_zero : SVE2_1VectorArg_Pred_Intrinsic; -def int_aarch64_sve_pmov_to_vector_lane_zeroing : - DefaultAttrsIntrinsic<[llvm_anyvector_ty], - [LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>], - [IntrNoMem]>; +def int_aarch64_sve_pmov_to_vector_lane_merging : SVE2_Pred_1VectorArgIndexed_Intrinsic; + +def int_aarch64_sve_pmov_to_vector_lane_zeroing : SVE2_Pred_1VectorArg_Intrinsic;
\ No newline at end of file diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-pmov-to-pred.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-pmov-to-pred.ll index 7cae1d2c216b..a592dcd4b8ce 100644 --- a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-pmov-to-pred.ll +++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-pmov-to-pred.ll @@ -4,12 +4,7 @@ define <vscale x 16 x i1> @test_pmov_to_pred_i8(<vscale x 16 x i8> %zn) { ; CHECK-LABEL: test_pmov_to_pred_i8: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: .cfi_offset w30, -16 -; CHECK-NEXT: mov w0, wzr -; CHECK-NEXT: bl llvm.aarch64.sve.pmov.to.pred.lane.nxv16i8 -; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: pmov p0.b, z0 ; CHECK-NEXT: ret entry: %res = call <vscale x 16 x i1> @llvm.aarch64.sve.pmov.to.pred.lane.nxv16i8(<vscale x 16 x i8> %zn, i32 0) @@ -19,27 +14,10 @@ define <vscale x 16 x i1> @test_pmov_to_pred_i8(<vscale x 16 x i8> %zn) { define <vscale x 8 x i1> @test_pmov_to_pred_i16(<vscale x 8 x i16> %zn) { ; CHECK-LABEL: test_pmov_to_pred_i16: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-2 -; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill -; CHECK-NEXT: str z8, [sp, #1, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG -; CHECK-NEXT: .cfi_offset w30, -8 -; CHECK-NEXT: .cfi_offset w29, -16 -; CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG -; CHECK-NEXT: mov w0, wzr -; CHECK-NEXT: mov z8.d, z0.d -; CHECK-NEXT: bl llvm.aarch64.sve.pmov.to.pred.lane.nxv8i16 -; CHECK-NEXT: mov z0.d, z8.d -; CHECK-NEXT: mov w0, #1 // =0x1 -; CHECK-NEXT: mov p4.b, p0.b -; CHECK-NEXT: bl llvm.aarch64.sve.pmov.to.pred.lane.nxv8i16 -; CHECK-NEXT: ptrue p1.h -; CHECK-NEXT: ldr z8, [sp, #1, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: eor p0.b, p1/z, p4.b, p0.b -; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: addvl sp, sp, #2 -; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: pmov p1.h, z0[0] +; CHECK-NEXT: pmov p2.h, z0[1] +; CHECK-NEXT: eor p0.b, p0/z, p1.b, p2.b ; CHECK-NEXT: ret entry: %res1 = call <vscale x 8 x i1> @llvm.aarch64.sve.pmov.to.pred.lane.nxv8i16(<vscale x 8 x i16> %zn, i32 0) @@ -52,27 +30,10 @@ define <vscale x 8 x i1> @test_pmov_to_pred_i16(<vscale x 8 x i16> %zn) { define <vscale x 4 x i1> @test_pmov_to_pred_i32(<vscale x 4 x i32> %zn) { ; CHECK-LABEL: test_pmov_to_pred_i32: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-2 -; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill -; CHECK-NEXT: str z8, [sp, #1, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG -; CHECK-NEXT: .cfi_offset w30, -8 -; CHECK-NEXT: .cfi_offset w29, -16 -; CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG -; CHECK-NEXT: mov w0, wzr -; CHECK-NEXT: mov z8.d, z0.d -; CHECK-NEXT: bl llvm.aarch64.sve.pmov.to.pred.lane.nxv4i32 -; CHECK-NEXT: mov z0.d, z8.d -; CHECK-NEXT: mov w0, #3 // =0x3 -; CHECK-NEXT: mov p4.b, p0.b -; CHECK-NEXT: bl llvm.aarch64.sve.pmov.to.pred.lane.nxv4i32 -; CHECK-NEXT: ptrue p1.s -; CHECK-NEXT: ldr z8, [sp, #1, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: eor p0.b, p1/z, p4.b, p0.b -; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: addvl sp, sp, #2 -; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: pmov p1.s, z0[0] +; CHECK-NEXT: pmov p2.s, z0[3] +; CHECK-NEXT: eor p0.b, p0/z, p1.b, p2.b ; CHECK-NEXT: ret entry: %res1 = call <vscale x 4 x i1> @llvm.aarch64.sve.pmov.to.pred.lane.nxv4i32(<vscale x 4 x i32> %zn, i32 0) @@ -85,27 +46,10 @@ define <vscale x 4 x i1> @test_pmov_to_pred_i32(<vscale x 4 x i32> %zn) { define <vscale x 2 x i1> @test_pmov_to_pred_i64(<vscale x 2 x i64> %zn) { ; CHECK-LABEL: test_pmov_to_pred_i64: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-2 -; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill -; CHECK-NEXT: str z8, [sp, #1, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG -; CHECK-NEXT: .cfi_offset w30, -8 -; CHECK-NEXT: .cfi_offset w29, -16 -; CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG -; CHECK-NEXT: mov w0, wzr -; CHECK-NEXT: mov z8.d, z0.d -; CHECK-NEXT: bl llvm.aarch64.sve.pmov.to.pred.lane.nxv2i64 -; CHECK-NEXT: mov z0.d, z8.d -; CHECK-NEXT: mov w0, #7 // =0x7 -; CHECK-NEXT: mov p4.b, p0.b -; CHECK-NEXT: bl llvm.aarch64.sve.pmov.to.pred.lane.nxv2i64 -; CHECK-NEXT: ptrue p1.d -; CHECK-NEXT: ldr z8, [sp, #1, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: eor p0.b, p1/z, p4.b, p0.b -; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: addvl sp, sp, #2 -; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: pmov p1.d, z0[0] +; CHECK-NEXT: pmov p2.d, z0[7] +; CHECK-NEXT: eor p0.b, p0/z, p1.b, p2.b ; CHECK-NEXT: ret entry: %res1 = call <vscale x 2 x i1> @llvm.aarch64.sve.pmov.to.pred.lane.nxv2i64(<vscale x 2 x i64> %zn, i32 0) diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-pmov-to-vector.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-pmov-to-vector.ll index 58b240b0fbd6..b7f36c651023 100644 --- a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-pmov-to-vector.ll +++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-pmov-to-vector.ll @@ -6,12 +6,7 @@ define <vscale x 8 x i16> @test_pmov_to_vector_i16(<vscale x 8 x i16> %zn, <vscale x 8 x i1> %pn) { ; CHECK-LABEL: test_pmov_to_vector_i16: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: .cfi_offset w30, -16 -; CHECK-NEXT: mov w0, #1 // =0x1 -; CHECK-NEXT: bl llvm.aarch64.sve.pmov.to.vector.lane.merging.nxv8i16 -; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: pmov z0[1], p0.h ; CHECK-NEXT: ret entry: %res = call <vscale x 8 x i16> @llvm.aarch64.sve.pmov.to.vector.lane.merging.nxv8i16(<vscale x 8 x i16> %zn, <vscale x 8 x i1> %pn, i32 1) @@ -21,12 +16,7 @@ define <vscale x 8 x i16> @test_pmov_to_vector_i16(<vscale x 8 x i16> %zn, <vsca define <vscale x 4 x i32> @test_pmov_to_vector_i32(<vscale x 4 x i32> %zn, <vscale x 4 x i1> %pn) { ; CHECK-LABEL: test_pmov_to_vector_i32: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: .cfi_offset w30, -16 -; CHECK-NEXT: mov w0, #3 // =0x3 -; CHECK-NEXT: bl llvm.aarch64.sve.pmov.to.vector.lane.merging.nxv4i32 -; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: pmov z0[3], p0.s ; CHECK-NEXT: ret entry: %res = call <vscale x 4 x i32> @llvm.aarch64.sve.pmov.to.vector.lane.merging.nxv4i32(<vscale x 4 x i32> %zn, <vscale x 4 x i1> %pn, i32 3) @@ -36,12 +26,7 @@ define <vscale x 4 x i32> @test_pmov_to_vector_i32(<vscale x 4 x i32> %zn, <vsca define <vscale x 2 x i64> @test_pmov_to_vector_i64(<vscale x 2 x i64> %zn, <vscale x 2 x i1> %pn) { ; CHECK-LABEL: test_pmov_to_vector_i64: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: .cfi_offset w30, -16 -; CHECK-NEXT: mov w0, #7 // =0x7 -; CHECK-NEXT: bl llvm.aarch64.sve.pmov.to.vector.lane.merging.nxv2i64 -; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: pmov z0[7], p0.d ; CHECK-NEXT: ret entry: %res = call <vscale x 2 x i64> @llvm.aarch64.sve.pmov.to.vector.lane.merging.nxv2i64(<vscale x 2 x i64> %zn, <vscale x 2 x i1> %pn, i32 7) @@ -54,11 +39,7 @@ define <vscale x 2 x i64> @test_pmov_to_vector_i64(<vscale x 2 x i64> %zn, <vsca define <vscale x 16 x i8> @test_pmov_to_vector_zero_i8(<vscale x 16 x i1> %pn) { ; CHECK-LABEL: test_pmov_to_vector_zero_i8: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: .cfi_offset w30, -16 -; CHECK-NEXT: bl llvm.aarch64.sve.pmov.to.vector.lane.zeroing.nxv16i8 -; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: pmov z0, p0.b ; CHECK-NEXT: ret entry: %res = call <vscale x 16 x i8> @llvm.aarch64.sve.pmov.to.vector.lane.zeroing.nxv16i8(<vscale x 16 x i1> %pn) @@ -68,11 +49,7 @@ define <vscale x 16 x i8> @test_pmov_to_vector_zero_i8(<vscale x 16 x i1> %pn) { define <vscale x 8 x i16> @test_pmov_to_vector_zero_i16(<vscale x 8 x i1> %pn) { ; CHECK-LABEL: test_pmov_to_vector_zero_i16: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: .cfi_offset w30, -16 -; CHECK-NEXT: bl llvm.aarch64.sve.pmov.to.vector.lane.zeroing.nxv8i16 -; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: pmov z0[0], p0.h ; CHECK-NEXT: ret entry: %res = call <vscale x 8 x i16> @llvm.aarch64.sve.pmov.to.vector.lane.zeroing.nxv8i16(<vscale x 8 x i1> %pn) @@ -82,11 +59,7 @@ define <vscale x 8 x i16> @test_pmov_to_vector_zero_i16(<vscale x 8 x i1> %pn) { define <vscale x 4 x i32> @test_pmov_to_vector_zero_i32(<vscale x 4 x i1> %pn) { ; CHECK-LABEL: test_pmov_to_vector_zero_i32: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: .cfi_offset w30, -16 -; CHECK-NEXT: bl llvm.aarch64.sve.pmov.to.vector.lane.zeroing.nxv4i32 -; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: pmov z0[0], p0.s ; CHECK-NEXT: ret entry: %res = call <vscale x 4 x i32> @llvm.aarch64.sve.pmov.to.vector.lane.zeroing.nxv4i32(<vscale x 4 x i1> %pn) @@ -96,11 +69,7 @@ define <vscale x 4 x i32> @test_pmov_to_vector_zero_i32(<vscale x 4 x i1> %pn) { define <vscale x 2 x i64> @test_pmov_to_vector_zero_i64(<vscale x 2 x i1> %pn) { ; CHECK-LABEL: test_pmov_to_vector_zero_i64: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: .cfi_offset w30, -16 -; CHECK-NEXT: bl llvm.aarch64.sve.pmov.to.vector.lane.zeroing.nxv2i64 -; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: pmov z0[0], p0.d ; CHECK-NEXT: ret entry: %res = call <vscale x 2 x i64> @llvm.aarch64.sve.pmov.to.vector.lane.zeroing.nxv2i64(<vscale x 2 x i1> %pn) |