summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJay Foad <jay.foad@amd.com>2024-04-16 10:15:00 +0100
committerGitHub <noreply@github.com>2024-04-16 10:15:00 +0100
commitf4f772ceef379bd434d266b6e0d2bbdf796f81cb (patch)
tree533107a3ff4a791f86863aa4ecf1b075030bcbd4
parent4fc0a99b8f220b6b41648da491bcc81a067f1600 (diff)
[AMDGPU] Stop reserving $vcc_hi in wave32 mode (#87783)
This gives us one extra SGPR to play with. The comment suggested that it could cause bugs, but I have tested it with Vulkan CTS with the default wave size for compute shaders set to 32 and did not find any problems.
-rw-r--r--llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp7
-rw-r--r--llvm/test/CodeGen/AMDGPU/bf16.ll22
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll20
-rw-r--r--llvm/test/CodeGen/AMDGPU/load-constant-i16.ll22
-rw-r--r--llvm/test/CodeGen/AMDGPU/load-constant-i8.ll90
-rw-r--r--llvm/test/CodeGen/AMDGPU/sgpr-spill-overlap-wwm-reserve.mir26
6 files changed, 88 insertions, 99 deletions
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index 245731ad5fc7..acb54fd10b90 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -612,13 +612,6 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
// Reserve null register - it shall never be allocated
reserveRegisterTuples(Reserved, AMDGPU::SGPR_NULL64);
- // Disallow vcc_hi allocation in wave32. It may be allocated but most likely
- // will result in bugs.
- if (isWave32) {
- Reserved.set(AMDGPU::VCC);
- Reserved.set(AMDGPU::VCC_HI);
- }
-
// Reserve SGPRs.
//
unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF);
diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll
index bf4302c156d8..4c9c34de7194 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16.ll
@@ -38342,12 +38342,11 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
; GFX10-NEXT: v_and_b32_e32 v2, 1, v2
; GFX10-NEXT: v_and_b32_e32 v4, 1, v4
; GFX10-NEXT: v_and_b32_e32 v6, 1, v6
-; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: v_and_b32_e32 v8, 1, v8
; GFX10-NEXT: v_and_b32_e32 v10, 1, v10
+; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: v_and_b32_e32 v1, 1, v1
; GFX10-NEXT: v_and_b32_e32 v3, 1, v3
-; GFX10-NEXT: v_writelane_b32 v40, s34, 2
; GFX10-NEXT: v_and_b32_e32 v5, 1, v5
; GFX10-NEXT: v_and_b32_e32 v7, 1, v7
; GFX10-NEXT: v_and_b32_e32 v9, 1, v9
@@ -38366,7 +38365,7 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
; GFX10-NEXT: v_cmp_eq_u32_e64 s17, 1, v4
; GFX10-NEXT: v_cmp_eq_u32_e64 s18, 1, v2
; GFX10-NEXT: v_cmp_eq_u32_e64 s19, 1, v0
-; GFX10-NEXT: v_writelane_b32 v40, s35, 3
+; GFX10-NEXT: v_writelane_b32 v40, s34, 2
; GFX10-NEXT: v_cmp_eq_u32_e64 s20, 1, v27
; GFX10-NEXT: v_cmp_eq_u32_e64 s21, 1, v25
; GFX10-NEXT: v_cmp_eq_u32_e64 s22, 1, v23
@@ -38377,10 +38376,10 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
; GFX10-NEXT: v_cmp_eq_u32_e64 s27, 1, v13
; GFX10-NEXT: v_cmp_eq_u32_e64 s28, 1, v11
; GFX10-NEXT: v_cmp_eq_u32_e64 s29, 1, v7
-; GFX10-NEXT: v_cmp_eq_u32_e64 s30, 1, v3
-; GFX10-NEXT: v_cmp_eq_u32_e64 s31, 1, v1
-; GFX10-NEXT: v_cmp_eq_u32_e64 s34, 1, v5
-; GFX10-NEXT: v_cmp_eq_u32_e64 s35, 1, v9
+; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_hi, 1, v3
+; GFX10-NEXT: v_cmp_eq_u32_e64 s30, 1, v1
+; GFX10-NEXT: v_cmp_eq_u32_e64 s31, 1, v5
+; GFX10-NEXT: v_cmp_eq_u32_e64 s34, 1, v9
; GFX10-NEXT: s_waitcnt vmcnt(32)
; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v31
; GFX10-NEXT: s_waitcnt vmcnt(31)
@@ -38460,10 +38459,10 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
; GFX10-NEXT: v_cndmask_b32_e64 v6, v29, v39, s27
; GFX10-NEXT: v_cndmask_b32_e64 v5, v28, v26, s28
; GFX10-NEXT: v_cndmask_b32_e64 v20, v51, v20, s29
-; GFX10-NEXT: v_cndmask_b32_e64 v0, v14, v12, s31
-; GFX10-NEXT: v_cndmask_b32_e64 v1, v55, v16, s30
-; GFX10-NEXT: v_cndmask_b32_e64 v2, v53, v18, s34
-; GFX10-NEXT: v_cndmask_b32_e64 v12, v24, v22, s35
+; GFX10-NEXT: v_cndmask_b32_e64 v0, v14, v12, s30
+; GFX10-NEXT: v_cndmask_b32_e64 v1, v55, v16, vcc_hi
+; GFX10-NEXT: v_cndmask_b32_e64 v2, v53, v18, s31
+; GFX10-NEXT: v_cndmask_b32_e64 v12, v24, v22, s34
; GFX10-NEXT: v_cndmask_b32_e64 v16, v4, v3, s4
; GFX10-NEXT: v_perm_b32 v0, v0, v64, 0x5040100
; GFX10-NEXT: v_perm_b32 v1, v1, v54, 0x5040100
@@ -38481,7 +38480,6 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
; GFX10-NEXT: v_perm_b32 v13, v66, v13, 0x5040100
; GFX10-NEXT: v_perm_b32 v14, v65, v17, 0x5040100
; GFX10-NEXT: v_perm_b32 v15, v16, v15, 0x5040100
-; GFX10-NEXT: v_readlane_b32 s35, v40, 3
; GFX10-NEXT: v_readlane_b32 s34, v40, 2
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll
index ec3c08ec7952..da64c379672e 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll
@@ -1259,17 +1259,17 @@ define <4 x i1> @isnan_v4f16(<4 x half> %x) nounwind {
; GFX10SELDAG-LABEL: isnan_v4f16:
; GFX10SELDAG: ; %bb.0:
; GFX10SELDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10SELDAG-NEXT: v_mov_b32_e32 v2, 3
-; GFX10SELDAG-NEXT: v_cmp_class_f16_e64 s5, v0, 3
-; GFX10SELDAG-NEXT: v_cmp_class_f16_sdwa s4, v1, v2 src0_sel:WORD_1 src1_sel:DWORD
-; GFX10SELDAG-NEXT: v_cndmask_b32_e64 v4, 0, 1, s5
-; GFX10SELDAG-NEXT: v_cmp_class_f16_sdwa s5, v0, v2 src0_sel:WORD_1 src1_sel:DWORD
+; GFX10SELDAG-NEXT: v_cmp_class_f16_e64 s4, v0, 3
+; GFX10SELDAG-NEXT: v_mov_b32_e32 v3, 3
+; GFX10SELDAG-NEXT: v_cndmask_b32_e64 v5, 0, 1, s4
+; GFX10SELDAG-NEXT: v_cmp_class_f16_e64 s4, v1, 3
+; GFX10SELDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, s4
+; GFX10SELDAG-NEXT: v_cmp_class_f16_sdwa s4, v0, v3 src0_sel:WORD_1 src1_sel:DWORD
+; GFX10SELDAG-NEXT: v_mov_b32_e32 v0, v5
+; GFX10SELDAG-NEXT: v_cndmask_b32_e64 v4, 0, 1, s4
+; GFX10SELDAG-NEXT: v_cmp_class_f16_sdwa s4, v1, v3 src0_sel:WORD_1 src1_sel:DWORD
+; GFX10SELDAG-NEXT: v_mov_b32_e32 v1, v4
; GFX10SELDAG-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4
-; GFX10SELDAG-NEXT: v_mov_b32_e32 v0, v4
-; GFX10SELDAG-NEXT: v_cndmask_b32_e64 v5, 0, 1, s5
-; GFX10SELDAG-NEXT: v_cmp_class_f16_e64 s5, v1, 3
-; GFX10SELDAG-NEXT: v_mov_b32_e32 v1, v5
-; GFX10SELDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, s5
; GFX10SELDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX10GLISEL-LABEL: isnan_v4f16:
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
index ab6a9dcf71ac..a87fa8bf36d9 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
@@ -7404,35 +7404,35 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i64(ptr addrspace(1) %
; GFX12-NEXT: v_dual_mov_b32 v4, s22 :: v_dual_mov_b32 v9, s31
; GFX12-NEXT: v_dual_mov_b32 v8, s30 :: v_dual_mov_b32 v11, s35
; GFX12-NEXT: v_dual_mov_b32 v10, s34 :: v_dual_mov_b32 v3, s5
-; GFX12-NEXT: s_bfe_i64 s[10:11], s[0:1], 0x100000
-; GFX12-NEXT: s_lshr_b32 s12, s0, 16
-; GFX12-NEXT: s_mov_b32 s14, s1
-; GFX12-NEXT: s_lshr_b32 s16, s1, 16
-; GFX12-NEXT: s_bfe_i64 s[0:1], s[2:3], 0x100000
+; GFX12-NEXT: s_bfe_i64 s[16:17], s[2:3], 0x100000
; GFX12-NEXT: s_lshr_b32 s2, s2, 16
; GFX12-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x100000
; GFX12-NEXT: v_dual_mov_b32 v0, s28 :: v_dual_mov_b32 v5, s23
; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v13, s25
+; GFX12-NEXT: s_mov_b32 s12, s1
+; GFX12-NEXT: s_lshr_b32 s14, s1, 16
; GFX12-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x100000
; GFX12-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x100000
; GFX12-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27
; GFX12-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v7, s7
+; GFX12-NEXT: s_bfe_i64 s[10:11], s[0:1], 0x100000
+; GFX12-NEXT: s_lshr_b32 s0, s0, 16
; GFX12-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x100000
; GFX12-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v17, s19
+; GFX12-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x100000
; GFX12-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x100000
-; GFX12-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x100000
; GFX12-NEXT: v_dual_mov_b32 v16, s18 :: v_dual_mov_b32 v19, s21
; GFX12-NEXT: v_mov_b32_e32 v18, s20
-; GFX12-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x100000
+; GFX12-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x100000
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v24, v[8:11], s[8:9] offset:80
; GFX12-NEXT: global_store_b128 v24, v[0:3], s[8:9] offset:64
-; GFX12-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v0, s0
+; GFX12-NEXT: v_dual_mov_b32 v1, s17 :: v_dual_mov_b32 v0, s16
; GFX12-NEXT: v_dual_mov_b32 v3, s3 :: v_dual_mov_b32 v2, s2
-; GFX12-NEXT: v_dual_mov_b32 v9, s15 :: v_dual_mov_b32 v8, s14
-; GFX12-NEXT: v_dual_mov_b32 v11, s17 :: v_dual_mov_b32 v10, s16
+; GFX12-NEXT: v_dual_mov_b32 v9, s13 :: v_dual_mov_b32 v8, s12
+; GFX12-NEXT: v_dual_mov_b32 v11, s15 :: v_dual_mov_b32 v10, s14
; GFX12-NEXT: v_dual_mov_b32 v21, s11 :: v_dual_mov_b32 v20, s10
-; GFX12-NEXT: v_dual_mov_b32 v23, s13 :: v_dual_mov_b32 v22, s12
+; GFX12-NEXT: v_dual_mov_b32 v23, s1 :: v_dual_mov_b32 v22, s0
; GFX12-NEXT: s_clause 0x5
; GFX12-NEXT: global_store_b128 v24, v[12:15], s[8:9] offset:112
; GFX12-NEXT: global_store_b128 v24, v[4:7], s[8:9] offset:96
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll
index 952827b8cd0e..889755c23bbc 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll
@@ -8808,73 +8808,73 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o
; GFX12-NEXT: v_lshrrev_b16 v2, 8, s6
; GFX12-NEXT: v_lshrrev_b16 v4, 8, s5
; GFX12-NEXT: v_lshrrev_b16 v8, 8, s2
-; GFX12-NEXT: s_lshr_b32 s24, s7, 16
+; GFX12-NEXT: s_lshr_b32 s22, s7, 16
; GFX12-NEXT: v_bfe_i32 v31, v1, 0, 8
-; GFX12-NEXT: s_lshr_b32 s42, s2, 24
-; GFX12-NEXT: s_mov_b32 s48, s7
+; GFX12-NEXT: s_lshr_b32 s40, s2, 24
+; GFX12-NEXT: s_mov_b32 s46, s7
; GFX12-NEXT: v_lshrrev_b16 v5, 8, s4
; GFX12-NEXT: v_lshrrev_b16 v7, 8, s1
-; GFX12-NEXT: s_lshr_b32 s26, s6, 16
-; GFX12-NEXT: s_lshr_b32 s44, s1, 16
+; GFX12-NEXT: s_lshr_b32 s24, s6, 16
+; GFX12-NEXT: s_lshr_b32 s42, s1, 16
; GFX12-NEXT: s_ashr_i64 s[58:59], s[6:7], 56
-; GFX12-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x80000
-; GFX12-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x80000
-; GFX12-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000
+; GFX12-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x80000
+; GFX12-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x80000
+; GFX12-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000
; GFX12-NEXT: v_lshrrev_b16 v6, 8, s3
; GFX12-NEXT: v_lshrrev_b16 v3, 8, s0
-; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v33, s24
-; GFX12-NEXT: s_lshr_b32 s28, s6, 24
-; GFX12-NEXT: s_lshr_b32 s30, s5, 16
-; GFX12-NEXT: s_lshr_b32 s40, s2, 16
+; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v33, s22
+; GFX12-NEXT: s_lshr_b32 s26, s6, 24
+; GFX12-NEXT: s_lshr_b32 s28, s5, 16
+; GFX12-NEXT: s_lshr_b32 s38, s2, 16
; GFX12-NEXT: v_bfe_i32 v11, v8, 0, 8
; GFX12-NEXT: v_bfe_i32 v23, v4, 0, 8
; GFX12-NEXT: v_bfe_i32 v27, v2, 0, 8
; GFX12-NEXT: v_ashrrev_i32_e32 v32, 31, v31
-; GFX12-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x80000
-; GFX12-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x80000
-; GFX12-NEXT: v_dual_mov_b32 v34, s25 :: v_dual_mov_b32 v35, s58
-; GFX12-NEXT: v_dual_mov_b32 v36, s59 :: v_dual_mov_b32 v37, s26
-; GFX12-NEXT: v_dual_mov_b32 v56, s43 :: v_dual_mov_b32 v29, s48
-; GFX12-NEXT: v_mov_b32_e32 v30, s49
-; GFX12-NEXT: s_lshr_b32 s46, s0, 24
-; GFX12-NEXT: s_mov_b32 s50, s5
-; GFX12-NEXT: s_mov_b32 s52, s3
-; GFX12-NEXT: s_lshr_b32 s34, s4, 16
-; GFX12-NEXT: s_lshr_b32 s36, s4, 24
-; GFX12-NEXT: s_ashr_i64 s[22:23], s[2:3], 56
+; GFX12-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x80000
+; GFX12-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000
+; GFX12-NEXT: v_dual_mov_b32 v34, s23 :: v_dual_mov_b32 v35, s58
+; GFX12-NEXT: v_dual_mov_b32 v36, s59 :: v_dual_mov_b32 v37, s24
+; GFX12-NEXT: v_dual_mov_b32 v56, s41 :: v_dual_mov_b32 v29, s46
+; GFX12-NEXT: v_mov_b32_e32 v30, s47
+; GFX12-NEXT: s_lshr_b32 s44, s0, 24
+; GFX12-NEXT: s_mov_b32 s48, s5
+; GFX12-NEXT: s_mov_b32 s50, s3
+; GFX12-NEXT: s_lshr_b32 s30, s4, 16
+; GFX12-NEXT: s_lshr_b32 s34, s4, 24
+; GFX12-NEXT: s_ashr_i64 s[54:55], s[2:3], 56
; GFX12-NEXT: s_ashr_i64 s[56:57], s[4:5], 56
; GFX12-NEXT: v_bfe_i32 v7, v7, 0, 8
; GFX12-NEXT: v_bfe_i32 v19, v5, 0, 8
-; GFX12-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x80000
-; GFX12-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x80000
+; GFX12-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x80000
; GFX12-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x80000
-; GFX12-NEXT: s_lshr_b32 s38, s3, 16
-; GFX12-NEXT: s_mov_b32 s54, s1
+; GFX12-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x80000
+; GFX12-NEXT: s_lshr_b32 s36, s3, 16
+; GFX12-NEXT: s_mov_b32 s52, s1
; GFX12-NEXT: s_bfe_i64 s[12:13], s[2:3], 0x80000
; GFX12-NEXT: s_bfe_i64 s[14:15], s[4:5], 0x80000
; GFX12-NEXT: s_bfe_i64 s[16:17], s[6:7], 0x80000
-; GFX12-NEXT: s_bfe_i64 s[2:3], s[52:53], 0x80000
-; GFX12-NEXT: s_bfe_i64 s[4:5], s[50:51], 0x80000
-; GFX12-NEXT: s_bfe_i64 s[6:7], s[46:47], 0x80000
+; GFX12-NEXT: s_bfe_i64 s[2:3], s[50:51], 0x80000
+; GFX12-NEXT: s_bfe_i64 s[4:5], s[48:49], 0x80000
+; GFX12-NEXT: s_bfe_i64 s[6:7], s[44:45], 0x80000
; GFX12-NEXT: s_lshr_b32 s20, s0, 16
; GFX12-NEXT: s_ashr_i64 s[18:19], s[0:1], 56
; GFX12-NEXT: v_bfe_i32 v3, v3, 0, 8
; GFX12-NEXT: v_bfe_i32 v15, v6, 0, 8
-; GFX12-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x80000
; GFX12-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x80000
-; GFX12-NEXT: v_dual_mov_b32 v38, s27 :: v_dual_mov_b32 v39, s28
-; GFX12-NEXT: v_dual_mov_b32 v40, s29 :: v_dual_mov_b32 v41, s30
-; GFX12-NEXT: v_dual_mov_b32 v42, s31 :: v_dual_mov_b32 v43, s56
-; GFX12-NEXT: v_dual_mov_b32 v44, s57 :: v_dual_mov_b32 v45, s34
-; GFX12-NEXT: v_dual_mov_b32 v52, s23 :: v_dual_mov_b32 v53, s40
-; GFX12-NEXT: v_dual_mov_b32 v54, s41 :: v_dual_mov_b32 v55, s42
+; GFX12-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x80000
+; GFX12-NEXT: v_dual_mov_b32 v38, s25 :: v_dual_mov_b32 v39, s26
+; GFX12-NEXT: v_dual_mov_b32 v40, s27 :: v_dual_mov_b32 v41, s28
+; GFX12-NEXT: v_dual_mov_b32 v42, s29 :: v_dual_mov_b32 v43, s56
+; GFX12-NEXT: v_dual_mov_b32 v44, s57 :: v_dual_mov_b32 v45, s30
+; GFX12-NEXT: v_dual_mov_b32 v52, s55 :: v_dual_mov_b32 v53, s38
+; GFX12-NEXT: v_dual_mov_b32 v54, s39 :: v_dual_mov_b32 v55, s40
; GFX12-NEXT: s_bfe_i64 s[10:11], s[0:1], 0x80000
-; GFX12-NEXT: s_bfe_i64 s[0:1], s[54:55], 0x80000
+; GFX12-NEXT: s_bfe_i64 s[0:1], s[52:53], 0x80000
; GFX12-NEXT: v_ashrrev_i32_e32 v12, 31, v11
; GFX12-NEXT: v_ashrrev_i32_e32 v24, 31, v23
; GFX12-NEXT: v_ashrrev_i32_e32 v28, 31, v27
; GFX12-NEXT: global_store_b128 v0, v[33:36], s[8:9] offset:240
-; GFX12-NEXT: v_mov_b32_e32 v33, s44
+; GFX12-NEXT: v_mov_b32_e32 v33, s42
; GFX12-NEXT: global_store_b128 v0, v[29:32], s[8:9] offset:224
; GFX12-NEXT: v_dual_mov_b32 v25, s16 :: v_dual_mov_b32 v26, s17
; GFX12-NEXT: v_dual_mov_b32 v32, s7 :: v_dual_mov_b32 v21, s4
@@ -8882,16 +8882,16 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o
; GFX12-NEXT: v_dual_mov_b32 v14, s3 :: v_dual_mov_b32 v9, s12
; GFX12-NEXT: v_dual_mov_b32 v10, s13 :: v_dual_mov_b32 v5, s0
; GFX12-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000
-; GFX12-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x80000
-; GFX12-NEXT: v_dual_mov_b32 v46, s35 :: v_dual_mov_b32 v47, s36
-; GFX12-NEXT: v_dual_mov_b32 v48, s37 :: v_dual_mov_b32 v49, s38
-; GFX12-NEXT: v_dual_mov_b32 v34, s45 :: v_dual_mov_b32 v35, s18
+; GFX12-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x80000
+; GFX12-NEXT: v_dual_mov_b32 v46, s31 :: v_dual_mov_b32 v47, s34
+; GFX12-NEXT: v_dual_mov_b32 v48, s35 :: v_dual_mov_b32 v49, s36
+; GFX12-NEXT: v_dual_mov_b32 v34, s43 :: v_dual_mov_b32 v35, s18
; GFX12-NEXT: v_dual_mov_b32 v36, s19 :: v_dual_mov_b32 v29, s20
; GFX12-NEXT: v_ashrrev_i32_e32 v8, 31, v7
; GFX12-NEXT: v_ashrrev_i32_e32 v20, 31, v19
; GFX12-NEXT: v_dual_mov_b32 v18, s15 :: v_dual_mov_b32 v13, s2
; GFX12-NEXT: v_dual_mov_b32 v6, s1 :: v_dual_mov_b32 v1, s10
-; GFX12-NEXT: v_dual_mov_b32 v50, s39 :: v_dual_mov_b32 v51, s22
+; GFX12-NEXT: v_dual_mov_b32 v50, s37 :: v_dual_mov_b32 v51, s54
; GFX12-NEXT: v_dual_mov_b32 v30, s21 :: v_dual_mov_b32 v31, s6
; GFX12-NEXT: v_ashrrev_i32_e32 v4, 31, v3
; GFX12-NEXT: v_ashrrev_i32_e32 v16, 31, v15
diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spill-overlap-wwm-reserve.mir b/llvm/test/CodeGen/AMDGPU/sgpr-spill-overlap-wwm-reserve.mir
index f8e7cb397b47..8a5f75332557 100644
--- a/llvm/test/CodeGen/AMDGPU/sgpr-spill-overlap-wwm-reserve.mir
+++ b/llvm/test/CodeGen/AMDGPU/sgpr-spill-overlap-wwm-reserve.mir
@@ -28,18 +28,17 @@ body: |
; GCN-LABEL: name: test_main
; GCN: bb.0:
; GCN-NEXT: successors: %bb.1(0x80000000)
- ; GCN-NEXT: liveins: $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr18, $sgpr19, $sgpr20, $sgpr21, $sgpr22, $sgpr23, $sgpr24, $sgpr25, $sgpr26, $sgpr27, $sgpr28, $sgpr29, $sgpr30, $sgpr31, $sgpr64, $sgpr65, $sgpr66, $sgpr67, $sgpr68, $sgpr69, $sgpr70, $sgpr71, $sgpr72, $sgpr73, $sgpr74, $sgpr75, $sgpr76, $sgpr77, $sgpr78, $sgpr79, $sgpr80, $sgpr81, $sgpr82, $sgpr83, $sgpr84, $sgpr85, $sgpr86, $sgpr87, $sgpr88, $sgpr89, $sgpr90, $sgpr91, $sgpr92, $sgpr93, $sgpr94, $sgpr95, $sgpr96, $sgpr97, $sgpr98, $sgpr99, $sgpr100, $sgpr101, $sgpr102, $sgpr103, $vgpr0, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+ ; GCN-NEXT: liveins: $vcc_hi, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr18, $sgpr19, $sgpr20, $sgpr21, $sgpr22, $sgpr23, $sgpr24, $sgpr25, $sgpr26, $sgpr27, $sgpr28, $sgpr29, $sgpr30, $sgpr31, $sgpr64, $sgpr65, $sgpr66, $sgpr67, $sgpr68, $sgpr69, $sgpr70, $sgpr71, $sgpr72, $sgpr73, $sgpr74, $sgpr75, $sgpr76, $sgpr77, $sgpr78, $sgpr79, $sgpr80, $sgpr81, $sgpr82, $sgpr83, $sgpr84, $sgpr85, $sgpr86, $sgpr87, $sgpr88, $sgpr89, $sgpr90, $sgpr91, $sgpr92, $sgpr93, $sgpr94, $sgpr95, $sgpr96, $sgpr97, $sgpr98, $sgpr99, $sgpr100, $sgpr101, $sgpr102, $sgpr103, $vgpr0, $vgpr2, $vgpr3, $vgpr4, $vgpr5
; GCN-NEXT: {{ $}}
- ; GCN-NEXT: $sgpr0 = COPY $sgpr33
+ ; GCN-NEXT: $vcc_hi = frame-setup COPY $sgpr33
; GCN-NEXT: $sgpr33 = frame-setup COPY $sgpr32
- ; GCN-NEXT: $sgpr1 = S_XOR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
+ ; GCN-NEXT: $sgpr0 = S_XOR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
; GCN-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr3, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.69, addrspace 5)
; GCN-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr4, $sgpr33, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.70, addrspace 5)
; GCN-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr5, $sgpr33, 8, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.71, addrspace 5)
; GCN-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr2, $sgpr33, 12, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.72, addrspace 5)
- ; GCN-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr1, $sgpr33, 16, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.74, addrspace 5)
- ; GCN-NEXT: $exec_lo = S_MOV_B32 killed $sgpr1
- ; GCN-NEXT: $vgpr5 = SI_SPILL_S32_TO_VGPR $sgpr0, 4, undef $vgpr5
+ ; GCN-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr1, $sgpr33, 16, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.73, addrspace 5)
+ ; GCN-NEXT: $exec_lo = S_MOV_B32 killed $sgpr0
; GCN-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 24, implicit-def dead $scc
; GCN-NEXT: renamable $vgpr2 = IMPLICIT_DEF
; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr4, 0, $vgpr3
@@ -116,18 +115,18 @@ body: |
; GCN-NEXT: {{ $}}
; GCN-NEXT: bb.1:
; GCN-NEXT: successors: %bb.2(0x80000000)
- ; GCN-NEXT: liveins: $vgpr2, $vgpr3, $vgpr4, $vgpr5
+ ; GCN-NEXT: liveins: $vcc_hi, $vgpr2, $vgpr3, $vgpr4, $vgpr5
; GCN-NEXT: {{ $}}
; GCN-NEXT: KILL implicit-def $vcc_lo, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31, implicit-def $sgpr32_sgpr33_sgpr34_sgpr35_sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63, implicit-def $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95, implicit-def $sgpr96_sgpr97_sgpr98_sgpr99_sgpr100_sgpr101_sgpr102_sgpr103
; GCN-NEXT: {{ $}}
; GCN-NEXT: bb.2:
; GCN-NEXT: successors: %bb.3(0x80000000)
- ; GCN-NEXT: liveins: $vgpr2, $vgpr3, $vgpr4, $vgpr5
+ ; GCN-NEXT: liveins: $vcc_hi, $vgpr2, $vgpr3, $vgpr4, $vgpr5
; GCN-NEXT: {{ $}}
; GCN-NEXT: $sgpr22 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 0
; GCN-NEXT: {{ $}}
; GCN-NEXT: bb.3:
- ; GCN-NEXT: liveins: $vgpr2, $vgpr3, $vgpr4, $vgpr5
+ ; GCN-NEXT: liveins: $vcc_hi, $vgpr2, $vgpr3, $vgpr4, $vgpr5
; GCN-NEXT: {{ $}}
; GCN-NEXT: $sgpr103 = SI_RESTORE_S32_FROM_VGPR $vgpr5, 3
; GCN-NEXT: $sgpr102 = SI_RESTORE_S32_FROM_VGPR $vgpr5, 2
@@ -198,16 +197,15 @@ body: |
; GCN-NEXT: $sgpr5 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 1
; GCN-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 0
; GCN-NEXT: KILL killed renamable $vgpr2
- ; GCN-NEXT: $sgpr0 = SI_RESTORE_S32_FROM_VGPR $vgpr5, 4
- ; GCN-NEXT: $sgpr1 = S_XOR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
+ ; GCN-NEXT: $sgpr0 = S_XOR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
; GCN-NEXT: $vgpr3 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.69, addrspace 5)
; GCN-NEXT: $vgpr4 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.70, addrspace 5)
; GCN-NEXT: $vgpr5 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 8, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.71, addrspace 5)
; GCN-NEXT: $vgpr2 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 12, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.72, addrspace 5)
- ; GCN-NEXT: $vgpr1 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 16, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.74, addrspace 5)
- ; GCN-NEXT: $exec_lo = S_MOV_B32 killed $sgpr1
+ ; GCN-NEXT: $vgpr1 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 16, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.73, addrspace 5)
+ ; GCN-NEXT: $exec_lo = S_MOV_B32 killed $sgpr0
; GCN-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -24, implicit-def dead $scc
- ; GCN-NEXT: $sgpr33 = COPY $sgpr0
+ ; GCN-NEXT: $sgpr33 = frame-destroy COPY $vcc_hi
; GCN-NEXT: S_ENDPGM 0
bb.0:
liveins: $vgpr0