diff options
author | Changpeng Fang <changpeng.fang@amd.com> | 2024-01-23 14:30:11 -0800 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-01-23 14:30:11 -0800 |
commit | 1a300d6da3f3d10e02d9580f8f3f2080bba8adf9 (patch) | |
tree | 9ae7aa2baeb6b46acad6cc7fedc46beecc90728e | |
parent | dc410f94f602390a65c832cf348b9ee6556b1809 (diff) |
AMDGPU: Add SourceOfDivergence for int_amdgcn_global_load_tr (#79218)
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td | 1 | ||||
-rw-r--r-- | llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll | 81 |
2 files changed, 82 insertions, 0 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td index b0ea4aba0189..67263f23b983 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td @@ -414,6 +414,7 @@ def : SourceOfDivergence<int_amdgcn_wmma_f16_16x16x16_f16>; def : SourceOfDivergence<int_amdgcn_wmma_bf16_16x16x16_bf16>; def : SourceOfDivergence<int_amdgcn_wmma_i32_16x16x16_iu8>; def : SourceOfDivergence<int_amdgcn_wmma_i32_16x16x16_iu4>; +def : SourceOfDivergence<int_amdgcn_global_load_tr>; // The dummy boolean output is divergent from the IR's perspective, // but the mask results are uniform. These produce a divergent and diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll index 8826263eabb6..a08ca86c8a61 100644 --- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll +++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll @@ -109,6 +109,78 @@ bb: ret void } +; CHECK: DIVERGENT: %tmp0 = call <2 x i32> @llvm.amdgcn.global.load.tr.v2i32(ptr addrspace(1) %gep) +define amdgpu_kernel void @global_load_tr_b64_v2i32(ptr addrspace(1) %addr, ptr addrspace(1) %out) { +bb: + %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4 + %tmp0 = call <2 x i32> @llvm.amdgcn.global.load.tr.v2i32(ptr addrspace(1) %gep) + store <2 x i32> %tmp0, ptr addrspace(1) %out, align 8 + ret void +} + +; CHECK: DIVERGENT: %tmp0 = call <8 x i16> @llvm.amdgcn.global.load.tr.v8i16(ptr addrspace(1) %gep) +define amdgpu_kernel void @global_load_tr_b128_v8i16(ptr addrspace(1) %addr, ptr addrspace(1) %out) { +bb: + %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4 + %tmp0 = call <8 x i16> @llvm.amdgcn.global.load.tr.v8i16(ptr addrspace(1) %gep) + store <8 x i16> %tmp0, ptr addrspace(1) %out, align 16 + ret void +} + +; CHECK: DIVERGENT: %tmp0 = call <8 x half> @llvm.amdgcn.global.load.tr.v8f16(ptr addrspace(1) %gep) +define amdgpu_kernel void @global_load_tr_b128_v8f16(ptr addrspace(1) %addr, ptr addrspace(1) %out) { +bb: + %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4 + %tmp0 = call <8 x half> @llvm.amdgcn.global.load.tr.v8f16(ptr addrspace(1) %gep) + store <8 x half> %tmp0, ptr addrspace(1) %out, align 16 + ret void +} + +; CHECK: DIVERGENT: %tmp0 = call <8 x bfloat> @llvm.amdgcn.global.load.tr.v8bf16(ptr addrspace(1) %gep) +define amdgpu_kernel void @global_load_tr_b128_v8bf16(ptr addrspace(1) %addr, ptr addrspace(1) %out) { +bb: + %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4 + %tmp0 = call <8 x bfloat> @llvm.amdgcn.global.load.tr.v8bf16(ptr addrspace(1) %gep) + store <8 x bfloat> %tmp0, ptr addrspace(1) %out, align 16 + ret void +} + +; CHECK: DIVERGENT: %tmp0 = call i32 @llvm.amdgcn.global.load.tr.i32(ptr addrspace(1) %gep) +define amdgpu_kernel void @global_load_tr_b64_i32(ptr addrspace(1) %addr, ptr addrspace(1) %out) { +bb: + %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4 + %tmp0 = call i32 @llvm.amdgcn.global.load.tr.i32(ptr addrspace(1) %gep) + store i32 %tmp0, ptr addrspace(1) %out, align 4 + ret void +} + +; CHECK: DIVERGENT: %tmp0 = call <4 x i16> @llvm.amdgcn.global.load.tr.v4i16(ptr addrspace(1) %gep) +define amdgpu_kernel void @global_load_tr_b128_v4i16_(ptr addrspace(1) %addr, ptr addrspace(1) %out) { +bb: + %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4 + %tmp0 = call <4 x i16> @llvm.amdgcn.global.load.tr.v4i16(ptr addrspace(1) %gep) + store <4 x i16> %tmp0, ptr addrspace(1) %out, align 8 + ret void +} + +; CHECK: DIVERGENT: %tmp0 = call <4 x half> @llvm.amdgcn.global.load.tr.v4f16(ptr addrspace(1) %gep) +define amdgpu_kernel void @global_load_tr_b128_v4f16(ptr addrspace(1) %addr, ptr addrspace(1) %out) { +bb: + %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4 + %tmp0 = call <4 x half> @llvm.amdgcn.global.load.tr.v4f16(ptr addrspace(1) %gep) + store <4 x half> %tmp0, ptr addrspace(1) %out, align 8 + ret void +} + +; CHECK: DIVERGENT: %tmp0 = call <4 x bfloat> @llvm.amdgcn.global.load.tr.v4bf16(ptr addrspace(1) %gep) +define amdgpu_kernel void @global_load_tr_b128_v4bf16(ptr addrspace(1) %addr, ptr addrspace(1) %out) { +bb: + %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4 + %tmp0 = call <4 x bfloat> @llvm.amdgcn.global.load.tr.v4bf16(ptr addrspace(1) %gep) + store <4 x bfloat> %tmp0, ptr addrspace(1) %out, align 8 + ret void +} + declare i32 @llvm.amdgcn.ds.swizzle(i32, i32) #1 declare i32 @llvm.amdgcn.permlane16(i32, i32, i32, i32, i1, i1) #1 declare i32 @llvm.amdgcn.permlanex16(i32, i32, i32, i32, i1, i1) #1 @@ -125,5 +197,14 @@ declare <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v16i16(<16 x i16>, <16 x declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32(i1 immarg, <4 x i32>, i1 immarg, <4 x i32> , <8 x i32>, i1 immarg) #1 declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32(i1 immarg, <2 x i32>, i1 immarg, <2 x i32> , <8 x i32>, i1 immarg) #1 +declare <2 x i32> @llvm.amdgcn.global.load.tr.v2i32(ptr addrspace(1)) +declare <8 x i16> @llvm.amdgcn.global.load.tr.v8i16(ptr addrspace(1)) +declare <8 x half> @llvm.amdgcn.global.load.tr.v8f16(ptr addrspace(1)) +declare <8 x bfloat> @llvm.amdgcn.global.load.tr.v8bf16(ptr addrspace(1)) +declare i32 @llvm.amdgcn.global.load.tr.i32(ptr addrspace(1)) +declare <4 x i16> @llvm.amdgcn.global.load.tr.v4i16(ptr addrspace(1)) +declare <4 x half> @llvm.amdgcn.global.load.tr.v4f16(ptr addrspace(1)) +declare <4 x bfloat> @llvm.amdgcn.global.load.tr.v4bf16(ptr addrspace(1)) + attributes #0 = { nounwind convergent } attributes #1 = { nounwind readnone convergent } |