summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorNicolas Miller <nicolas.miller@codeplay.com>2024-05-01 18:15:52 +0100
committerGitHub <noreply@github.com>2024-05-01 10:15:52 -0700
commit7396ab1210a2aeee6bab5b73ec6d02975ba51b93 (patch)
treeaf717a696e0fa1411392b156f7067c701127ff8c
parentcf2f32c97f8fece105557c2357be4809cb9c14a1 (diff)
[NVPTX] Fix 64 bits rotations with large shift values (#89399)
ROTL and ROTR can take a shift amount larger than the element size, in which case the effective shift amount should be the shift amount modulo the element size. This patch adds the modulo step when the shift amount isn't known at compile time. Without it the existing implementation would end up shifting beyond the type size and give incorrect results.
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXInstrInfo.td10
-rw-r--r--llvm/test/CodeGen/NVPTX/rotate.ll339
2 files changed, 320 insertions, 29 deletions
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index 897ee89323f0..142dd64ddea9 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -1752,8 +1752,9 @@ def ROTL64reg_sw :
".reg .b64 %lhs;\n\t"
".reg .b64 %rhs;\n\t"
".reg .u32 %amt2;\n\t"
- "shl.b64 \t%lhs, $src, $amt;\n\t"
- "sub.u32 \t%amt2, 64, $amt;\n\t"
+ "and.b32 \t%amt2, $amt, 63;\n\t"
+ "shl.b64 \t%lhs, $src, %amt2;\n\t"
+ "sub.u32 \t%amt2, 64, %amt2;\n\t"
"shr.b64 \t%rhs, $src, %amt2;\n\t"
"add.u64 \t$dst, %lhs, %rhs;\n\t"
"}}",
@@ -1765,8 +1766,9 @@ def ROTR64reg_sw :
".reg .b64 %lhs;\n\t"
".reg .b64 %rhs;\n\t"
".reg .u32 %amt2;\n\t"
- "shr.b64 \t%lhs, $src, $amt;\n\t"
- "sub.u32 \t%amt2, 64, $amt;\n\t"
+ "and.b32 \t%amt2, $amt, 63;\n\t"
+ "shr.b64 \t%lhs, $src, %amt2;\n\t"
+ "sub.u32 \t%amt2, 64, %amt2;\n\t"
"shl.b64 \t%rhs, $src, %amt2;\n\t"
"add.u64 \t$dst, %lhs, %rhs;\n\t"
"}}",
diff --git a/llvm/test/CodeGen/NVPTX/rotate.ll b/llvm/test/CodeGen/NVPTX/rotate.ll
index 9d058662c271..20c7ae5908d2 100644
--- a/llvm/test/CodeGen/NVPTX/rotate.ll
+++ b/llvm/test/CodeGen/NVPTX/rotate.ll
@@ -1,7 +1,8 @@
-; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck --check-prefix=SM20 %s
-; RUN: llc < %s -march=nvptx64 -mcpu=sm_35 | FileCheck --check-prefix=SM35 %s
-; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
-; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_35 | %ptxas-verify %}
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc < %s --mtriple=nvptx64 -mcpu=sm_20 | FileCheck --check-prefix=SM20 %s
+; RUN: llc < %s --mtriple=nvptx64 -mcpu=sm_35 | FileCheck --check-prefix=SM35 %s
+; RUN: %if ptxas %{ llc < %s --mtriple=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
+; RUN: %if ptxas %{ llc < %s --mtriple=nvptx64 -mcpu=sm_35 | %ptxas-verify %}
declare i32 @llvm.nvvm.rotate.b32(i32, i32)
@@ -11,11 +12,35 @@ declare i64 @llvm.nvvm.rotate.right.b64(i64, i32)
; SM20: rotate32
; SM35: rotate32
define i32 @rotate32(i32 %a, i32 %b) {
-; SM20: shl.b32
-; SM20: sub.s32
-; SM20: shr.b32
-; SM20: add.u32
-; SM35: shf.l.wrap.b32
+; SM20-LABEL: rotate32(
+; SM20: {
+; SM20-NEXT: .reg .b32 %r<4>;
+; SM20-EMPTY:
+; SM20-NEXT: // %bb.0:
+; SM20-NEXT: ld.param.u32 %r1, [rotate32_param_0];
+; SM20-NEXT: ld.param.u32 %r2, [rotate32_param_1];
+; SM20-NEXT: {
+; SM20-NEXT: .reg .b32 %lhs;
+; SM20-NEXT: .reg .b32 %rhs;
+; SM20-NEXT: .reg .b32 %amt2;
+; SM20-NEXT: shl.b32 %lhs, %r1, %r2;
+; SM20-NEXT: sub.s32 %amt2, 32, %r2;
+; SM20-NEXT: shr.b32 %rhs, %r1, %amt2;
+; SM20-NEXT: add.u32 %r3, %lhs, %rhs;
+; SM20-NEXT: }
+; SM20-NEXT: st.param.b32 [func_retval0+0], %r3;
+; SM20-NEXT: ret;
+;
+; SM35-LABEL: rotate32(
+; SM35: {
+; SM35-NEXT: .reg .b32 %r<4>;
+; SM35-EMPTY:
+; SM35-NEXT: // %bb.0:
+; SM35-NEXT: ld.param.u32 %r1, [rotate32_param_0];
+; SM35-NEXT: ld.param.u32 %r2, [rotate32_param_1];
+; SM35-NEXT: shf.l.wrap.b32 %r3, %r1, %r1, %r2;
+; SM35-NEXT: st.param.b32 [func_retval0+0], %r3;
+; SM35-NEXT: ret;
%val = tail call i32 @llvm.nvvm.rotate.b32(i32 %a, i32 %b)
ret i32 %val
}
@@ -23,12 +48,48 @@ define i32 @rotate32(i32 %a, i32 %b) {
; SM20: rotate64
; SM35: rotate64
define i64 @rotate64(i64 %a, i32 %b) {
-; SM20: shl.b64
-; SM20: sub.u32
-; SM20: shr.b64
-; SM20: add.u64
-; SM35: shf.l.wrap.b32
-; SM35: shf.l.wrap.b32
+; SM20-LABEL: rotate64(
+; SM20: {
+; SM20-NEXT: .reg .b32 %r<2>;
+; SM20-NEXT: .reg .b64 %rd<3>;
+; SM20-EMPTY:
+; SM20-NEXT: // %bb.0:
+; SM20-NEXT: ld.param.u64 %rd1, [rotate64_param_0];
+; SM20-NEXT: ld.param.u32 %r1, [rotate64_param_1];
+; SM20-NEXT: {
+; SM20-NEXT: .reg .b64 %lhs;
+; SM20-NEXT: .reg .b64 %rhs;
+; SM20-NEXT: .reg .u32 %amt2;
+; SM20-NEXT: and.b32 %amt2, %r1, 63;
+; SM20-NEXT: shl.b64 %lhs, %rd1, %amt2;
+; SM20-NEXT: sub.u32 %amt2, 64, %amt2;
+; SM20-NEXT: shr.b64 %rhs, %rd1, %amt2;
+; SM20-NEXT: add.u64 %rd2, %lhs, %rhs;
+; SM20-NEXT: }
+; SM20-NEXT: st.param.b64 [func_retval0+0], %rd2;
+; SM20-NEXT: ret;
+;
+; SM35-LABEL: rotate64(
+; SM35: {
+; SM35-NEXT: .reg .b32 %r<6>;
+; SM35-NEXT: .reg .b64 %rd<3>;
+; SM35-EMPTY:
+; SM35-NEXT: // %bb.0:
+; SM35-NEXT: ld.param.u64 %rd1, [rotate64_param_0];
+; SM35-NEXT: {
+; SM35-NEXT: .reg .b32 %dummy;
+; SM35-NEXT: mov.b64 {%dummy,%r1}, %rd1;
+; SM35-NEXT: }
+; SM35-NEXT: {
+; SM35-NEXT: .reg .b32 %dummy;
+; SM35-NEXT: mov.b64 {%r2,%dummy}, %rd1;
+; SM35-NEXT: }
+; SM35-NEXT: ld.param.u32 %r3, [rotate64_param_1];
+; SM35-NEXT: shf.l.wrap.b32 %r4, %r2, %r1, %r3;
+; SM35-NEXT: shf.l.wrap.b32 %r5, %r1, %r2, %r3;
+; SM35-NEXT: mov.b64 %rd2, {%r5, %r4};
+; SM35-NEXT: st.param.b64 [func_retval0+0], %rd2;
+; SM35-NEXT: ret;
%val = tail call i64 @llvm.nvvm.rotate.b64(i64 %a, i32 %b)
ret i64 %val
}
@@ -36,12 +97,48 @@ define i64 @rotate64(i64 %a, i32 %b) {
; SM20: rotateright64
; SM35: rotateright64
define i64 @rotateright64(i64 %a, i32 %b) {
-; SM20: shr.b64
-; SM20: sub.u32
-; SM20: shl.b64
-; SM20: add.u64
-; SM35: shf.r.wrap.b32
-; SM35: shf.r.wrap.b32
+; SM20-LABEL: rotateright64(
+; SM20: {
+; SM20-NEXT: .reg .b32 %r<2>;
+; SM20-NEXT: .reg .b64 %rd<3>;
+; SM20-EMPTY:
+; SM20-NEXT: // %bb.0:
+; SM20-NEXT: ld.param.u64 %rd1, [rotateright64_param_0];
+; SM20-NEXT: ld.param.u32 %r1, [rotateright64_param_1];
+; SM20-NEXT: {
+; SM20-NEXT: .reg .b64 %lhs;
+; SM20-NEXT: .reg .b64 %rhs;
+; SM20-NEXT: .reg .u32 %amt2;
+; SM20-NEXT: and.b32 %amt2, %r1, 63;
+; SM20-NEXT: shr.b64 %lhs, %rd1, %amt2;
+; SM20-NEXT: sub.u32 %amt2, 64, %amt2;
+; SM20-NEXT: shl.b64 %rhs, %rd1, %amt2;
+; SM20-NEXT: add.u64 %rd2, %lhs, %rhs;
+; SM20-NEXT: }
+; SM20-NEXT: st.param.b64 [func_retval0+0], %rd2;
+; SM20-NEXT: ret;
+;
+; SM35-LABEL: rotateright64(
+; SM35: {
+; SM35-NEXT: .reg .b32 %r<6>;
+; SM35-NEXT: .reg .b64 %rd<3>;
+; SM35-EMPTY:
+; SM35-NEXT: // %bb.0:
+; SM35-NEXT: ld.param.u64 %rd1, [rotateright64_param_0];
+; SM35-NEXT: {
+; SM35-NEXT: .reg .b32 %dummy;
+; SM35-NEXT: mov.b64 {%r1,%dummy}, %rd1;
+; SM35-NEXT: }
+; SM35-NEXT: {
+; SM35-NEXT: .reg .b32 %dummy;
+; SM35-NEXT: mov.b64 {%dummy,%r2}, %rd1;
+; SM35-NEXT: }
+; SM35-NEXT: ld.param.u32 %r3, [rotateright64_param_1];
+; SM35-NEXT: shf.r.wrap.b32 %r4, %r2, %r1, %r3;
+; SM35-NEXT: shf.r.wrap.b32 %r5, %r1, %r2, %r3;
+; SM35-NEXT: mov.b64 %rd2, {%r5, %r4};
+; SM35-NEXT: st.param.b64 [func_retval0+0], %rd2;
+; SM35-NEXT: ret;
%val = tail call i64 @llvm.nvvm.rotate.right.b64(i64 %a, i32 %b)
ret i64 %val
}
@@ -49,12 +146,204 @@ define i64 @rotateright64(i64 %a, i32 %b) {
; SM20: rotl0
; SM35: rotl0
define i32 @rotl0(i32 %x) {
-; SM20: shl.b32
-; SM20: shr.b32
-; SM20: add.u32
-; SM35: shf.l.wrap.b32
+; SM20-LABEL: rotl0(
+; SM20: {
+; SM20-NEXT: .reg .b32 %r<3>;
+; SM20-EMPTY:
+; SM20-NEXT: // %bb.0:
+; SM20-NEXT: ld.param.u32 %r1, [rotl0_param_0];
+; SM20-NEXT: {
+; SM20-NEXT: .reg .b32 %lhs;
+; SM20-NEXT: .reg .b32 %rhs;
+; SM20-NEXT: shl.b32 %lhs, %r1, 8;
+; SM20-NEXT: shr.b32 %rhs, %r1, 24;
+; SM20-NEXT: add.u32 %r2, %lhs, %rhs;
+; SM20-NEXT: }
+; SM20-NEXT: st.param.b32 [func_retval0+0], %r2;
+; SM20-NEXT: ret;
+;
+; SM35-LABEL: rotl0(
+; SM35: {
+; SM35-NEXT: .reg .b32 %r<3>;
+; SM35-EMPTY:
+; SM35-NEXT: // %bb.0:
+; SM35-NEXT: ld.param.u32 %r1, [rotl0_param_0];
+; SM35-NEXT: shf.l.wrap.b32 %r2, %r1, %r1, 8;
+; SM35-NEXT: st.param.b32 [func_retval0+0], %r2;
+; SM35-NEXT: ret;
%t0 = shl i32 %x, 8
%t1 = lshr i32 %x, 24
%t2 = or i32 %t0, %t1
ret i32 %t2
}
+
+declare i64 @llvm.fshl.i64(i64, i64, i64)
+declare i64 @llvm.fshr.i64(i64, i64, i64)
+
+; SM35: rotl64
+define i64 @rotl64(i64 %a, i64 %n) {
+; SM20-LABEL: rotl64(
+; SM20: {
+; SM20-NEXT: .reg .b32 %r<2>;
+; SM20-NEXT: .reg .b64 %rd<3>;
+; SM20-EMPTY:
+; SM20-NEXT: // %bb.0:
+; SM20-NEXT: ld.param.u64 %rd1, [rotl64_param_0];
+; SM20-NEXT: ld.param.u32 %r1, [rotl64_param_1];
+; SM20-NEXT: {
+; SM20-NEXT: .reg .b64 %lhs;
+; SM20-NEXT: .reg .b64 %rhs;
+; SM20-NEXT: .reg .u32 %amt2;
+; SM20-NEXT: and.b32 %amt2, %r1, 63;
+; SM20-NEXT: shl.b64 %lhs, %rd1, %amt2;
+; SM20-NEXT: sub.u32 %amt2, 64, %amt2;
+; SM20-NEXT: shr.b64 %rhs, %rd1, %amt2;
+; SM20-NEXT: add.u64 %rd2, %lhs, %rhs;
+; SM20-NEXT: }
+; SM20-NEXT: st.param.b64 [func_retval0+0], %rd2;
+; SM20-NEXT: ret;
+;
+; SM35-LABEL: rotl64(
+; SM35: {
+; SM35-NEXT: .reg .b32 %r<2>;
+; SM35-NEXT: .reg .b64 %rd<3>;
+; SM35-EMPTY:
+; SM35-NEXT: // %bb.0:
+; SM35-NEXT: ld.param.u64 %rd1, [rotl64_param_0];
+; SM35-NEXT: ld.param.u32 %r1, [rotl64_param_1];
+; SM35-NEXT: {
+; SM35-NEXT: .reg .b64 %lhs;
+; SM35-NEXT: .reg .b64 %rhs;
+; SM35-NEXT: .reg .u32 %amt2;
+; SM35-NEXT: and.b32 %amt2, %r1, 63;
+; SM35-NEXT: shl.b64 %lhs, %rd1, %amt2;
+; SM35-NEXT: sub.u32 %amt2, 64, %amt2;
+; SM35-NEXT: shr.b64 %rhs, %rd1, %amt2;
+; SM35-NEXT: add.u64 %rd2, %lhs, %rhs;
+; SM35-NEXT: }
+; SM35-NEXT: st.param.b64 [func_retval0+0], %rd2;
+; SM35-NEXT: ret;
+ %val = tail call i64 @llvm.fshl.i64(i64 %a, i64 %a, i64 %n)
+ ret i64 %val
+}
+
+; SM35: rotl64_imm
+define i64 @rotl64_imm(i64 %a) {
+; SM20-LABEL: rotl64_imm(
+; SM20: {
+; SM20-NEXT: .reg .b64 %rd<3>;
+; SM20-EMPTY:
+; SM20-NEXT: // %bb.0:
+; SM20-NEXT: ld.param.u64 %rd1, [rotl64_imm_param_0];
+; SM20-NEXT: {
+; SM20-NEXT: .reg .b64 %lhs;
+; SM20-NEXT: .reg .b64 %rhs;
+; SM20-NEXT: shl.b64 %lhs, %rd1, 2;
+; SM20-NEXT: shr.b64 %rhs, %rd1, 62;
+; SM20-NEXT: add.u64 %rd2, %lhs, %rhs;
+; SM20-NEXT: }
+; SM20-NEXT: st.param.b64 [func_retval0+0], %rd2;
+; SM20-NEXT: ret;
+;
+; SM35-LABEL: rotl64_imm(
+; SM35: {
+; SM35-NEXT: .reg .b64 %rd<3>;
+; SM35-EMPTY:
+; SM35-NEXT: // %bb.0:
+; SM35-NEXT: ld.param.u64 %rd1, [rotl64_imm_param_0];
+; SM35-NEXT: {
+; SM35-NEXT: .reg .b64 %lhs;
+; SM35-NEXT: .reg .b64 %rhs;
+; SM35-NEXT: shl.b64 %lhs, %rd1, 2;
+; SM35-NEXT: shr.b64 %rhs, %rd1, 62;
+; SM35-NEXT: add.u64 %rd2, %lhs, %rhs;
+; SM35-NEXT: }
+; SM35-NEXT: st.param.b64 [func_retval0+0], %rd2;
+; SM35-NEXT: ret;
+ %val = tail call i64 @llvm.fshl.i64(i64 %a, i64 %a, i64 66)
+ ret i64 %val
+}
+
+; SM35: rotr64
+define i64 @rotr64(i64 %a, i64 %n) {
+; SM20-LABEL: rotr64(
+; SM20: {
+; SM20-NEXT: .reg .b32 %r<2>;
+; SM20-NEXT: .reg .b64 %rd<3>;
+; SM20-EMPTY:
+; SM20-NEXT: // %bb.0:
+; SM20-NEXT: ld.param.u64 %rd1, [rotr64_param_0];
+; SM20-NEXT: ld.param.u32 %r1, [rotr64_param_1];
+; SM20-NEXT: {
+; SM20-NEXT: .reg .b64 %lhs;
+; SM20-NEXT: .reg .b64 %rhs;
+; SM20-NEXT: .reg .u32 %amt2;
+; SM20-NEXT: and.b32 %amt2, %r1, 63;
+; SM20-NEXT: shr.b64 %lhs, %rd1, %amt2;
+; SM20-NEXT: sub.u32 %amt2, 64, %amt2;
+; SM20-NEXT: shl.b64 %rhs, %rd1, %amt2;
+; SM20-NEXT: add.u64 %rd2, %lhs, %rhs;
+; SM20-NEXT: }
+; SM20-NEXT: st.param.b64 [func_retval0+0], %rd2;
+; SM20-NEXT: ret;
+;
+; SM35-LABEL: rotr64(
+; SM35: {
+; SM35-NEXT: .reg .b32 %r<2>;
+; SM35-NEXT: .reg .b64 %rd<3>;
+; SM35-EMPTY:
+; SM35-NEXT: // %bb.0:
+; SM35-NEXT: ld.param.u64 %rd1, [rotr64_param_0];
+; SM35-NEXT: ld.param.u32 %r1, [rotr64_param_1];
+; SM35-NEXT: {
+; SM35-NEXT: .reg .b64 %lhs;
+; SM35-NEXT: .reg .b64 %rhs;
+; SM35-NEXT: .reg .u32 %amt2;
+; SM35-NEXT: and.b32 %amt2, %r1, 63;
+; SM35-NEXT: shr.b64 %lhs, %rd1, %amt2;
+; SM35-NEXT: sub.u32 %amt2, 64, %amt2;
+; SM35-NEXT: shl.b64 %rhs, %rd1, %amt2;
+; SM35-NEXT: add.u64 %rd2, %lhs, %rhs;
+; SM35-NEXT: }
+; SM35-NEXT: st.param.b64 [func_retval0+0], %rd2;
+; SM35-NEXT: ret;
+ %val = tail call i64 @llvm.fshr.i64(i64 %a, i64 %a, i64 %n)
+ ret i64 %val
+}
+
+; SM35: rotr64_imm
+define i64 @rotr64_imm(i64 %a) {
+; SM20-LABEL: rotr64_imm(
+; SM20: {
+; SM20-NEXT: .reg .b64 %rd<3>;
+; SM20-EMPTY:
+; SM20-NEXT: // %bb.0:
+; SM20-NEXT: ld.param.u64 %rd1, [rotr64_imm_param_0];
+; SM20-NEXT: {
+; SM20-NEXT: .reg .b64 %lhs;
+; SM20-NEXT: .reg .b64 %rhs;
+; SM20-NEXT: shl.b64 %lhs, %rd1, 62;
+; SM20-NEXT: shr.b64 %rhs, %rd1, 2;
+; SM20-NEXT: add.u64 %rd2, %lhs, %rhs;
+; SM20-NEXT: }
+; SM20-NEXT: st.param.b64 [func_retval0+0], %rd2;
+; SM20-NEXT: ret;
+;
+; SM35-LABEL: rotr64_imm(
+; SM35: {
+; SM35-NEXT: .reg .b64 %rd<3>;
+; SM35-EMPTY:
+; SM35-NEXT: // %bb.0:
+; SM35-NEXT: ld.param.u64 %rd1, [rotr64_imm_param_0];
+; SM35-NEXT: {
+; SM35-NEXT: .reg .b64 %lhs;
+; SM35-NEXT: .reg .b64 %rhs;
+; SM35-NEXT: shl.b64 %lhs, %rd1, 62;
+; SM35-NEXT: shr.b64 %rhs, %rd1, 2;
+; SM35-NEXT: add.u64 %rd2, %lhs, %rhs;
+; SM35-NEXT: }
+; SM35-NEXT: st.param.b64 [func_retval0+0], %rd2;
+; SM35-NEXT: ret;
+ %val = tail call i64 @llvm.fshr.i64(i64 %a, i64 %a, i64 66)
+ ret i64 %val
+}