summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLeon Clark <PeddleSpam@users.noreply.github.com>2024-05-20 16:00:15 +0100
committerGitHub <noreply@github.com>2024-05-20 16:00:15 +0100
commit6f236cdc42601d96f06781e75d0112bdb8d4a4ce (patch)
tree3cf0897f218b3939f836ce54d4c2536252085568
parent2a90d59fc3905d3d56dac99fa25640a6d6a7bad2 (diff)
Revert "[AMDGPU] Use LSH for lowering ctlz_zero_undef.i8/i16 (#88512)"upstream/revert-88512-ctlz_zu
This reverts commit fb2c6597e39e9e1a775525ea0236b2f89e46acff.
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp22
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp44
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h2
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ctlz-zero-undef.mir47
-rw-r--r--llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll232
5 files changed, 178 insertions, 169 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 980e58510ceb..d35a022ad680 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -3117,30 +3117,20 @@ static bool isCttzOpc(unsigned Opc) {
SDValue AMDGPUTargetLowering::lowerCTLZResults(SDValue Op,
SelectionDAG &DAG) const {
auto SL = SDLoc(Op);
- auto Opc = Op.getOpcode();
auto Arg = Op.getOperand(0u);
auto ResultVT = Op.getValueType();
if (ResultVT != MVT::i8 && ResultVT != MVT::i16)
return {};
- assert(isCtlzOpc(Opc));
+ assert(isCtlzOpc(Op.getOpcode()));
assert(ResultVT == Arg.getValueType());
- const uint64_t NumBits = ResultVT.getFixedSizeInBits();
- SDValue NumExtBits = DAG.getConstant(32u - NumBits, SL, MVT::i32);
- SDValue NewOp;
-
- if (Opc == ISD::CTLZ_ZERO_UNDEF) {
- NewOp = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Arg);
- NewOp = DAG.getNode(ISD::SHL, SL, MVT::i32, NewOp, NumExtBits);
- NewOp = DAG.getNode(Opc, SL, MVT::i32, NewOp);
- } else {
- NewOp = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Arg);
- NewOp = DAG.getNode(Opc, SL, MVT::i32, NewOp);
- NewOp = DAG.getNode(ISD::SUB, SL, MVT::i32, NewOp, NumExtBits);
- }
-
+ auto const LeadingZeroes = 32u - ResultVT.getFixedSizeInBits();
+ auto SubVal = DAG.getConstant(LeadingZeroes, SL, MVT::i32);
+ auto NewOp = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Arg);
+ NewOp = DAG.getNode(Op.getOpcode(), SL, MVT::i32, NewOp);
+ NewOp = DAG.getNode(ISD::SUB, SL, MVT::i32, NewOp, SubVal);
return DAG.getNode(ISD::TRUNCATE, SL, ResultVT, NewOp);
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 15a4b6796880..bd7bf78c4c0b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -1270,22 +1270,13 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
.custom();
// The 64-bit versions produce 32-bit results, but only on the SALU.
- getActionDefinitionsBuilder(G_CTLZ_ZERO_UNDEF)
- .legalFor({{S32, S32}, {S32, S64}})
- .customIf(scalarNarrowerThan(1, 32))
- .clampScalar(0, S32, S32)
- .clampScalar(1, S32, S64)
- .scalarize(0)
- .widenScalarToNextPow2(0, 32)
- .widenScalarToNextPow2(1, 32);
-
- getActionDefinitionsBuilder(G_CTTZ_ZERO_UNDEF)
- .legalFor({{S32, S32}, {S32, S64}})
- .clampScalar(0, S32, S32)
- .clampScalar(1, S32, S64)
- .scalarize(0)
- .widenScalarToNextPow2(0, 32)
- .widenScalarToNextPow2(1, 32);
+ getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
+ .legalFor({{S32, S32}, {S32, S64}})
+ .clampScalar(0, S32, S32)
+ .clampScalar(1, S32, S64)
+ .scalarize(0)
+ .widenScalarToNextPow2(0, 32)
+ .widenScalarToNextPow2(1, 32);
// S64 is only legal on SALU, and needs to be broken into 32-bit elements in
// RegBankSelect.
@@ -2137,8 +2128,6 @@ bool AMDGPULegalizerInfo::legalizeCustom(
case TargetOpcode::G_CTLZ:
case TargetOpcode::G_CTTZ:
return legalizeCTLZ_CTTZ(MI, MRI, B);
- case TargetOpcode::G_CTLZ_ZERO_UNDEF:
- return legalizeCTLZ_ZERO_UNDEF(MI, MRI, B);
case TargetOpcode::G_INTRINSIC_FPTRUNC_ROUND:
return legalizeFPTruncRound(MI, B);
case TargetOpcode::G_STACKSAVE:
@@ -4156,25 +4145,6 @@ bool AMDGPULegalizerInfo::legalizeCTLZ_CTTZ(MachineInstr &MI,
return true;
}
-bool AMDGPULegalizerInfo::legalizeCTLZ_ZERO_UNDEF(MachineInstr &MI,
- MachineRegisterInfo &MRI,
- MachineIRBuilder &B) const {
- Register Dst = MI.getOperand(0).getReg();
- Register Src = MI.getOperand(1).getReg();
- LLT SrcTy = MRI.getType(Src);
- TypeSize NumBits = SrcTy.getSizeInBits();
-
- assert(NumBits < 32u);
-
- auto ShiftAmt = B.buildConstant(S32, 32u - NumBits);
- auto Extend = B.buildAnyExt(S32, {Src}).getReg(0u);
- auto Shift = B.buildLShr(S32, {Extend}, ShiftAmt);
- auto Ctlz = B.buildInstr(AMDGPU::G_AMDGPU_FFBH_U32, {S32}, {Shift});
- B.buildTrunc(Dst, Ctlz);
- MI.eraseFromParent();
- return true;
-}
-
// Check that this is a G_XOR x, -1
static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) {
if (MI.getOpcode() != TargetOpcode::G_XOR)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
index 4b1d821dadc2..e5ba84a74a0f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
@@ -108,8 +108,6 @@ public:
bool legalizeMul(LegalizerHelper &Helper, MachineInstr &MI) const;
bool legalizeCTLZ_CTTZ(MachineInstr &MI, MachineRegisterInfo &MRI,
MachineIRBuilder &B) const;
- bool legalizeCTLZ_ZERO_UNDEF(MachineInstr &MI, MachineRegisterInfo &MRI,
- MachineIRBuilder &B) const;
bool loadInputValue(Register DstReg, MachineIRBuilder &B,
const ArgDescriptor *Arg,
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ctlz-zero-undef.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ctlz-zero-undef.mir
index 7748b481cf5b..fed277d7d10d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ctlz-zero-undef.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ctlz-zero-undef.mir
@@ -81,12 +81,14 @@ body: |
; CHECK: liveins: $vgpr0
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
- ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C]](s32)
- ; CHECK-NEXT: [[AMDGPU_FFBH_U32:%[0-9]+]]:_(s32) = G_AMDGPU_FFBH_U32 [[LSHR]](s32)
- ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
- ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[AMDGPU_FFBH_U32]], [[C1]]
- ; CHECK-NEXT: $vgpr0 = COPY [[AND]](s32)
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
+ ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C]]
+ ; CHECK-NEXT: [[CTLZ_ZERO_UNDEF:%[0-9]+]]:_(s32) = G_CTLZ_ZERO_UNDEF [[AND]](s32)
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+ ; CHECK-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[CTLZ_ZERO_UNDEF]], [[C1]]
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[SUB]](s32)
+ ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C]]
+ ; CHECK-NEXT: $vgpr0 = COPY [[AND1]](s32)
%0:_(s32) = COPY $vgpr0
%1:_(s16) = G_TRUNC %0
%2:_(s16) = G_CTLZ_ZERO_UNDEF %1
@@ -147,15 +149,18 @@ body: |
; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>)
; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
- ; CHECK-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
- ; CHECK-NEXT: [[AMDGPU_FFBH_U32:%[0-9]+]]:_(s32) = G_AMDGPU_FFBH_U32 [[LSHR1]](s32)
- ; CHECK-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[LSHR]], [[C]](s32)
- ; CHECK-NEXT: [[AMDGPU_FFBH_U321:%[0-9]+]]:_(s32) = G_AMDGPU_FFBH_U32 [[LSHR2]](s32)
; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
- ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[AMDGPU_FFBH_U32]], [[C1]]
- ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[AMDGPU_FFBH_U321]], [[C1]]
- ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32)
- ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
+ ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[BITCAST]], [[C1]]
+ ; CHECK-NEXT: [[CTLZ_ZERO_UNDEF:%[0-9]+]]:_(s32) = G_CTLZ_ZERO_UNDEF [[AND]](s32)
+ ; CHECK-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[CTLZ_ZERO_UNDEF]], [[C]]
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[SUB]](s32)
+ ; CHECK-NEXT: [[CTLZ_ZERO_UNDEF1:%[0-9]+]]:_(s32) = G_CTLZ_ZERO_UNDEF [[LSHR]](s32)
+ ; CHECK-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[CTLZ_ZERO_UNDEF1]], [[C]]
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[SUB1]](s32)
+ ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C1]]
+ ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]]
+ ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND2]], [[C]](s32)
+ ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND1]], [[SHL]]
; CHECK-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
; CHECK-NEXT: $vgpr0 = COPY [[BITCAST1]](<2 x s16>)
%0:_(<2 x s16>) = COPY $vgpr0
@@ -174,12 +179,14 @@ body: |
; CHECK: liveins: $vgpr0
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 25
- ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C]](s32)
- ; CHECK-NEXT: [[FFBH:%[0-9]+]]:_(s32) = G_AMDGPU_FFBH_U32 [[LSHR]](s32)
- ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 127
- ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[FFBH]], [[C1]]
- ; CHECK-NEXT: $vgpr0 = COPY [[AND]](s32)
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 127
+ ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C]]
+ ; CHECK-NEXT: [[CTLZ_ZERO_UNDEF:%[0-9]+]]:_(s32) = G_CTLZ_ZERO_UNDEF [[AND]](s32)
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 25
+ ; CHECK-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[CTLZ_ZERO_UNDEF]], [[C1]]
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[SUB]](s32)
+ ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C]]
+ ; CHECK-NEXT: $vgpr0 = COPY [[AND1]](s32)
%0:_(s32) = COPY $vgpr0
%1:_(s7) = G_TRUNC %0
%2:_(s7) = G_CTLZ_ZERO_UNDEF %1
diff --git a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
index d94a27e8c020..54adde38d6d2 100644
--- a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
@@ -322,8 +322,9 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_lshl_b32 s2, s2, 24
-; SI-NEXT: s_flbit_i32_b32 s4, s2
+; SI-NEXT: s_and_b32 s2, s2, 0xff
+; SI-NEXT: s_flbit_i32_b32 s2, s2
+; SI-NEXT: s_sub_i32 s4, s2, 24
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: v_mov_b32_e32 v0, s4
; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0
@@ -334,8 +335,9 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa
; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshl_b32 s2, s2, 24
+; VI-NEXT: s_and_b32 s2, s2, 0xff
; VI-NEXT: s_flbit_i32_b32 s2, s2
+; VI-NEXT: s_sub_i32 s2, s2, 24
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v2, s2
@@ -355,13 +357,13 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa
; EG-NEXT: ALU clause starting at 8:
; EG-NEXT: MOV * T0.X, 0.0,
; EG-NEXT: ALU clause starting at 9:
-; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
-; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
-; EG-NEXT: FFBH_UINT T0.W, PV.W,
+; EG-NEXT: FFBH_UINT T0.W, T0.X,
; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x,
; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT: ADD_INT * T0.W, PV.W, literal.x,
+; EG-NEXT: -24(nan), 0(0.000000e+00)
; EG-NEXT: AND_INT T0.W, PV.W, literal.x,
-; EG-NEXT: LSHL * T1.W, PS, literal.y,
+; EG-NEXT: LSHL * T1.W, T1.W, literal.y,
; EG-NEXT: 255(3.573311e-43), 3(4.203895e-45)
; EG-NEXT: LSHL T0.X, PV.W, PS,
; EG-NEXT: LSHL * T0.W, literal.x, PS,
@@ -377,8 +379,9 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa
; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: s_lshr_b32 s0, s4, 24
+; GFX9-GISEL-NEXT: s_and_b32 s0, s4, 0xff
; GFX9-GISEL-NEXT: s_flbit_i32_b32 s0, s0
+; GFX9-GISEL-NEXT: s_sub_i32 s0, s0, 24
; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX9-GISEL-NEXT: global_store_byte v1, v0, s[2:3]
; GFX9-GISEL-NEXT: s_endpgm
@@ -396,8 +399,9 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_lshl_b32 s2, s2, 16
-; SI-NEXT: s_flbit_i32_b32 s4, s2
+; SI-NEXT: s_and_b32 s2, s2, 0xffff
+; SI-NEXT: s_flbit_i32_b32 s2, s2
+; SI-NEXT: s_add_i32 s4, s2, -16
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: v_mov_b32_e32 v0, s4
; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
@@ -430,13 +434,13 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no
; EG-NEXT: ALU clause starting at 8:
; EG-NEXT: MOV * T0.X, 0.0,
; EG-NEXT: ALU clause starting at 9:
-; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
-; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: FFBH_UINT T0.W, PV.W,
+; EG-NEXT: FFBH_UINT T0.W, T0.X,
; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x,
; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT: ADD_INT * T0.W, PV.W, literal.x,
+; EG-NEXT: -16(nan), 0(0.000000e+00)
; EG-NEXT: AND_INT T0.W, PV.W, literal.x,
-; EG-NEXT: LSHL * T1.W, PS, literal.y,
+; EG-NEXT: LSHL * T1.W, T1.W, literal.y,
; EG-NEXT: 65535(9.183409e-41), 3(4.203895e-45)
; EG-NEXT: LSHL T0.X, PV.W, PS,
; EG-NEXT: LSHL * T0.W, literal.x, PS,
@@ -452,8 +456,9 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no
; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: s_lshr_b32 s0, s4, 16
+; GFX9-GISEL-NEXT: s_and_b32 s0, s4, 0xffff
; GFX9-GISEL-NEXT: s_flbit_i32_b32 s0, s0
+; GFX9-GISEL-NEXT: s_sub_i32 s0, s0, 16
; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX9-GISEL-NEXT: global_store_short v1, v0, s[2:3]
; GFX9-GISEL-NEXT: s_endpgm
@@ -593,8 +598,8 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa
; SI-NEXT: s_mov_b32 s4, s0
; SI-NEXT: s_mov_b32 s5, s1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v0
-; SI-NEXT: v_ffbh_u32_e32 v1, v1
+; SI-NEXT: v_ffbh_u32_e32 v1, v0
+; SI-NEXT: v_subrev_i32_e32 v1, vcc, 24, v1
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; SI-NEXT: v_cndmask_b32_e32 v0, 32, v1, vcc
; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0
@@ -608,8 +613,8 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_load_ubyte v0, v[0:1]
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v0
-; VI-NEXT: v_ffbh_u32_e32 v1, v1
+; VI-NEXT: v_ffbh_u32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; VI-NEXT: v_subrev_u32_e32 v1, vcc, 24, v1
; VI-NEXT: v_cmp_ne_u16_e32 vcc, 0, v0
; VI-NEXT: v_cndmask_b32_e32 v2, 32, v1, vcc
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -621,7 +626,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @6
-; EG-NEXT: ALU 16, @9, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ALU 15, @9, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X
; EG-NEXT: CF_END
; EG-NEXT: PAD
@@ -630,11 +635,10 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa
; EG-NEXT: ALU clause starting at 8:
; EG-NEXT: MOV * T0.X, KC0[2].Z,
; EG-NEXT: ALU clause starting at 9:
-; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
-; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
-; EG-NEXT: FFBH_UINT T0.W, PV.W,
-; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x,
-; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT: FFBH_UINT * T0.W, T0.X,
+; EG-NEXT: ADD_INT T0.W, PV.W, literal.x,
+; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT: -24(nan), 3(4.203895e-45)
; EG-NEXT: CNDE_INT * T0.W, T0.X, literal.x, PV.W,
; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
; EG-NEXT: AND_INT T0.W, PV.W, literal.x,
@@ -655,7 +659,8 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3]
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT: v_ffbh_u32_sdwa v2, v1
+; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v2, v1
+; GFX9-GISEL-NEXT: v_subrev_u32_e32 v2, 24, v2
; GFX9-GISEL-NEXT: v_and_b32_e32 v2, 0xff, v2
; GFX9-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, 32, v2, vcc
@@ -688,8 +693,8 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no
; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v0
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_or_b32_e32 v0, v0, v1
-; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v0
-; SI-NEXT: v_ffbh_u32_e32 v1, v1
+; SI-NEXT: v_ffbh_u32_e32 v1, v0
+; SI-NEXT: v_add_i32_e32 v1, vcc, -16, v1
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; SI-NEXT: v_cndmask_b32_e32 v0, 32, v1, vcc
; SI-NEXT: buffer_store_short v0, off, s[4:7], 0
@@ -724,7 +729,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @6
-; EG-NEXT: ALU 16, @9, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ALU 15, @9, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X
; EG-NEXT: CF_END
; EG-NEXT: PAD
@@ -733,11 +738,10 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no
; EG-NEXT: ALU clause starting at 8:
; EG-NEXT: MOV * T0.X, KC0[2].Z,
; EG-NEXT: ALU clause starting at 9:
-; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
-; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: FFBH_UINT T0.W, PV.W,
-; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x,
-; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT: FFBH_UINT * T0.W, T0.X,
+; EG-NEXT: ADD_INT T0.W, PV.W, literal.x,
+; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT: -16(nan), 3(4.203895e-45)
; EG-NEXT: CNDE_INT * T0.W, T0.X, literal.x, PV.W,
; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
; EG-NEXT: AND_INT T0.W, PV.W, literal.x,
@@ -760,7 +764,8 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no
; GFX9-GISEL-NEXT: global_load_ubyte v2, v0, s[2:3] offset:1
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX9-GISEL-NEXT: v_lshl_or_b32 v1, v2, 8, v1
-; GFX9-GISEL-NEXT: v_ffbh_u32_sdwa v2, v1
+; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v2, v1
+; GFX9-GISEL-NEXT: v_subrev_u32_e32 v2, 16, v2
; GFX9-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX9-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, 32, v2, vcc
@@ -1105,8 +1110,8 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8(ptr addrspace(1) noalias %out, p
; SI-NEXT: s_mov_b32 s4, s0
; SI-NEXT: s_mov_b32 s5, s1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0
; SI-NEXT: v_ffbh_u32_e32 v0, v0
+; SI-NEXT: v_subrev_i32_e32 v0, vcc, 24, v0
; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0
; SI-NEXT: s_endpgm
;
@@ -1119,8 +1124,8 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8(ptr addrspace(1) noalias %out, p
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_ubyte v0, v[0:1]
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_lshlrev_b32_e32 v0, 24, v0
-; VI-NEXT: v_ffbh_u32_e32 v2, v0
+; VI-NEXT: v_ffbh_u32_e32 v0, v0
+; VI-NEXT: v_subrev_u32_e32 v2, vcc, 24, v0
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_store_byte v[0:1], v2
@@ -1139,13 +1144,13 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8(ptr addrspace(1) noalias %out, p
; EG-NEXT: ALU clause starting at 8:
; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, T0.X,
; EG-NEXT: ALU clause starting at 9:
-; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
-; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
-; EG-NEXT: FFBH_UINT T0.W, PV.W,
+; EG-NEXT: FFBH_UINT T0.W, T0.X,
; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x,
; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT: ADD_INT * T0.W, PV.W, literal.x,
+; EG-NEXT: -24(nan), 0(0.000000e+00)
; EG-NEXT: AND_INT T0.W, PV.W, literal.x,
-; EG-NEXT: LSHL * T1.W, PS, literal.y,
+; EG-NEXT: LSHL * T1.W, T1.W, literal.y,
; EG-NEXT: 255(3.573311e-43), 3(4.203895e-45)
; EG-NEXT: LSHL T0.X, PV.W, PS,
; EG-NEXT: LSHL * T0.W, literal.x, PS,
@@ -1167,7 +1172,8 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8(ptr addrspace(1) noalias %out, p
; GFX9-GISEL-NEXT: global_load_ubyte v0, v[0:1], off
; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT: v_ffbh_u32_sdwa v0, v0
+; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v0, v0
+; GFX9-GISEL-NEXT: v_subrev_u32_e32 v0, 24, v0
; GFX9-GISEL-NEXT: global_store_byte v1, v0, s[0:1]
; GFX9-GISEL-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -1703,11 +1709,12 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_sel_eq_neg1(ptr addrspace(1) noa
; GFX9-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v1, v0
; GFX9-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v3, vcc
; GFX9-GISEL-NEXT: global_load_ubyte v0, v[0:1], off
-; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT: v_ffbh_u32_sdwa v2, v0
-; GFX9-GISEL-NEXT: v_cmp_eq_u32_sdwa s[2:3], v0, v1
-; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, v2, -1, s[2:3]
+; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v0
+; GFX9-GISEL-NEXT: v_subrev_u32_e32 v1, 24, v1
+; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, v1, -1, vcc
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0
; GFX9-GISEL-NEXT: global_store_byte v1, v0, s[0:1]
; GFX9-GISEL-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -2186,8 +2193,9 @@ define i7 @v_ctlz_zero_undef_i7(i7 %val) {
; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i7:
; GFX9-GISEL: ; %bb.0:
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT: v_lshrrev_b32_e32 v0, 25, v0
+; GFX9-GISEL-NEXT: v_and_b32_e32 v0, 0x7f, v0
; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v0, v0
+; GFX9-GISEL-NEXT: v_subrev_u32_e32 v0, 25, v0
; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
%ctlz = call i7 @llvm.ctlz.i7(i7 %val, i1 true)
ret i7 %ctlz
@@ -2278,8 +2286,9 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i18(ptr addrspace(1) noalias %out,
; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: s_lshr_b32 s0, s4, 14
+; GFX9-GISEL-NEXT: s_and_b32 s0, s4, 0x3ffff
; GFX9-GISEL-NEXT: s_flbit_i32_b32 s0, s0
+; GFX9-GISEL-NEXT: s_sub_i32 s0, s0, 14
; GFX9-GISEL-NEXT: s_and_b32 s0, s0, 0x3ffff
; GFX9-GISEL-NEXT: s_lshr_b32 s1, s0, 16
; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s0
@@ -2317,8 +2326,9 @@ define i18 @v_ctlz_zero_undef_i18(i18 %val) {
; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i18:
; GFX9-GISEL: ; %bb.0:
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT: v_lshrrev_b32_e32 v0, 14, v0
+; GFX9-GISEL-NEXT: v_and_b32_e32 v0, 0x3ffff, v0
; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v0, v0
+; GFX9-GISEL-NEXT: v_subrev_u32_e32 v0, 14, v0
; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
%ctlz = call i18 @llvm.ctlz.i18(i18 %val, i1 true)
ret i18 %ctlz
@@ -2355,10 +2365,12 @@ define <2 x i18> @v_ctlz_zero_undef_v2i18(<2 x i18> %val) {
; GFX9-GISEL-LABEL: v_ctlz_zero_undef_v2i18:
; GFX9-GISEL: ; %bb.0:
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT: v_lshrrev_b32_e32 v0, 14, v0
-; GFX9-GISEL-NEXT: v_lshrrev_b32_e32 v1, 14, v1
+; GFX9-GISEL-NEXT: v_and_b32_e32 v0, 0x3ffff, v0
+; GFX9-GISEL-NEXT: v_and_b32_e32 v1, 0x3ffff, v1
; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v0, v0
; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v1
+; GFX9-GISEL-NEXT: v_subrev_u32_e32 v0, 14, v0
+; GFX9-GISEL-NEXT: v_subrev_u32_e32 v1, 14, v1
; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
%ctlz = call <2 x i18> @llvm.ctlz.v2i18(<2 x i18> %val, i1 true)
ret <2 x i18> %ctlz
@@ -2368,12 +2380,16 @@ define <2 x i16> @v_ctlz_zero_undef_v2i16(<2 x i16> %val) {
; SI-LABEL: v_ctlz_zero_undef_v2i16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
; SI-NEXT: v_ffbh_u32_e32 v1, v1
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; SI-NEXT: v_ffbh_u32_e32 v0, v0
+; SI-NEXT: v_add_i32_e32 v1, vcc, -16, v1
+; SI-NEXT: v_add_i32_e32 v0, vcc, -16, v0
+; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
; SI-NEXT: v_or_b32_e32 v0, v0, v2
+; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_ctlz_zero_undef_v2i16:
@@ -2394,10 +2410,12 @@ define <2 x i16> @v_ctlz_zero_undef_v2i16(<2 x i16> %val) {
; GFX9-GISEL-LABEL: v_ctlz_zero_undef_v2i16:
; GFX9-GISEL: ; %bb.0:
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT: v_ffbh_u32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX9-GISEL-NEXT: v_subrev_u32_e32 v1, 16, v1
; GFX9-GISEL-NEXT: v_ffbh_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-GISEL-NEXT: s_flbit_i32_b32 s4, 0
-; GFX9-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX9-GISEL-NEXT: v_lshl_or_b32 v0, s4, 16, v0
+; GFX9-GISEL-NEXT: v_subrev_u32_e32 v0, 16, v0
+; GFX9-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX9-GISEL-NEXT: v_lshl_or_b32 v0, v0, 16, v1
; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
%ctlz = call <2 x i16> @llvm.ctlz.v2i16(<2 x i16> %val, i1 true)
ret <2 x i16> %ctlz
@@ -2407,15 +2425,20 @@ define <3 x i16> @v_ctlz_zero_undef_v3i16(<3 x i16> %val) {
; SI-LABEL: v_ctlz_zero_undef_v3i16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2
; SI-NEXT: v_ffbh_u32_e32 v1, v1
; SI-NEXT: v_ffbh_u32_e32 v0, v0
-; SI-NEXT: v_ffbh_u32_e32 v3, v2
+; SI-NEXT: v_ffbh_u32_e32 v2, v2
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_or_b32_e32 v0, v0, v1
-; SI-NEXT: v_or_b32_e32 v2, 0x200000, v3
+; SI-NEXT: v_add_i32_e32 v0, vcc, -16, v0
+; SI-NEXT: v_add_i32_e32 v3, vcc, -16, v2
+; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; SI-NEXT: v_and_b32_e32 v2, 0xffff, v3
+; SI-NEXT: v_or_b32_e32 v0, v1, v0
+; SI-NEXT: v_add_i32_e32 v0, vcc, 0xfff00000, v0
+; SI-NEXT: v_or_b32_e32 v2, 0x100000, v2
; SI-NEXT: v_alignbit_b32 v1, v3, v0, 16
; SI-NEXT: s_setpc_b64 s[30:31]
;
@@ -2439,11 +2462,14 @@ define <3 x i16> @v_ctlz_zero_undef_v3i16(<3 x i16> %val) {
; GFX9-GISEL-LABEL: v_ctlz_zero_undef_v3i16:
; GFX9-GISEL: ; %bb.0:
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT: v_ffbh_u32_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX9-GISEL-NEXT: v_subrev_u32_e32 v2, 16, v2
; GFX9-GISEL-NEXT: v_ffbh_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-GISEL-NEXT: s_flbit_i32_b32 s4, 0
-; GFX9-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX9-GISEL-NEXT: v_ffbh_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-GISEL-NEXT: v_lshl_or_b32 v0, s4, 16, v0
+; GFX9-GISEL-NEXT: v_subrev_u32_e32 v0, 16, v0
+; GFX9-GISEL-NEXT: v_ffbh_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX9-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX9-GISEL-NEXT: v_subrev_u32_e32 v1, 16, v1
+; GFX9-GISEL-NEXT: v_lshl_or_b32 v0, v0, 16, v2
; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
%ctlz = call <3 x i16> @llvm.ctlz.v3i16(<3 x i16> %val, i1 true)
ret <3 x i16> %ctlz
@@ -2453,18 +2479,24 @@ define <4 x i16> @v_ctlz_zero_undef_v4i16(<4 x i16> %val) {
; SI-LABEL: v_ctlz_zero_undef_v4i16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
; SI-NEXT: v_ffbh_u32_e32 v3, v3
; SI-NEXT: v_ffbh_u32_e32 v2, v2
; SI-NEXT: v_ffbh_u32_e32 v1, v1
; SI-NEXT: v_ffbh_u32_e32 v0, v0
; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT: v_add_i32_e32 v2, vcc, -16, v2
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_or_b32_e32 v2, v2, v3
-; SI-NEXT: v_or_b32_e32 v0, v0, v1
+; SI-NEXT: v_add_i32_e32 v0, vcc, -16, v0
+; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; SI-NEXT: v_or_b32_e32 v2, v3, v2
+; SI-NEXT: v_or_b32_e32 v0, v1, v0
+; SI-NEXT: v_add_i32_e32 v2, vcc, 0xfff00000, v2
+; SI-NEXT: v_add_i32_e32 v0, vcc, 0xfff00000, v0
; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16
; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2
; SI-NEXT: s_setpc_b64 s[30:31]
@@ -2492,13 +2524,18 @@ define <4 x i16> @v_ctlz_zero_undef_v4i16(<4 x i16> %val) {
; GFX9-GISEL-LABEL: v_ctlz_zero_undef_v4i16:
; GFX9-GISEL: ; %bb.0:
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT: v_ffbh_u32_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX9-GISEL-NEXT: v_subrev_u32_e32 v2, 16, v2
; GFX9-GISEL-NEXT: v_ffbh_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-GISEL-NEXT: v_ffbh_u32_sdwa v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX9-GISEL-NEXT: v_subrev_u32_e32 v0, 16, v0
+; GFX9-GISEL-NEXT: v_subrev_u32_e32 v3, 16, v3
; GFX9-GISEL-NEXT: v_ffbh_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-GISEL-NEXT: s_flbit_i32_b32 s4, 0
-; GFX9-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX9-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX9-GISEL-NEXT: v_lshl_or_b32 v0, s4, 16, v0
-; GFX9-GISEL-NEXT: v_lshl_or_b32 v1, s4, 16, v1
+; GFX9-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX9-GISEL-NEXT: v_subrev_u32_e32 v1, 16, v1
+; GFX9-GISEL-NEXT: v_lshl_or_b32 v0, v0, 16, v2
+; GFX9-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v3
+; GFX9-GISEL-NEXT: v_lshl_or_b32 v1, v1, 16, v2
; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
%ctlz = call <4 x i16> @llvm.ctlz.v4i16(<4 x i16> %val, i1 true)
ret <4 x i16> %ctlz
@@ -2508,24 +2545,27 @@ define <2 x i8> @v_ctlz_zero_undef_v2i8(<2 x i8> %val) {
; SI-LABEL: v_ctlz_zero_undef_v2i8:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v1
-; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0
+; SI-NEXT: v_and_b32_e32 v1, 0xff, v1
+; SI-NEXT: v_and_b32_e32 v0, 0xff, v0
; SI-NEXT: v_ffbh_u32_e32 v1, v1
-; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v1
; SI-NEXT: v_ffbh_u32_e32 v0, v0
-; SI-NEXT: v_or_b32_e32 v0, v0, v2
+; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v1
+; SI-NEXT: v_subrev_i32_e32 v0, vcc, 24, v0
+; SI-NEXT: v_and_b32_e32 v0, 0xff, v0
+; SI-NEXT: v_or_b32_e32 v0, v1, v0
+; SI-NEXT: v_add_i32_e32 v0, vcc, 0xffffe800, v0
+; SI-NEXT: v_bfe_u32 v1, v0, 8, 8
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_ctlz_zero_undef_v2i8:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v1
-; VI-NEXT: v_ffbh_u32_e32 v1, v1
-; VI-NEXT: v_lshlrev_b32_e32 v0, 24, v0
-; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v1
-; VI-NEXT: v_ffbh_u32_e32 v0, v0
-; VI-NEXT: v_or_b32_e32 v0, v0, v2
-; VI-NEXT: v_and_b32_e32 v1, 0xff, v1
+; VI-NEXT: v_ffbh_u32_sdwa v1, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0
+; VI-NEXT: v_ffbh_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
+; VI-NEXT: v_add_u16_e32 v1, 0xe800, v1
+; VI-NEXT: v_subrev_u16_e32 v0, 24, v0
+; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_lshrrev_b16_e32 v1, 8, v1
; VI-NEXT: s_setpc_b64 s[30:31]
;
; EG-LABEL: v_ctlz_zero_undef_v2i8:
@@ -2536,8 +2576,10 @@ define <2 x i8> @v_ctlz_zero_undef_v2i8(<2 x i8> %val) {
; GFX9-GISEL-LABEL: v_ctlz_zero_undef_v2i8:
; GFX9-GISEL: ; %bb.0:
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT: v_ffbh_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
-; GFX9-GISEL-NEXT: v_ffbh_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
+; GFX9-GISEL-NEXT: v_ffbh_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
+; GFX9-GISEL-NEXT: v_ffbh_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
+; GFX9-GISEL-NEXT: v_subrev_u32_e32 v0, 24, v0
+; GFX9-GISEL-NEXT: v_subrev_u32_e32 v1, 24, v1
; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
%ctlz = call <2 x i8> @llvm.ctlz.v2i8(<2 x i8> %val, i1 true)
ret <2 x i8> %ctlz
@@ -2579,10 +2621,12 @@ define <2 x i7> @v_ctlz_zero_undef_v2i7(<2 x i7> %val) {
; GFX9-GISEL-LABEL: v_ctlz_zero_undef_v2i7:
; GFX9-GISEL: ; %bb.0:
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT: v_lshrrev_b32_e32 v0, 25, v0
-; GFX9-GISEL-NEXT: v_lshrrev_b32_e32 v1, 25, v1
+; GFX9-GISEL-NEXT: v_and_b32_e32 v0, 0x7f, v0
+; GFX9-GISEL-NEXT: v_and_b32_e32 v1, 0x7f, v1
; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v0, v0
; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v1
+; GFX9-GISEL-NEXT: v_subrev_u32_e32 v0, 25, v0
+; GFX9-GISEL-NEXT: v_subrev_u32_e32 v1, 25, v1
; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
%ctlz = call <2 x i7> @llvm.ctlz.v2i7(<2 x i7> %val, i1 true)
ret <2 x i7> %ctlz