summaryrefslogtreecommitdiffstats
path: root/libc/src/__support/GPU/nvptx/utils.h
diff options
context:
space:
mode:
Diffstat (limited to 'libc/src/__support/GPU/nvptx/utils.h')
-rw-r--r--libc/src/__support/GPU/nvptx/utils.h10
1 files changed, 9 insertions, 1 deletions
diff --git a/libc/src/__support/GPU/nvptx/utils.h b/libc/src/__support/GPU/nvptx/utils.h
index a92c8847b6ec..3f19afb83648 100644
--- a/libc/src/__support/GPU/nvptx/utils.h
+++ b/libc/src/__support/GPU/nvptx/utils.h
@@ -97,7 +97,7 @@ LIBC_INLINE uint32_t get_lane_size() { return 32; }
/// Returns the id of the thread inside of a CUDA warp executing together.
[[clang::convergent]] LIBC_INLINE uint32_t get_lane_id() {
- return get_thread_id() & (get_lane_size() - 1);
+ return __nvvm_read_ptx_sreg_laneid();
}
/// Returns the bit-mask of active threads in the current warp.
@@ -126,6 +126,14 @@ LIBC_INLINE uint32_t get_lane_size() { return 32; }
__nvvm_bar_warp_sync(static_cast<uint32_t>(mask));
}
+/// Shuffles the the lanes inside the warp according to the given index.
+[[clang::convergent]] LIBC_INLINE uint32_t shuffle(uint64_t lane_mask,
+ uint32_t idx, uint32_t x) {
+ uint32_t mask = static_cast<uint32_t>(lane_mask);
+ uint32_t bitmask = (mask >> idx) & 1;
+ return -bitmask & __nvvm_shfl_sync_idx_i32(mask, x, idx, get_lane_size() - 1);
+}
+
/// Returns the current value of the GPU's processor clock.
LIBC_INLINE uint64_t processor_clock() { return __builtin_readcyclecounter(); }