diff options
Diffstat (limited to 'libc/src/__support/GPU/nvptx/utils.h')
-rw-r--r-- | libc/src/__support/GPU/nvptx/utils.h | 10 |
1 files changed, 9 insertions, 1 deletions
diff --git a/libc/src/__support/GPU/nvptx/utils.h b/libc/src/__support/GPU/nvptx/utils.h index a92c8847b6ec..3f19afb83648 100644 --- a/libc/src/__support/GPU/nvptx/utils.h +++ b/libc/src/__support/GPU/nvptx/utils.h @@ -97,7 +97,7 @@ LIBC_INLINE uint32_t get_lane_size() { return 32; } /// Returns the id of the thread inside of a CUDA warp executing together. [[clang::convergent]] LIBC_INLINE uint32_t get_lane_id() { - return get_thread_id() & (get_lane_size() - 1); + return __nvvm_read_ptx_sreg_laneid(); } /// Returns the bit-mask of active threads in the current warp. @@ -126,6 +126,14 @@ LIBC_INLINE uint32_t get_lane_size() { return 32; } __nvvm_bar_warp_sync(static_cast<uint32_t>(mask)); } +/// Shuffles the the lanes inside the warp according to the given index. +[[clang::convergent]] LIBC_INLINE uint32_t shuffle(uint64_t lane_mask, + uint32_t idx, uint32_t x) { + uint32_t mask = static_cast<uint32_t>(lane_mask); + uint32_t bitmask = (mask >> idx) & 1; + return -bitmask & __nvvm_shfl_sync_idx_i32(mask, x, idx, get_lane_size() - 1); +} + /// Returns the current value of the GPU's processor clock. LIBC_INLINE uint64_t processor_clock() { return __builtin_readcyclecounter(); } |