summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorShilei Tian <i@tianshilei.me>2023-11-05 22:48:55 -0500
committerGitHub <noreply@github.com>2023-11-05 22:48:55 -0500
commitae1446bd2c8ee6ed222e43a5ffffaa73b4a5b41a (patch)
tree72d0f8cb3a68ae64da587949f657bdff984b9d77
parentfbdf6e2724e11baa4441cd053804fb2d2375817a (diff)
Revert "[OpenMP] Simplify parallel reductions (#70983)"upstream/revert-70983-reduction_opt
This reverts commit e9a48f9e05c103a235993c6b15a2c36442a2ddc1.
-rw-r--r--openmp/libomptarget/DeviceRTL/src/Reduction.cpp120
-rw-r--r--openmp/libomptarget/test/offloading/generic_reduction.c25
2 files changed, 98 insertions, 47 deletions
diff --git a/openmp/libomptarget/DeviceRTL/src/Reduction.cpp b/openmp/libomptarget/DeviceRTL/src/Reduction.cpp
index 29a484aa0eb2..fc5a3b08cb9d 100644
--- a/openmp/libomptarget/DeviceRTL/src/Reduction.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/Reduction.cpp
@@ -44,45 +44,119 @@ void gpu_irregular_warp_reduce(void *reduce_data, ShuffleReductFnTy shflFct,
}
}
-static int32_t nvptx_parallel_reduce_nowait(void *reduce_data,
+#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 700
+static uint32_t gpu_irregular_simd_reduce(void *reduce_data,
+ ShuffleReductFnTy shflFct) {
+ uint32_t size, remote_id, physical_lane_id;
+ physical_lane_id = mapping::getThreadIdInBlock() % mapping::getWarpSize();
+ __kmpc_impl_lanemask_t lanemask_lt = mapping::lanemaskLT();
+ __kmpc_impl_lanemask_t Liveness = mapping::activemask();
+ uint32_t logical_lane_id = utils::popc(Liveness & lanemask_lt) * 2;
+ __kmpc_impl_lanemask_t lanemask_gt = mapping::lanemaskGT();
+ do {
+ Liveness = mapping::activemask();
+ remote_id = utils::ffs(Liveness & lanemask_gt);
+ size = utils::popc(Liveness);
+ logical_lane_id /= 2;
+ shflFct(reduce_data, /*LaneId =*/logical_lane_id,
+ /*Offset=*/remote_id - 1 - physical_lane_id, /*AlgoVersion=*/2);
+ } while (logical_lane_id % 2 == 0 && size > 1);
+ return (logical_lane_id == 0);
+}
+#endif
+
+static int32_t nvptx_parallel_reduce_nowait(int32_t TId, int32_t num_vars,
+ uint64_t reduce_size,
+ void *reduce_data,
ShuffleReductFnTy shflFct,
- InterWarpCopyFnTy cpyFct) {
+ InterWarpCopyFnTy cpyFct,
+ bool isSPMDExecutionMode, bool) {
+ uint32_t BlockThreadId = mapping::getThreadIdInBlock();
+ if (mapping::isMainThreadInGenericMode(/* IsSPMD */ false))
+ BlockThreadId = 0;
uint32_t NumThreads = omp_get_num_threads();
- // Handle degenerated parallel regions, including all nested ones, first.
if (NumThreads == 1)
return 1;
-
- /*
- * 1. Reduce within a warp.
- * 2. Warp master copies value to warp 0 via shared memory.
- * 3. Warp 0 reduces to a single value.
- * 4. The reduced value is available in the thread that returns 1.
- */
-
- uint32_t BlockThreadId = mapping::getThreadIdInBlock();
- uint32_t NumWarps =
+ /*
+ * This reduce function handles reduction within a team. It handles
+ * parallel regions in both L1 and L2 parallelism levels. It also
+ * supports Generic, SPMD, and NoOMP modes.
+ *
+ * 1. Reduce within a warp.
+ * 2. Warp master copies value to warp 0 via shared memory.
+ * 3. Warp 0 reduces to a single value.
+ * 4. The reduced value is available in the thread that returns 1.
+ */
+
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
+ uint32_t WarpsNeeded =
(NumThreads + mapping::getWarpSize() - 1) / mapping::getWarpSize();
+ uint32_t WarpId = mapping::getWarpIdInBlock();
+ // Volta execution model:
// For the Generic execution mode a parallel region either has 1 thread and
// beyond that, always a multiple of 32. For the SPMD execution mode we may
// have any number of threads.
- gpu_regular_warp_reduce(reduce_data, shflFct);
+ if ((NumThreads % mapping::getWarpSize() == 0) || (WarpId < WarpsNeeded - 1))
+ gpu_regular_warp_reduce(reduce_data, shflFct);
+ else if (NumThreads > 1) // Only SPMD execution mode comes thru this case.
+ gpu_irregular_warp_reduce(reduce_data, shflFct,
+ /*LaneCount=*/NumThreads % mapping::getWarpSize(),
+ /*LaneId=*/mapping::getThreadIdInBlock() %
+ mapping::getWarpSize());
// When we have more than [mapping::getWarpSize()] number of threads
// a block reduction is performed here.
+ //
+ // Only L1 parallel region can enter this if condition.
if (NumThreads > mapping::getWarpSize()) {
// Gather all the reduced values from each warp
// to the first warp.
- cpyFct(reduce_data, NumWarps);
+ cpyFct(reduce_data, WarpsNeeded);
- if (BlockThreadId < mapping::getWarpSize())
- gpu_irregular_warp_reduce(reduce_data, shflFct, NumWarps, BlockThreadId);
+ if (WarpId == 0)
+ gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded,
+ BlockThreadId);
}
-
- // In Generic and in SPMD mode block thread Id 0 is what we want.
- // It's either the main thread in SPMD mode or the "acting" main thread in the
- // parallel region.
return BlockThreadId == 0;
+#else
+ __kmpc_impl_lanemask_t Liveness = mapping::activemask();
+ if (Liveness == lanes::All) // Full warp
+ gpu_regular_warp_reduce(reduce_data, shflFct);
+ else if (!(Liveness & (Liveness + 1))) // Partial warp but contiguous lanes
+ gpu_irregular_warp_reduce(reduce_data, shflFct,
+ /*LaneCount=*/utils::popc(Liveness),
+ /*LaneId=*/mapping::getThreadIdInBlock() %
+ mapping::getWarpSize());
+ else { // Dispersed lanes. Only threads in L2
+ // parallel region may enter here; return
+ // early.
+ return gpu_irregular_simd_reduce(reduce_data, shflFct);
+ }
+
+ // When we have more than [mapping::getWarpSize()] number of threads
+ // a block reduction is performed here.
+ //
+ // Only L1 parallel region can enter this if condition.
+ if (NumThreads > mapping::getWarpSize()) {
+ uint32_t WarpsNeeded =
+ (NumThreads + mapping::getWarpSize() - 1) / mapping::getWarpSize();
+ // Gather all the reduced values from each warp
+ // to the first warp.
+ cpyFct(reduce_data, WarpsNeeded);
+
+ uint32_t WarpId = BlockThreadId / mapping::getWarpSize();
+ if (WarpId == 0)
+ gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded,
+ BlockThreadId);
+
+ return BlockThreadId == 0;
+ }
+
+ // Get the OMP thread Id. This is different from BlockThreadId in the case of
+ // an L2 parallel region.
+ return TId == 0;
+#endif // __CUDA_ARCH__ >= 700
}
uint32_t roundToWarpsize(uint32_t s) {
@@ -99,7 +173,9 @@ extern "C" {
int32_t __kmpc_nvptx_parallel_reduce_nowait_v2(
IdentTy *Loc, int32_t TId, int32_t num_vars, uint64_t reduce_size,
void *reduce_data, ShuffleReductFnTy shflFct, InterWarpCopyFnTy cpyFct) {
- return nvptx_parallel_reduce_nowait(reduce_data, shflFct, cpyFct);
+ return nvptx_parallel_reduce_nowait(TId, num_vars, reduce_size, reduce_data,
+ shflFct, cpyFct, mapping::isSPMDMode(),
+ false);
}
/// Mostly like _v2 but with the builtin assumption that we have less than
diff --git a/openmp/libomptarget/test/offloading/generic_reduction.c b/openmp/libomptarget/test/offloading/generic_reduction.c
deleted file mode 100644
index 8b5ff0f067f9..000000000000
--- a/openmp/libomptarget/test/offloading/generic_reduction.c
+++ /dev/null
@@ -1,25 +0,0 @@
-// RUN: %libomptarget-compilexx-run-and-check-generic
-// RUN: %libomptarget-compileoptxx-run-and-check-generic
-
-#include <omp.h>
-#include <stdio.h>
-__attribute__((optnone)) void optnone(void) {}
-
-int main() {
- int sum = 0, nt;
-#pragma omp target teams map(tofrom : sum, nt) num_teams(1)
- {
- nt = 3 * omp_get_max_threads();
- optnone();
-#pragma omp parallel reduction(+ : sum)
- sum += 1;
-#pragma omp parallel reduction(+ : sum)
- sum += 1;
-#pragma omp parallel reduction(+ : sum)
- sum += 1;
- }
- // CHECK: nt: [[NT:.*]]
- // CHECK: sum: [[NT]]
- printf("nt: %i\n", nt);
- printf("sum: %i\n", sum);
-}