summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJonathan Peyton <jonathan.l.peyton@intel.com>2023-11-08 10:19:37 -0600
committerGitHub <noreply@github.com>2023-11-08 10:19:37 -0600
commit5cc603cb2244bb46683e4ce801fdf224aa2d1636 (patch)
treeff8d228b470c3f557ca77e8f0c1441874ba7c46d
parent3dff285679323ccad60e658ea3cc6900f037e528 (diff)
[OpenMP] Add skewed iteration distribution on hybrid systems (#69946)
This commit adds skewed distribution of iterations in nonmonotonic:dynamic schedule (static steal) for hybrid systems when thread affinity is assigned. Currently, it distributes the iterations at 60:40 ratio. Consider this loop with dynamic schedule type, for (int i = 0; i < 100; ++i). In a hybrid system with 20 hardware threads (16 CORE and 4 ATOM core), 88 iterations will be assigned to performance cores and 12 iterations will be assigned to efficient cores. Each thread with CORE core will process 5 iterations + extras and with ATOM core will process 3 iterations. Differential Revision: https://reviews.llvm.org/D152955
-rw-r--r--openmp/runtime/src/kmp.h67
-rw-r--r--openmp/runtime/src/kmp_affinity.cpp38
-rw-r--r--openmp/runtime/src/kmp_dispatch.cpp209
-rw-r--r--openmp/runtime/src/kmp_dispatch.h14
-rw-r--r--openmp/runtime/src/kmp_global.cpp3
-rw-r--r--openmp/runtime/test/worksharing/for/omp_for_schedule_dynamic.c1
6 files changed, 276 insertions, 56 deletions
diff --git a/openmp/runtime/src/kmp.h b/openmp/runtime/src/kmp.h
index dc759ab1c527..f95d008f2c6a 100644
--- a/openmp/runtime/src/kmp.h
+++ b/openmp/runtime/src/kmp.h
@@ -27,6 +27,9 @@
#ifndef KMP_STATIC_STEAL_ENABLED
#define KMP_STATIC_STEAL_ENABLED 1
#endif
+#define KMP_WEIGHTED_ITERATIONS_SUPPORTED \
+ (KMP_AFFINITY_SUPPORTED && KMP_STATIC_STEAL_ENABLED && \
+ (KMP_ARCH_X86 || KMP_ARCH_X86_64))
#define TASK_CURRENT_NOT_QUEUED 0
#define TASK_CURRENT_QUEUED 1
@@ -881,14 +884,8 @@ typedef struct kmp_affinity_flags_t {
KMP_BUILD_ASSERT(sizeof(kmp_affinity_flags_t) == 4);
typedef struct kmp_affinity_ids_t {
+ int os_id;
int ids[KMP_HW_LAST];
- int operator[](size_t idx) const { return ids[idx]; }
- int &operator[](size_t idx) { return ids[idx]; }
- kmp_affinity_ids_t &operator=(const kmp_affinity_ids_t &rhs) {
- for (int i = 0; i < KMP_HW_LAST; ++i)
- ids[i] = rhs[i];
- return *this;
- }
} kmp_affinity_ids_t;
typedef struct kmp_affinity_attrs_t {
@@ -938,6 +935,10 @@ extern kmp_affin_mask_t *__kmp_affin_fullMask;
extern kmp_affin_mask_t *__kmp_affin_origMask;
extern char *__kmp_cpuinfo_file;
+#if KMP_WEIGHTED_ITERATIONS_SUPPORTED
+extern int __kmp_first_osid_with_ecore;
+#endif
+
#endif /* KMP_AFFINITY_SUPPORTED */
// This needs to be kept in sync with the values in omp.h !!!
@@ -1849,12 +1850,9 @@ typedef struct kmp_sched_flags {
unsigned ordered : 1;
unsigned nomerge : 1;
unsigned contains_last : 1;
-#if KMP_USE_HIER_SCHED
- unsigned use_hier : 1;
- unsigned unused : 28;
-#else
- unsigned unused : 29;
-#endif
+ unsigned use_hier : 1; // Used in KMP_USE_HIER_SCHED code
+ unsigned use_hybrid : 1; // Used in KMP_WEIGHTED_ITERATIONS_SUPPORTED code
+ unsigned unused : 27;
} kmp_sched_flags_t;
KMP_BUILD_ASSERT(sizeof(kmp_sched_flags_t) == 4);
@@ -1868,26 +1866,37 @@ typedef struct KMP_ALIGN_CACHE dispatch_private_info32 {
kmp_int32 st;
kmp_int32 tc;
kmp_lock_t *steal_lock; // lock used for chunk stealing
+
+ kmp_uint32 ordered_lower;
+ kmp_uint32 ordered_upper;
+
// KMP_ALIGN(32) ensures (if the KMP_ALIGN macro is turned on)
// a) parm3 is properly aligned and
// b) all parm1-4 are on the same cache line.
// Because of parm1-4 are used together, performance seems to be better
// if they are on the same cache line (not measured though).
- struct KMP_ALIGN(32) { // AC: changed 16 to 32 in order to simplify template
- kmp_int32 parm1; // structures in kmp_dispatch.cpp. This should
- kmp_int32 parm2; // make no real change at least while padding is off.
+ struct KMP_ALIGN(32) {
+ kmp_int32 parm1;
+ kmp_int32 parm2;
kmp_int32 parm3;
kmp_int32 parm4;
};
- kmp_uint32 ordered_lower;
- kmp_uint32 ordered_upper;
+#if KMP_WEIGHTED_ITERATIONS_SUPPORTED
+ kmp_uint32 pchunks;
+ kmp_uint32 num_procs_with_pcore;
+ kmp_int32 first_thread_with_ecore;
+#endif
#if KMP_OS_WINDOWS
kmp_int32 last_upper;
#endif /* KMP_OS_WINDOWS */
} dispatch_private_info32_t;
+#if CACHE_LINE <= 128
+KMP_BUILD_ASSERT(sizeof(dispatch_private_info32_t) <= 128);
+#endif
+
typedef struct KMP_ALIGN_CACHE dispatch_private_info64 {
kmp_int64 count; // current chunk number for static & static-steal scheduling
kmp_int64 ub; /* upper-bound */
@@ -1896,14 +1905,16 @@ typedef struct KMP_ALIGN_CACHE dispatch_private_info64 {
kmp_int64 st; /* stride */
kmp_int64 tc; /* trip count (number of iterations) */
kmp_lock_t *steal_lock; // lock used for chunk stealing
+
+ kmp_uint64 ordered_lower;
+ kmp_uint64 ordered_upper;
/* parm[1-4] are used in different ways by different scheduling algorithms */
- // KMP_ALIGN( 32 ) ensures ( if the KMP_ALIGN macro is turned on )
+ // KMP_ALIGN(32) ensures ( if the KMP_ALIGN macro is turned on )
// a) parm3 is properly aligned and
// b) all parm1-4 are in the same cache line.
// Because of parm1-4 are used together, performance seems to be better
// if they are in the same line (not measured though).
-
struct KMP_ALIGN(32) {
kmp_int64 parm1;
kmp_int64 parm2;
@@ -1911,12 +1922,21 @@ typedef struct KMP_ALIGN_CACHE dispatch_private_info64 {
kmp_int64 parm4;
};
- kmp_uint64 ordered_lower;
- kmp_uint64 ordered_upper;
+#if KMP_WEIGHTED_ITERATIONS_SUPPORTED
+ kmp_uint64 pchunks;
+ kmp_uint64 num_procs_with_pcore;
+ kmp_int64 first_thread_with_ecore;
+#endif
+
#if KMP_OS_WINDOWS
kmp_int64 last_upper;
#endif /* KMP_OS_WINDOWS */
} dispatch_private_info64_t;
+
+#if CACHE_LINE <= 128
+KMP_BUILD_ASSERT(sizeof(dispatch_private_info64_t) <= 128);
+#endif
+
#else /* KMP_STATIC_STEAL_ENABLED */
typedef struct KMP_ALIGN_CACHE dispatch_private_info32 {
kmp_int32 lb;
@@ -3862,6 +3882,9 @@ extern int __kmp_aux_set_affinity_mask_proc(int proc, void **mask);
extern int __kmp_aux_unset_affinity_mask_proc(int proc, void **mask);
extern int __kmp_aux_get_affinity_mask_proc(int proc, void **mask);
extern void __kmp_balanced_affinity(kmp_info_t *th, int team_size);
+#if KMP_WEIGHTED_ITERATIONS_SUPPORTED
+extern int __kmp_get_first_osid_with_ecore(void);
+#endif
#if KMP_OS_LINUX || KMP_OS_FREEBSD
extern int kmp_set_thread_affinity_mask_initial(void);
#endif
diff --git a/openmp/runtime/src/kmp_affinity.cpp b/openmp/runtime/src/kmp_affinity.cpp
index 8c608d78bb56..7009730a49ba 100644
--- a/openmp/runtime/src/kmp_affinity.cpp
+++ b/openmp/runtime/src/kmp_affinity.cpp
@@ -4196,7 +4196,7 @@ static void __kmp_affinity_get_mask_topology_info(const kmp_affin_mask_t *mask,
// Initiailze ids and attrs thread data
for (int i = 0; i < KMP_HW_LAST; ++i)
- ids[i] = kmp_hw_thread_t::UNKNOWN_ID;
+ ids.ids[i] = kmp_hw_thread_t::UNKNOWN_ID;
attrs = KMP_AFFINITY_ATTRS_UNKNOWN;
// Iterate through each os id within the mask and determine
@@ -4205,19 +4205,20 @@ static void __kmp_affinity_get_mask_topology_info(const kmp_affin_mask_t *mask,
int depth = __kmp_topology->get_depth();
KMP_CPU_SET_ITERATE(cpu, mask) {
int osid_idx = __kmp_osid_to_hwthread_map[cpu];
+ ids.os_id = cpu;
const kmp_hw_thread_t &hw_thread = __kmp_topology->at(osid_idx);
for (int level = 0; level < depth; ++level) {
kmp_hw_t type = __kmp_topology->get_type(level);
int id = hw_thread.sub_ids[level];
- if (ids[type] == kmp_hw_thread_t::UNKNOWN_ID || ids[type] == id) {
- ids[type] = id;
+ if (ids.ids[type] == kmp_hw_thread_t::UNKNOWN_ID || ids.ids[type] == id) {
+ ids.ids[type] = id;
} else {
// This mask spans across multiple topology units, set it as such
// and mark every level below as such as well.
- ids[type] = kmp_hw_thread_t::MULTIPLE_ID;
+ ids.ids[type] = kmp_hw_thread_t::MULTIPLE_ID;
for (; level < depth; ++level) {
kmp_hw_t type = __kmp_topology->get_type(level);
- ids[type] = kmp_hw_thread_t::MULTIPLE_ID;
+ ids.ids[type] = kmp_hw_thread_t::MULTIPLE_ID;
}
}
}
@@ -4297,6 +4298,9 @@ static void __kmp_aux_affinity_initialize_other_data(kmp_affinity_t &affinity) {
if (__kmp_topology && __kmp_topology->get_num_hw_threads()) {
machine_hierarchy.init(__kmp_topology->get_num_hw_threads());
__kmp_affinity_get_topology_info(affinity);
+#if KMP_WEIGHTED_ITERATIONS_SUPPORTED
+ __kmp_first_osid_with_ecore = __kmp_get_first_osid_with_ecore();
+#endif
}
}
@@ -4876,7 +4880,7 @@ void __kmp_affinity_set_init_mask(int gtid, int isa_root) {
// Set the thread topology information to default of unknown
for (int id = 0; id < KMP_HW_LAST; ++id)
- th->th.th_topology_ids[id] = kmp_hw_thread_t::UNKNOWN_ID;
+ th->th.th_topology_ids.ids[id] = kmp_hw_thread_t::UNKNOWN_ID;
th->th.th_topology_attrs = KMP_AFFINITY_ATTRS_UNKNOWN;
if (!KMP_AFFINITY_CAPABLE()) {
@@ -5273,6 +5277,28 @@ int __kmp_aux_get_affinity_mask_proc(int proc, void **mask) {
return KMP_CPU_ISSET(proc, (kmp_affin_mask_t *)(*mask));
}
+#if KMP_WEIGHTED_ITERATIONS_SUPPORTED
+// Returns first os proc id with ATOM core
+int __kmp_get_first_osid_with_ecore(void) {
+ int low = 0;
+ int high = __kmp_topology->get_num_hw_threads() - 1;
+ int mid = 0;
+ while (high - low > 1) {
+ mid = (high + low) / 2;
+ if (__kmp_topology->at(mid).attrs.get_core_type() ==
+ KMP_HW_CORE_TYPE_CORE) {
+ low = mid + 1;
+ } else {
+ high = mid;
+ }
+ }
+ if (__kmp_topology->at(mid).attrs.get_core_type() == KMP_HW_CORE_TYPE_ATOM) {
+ return mid;
+ }
+ return -1;
+}
+#endif
+
// Dynamic affinity settings - Affinity balanced
void __kmp_balanced_affinity(kmp_info_t *th, int nthreads) {
KMP_DEBUG_ASSERT(th);
diff --git a/openmp/runtime/src/kmp_dispatch.cpp b/openmp/runtime/src/kmp_dispatch.cpp
index a6ee844e5988..ac85b2b3f2fc 100644
--- a/openmp/runtime/src/kmp_dispatch.cpp
+++ b/openmp/runtime/src/kmp_dispatch.cpp
@@ -90,6 +90,70 @@ static inline int __kmp_get_monotonicity(ident_t *loc, enum sched_type schedule,
return monotonicity;
}
+#if KMP_WEIGHTED_ITERATIONS_SUPPORTED
+// Return floating point number rounded to two decimal points
+static inline float __kmp_round_2decimal_val(float num) {
+ return (float)(static_cast<int>(num * 100 + 0.5)) / 100;
+}
+static inline int __kmp_get_round_val(float num) {
+ return static_cast<int>(num < 0 ? num - 0.5 : num + 0.5);
+}
+#endif
+
+template <typename T>
+inline void
+__kmp_initialize_self_buffer(kmp_team_t *team, T id,
+ dispatch_private_info_template<T> *pr,
+ typename traits_t<T>::unsigned_t nchunks, T nproc,
+ typename traits_t<T>::unsigned_t &init,
+ T &small_chunk, T &extras, T &p_extra) {
+
+#if KMP_WEIGHTED_ITERATIONS_SUPPORTED
+ if (pr->flags.use_hybrid) {
+ kmp_info_t *th = __kmp_threads[__kmp_gtid_from_tid((int)id, team)];
+ kmp_hw_core_type_t type =
+ (kmp_hw_core_type_t)th->th.th_topology_attrs.core_type;
+ T pchunks = pr->u.p.pchunks;
+ T echunks = nchunks - pchunks;
+ T num_procs_with_pcore = pr->u.p.num_procs_with_pcore;
+ T num_procs_with_ecore = nproc - num_procs_with_pcore;
+ T first_thread_with_ecore = pr->u.p.first_thread_with_ecore;
+ T big_chunk =
+ pchunks / num_procs_with_pcore; // chunks per thread with p-core
+ small_chunk =
+ echunks / num_procs_with_ecore; // chunks per thread with e-core
+
+ extras =
+ (pchunks % num_procs_with_pcore) + (echunks % num_procs_with_ecore);
+
+ p_extra = (big_chunk - small_chunk);
+
+ if (type == KMP_HW_CORE_TYPE_CORE) {
+ if (id < first_thread_with_ecore) {
+ init = id * small_chunk + id * p_extra + (id < extras ? id : extras);
+ } else {
+ init = id * small_chunk + (id - num_procs_with_ecore) * p_extra +
+ (id < extras ? id : extras);
+ }
+ } else {
+ if (id == first_thread_with_ecore) {
+ init = id * small_chunk + id * p_extra + (id < extras ? id : extras);
+ } else {
+ init = id * small_chunk + first_thread_with_ecore * p_extra +
+ (id < extras ? id : extras);
+ }
+ }
+ p_extra = (type == KMP_HW_CORE_TYPE_CORE) ? p_extra : 0;
+ return;
+ }
+#endif
+
+ small_chunk = nchunks / nproc; // chunks per thread
+ extras = nchunks % nproc;
+ p_extra = 0;
+ init = id * small_chunk + (id < extras ? id : extras);
+}
+
#if KMP_STATIC_STEAL_ENABLED
enum { // values for steal_flag (possible states of private per-loop buffer)
UNUSED = 0,
@@ -366,7 +430,7 @@ void __kmp_dispatch_init_algorithm(ident_t *loc, int gtid,
switch (schedule) {
#if KMP_STATIC_STEAL_ENABLED
case kmp_sch_static_steal: {
- T ntc, init;
+ T ntc, init = 0;
KD_TRACE(100,
("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_steal case\n",
@@ -376,7 +440,7 @@ void __kmp_dispatch_init_algorithm(ident_t *loc, int gtid,
if (nproc > 1 && ntc >= nproc) {
KMP_COUNT_BLOCK(OMP_LOOP_STATIC_STEAL);
T id = tid;
- T small_chunk, extras;
+ T small_chunk, extras, p_extra = 0;
kmp_uint32 old = UNUSED;
int claimed = pr->steal_flag.compare_exchange_strong(old, CLAIMED);
if (traits_t<T>::type_size > 4) {
@@ -388,13 +452,110 @@ void __kmp_dispatch_init_algorithm(ident_t *loc, int gtid,
pr->u.p.steal_lock = (kmp_lock_t *)__kmp_allocate(sizeof(kmp_lock_t));
__kmp_init_lock(pr->u.p.steal_lock);
}
- small_chunk = ntc / nproc;
- extras = ntc % nproc;
- init = id * small_chunk + (id < extras ? id : extras);
+#if KMP_WEIGHTED_ITERATIONS_SUPPORTED
+ // Iterations are divided in a 60/40 skewed distribution among CORE and
+ // ATOM processors for hybrid systems
+ bool use_hybrid = false;
+ kmp_hw_core_type_t core_type = KMP_HW_CORE_TYPE_UNKNOWN;
+ T first_thread_with_ecore = 0;
+ T num_procs_with_pcore = 0;
+ T num_procs_with_ecore = 0;
+ T p_ntc = 0, e_ntc = 0;
+ if (__kmp_is_hybrid_cpu() && __kmp_affinity.type != affinity_none &&
+ __kmp_affinity.type != affinity_explicit) {
+ use_hybrid = true;
+ core_type = (kmp_hw_core_type_t)th->th.th_topology_attrs.core_type;
+ if (core_type != KMP_HW_CORE_TYPE_UNKNOWN &&
+ __kmp_first_osid_with_ecore > -1) {
+ for (int i = 0; i < team->t.t_nproc; ++i) {
+ kmp_hw_core_type_t type = (kmp_hw_core_type_t)team->t.t_threads[i]
+ ->th.th_topology_attrs.core_type;
+ int id = team->t.t_threads[i]->th.th_topology_ids.os_id;
+ if (id == __kmp_first_osid_with_ecore) {
+ first_thread_with_ecore =
+ team->t.t_threads[i]->th.th_info.ds.ds_tid;
+ }
+ if (type == KMP_HW_CORE_TYPE_CORE) {
+ num_procs_with_pcore++;
+ } else if (type == KMP_HW_CORE_TYPE_ATOM) {
+ num_procs_with_ecore++;
+ } else {
+ use_hybrid = false;
+ break;
+ }
+ }
+ }
+ if (num_procs_with_pcore > 0 && num_procs_with_ecore > 0) {
+ float multiplier = 60.0 / 40.0;
+ float p_ratio = (float)num_procs_with_pcore / nproc;
+ float e_ratio = (float)num_procs_with_ecore / nproc;
+ float e_multiplier =
+ (float)1 /
+ (((multiplier * num_procs_with_pcore) / nproc) + e_ratio);
+ float p_multiplier = multiplier * e_multiplier;
+ p_ntc = __kmp_get_round_val(ntc * p_ratio * p_multiplier);
+ if ((int)p_ntc > (int)(ntc * p_ratio * p_multiplier))
+ e_ntc =
+ (int)(__kmp_round_2decimal_val(ntc * e_ratio * e_multiplier));
+ else
+ e_ntc = __kmp_get_round_val(ntc * e_ratio * e_multiplier);
+ KMP_DEBUG_ASSERT(ntc == p_ntc + e_ntc);
+
+ // Use regular static steal if not enough chunks for skewed
+ // distribution
+ use_hybrid = (use_hybrid && (p_ntc >= num_procs_with_pcore &&
+ e_ntc >= num_procs_with_ecore)
+ ? true
+ : false);
+ } else {
+ use_hybrid = false;
+ }
+ }
+ pr->flags.use_hybrid = use_hybrid;
+ pr->u.p.pchunks = p_ntc;
+ pr->u.p.num_procs_with_pcore = num_procs_with_pcore;
+ pr->u.p.first_thread_with_ecore = first_thread_with_ecore;
+
+ if (use_hybrid) {
+ KMP_DEBUG_ASSERT(nproc == num_procs_with_pcore + num_procs_with_ecore);
+ T big_chunk = p_ntc / num_procs_with_pcore;
+ small_chunk = e_ntc / num_procs_with_ecore;
+
+ extras =
+ (p_ntc % num_procs_with_pcore) + (e_ntc % num_procs_with_ecore);
+
+ p_extra = (big_chunk - small_chunk);
+
+ if (core_type == KMP_HW_CORE_TYPE_CORE) {
+ if (id < first_thread_with_ecore) {
+ init =
+ id * small_chunk + id * p_extra + (id < extras ? id : extras);
+ } else {
+ init = id * small_chunk + (id - num_procs_with_ecore) * p_extra +
+ (id < extras ? id : extras);
+ }
+ } else {
+ if (id == first_thread_with_ecore) {
+ init =
+ id * small_chunk + id * p_extra + (id < extras ? id : extras);
+ } else {
+ init = id * small_chunk + first_thread_with_ecore * p_extra +
+ (id < extras ? id : extras);
+ }
+ }
+ p_extra = (core_type == KMP_HW_CORE_TYPE_CORE) ? p_extra : 0;
+ } else
+#endif
+ {
+ small_chunk = ntc / nproc;
+ extras = ntc % nproc;
+ init = id * small_chunk + (id < extras ? id : extras);
+ p_extra = 0;
+ }
pr->u.p.count = init;
if (claimed) { // are we succeeded in claiming own buffer?
- pr->u.p.ub = init + small_chunk + (id < extras ? 1 : 0);
+ pr->u.p.ub = init + small_chunk + p_extra + (id < extras ? 1 : 0);
// Other threads will inspect steal_flag when searching for a victim.
// READY means other threads may steal from this thread from now on.
KMP_ATOMIC_ST_REL(&pr->steal_flag, READY);
@@ -1261,13 +1422,13 @@ int __kmp_dispatch_next_algorithm(int gtid,
if (status) {
// initialize self buffer with victim's whole range of chunks
T id = victimId;
- T small_chunk, extras;
- small_chunk = nchunks / nproc; // chunks per thread
- extras = nchunks % nproc;
- init = id * small_chunk + (id < extras ? id : extras);
+ T small_chunk = 0, extras = 0, p_extra = 0;
+ __kmp_initialize_self_buffer<T>(team, id, pr, nchunks, nproc,
+ init, small_chunk, extras,
+ p_extra);
__kmp_acquire_lock(lck, gtid);
pr->u.p.count = init + 1; // exclude one we execute immediately
- pr->u.p.ub = init + small_chunk + (id < extras ? 1 : 0);
+ pr->u.p.ub = init + small_chunk + p_extra + (id < extras ? 1 : 0);
__kmp_release_lock(lck, gtid);
pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid
// no need to reinitialize other thread invariants: lb, st, etc.
@@ -1275,10 +1436,10 @@ int __kmp_dispatch_next_algorithm(int gtid,
{
char *buff;
// create format specifiers before the debug output
- buff = __kmp_str_format(
- "__kmp_dispatch_next: T#%%d stolen chunks from T#%%d, "
- "count:%%%s ub:%%%s\n",
- traits_t<UT>::spec, traits_t<T>::spec);
+ buff = __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d "
+ "stolen chunks from T#%%d, "
+ "count:%%%s ub:%%%s\n",
+ traits_t<UT>::spec, traits_t<T>::spec);
KD_TRACE(10, (buff, gtid, id, pr->u.p.count, pr->u.p.ub));
__kmp_str_free(&buff);
}
@@ -1404,12 +1565,12 @@ int __kmp_dispatch_next_algorithm(int gtid,
if (status) {
// initialize self buffer with victim's whole range of chunks
T id = victimId;
- T small_chunk, extras;
- small_chunk = nchunks / nproc; // chunks per thread
- extras = nchunks % nproc;
- init = id * small_chunk + (id < extras ? id : extras);
+ T small_chunk = 0, extras = 0, p_extra = 0;
+ __kmp_initialize_self_buffer<T>(team, id, pr, nchunks, nproc,
+ init, small_chunk, extras,
+ p_extra);
vnew.p.count = init + 1;
- vnew.p.ub = init + small_chunk + (id < extras ? 1 : 0);
+ vnew.p.ub = init + small_chunk + p_extra + (id < extras ? 1 : 0);
// write pair (count, ub) at once atomically
#if KMP_ARCH_X86
KMP_XCHG_FIXED64((volatile kmp_int64 *)(&pr->u.p.count), vnew.b);
@@ -1422,10 +1583,10 @@ int __kmp_dispatch_next_algorithm(int gtid,
{
char *buff;
// create format specifiers before the debug output
- buff = __kmp_str_format(
- "__kmp_dispatch_next: T#%%d stolen chunks from T#%%d, "
- "count:%%%s ub:%%%s\n",
- traits_t<UT>::spec, traits_t<T>::spec);
+ buff = __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d "
+ "stolen chunks from T#%%d, "
+ "count:%%%s ub:%%%s\n",
+ traits_t<UT>::spec, traits_t<T>::spec);
KD_TRACE(10, (buff, gtid, id, pr->u.p.count, pr->u.p.ub));
__kmp_str_free(&buff);
}
diff --git a/openmp/runtime/src/kmp_dispatch.h b/openmp/runtime/src/kmp_dispatch.h
index 154db174613d..cf19eb52662c 100644
--- a/openmp/runtime/src/kmp_dispatch.h
+++ b/openmp/runtime/src/kmp_dispatch.h
@@ -75,14 +75,17 @@ template <typename T> struct dispatch_private_infoXX_template {
ST st; // signed
UT tc; // unsigned
kmp_lock_t *steal_lock; // lock used for chunk stealing
+
+ UT ordered_lower; // unsigned
+ UT ordered_upper; // unsigned
+
/* parm[1-4] are used in different ways by different scheduling algorithms */
- // KMP_ALIGN( 32 ) ensures ( if the KMP_ALIGN macro is turned on )
+ // KMP_ALIGN(32) ensures ( if the KMP_ALIGN macro is turned on )
// a) parm3 is properly aligned and
// b) all parm1-4 are in the same cache line.
// Because of parm1-4 are used together, performance seems to be better
// if they are in the same line (not measured though).
-
struct KMP_ALIGN(32) { // compiler does not accept sizeof(T)*4
T parm1;
T parm2;
@@ -90,8 +93,11 @@ template <typename T> struct dispatch_private_infoXX_template {
T parm4;
};
- UT ordered_lower; // unsigned
- UT ordered_upper; // unsigned
+#if KMP_WEIGHTED_ITERATIONS_SUPPORTED
+ UT pchunks; // total number of chunks for processes with p-core
+ UT num_procs_with_pcore; // number of threads with p-core
+ T first_thread_with_ecore;
+#endif
#if KMP_OS_WINDOWS
T last_upper;
#endif /* KMP_OS_WINDOWS */
diff --git a/openmp/runtime/src/kmp_global.cpp b/openmp/runtime/src/kmp_global.cpp
index 48097fb530d1..b132f38fd3b0 100644
--- a/openmp/runtime/src/kmp_global.cpp
+++ b/openmp/runtime/src/kmp_global.cpp
@@ -282,6 +282,9 @@ kmp_affinity_t __kmp_hh_affinity =
kmp_affinity_t *__kmp_affinities[] = {&__kmp_affinity, &__kmp_hh_affinity};
char *__kmp_cpuinfo_file = NULL;
+#if KMP_WEIGHTED_ITERATIONS_SUPPORTED
+int __kmp_first_osid_with_ecore = -1;
+#endif
#endif /* KMP_AFFINITY_SUPPORTED */
diff --git a/openmp/runtime/test/worksharing/for/omp_for_schedule_dynamic.c b/openmp/runtime/test/worksharing/for/omp_for_schedule_dynamic.c
index 4433d2a3dafb..419187321d28 100644
--- a/openmp/runtime/test/worksharing/for/omp_for_schedule_dynamic.c
+++ b/openmp/runtime/test/worksharing/for/omp_for_schedule_dynamic.c
@@ -1,4 +1,5 @@
// RUN: %libomp-compile-and-run
+// RUN: env KMP_AFFINITY=compact,0 %libomp-run
/*
* Test for dynamic scheduling with chunk size
* Method: calculate how many times the iteration space is dispatched