openmpi3/rmaps-simplify-the-lookup-for-the-binding-object-and-fix-for-hwloc-2.0.patch

307 lines
13 KiB
Diff

commit 8992b7c5996de5e261bbfc9e57b270c8717852f9
Author: Brice Goglin <Brice.Goglin@inria.fr>
Date: Fri Jan 26 16:19:52 2018 +0100
rmaps: simplify the lookup for the binding object and fix for hwloc 2.0
Don't bother doing a lookup upwards or downwards for the target object type.
Just use the target depth, iterate over the level until we find the min_bound
object that intersects the locale cpuset.
Signed-off-by: Brice Goglin <Brice.Goglin@inria.fr>
diff --git orte/mca/rmaps/base/rmaps_base_binding.c orte/mca/rmaps/base/rmaps_base_binding.c
index df3799947514..d6781608f36f 100644
--- orte/mca/rmaps/base/rmaps_base_binding.c
+++ orte/mca/rmaps/base/rmaps_base_binding.c
@@ -15,6 +15,7 @@
* Copyright (c) 2013-2017 Intel, Inc. All rights reserved.
* Copyright (c) 2015-2017 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
+ * Copyright (c) 2018 Inria. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@@ -120,142 +121,21 @@ static void unbind_procs(orte_job_t *jdata)
}
}
-static int bind_upwards(orte_job_t *jdata,
+static int bind_generic(orte_job_t *jdata,
orte_node_t *node,
- hwloc_obj_type_t target,
- unsigned cache_level)
-{
- /* traverse the hwloc topology tree on each node upwards
- * until we find an object of type target - and then bind
- * the process to that target
- */
- int j;
- orte_job_map_t *map;
- orte_proc_t *proc;
- hwloc_obj_t obj;
- unsigned int idx, ncpus;
- opal_hwloc_obj_data_t *data;
- hwloc_obj_t locale;
- char *cpu_bitmap;
-
- opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
- "mca:rmaps: bind upwards for job %s with bindings %s",
- ORTE_JOBID_PRINT(jdata->jobid),
- opal_hwloc_base_print_binding(jdata->map->binding));
- /* initialize */
- map = jdata->map;
-
-
- /* cycle thru the procs */
- for (j=0; j < node->procs->size; j++) {
- if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) {
- continue;
- }
- /* ignore procs from other jobs */
- if (proc->name.jobid != jdata->jobid) {
- continue;
- }
- /* bozo check */
- if (!orte_get_attribute(&proc->attributes, ORTE_PROC_HWLOC_LOCALE, (void**)&locale, OPAL_PTR)) {
- orte_show_help("help-orte-rmaps-base.txt", "rmaps:no-locale", true, ORTE_NAME_PRINT(&proc->name));
- return ORTE_ERR_SILENT;
- }
- /* starting at the locale, move up thru the parents
- * to find the target object type
- */
- cpu_bitmap = NULL;
- for (obj = locale->parent; NULL != obj; obj = obj->parent) {
- opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
- "%s bind:upward target %s type %s",
- ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
- hwloc_obj_type_string(target),
- hwloc_obj_type_string(obj->type));
- if (target == obj->type) {
-#if HWLOC_API_VERSION < 0x20000
- if (HWLOC_OBJ_CACHE == target && cache_level != obj->attr->cache.depth) {
- continue;
- }
-#endif
- /* get its index */
- if (UINT_MAX == (idx = opal_hwloc_base_get_obj_idx(node->topology->topo, obj, OPAL_HWLOC_AVAILABLE))) {
- ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
- return ORTE_ERR_SILENT;
- }
- /* track the number bound */
- data = (opal_hwloc_obj_data_t*)obj->userdata;
- data->num_bound++;
- /* get the number of cpus under this location */
- if (0 == (ncpus = opal_hwloc_base_get_npus(node->topology->topo, obj))) {
- orte_show_help("help-orte-rmaps-base.txt", "rmaps:no-available-cpus", true, node->name);
- return ORTE_ERR_SILENT;
- }
- /* error out if adding a proc would cause overload and that wasn't allowed,
- * and it wasn't a default binding policy (i.e., the user requested it)
- */
- if (ncpus < data->num_bound &&
- !OPAL_BIND_OVERLOAD_ALLOWED(jdata->map->binding)) {
- if (OPAL_BINDING_POLICY_IS_SET(jdata->map->binding)) {
- /* if the user specified a binding policy, then we cannot meet
- * it since overload isn't allowed, so error out - have the
- * message indicate that setting overload allowed will remove
- * this restriction */
- orte_show_help("help-orte-rmaps-base.txt", "rmaps:binding-overload", true,
- opal_hwloc_base_print_binding(map->binding), node->name,
- data->num_bound, ncpus);
- return ORTE_ERR_SILENT;
- } else {
- /* if we have the default binding policy, then just don't bind */
- OPAL_SET_BINDING_POLICY(map->binding, OPAL_BIND_TO_NONE);
- unbind_procs(jdata);
- return ORTE_SUCCESS;
- }
- }
- /* bind it here */
- hwloc_bitmap_list_asprintf(&cpu_bitmap, obj->cpuset);
- orte_set_attribute(&proc->attributes, ORTE_PROC_CPU_BITMAP, ORTE_ATTR_GLOBAL, cpu_bitmap, OPAL_STRING);
- /* record the location */
- orte_set_attribute(&proc->attributes, ORTE_PROC_HWLOC_BOUND, ORTE_ATTR_LOCAL, obj, OPAL_PTR);
- opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
- "%s BOUND PROC %s TO %s[%s:%u] on node %s",
- ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
- ORTE_NAME_PRINT(&proc->name),
- cpu_bitmap,
- hwloc_obj_type_string(target),
- idx, node->name);
- break;
- }
- }
- if (NULL == cpu_bitmap && OPAL_BINDING_REQUIRED(jdata->map->binding)) {
- /* didn't find anyone to bind to - this is an error
- * unless the user specified if-supported
- */
- orte_show_help("help-orte-rmaps-base.txt", "rmaps:binding-target-not-found", true,
- opal_hwloc_base_print_binding(map->binding), node->name);
- return ORTE_ERR_SILENT;
- }
- if (NULL != cpu_bitmap) {
- free(cpu_bitmap);
- }
- }
-
- return ORTE_SUCCESS;
-}
-
-static int bind_downwards(orte_job_t *jdata,
- orte_node_t *node,
- hwloc_obj_type_t target,
- unsigned cache_level)
+ int target_depth)
{
int j;
orte_job_map_t *map;
orte_proc_t *proc;
- hwloc_obj_t trg_obj, nxt_obj;
+ hwloc_obj_t trg_obj, tmp_obj, nxt_obj;
unsigned int ncpus;
opal_hwloc_obj_data_t *data;
int total_cpus;
hwloc_cpuset_t totalcpuset;
hwloc_obj_t locale;
char *cpu_bitmap;
+ unsigned min_bound;
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
"mca:rmaps: bind downward for job %s with bindings %s",
@@ -282,12 +162,24 @@ static int bind_downwards(orte_job_t *jdata,
hwloc_bitmap_free(totalcpuset);
return ORTE_ERR_SILENT;
}
- /* we don't know if the target is a direct child of this locale,
- * or if it is some depth below it, so we have to conduct a bit
- * of a search. Let hwloc find the min usage one for us.
- */
- trg_obj = opal_hwloc_base_find_min_bound_target_under_obj(node->topology->topo, locale,
- target, cache_level);
+
+ /* use the min_bound object that intersects locale->cpuset at target_depth */
+ tmp_obj = NULL;
+ trg_obj = NULL;
+ min_bound = UINT_MAX;
+ while (tmp_obj = hwloc_get_next_obj_by_depth(node->topology->topo, target_depth, tmp_obj)) {
+ if (!hwloc_bitmap_intersects(locale->cpuset, tmp_obj->cpuset))
+ continue;
+ data = (opal_hwloc_obj_data_t*)tmp_obj->userdata;
+ if (NULL == data) {
+ data = OBJ_NEW(opal_hwloc_obj_data_t);
+ tmp_obj->userdata = data;
+ }
+ if (data->num_bound < min_bound) {
+ min_bound = data->num_bound;
+ trg_obj = tmp_obj;
+ }
+ }
if (NULL == trg_obj) {
/* there aren't any such targets under this object */
orte_show_help("help-orte-rmaps-base.txt", "rmaps:no-available-cpus", true, node->name);
@@ -296,6 +188,7 @@ static int bind_downwards(orte_job_t *jdata,
}
/* record the location */
orte_set_attribute(&proc->attributes, ORTE_PROC_HWLOC_BOUND, ORTE_ATTR_LOCAL, trg_obj, OPAL_PTR);
+
/* start with a clean slate */
hwloc_bitmap_zero(totalcpuset);
total_cpus = 0;
@@ -685,7 +578,7 @@ int orte_rmaps_base_compute_bindings(orte_job_t *jdata)
int i, rc;
struct hwloc_topology_support *support;
bool force_down = false;
- int bind_depth, map_depth;
+ int bind_depth;
opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
"mca:rmaps: compute bindings for job %s with policy %s[%x]",
@@ -904,62 +797,35 @@ int orte_rmaps_base_compute_bindings(orte_job_t *jdata)
*/
reset_usage(node, jdata->jobid);
- if (force_down) {
- if (ORTE_SUCCESS != (rc = bind_downwards(jdata, node, hwb, clvl))) {
- ORTE_ERROR_LOG(rc);
- return rc;
- }
- } else {
- /* determine the relative depth on this node */
+ /* determine the relative depth on this node */
#if HWLOC_API_VERSION < 0x20000
- if (HWLOC_OBJ_CACHE == hwb) {
- /* must use a unique function because blasted hwloc
- * just doesn't deal with caches very well...sigh
- */
- bind_depth = hwloc_get_cache_type_depth(node->topology->topo, clvl, (hwloc_obj_cache_type_t)-1);
- } else
+ if (HWLOC_OBJ_CACHE == hwb) {
+ /* must use a unique function because blasted hwloc
+ * just doesn't deal with caches very well...sigh
+ */
+ bind_depth = hwloc_get_cache_type_depth(node->topology->topo, clvl, (hwloc_obj_cache_type_t)-1);
+ } else
#endif
- bind_depth = hwloc_get_type_depth(node->topology->topo, hwb);
- if (0 > bind_depth) {
- /* didn't find such an object */
- orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:no-objects",
- true, hwloc_obj_type_string(hwb), node->name);
- return ORTE_ERR_SILENT;
- }
+ bind_depth = hwloc_get_type_depth(node->topology->topo, hwb);
#if HWLOC_API_VERSION < 0x20000
- if (HWLOC_OBJ_CACHE == hwm) {
- /* must use a unique function because blasted hwloc
- * just doesn't deal with caches very well...sigh
- */
- map_depth = hwloc_get_cache_type_depth(node->topology->topo, clvm, (hwloc_obj_cache_type_t)-1);
- } else
+ if (0 > bind_depth)
#else
- /* do something with clvm to silence compiler warnings */
- ++clvm;
+ if (0 > bind_depth && HWLOC_TYPE_DEPTH_NUMANODE != bind_depth)
#endif
- map_depth = hwloc_get_type_depth(node->topology->topo, hwm);
- if (0 > map_depth) {
- /* didn't find such an object */
- orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:no-objects",
- true, hwloc_obj_type_string(hwm), node->name);
- return ORTE_ERR_SILENT;
- }
- opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
- "%s bind_depth: %d map_depth %d",
- ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
- bind_depth, map_depth);
- if (bind_depth > map_depth) {
- if (ORTE_SUCCESS != (rc = bind_downwards(jdata, node, hwb, clvl))) {
- ORTE_ERROR_LOG(rc);
- return rc;
- }
- } else {
- if (ORTE_SUCCESS != (rc = bind_upwards(jdata, node, hwb, clvl))) {
- ORTE_ERROR_LOG(rc);
- return rc;
- }
- }
- }
+ {
+ /* didn't find such an object */
+ orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:no-objects",
+ true, hwloc_obj_type_string(hwb), node->name);
+ return ORTE_ERR_SILENT;
+ }
+ opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
+ "%s bind_depth: %d",
+ ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
+ bind_depth);
+ if (ORTE_SUCCESS != (rc = bind_generic(jdata, node, bind_depth))) {
+ ORTE_ERROR_LOG(rc);
+ return rc;
+ }
}
return ORTE_SUCCESS;