forked from pool/libfabric
779aa64bf6
- Fix github issue #3393: Add 0001-prov-psm-Eliminate-psm2-compat-library-delay-with-hf.patch OBS-URL: https://build.opensuse.org/request/show/538233 OBS-URL: https://build.opensuse.org/package/show/science:HPC/libfabric?expand=0&rev=24
87 lines
2.9 KiB
Diff
87 lines
2.9 KiB
Diff
From 6c8c40ad84fa790831407e5cd25375af898d929b Mon Sep 17 00:00:00 2001
|
|
From: Jianxin Xiong <jianxin.xiong@intel.com>
|
|
Date: Mon, 9 Oct 2017 15:10:45 -0700
|
|
Subject: [PATCH] prov/psm: Eliminate psm2-compat library delay with hfi
|
|
devices missing
|
|
|
|
The PSM2 library may introduce a 15 second delay at device initialization
|
|
time if the hfi devices are missing. This has been handled in the psm2
|
|
provider by checking the existence of the device files before initializing
|
|
the device.
|
|
|
|
The psm provider didn't handle this situation because the issue doesn't
|
|
exist with native PSM library over TrueScale. However, when PSM is
|
|
supported via the psm2-compat library over PSM2, the same delay can be
|
|
observed.
|
|
|
|
Now add the same mechanism to the psm provider.
|
|
|
|
Signed-off-by: Jianxin Xiong <jianxin.xiong@intel.com>
|
|
---
|
|
prov/psm/src/psmx_init.c | 34 ++++++++++++++++++++++++++++++++++
|
|
1 file changed, 34 insertions(+)
|
|
|
|
diff --git a/prov/psm/src/psmx_init.c b/prov/psm/src/psmx_init.c
|
|
index 118ef1a81..c4b06160c 100644
|
|
--- a/prov/psm/src/psmx_init.c
|
|
+++ b/prov/psm/src/psmx_init.c
|
|
@@ -32,10 +32,12 @@
|
|
|
|
#include "psmx.h"
|
|
#include "prov.h"
|
|
+#include <glob.h>
|
|
|
|
static int psmx_init_count = 0;
|
|
static int psmx_lib_initialized = 0;
|
|
static pthread_mutex_t psmx_lib_mutex;
|
|
+static int psmx_compat_lib = 0;
|
|
|
|
struct psmx_env psmx_env = {
|
|
.name_server = 1,
|
|
@@ -103,6 +105,12 @@ static int psmx_init_lib(void)
|
|
PSM_VERNO_MAJOR, PSM_VERNO_MINOR, major, minor);
|
|
}
|
|
|
|
+ if (major > 1) {
|
|
+ psmx_compat_lib = 1;
|
|
+ FI_INFO(&psmx_prov, FI_LOG_CORE,
|
|
+ "PSM is supported via the psm2-compat library over PSM2.\n");
|
|
+ }
|
|
+
|
|
psmx_lib_initialized = 1;
|
|
|
|
out:
|
|
@@ -197,6 +205,32 @@ static int psmx_getinfo(uint32_t version, const char *node, const char *service,
|
|
if (psmx_init_lib())
|
|
return -FI_ENODATA;
|
|
|
|
+ if (psmx_compat_lib) {
|
|
+ /*
|
|
+ * native PSM running over TrueScale doesn't have the issue handled
|
|
+ * here. it's only present when PSM is supported via the psm2-compat
|
|
+ * library, where the PSM functions are just wrappers around the PSM2
|
|
+ * counterparts.
|
|
+ *
|
|
+ * psm2_ep_num_devunits() may wait for 15 seconds before return
|
|
+ * when /dev/hfi1_0 is not present. Check the existence of any hfi1
|
|
+ * device interface first to avoid this delay. Note that the devices
|
|
+ * don't necessarily appear consecutively so we need to check all
|
|
+ * possible device names before returning "no device found" error.
|
|
+ * This also means if "/dev/hfi1_0" doesn't exist but other devices
|
|
+ * exist, we are still going to see the delay; but that's a rare case.
|
|
+ */
|
|
+ glob_t glob_buf;
|
|
+
|
|
+ if ((glob("/dev/hfi1_[0-9]", 0, NULL, &glob_buf) != 0) &&
|
|
+ (glob("/dev/hfi1_[0-9][0-9]", GLOB_APPEND, NULL, &glob_buf) != 0)) {
|
|
+ FI_INFO(&psmx_prov, FI_LOG_CORE,
|
|
+ "no hfi1 device is found.\n");
|
|
+ return -FI_ENODATA;
|
|
+ }
|
|
+ globfree(&glob_buf);
|
|
+ }
|
|
+
|
|
if (psm_ep_num_devunits(&cnt) || !cnt) {
|
|
FI_INFO(&psmx_prov, FI_LOG_CORE,
|
|
"no PSM device is found.\n");
|