From 779aa64bf6bf81a54a4f8dfb8379e574696b50e6806342d33fd1b312a84f2cd4 Mon Sep 17 00:00:00 2001 From: Nicolas Morey-Chaisemartin Date: Thu, 2 Nov 2017 16:19:23 +0000 Subject: [PATCH] Accepting request 538233 from home:StefanBruens:branches:science:HPC - Fix github issue #3393: Add 0001-prov-psm-Eliminate-psm2-compat-library-delay-with-hf.patch OBS-URL: https://build.opensuse.org/request/show/538233 OBS-URL: https://build.opensuse.org/package/show/science:HPC/libfabric?expand=0&rev=24 --- ...te-psm2-compat-library-delay-with-hf.patch | 86 +++++++++++++++++++ libfabric.changes | 6 ++ libfabric.spec | 2 + 3 files changed, 94 insertions(+) create mode 100644 0001-prov-psm-Eliminate-psm2-compat-library-delay-with-hf.patch diff --git a/0001-prov-psm-Eliminate-psm2-compat-library-delay-with-hf.patch b/0001-prov-psm-Eliminate-psm2-compat-library-delay-with-hf.patch new file mode 100644 index 0000000..f33427f --- /dev/null +++ b/0001-prov-psm-Eliminate-psm2-compat-library-delay-with-hf.patch @@ -0,0 +1,86 @@ +From 6c8c40ad84fa790831407e5cd25375af898d929b Mon Sep 17 00:00:00 2001 +From: Jianxin Xiong +Date: Mon, 9 Oct 2017 15:10:45 -0700 +Subject: [PATCH] prov/psm: Eliminate psm2-compat library delay with hfi + devices missing + +The PSM2 library may introduce a 15 second delay at device initialization +time if the hfi devices are missing. This has been handled in the psm2 +provider by checking the existence of the device files before initializing +the device. + +The psm provider didn't handle this situation because the issue doesn't +exist with native PSM library over TrueScale. However, when PSM is +supported via the psm2-compat library over PSM2, the same delay can be +observed. + +Now add the same mechanism to the psm provider. + +Signed-off-by: Jianxin Xiong +--- + prov/psm/src/psmx_init.c | 34 ++++++++++++++++++++++++++++++++++ + 1 file changed, 34 insertions(+) + +diff --git a/prov/psm/src/psmx_init.c b/prov/psm/src/psmx_init.c +index 118ef1a81..c4b06160c 100644 +--- a/prov/psm/src/psmx_init.c ++++ b/prov/psm/src/psmx_init.c +@@ -32,10 +32,12 @@ + + #include "psmx.h" + #include "prov.h" ++#include + + static int psmx_init_count = 0; + static int psmx_lib_initialized = 0; + static pthread_mutex_t psmx_lib_mutex; ++static int psmx_compat_lib = 0; + + struct psmx_env psmx_env = { + .name_server = 1, +@@ -103,6 +105,12 @@ static int psmx_init_lib(void) + PSM_VERNO_MAJOR, PSM_VERNO_MINOR, major, minor); + } + ++ if (major > 1) { ++ psmx_compat_lib = 1; ++ FI_INFO(&psmx_prov, FI_LOG_CORE, ++ "PSM is supported via the psm2-compat library over PSM2.\n"); ++ } ++ + psmx_lib_initialized = 1; + + out: +@@ -197,6 +205,32 @@ static int psmx_getinfo(uint32_t version, const char *node, const char *service, + if (psmx_init_lib()) + return -FI_ENODATA; + ++ if (psmx_compat_lib) { ++ /* ++ * native PSM running over TrueScale doesn't have the issue handled ++ * here. it's only present when PSM is supported via the psm2-compat ++ * library, where the PSM functions are just wrappers around the PSM2 ++ * counterparts. ++ * ++ * psm2_ep_num_devunits() may wait for 15 seconds before return ++ * when /dev/hfi1_0 is not present. Check the existence of any hfi1 ++ * device interface first to avoid this delay. Note that the devices ++ * don't necessarily appear consecutively so we need to check all ++ * possible device names before returning "no device found" error. ++ * This also means if "/dev/hfi1_0" doesn't exist but other devices ++ * exist, we are still going to see the delay; but that's a rare case. ++ */ ++ glob_t glob_buf; ++ ++ if ((glob("/dev/hfi1_[0-9]", 0, NULL, &glob_buf) != 0) && ++ (glob("/dev/hfi1_[0-9][0-9]", GLOB_APPEND, NULL, &glob_buf) != 0)) { ++ FI_INFO(&psmx_prov, FI_LOG_CORE, ++ "no hfi1 device is found.\n"); ++ return -FI_ENODATA; ++ } ++ globfree(&glob_buf); ++ } ++ + if (psm_ep_num_devunits(&cnt) || !cnt) { + FI_INFO(&psmx_prov, FI_LOG_CORE, + "no PSM device is found.\n"); diff --git a/libfabric.changes b/libfabric.changes index e8c8e05..f797b9e 100644 --- a/libfabric.changes +++ b/libfabric.changes @@ -1,3 +1,9 @@ +------------------------------------------------------------------- +Mon Oct 9 23:28:31 UTC 2017 - stefan.bruens@rwth-aachen.de + +- Fix github issue #3393: + Add 0001-prov-psm-Eliminate-psm2-compat-library-delay-with-hf.patch + ------------------------------------------------------------------- Thu Oct 5 07:10:28 UTC 2017 - nmoreychaisemartin@suse.com diff --git a/libfabric.spec b/libfabric.spec index 009b2a8..24f9fbc 100644 --- a/libfabric.spec +++ b/libfabric.spec @@ -28,6 +28,7 @@ Release: 0 Source: %{name}-%{version}%{git_ver}.tar.bz2 Source1: baselibs.conf Patch0: libfabric-libtool.patch +Patch1: 0001-prov-psm-Eliminate-psm2-compat-library-delay-with-hf.patch Url: http://www.github.com/ofiwg/libfabric Prefix: ${_prefix} BuildRequires: autoconf @@ -71,6 +72,7 @@ services, such as RDMA. This package contains the development files. %prep %setup -q -n %{name}-%{version}%{git_ver} %patch0 -p1 +%patch1 -p1 %build rm -f config/libtool.m4