Accepting request 562237 from home:NMoreyChaisemartin:branches:science:HPC
- Update to rdma-core v16.1: * Backport fixes: * srp_daemon: Don't create async_ev_thread if only run once * srp_daemon: handle SM lid change * srp_daemon: fix CQ handling - Drop srp_daemon-Don-t-create-async_ev_thread-if-only-run-once.patch, srp_daemon-fix-CQ-handling.patch, and srp_daemon-handle-SM-lid-change.patch as they were merged upstream. OBS-URL: https://build.opensuse.org/request/show/562237 OBS-URL: https://build.opensuse.org/package/show/science:HPC/rdma-core?expand=0&rev=72
This commit is contained in:
parent
5e19c671b9
commit
4d67f5d743
2
_service
2
_service
@ -8,7 +8,7 @@
|
||||
<param name="versionformat">@PARENT_TAG@.@TAG_OFFSET@.%h</param>
|
||||
<param name="versionrewrite-pattern">v(.*)</param>
|
||||
<param name="versionrewrite-replacement">\1</param>
|
||||
<param name="revision">bf2450ea9afd7ec10c3f108927e2978e39823d62</param>
|
||||
<param name="revision">9b91e2e5287160025f6fc0b555c8f0debfaf9b12</param>
|
||||
<param name="extract">suse/rdma-core.spec</param>
|
||||
</service>
|
||||
<service name="recompress" mode="disabled">
|
||||
|
@ -1,3 +0,0 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:1826eba8aa2311202c72d3410fe7e7c07e434904737c680e3d9b40d0fc50a9a7
|
||||
size 942833
|
3
rdma-core-16.1.0.9b91e2e52871.tar.gz
Normal file
3
rdma-core-16.1.0.9b91e2e52871.tar.gz
Normal file
@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:4b83a503e07fefbdec6eefdde9e102b73f2fd34389838ff5e5e5e5c3069a13e8
|
||||
size 1059514
|
@ -1,3 +1,16 @@
|
||||
-------------------------------------------------------------------
|
||||
Thu Jan 4 11:41:20 UTC 2018 - nmoreychaisemartin@suse.com
|
||||
|
||||
- Update to rdma-core v16.1:
|
||||
* Backport fixes:
|
||||
* srp_daemon: Don't create async_ev_thread if only run once
|
||||
* srp_daemon: handle SM lid change
|
||||
* srp_daemon: fix CQ handling
|
||||
- Drop srp_daemon-Don-t-create-async_ev_thread-if-only-run-once.patch,
|
||||
srp_daemon-fix-CQ-handling.patch, and
|
||||
srp_daemon-handle-SM-lid-change.patch as they were merged upstream.
|
||||
|
||||
|
||||
-------------------------------------------------------------------
|
||||
Wed Jan 3 09:17:10 UTC 2018 - nmoreychaisemartin@suse.com
|
||||
|
||||
|
@ -17,9 +17,9 @@
|
||||
|
||||
|
||||
%bcond_without systemd
|
||||
%define git_ver .0.bf2450ea
|
||||
%define git_ver .0.9b91e2e52871
|
||||
Name: rdma-core
|
||||
Version: 16
|
||||
Version: 16.1
|
||||
Release: 0
|
||||
Summary: RDMA core userspace libraries and daemons
|
||||
License: GPL-2.0 or BSD-2-Clause
|
||||
@ -50,9 +50,6 @@ Group: Productivity/Networking/Other
|
||||
Url: https://github.com/linux-rdma/rdma-core
|
||||
Source: rdma-core-%{version}%{git_ver}.tar.gz
|
||||
Source1: baselibs.conf
|
||||
Patch1: srp_daemon-handle-SM-lid-change.patch
|
||||
Patch2: srp_daemon-fix-CQ-handling.patch
|
||||
Patch3: srp_daemon-Don-t-create-async_ev_thread-if-only-run-once.patch
|
||||
BuildRequires: binutils
|
||||
BuildRequires: cmake >= 2.8.11
|
||||
BuildRequires: gcc
|
||||
@ -328,9 +325,6 @@ on those changes.
|
||||
|
||||
%prep
|
||||
%setup -q -n %{name}-%{version}%{git_ver}
|
||||
%patch1
|
||||
%patch2
|
||||
%patch3
|
||||
|
||||
%build
|
||||
|
||||
|
@ -1,40 +0,0 @@
|
||||
commit b1a51eeee28c14dbba332cf59a0e85a182374ed6
|
||||
Author: Honggang Li <honli@redhat.com>
|
||||
Date: Wed Dec 20 03:09:58 2017 +0800
|
||||
|
||||
srp_daemon: Don't create async_ev_thread if only run once
|
||||
|
||||
fd3005f0cd34 moves the signal handler setup from ibsrpdm path. So,
|
||||
default signal handler will be used when the main pthread send signal
|
||||
SIGINT to pthread async_ev_thread. ibsrpdm will exit with non-zero
|
||||
exit code as default signal handler killed it. ibsrpdm should return
|
||||
with exit code zero, if no error emerged.
|
||||
|
||||
We should not create async_ev_thread for ibsrpdm.
|
||||
|
||||
Fixes: fd3005f0cd34 ("srp_daemon: Move the setup of the wakeup_pipe after openlog")
|
||||
Reviewed-by: Bart Van Assche <bart.vanassche@wdc.com>
|
||||
Signed-off-by: Honggang Li <honli@redhat.com>
|
||||
|
||||
diff --git srp_daemon/srp_daemon.c srp_daemon/srp_daemon.c
|
||||
index 36df5c3bfe79..a7e7807774c5 100644
|
||||
--- srp_daemon/srp_daemon.c
|
||||
+++ srp_daemon/srp_daemon.c
|
||||
@@ -1945,12 +1945,12 @@ static struct resources *alloc_res(void)
|
||||
run_thread_get_trap_notices, &res->res);
|
||||
if (ret)
|
||||
goto err;
|
||||
- }
|
||||
|
||||
- ret = pthread_create(&res->res.async_ev_thread, NULL,
|
||||
- run_thread_listen_to_events, &res->res);
|
||||
- if (ret)
|
||||
- goto err;
|
||||
+ ret = pthread_create(&res->res.async_ev_thread, NULL,
|
||||
+ run_thread_listen_to_events, &res->res);
|
||||
+ if (ret)
|
||||
+ goto err;
|
||||
+ }
|
||||
|
||||
if (config->retry_timeout && !config->once) {
|
||||
ret = pthread_create(&res->res.reconnect_thread, NULL,
|
@ -1,105 +0,0 @@
|
||||
commit c1c584c34d249987d7c36ff061bc5f2eedec38fe
|
||||
Author: Nicolas Morey-Chaisemartin <NMoreyChaisemartin@suse.com>
|
||||
Date: Mon Dec 11 15:37:28 2017 +0100
|
||||
|
||||
srp_daemon: fix CQ handling
|
||||
|
||||
SM traps are polled through poll_cq which waited for a CQ event
|
||||
before polling the CQ itself.
|
||||
However it may happens that multiple completions are attached
|
||||
to a single event. As stated by the ibv_get_cq_event man page,
|
||||
it is required to poll the the CQ to get those completions
|
||||
after the call to ibv_req_notify_cq.
|
||||
|
||||
As completions need to be handled one by one in an outer function,
|
||||
start by polling the CQ and return the completion (if any) before
|
||||
waiting for the next completion event.
|
||||
This will allow emptying all pending completions, through multiple calls
|
||||
to poll_cq, before waiting for a new event.
|
||||
|
||||
The buggy use case seems to appear when the master SM is switched multiple
|
||||
times between two nodes. As the number of ping-pong between the SMs increases,
|
||||
the number of traps sent to notify that the SM just became master increases
|
||||
too. This causes burst of completions linked to a single event.
|
||||
Note that the race condition is also possible in other scenario.
|
||||
|
||||
Signed-off-by: Nicolas Morey-Chaisemartin <NMoreyChaisemartin@suse.com>
|
||||
Cc: stable@linux-rdma.org # v14, v15, v16
|
||||
|
||||
diff --git srp_daemon/srp_handle_traps.c srp_daemon/srp_handle_traps.c
|
||||
index 25f2b9ab..77a47db3 100644
|
||||
--- srp_daemon/srp_handle_traps.c
|
||||
+++ srp_daemon/srp_handle_traps.c
|
||||
@@ -496,6 +496,34 @@ static int stop_threads(struct sync_resources *sync_res)
|
||||
return result;
|
||||
}
|
||||
|
||||
+/*****************************************************************************
|
||||
+* Function: poll_cq_once
|
||||
+* Poll a CQ once.
|
||||
+* Returns the number of completion polled (0 or 1).
|
||||
+* Returns a negative value on error.
|
||||
+*****************************************************************************/
|
||||
+static int poll_cq_once(struct sync_resources *sync_res, struct ibv_cq *cq,
|
||||
+ struct ibv_wc *wc)
|
||||
+{
|
||||
+ int ret;
|
||||
+
|
||||
+ ret = ibv_poll_cq(cq, 1, wc);
|
||||
+ if (ret < 0) {
|
||||
+ pr_err("poll CQ failed\n");
|
||||
+ return ret;
|
||||
+ }
|
||||
+
|
||||
+ if (ret > 0 && wc->status != IBV_WC_SUCCESS) {
|
||||
+ if (!stop_threads(sync_res))
|
||||
+ pr_err("got bad completion with status: 0x%x\n",
|
||||
+ wc->status);
|
||||
+ return -ret;
|
||||
+ }
|
||||
+
|
||||
+ return ret;
|
||||
+}
|
||||
+
|
||||
+
|
||||
static int poll_cq(struct sync_resources *sync_res, struct ibv_cq *cq,
|
||||
struct ibv_wc *wc, struct ibv_comp_channel *channel)
|
||||
{
|
||||
@@ -504,6 +532,16 @@ static int poll_cq(struct sync_resources *sync_res, struct ibv_cq *cq,
|
||||
void *ev_ctx;
|
||||
|
||||
if (channel) {
|
||||
+ /* There may be extra completions that
|
||||
+ * were associated to the previous event.
|
||||
+ * Only poll for the first one. If there are more than one,
|
||||
+ * they will be handled by later call to poll_cq */
|
||||
+ ret = poll_cq_once(sync_res, cq, wc);
|
||||
+ /* return directly if there was an error or
|
||||
+ * 1 completion polled */
|
||||
+ if (ret)
|
||||
+ return ret;
|
||||
+
|
||||
if (ibv_get_cq_event(channel, &ev_cq, &ev_ctx)) {
|
||||
pr_err("Failed to get cq_event\n");
|
||||
return -1;
|
||||
@@ -524,18 +562,9 @@ static int poll_cq(struct sync_resources *sync_res, struct ibv_cq *cq,
|
||||
}
|
||||
|
||||
do {
|
||||
- ret = ibv_poll_cq(cq, 1, wc);
|
||||
- if (ret < 0) {
|
||||
- pr_err("poll CQ failed\n");
|
||||
+ ret = poll_cq_once(sync_res, cq, wc);
|
||||
+ if (ret < 0)
|
||||
return ret;
|
||||
- }
|
||||
-
|
||||
- if (ret > 0 && wc->status != IBV_WC_SUCCESS) {
|
||||
- if (!stop_threads(sync_res))
|
||||
- pr_err("got bad completion with status: 0x%x\n",
|
||||
- wc->status);
|
||||
- return -ret;
|
||||
- }
|
||||
|
||||
if (ret == 0 && channel) {
|
||||
pr_err("Weird poll returned no cqe after CQ event\n");
|
@ -1,100 +0,0 @@
|
||||
commit 2fbc501061218e7df8e37bb2df6db73e00005e9b
|
||||
Author: Nicolas Morey-Chaisemartin <NMoreyChaisemartin@suse.com>
|
||||
Date: Mon Dec 4 15:15:55 2017 +0100
|
||||
|
||||
srp_daemon: handle SM lid change
|
||||
|
||||
When srp_daemon was running and the master SM host changes,
|
||||
srp_daemon output these errors at every scan:
|
||||
srp_daemon[25394]: No response to inform info registration
|
||||
srp_daemon[25394]: Fail to register to traps, maybe there is no opensm
|
||||
running on fabric or IB port is down
|
||||
|
||||
This was introduced by commit 4952e5f Fix a memory leak.
|
||||
A side effect of this patch was that create_ah was only called when the
|
||||
port lid changes. Which meant register_to_traps used an older, obsolete,
|
||||
version of sm_lid and failed to connect to it.
|
||||
|
||||
This patch fixes this behaviour by checking for both local lid changes and
|
||||
SM lid changes, and calling create_ah on any of these events.
|
||||
|
||||
Fixes: 4952e5f7 (Fix a memory leak)
|
||||
Signed-off-by: Nicolas Morey-Chaisemartin <NMoreyChaisemartin@suse.com>
|
||||
Cc: stable@linux-rdma.org # v14, v15, v16
|
||||
|
||||
diff --git srp_daemon/srp_daemon.c srp_daemon/srp_daemon.c
|
||||
index 2465ccd9..36df5c3b 100644
|
||||
--- srp_daemon/srp_daemon.c
|
||||
+++ srp_daemon/srp_daemon.c
|
||||
@@ -1103,7 +1103,7 @@ static int get_shared_pkeys(struct resources *res,
|
||||
int i, num_pkeys = 0;
|
||||
uint16_t pkey;
|
||||
uint16_t local_port_lid = get_port_lid(res->ud_res->ib_ctx,
|
||||
- config->port_num);
|
||||
+ config->port_num, NULL);
|
||||
|
||||
in_mad_buf = malloc(sizeof(struct ib_user_mad) +
|
||||
node_table_response_size);
|
||||
@@ -2092,7 +2092,7 @@ int main(int argc, char *argv[])
|
||||
{
|
||||
int ret;
|
||||
struct resources *res;
|
||||
- uint16_t lid;
|
||||
+ uint16_t lid, sm_lid;
|
||||
uint16_t pkey;
|
||||
union umad_gid gid;
|
||||
struct target_details *target;
|
||||
@@ -2196,8 +2196,10 @@ catas_start:
|
||||
|
||||
pr_debug("Starting a recalculation\n");
|
||||
port_lid = get_port_lid(res->ud_res->ib_ctx,
|
||||
- config->port_num);
|
||||
- if (port_lid != res->ud_res->port_attr.lid) {
|
||||
+ config->port_num, &sm_lid);
|
||||
+ if (port_lid != res->ud_res->port_attr.lid ||
|
||||
+ sm_lid != res->ud_res->port_attr.sm_lid) {
|
||||
+
|
||||
if (res->ud_res->ah) {
|
||||
ibv_destroy_ah(res->ud_res->ah);
|
||||
res->ud_res->ah = NULL;
|
||||
diff --git srp_daemon/srp_daemon.h srp_daemon/srp_daemon.h
|
||||
index 5d268ed3..864b3d42 100644
|
||||
--- srp_daemon/srp_daemon.h
|
||||
+++ srp_daemon/srp_daemon.h
|
||||
@@ -299,7 +299,7 @@ void *run_thread_listen_to_events(void *res_in);
|
||||
int get_node(struct umad_resources *umad_res, uint16_t dlid, uint64_t *guid);
|
||||
int create_trap_resources(struct ud_resources *ud_res);
|
||||
int register_to_traps(struct resources *res, int subscribe);
|
||||
-uint16_t get_port_lid(struct ibv_context *ib_ctx, int port_num);
|
||||
+uint16_t get_port_lid(struct ibv_context *ib_ctx, int port_num, uint16_t *sm_lid);
|
||||
int create_ah(struct ud_resources *ud_res);
|
||||
void push_gid_to_list(struct sync_resources *res, union umad_gid *gid,
|
||||
uint16_t pkey);
|
||||
diff --git srp_daemon/srp_handle_traps.c srp_daemon/srp_handle_traps.c
|
||||
index 6d94634e..25f2b9ab 100644
|
||||
--- srp_daemon/srp_handle_traps.c
|
||||
+++ srp_daemon/srp_handle_traps.c
|
||||
@@ -340,12 +340,20 @@ int ud_resources_create(struct ud_resources *res)
|
||||
return 0;
|
||||
}
|
||||
|
||||
-uint16_t get_port_lid(struct ibv_context *ib_ctx, int port_num)
|
||||
+uint16_t get_port_lid(struct ibv_context *ib_ctx, int port_num, uint16_t *sm_lid)
|
||||
{
|
||||
struct ibv_port_attr port_attr;
|
||||
+ int ret;
|
||||
+
|
||||
+ ret = ibv_query_port(ib_ctx, port_num, &port_attr);
|
||||
|
||||
- return ibv_query_port(ib_ctx, port_num, &port_attr) == 0 ?
|
||||
- port_attr.lid : 0;
|
||||
+ if (!ret) {
|
||||
+ if (sm_lid)
|
||||
+ *sm_lid = port_attr.sm_lid;
|
||||
+ return port_attr.lid;
|
||||
+ }
|
||||
+
|
||||
+ return 0;
|
||||
}
|
||||
|
||||
int create_ah(struct ud_resources *ud_res)
|
Loading…
x
Reference in New Issue
Block a user