Accepting request 556717 from home:vuntz:branches:network:messaging:amqp
- Add ocf-pull-request-63.patch and ocf-pull-request-64.patch: fixes to avoid moving master unnecessarily, and to make start notification handler more reliable. - Add ocf-pull-request-66.patch: do not consider transient local failures as failures of remote nodes. OBS-URL: https://build.opensuse.org/request/show/556717 OBS-URL: https://build.opensuse.org/package/show/network:messaging:amqp/rabbitmq-server?expand=0&rev=84
This commit is contained in:
parent
56a05a9da9
commit
6f2036b23f
29
ocf-pull-request-63.patch
Normal file
29
ocf-pull-request-63.patch
Normal file
@ -0,0 +1,29 @@
|
|||||||
|
From 62a4f7561171328cd1d62cab394d0bba269ea7ad Mon Sep 17 00:00:00 2001
|
||||||
|
From: Vincent Untz <vuntz@suse.com>
|
||||||
|
Date: Fri, 8 Dec 2017 13:32:45 +0100
|
||||||
|
Subject: [PATCH] OCF RA: Avoid promoting nodes with same start time as master
|
||||||
|
|
||||||
|
It may happen that two nodes have the same start time, and one of these
|
||||||
|
is the master. When this happens, the node actually gets the same score
|
||||||
|
as the master and can get promoted. There's no reason to avoid being
|
||||||
|
stable here, so let's keep the same master in that scenario.
|
||||||
|
---
|
||||||
|
scripts/rabbitmq-server-ha.ocf | 5 +++++
|
||||||
|
1 file changed, 5 insertions(+)
|
||||||
|
|
||||||
|
diff --git a/scripts/rabbitmq-server-ha.ocf b/scripts/rabbitmq-server-ha.ocf
|
||||||
|
index 87bb7d4..da6aee6 100755
|
||||||
|
--- a/scripts/rabbitmq-server-ha.ocf
|
||||||
|
+++ b/scripts/rabbitmq-server-ha.ocf
|
||||||
|
@@ -1608,6 +1608,11 @@ get_monitor() {
|
||||||
|
ocf_log info "${LH} comparing us (start time: $our_start_time, score: $new_score) with $node (start time: $node_start_time, score: $node_score)"
|
||||||
|
if [ $node_start_time -ne 0 -a $node_score -ne 0 -a $node_start_time -lt $our_start_time ]; then
|
||||||
|
new_score=$((node_score - 10 < new_score ? node_score - 10 : new_score ))
|
||||||
|
+ elif [ $node_start_time -ne 0 -a $node_score -ne 0 -a $node_start_time -eq $our_start_time ]; then
|
||||||
|
+ # Do not get promoted if the other node is already master and we have the same start time
|
||||||
|
+ if is_master $node; then
|
||||||
|
+ new_score=$((node_score - 10 < new_score ? node_score - 10 : new_score ))
|
||||||
|
+ fi
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
fi
|
98
ocf-pull-request-64.patch
Normal file
98
ocf-pull-request-64.patch
Normal file
@ -0,0 +1,98 @@
|
|||||||
|
From a8e7a62513567b7beab895115d88f57257d21856 Mon Sep 17 00:00:00 2001
|
||||||
|
From: Vincent Untz <vuntz@suse.com>
|
||||||
|
Date: Fri, 8 Dec 2017 14:13:59 +0100
|
||||||
|
Subject: [PATCH 1/3] OCF RA: Fix test for no node in start notification
|
||||||
|
handler
|
||||||
|
|
||||||
|
If there's nothing starting and nothing active, then we do a -z " ",
|
||||||
|
which doesn't have the same result as -z "". Instead, just test for
|
||||||
|
emptiness for each set of nodes.
|
||||||
|
---
|
||||||
|
scripts/rabbitmq-server-ha.ocf | 2 +-
|
||||||
|
1 file changed, 1 insertion(+), 1 deletion(-)
|
||||||
|
|
||||||
|
diff --git a/scripts/rabbitmq-server-ha.ocf b/scripts/rabbitmq-server-ha.ocf
|
||||||
|
index 87bb7d4..8a82ac7 100755
|
||||||
|
--- a/scripts/rabbitmq-server-ha.ocf
|
||||||
|
+++ b/scripts/rabbitmq-server-ha.ocf
|
||||||
|
@@ -2185,7 +2185,7 @@ action_notify() {
|
||||||
|
local nodes_list="${OCF_RESKEY_CRM_meta_notify_start_uname} ${OCF_RESKEY_CRM_meta_notify_active_uname}"
|
||||||
|
# Do nothing, if the list of nodes being started or running reported empty
|
||||||
|
# Delegate recovery, if needed, to the "running out of the cluster" monitor's logic
|
||||||
|
- if [ -z "${nodes_list}" ] ; then
|
||||||
|
+ if [ -z "${OCF_RESKEY_CRM_meta_notify_start_uname}" -a -z "${OCF_RESKEY_CRM_meta_notify_active_uname}" ] ; then
|
||||||
|
ocf_log warn "${LH} I'm a last man standing and I must survive!"
|
||||||
|
ocf_log info "${LH} post-start end."
|
||||||
|
return $OCF_SUCCESS
|
||||||
|
|
||||||
|
From 2f284bf595dbbe1938a1ce3028b0299b1a75a6cc Mon Sep 17 00:00:00 2001
|
||||||
|
From: Vincent Untz <vuntz@suse.com>
|
||||||
|
Date: Fri, 8 Dec 2017 14:15:24 +0100
|
||||||
|
Subject: [PATCH 2/3] OCF RA: Do not start rabbitmq if notification of start is
|
||||||
|
not about us
|
||||||
|
|
||||||
|
Right now, every time we get a start notification, all nodes will ensure
|
||||||
|
the rabbitmq app is started. This makes little sense, as nodes that are
|
||||||
|
already active don't need to do that.
|
||||||
|
|
||||||
|
On top of that, this had the sideeffect of updating the start time for
|
||||||
|
each of these nodes, which could result in the master moving to another
|
||||||
|
node.
|
||||||
|
---
|
||||||
|
scripts/rabbitmq-server-ha.ocf | 3 +--
|
||||||
|
1 file changed, 1 insertion(+), 2 deletions(-)
|
||||||
|
|
||||||
|
diff --git a/scripts/rabbitmq-server-ha.ocf b/scripts/rabbitmq-server-ha.ocf
|
||||||
|
index 8a82ac7..5d65061 100755
|
||||||
|
--- a/scripts/rabbitmq-server-ha.ocf
|
||||||
|
+++ b/scripts/rabbitmq-server-ha.ocf
|
||||||
|
@@ -2182,7 +2182,6 @@ action_notify() {
|
||||||
|
;;
|
||||||
|
start)
|
||||||
|
ocf_log info "${LH} post-start begin."
|
||||||
|
- local nodes_list="${OCF_RESKEY_CRM_meta_notify_start_uname} ${OCF_RESKEY_CRM_meta_notify_active_uname}"
|
||||||
|
# Do nothing, if the list of nodes being started or running reported empty
|
||||||
|
# Delegate recovery, if needed, to the "running out of the cluster" monitor's logic
|
||||||
|
if [ -z "${OCF_RESKEY_CRM_meta_notify_start_uname}" -a -z "${OCF_RESKEY_CRM_meta_notify_active_uname}" ] ; then
|
||||||
|
@@ -2191,7 +2190,7 @@ action_notify() {
|
||||||
|
return $OCF_SUCCESS
|
||||||
|
fi
|
||||||
|
# check did this event from this host
|
||||||
|
- my_host "${nodes_list}"
|
||||||
|
+ my_host "${OCF_RESKEY_CRM_meta_notify_start_uname}"
|
||||||
|
rc=$?
|
||||||
|
# Do nothing, if there is no master reported
|
||||||
|
# Delegate recovery, if needed, to the "running out of the cluster" monitor's logic
|
||||||
|
|
||||||
|
From a6dc3f91b0c1038927567cbdce9f6a9538904075 Mon Sep 17 00:00:00 2001
|
||||||
|
From: Vincent Untz <vuntz@suse.com>
|
||||||
|
Date: Fri, 8 Dec 2017 14:17:38 +0100
|
||||||
|
Subject: [PATCH 3/3] OCF RA: Fix logging in start notification handler
|
||||||
|
|
||||||
|
The "post-start end" log message was written too early (some things were
|
||||||
|
still done afterwards), and not in all cases (it was inside a if
|
||||||
|
statement).
|
||||||
|
---
|
||||||
|
scripts/rabbitmq-server-ha.ocf | 2 +-
|
||||||
|
1 file changed, 1 insertion(+), 1 deletion(-)
|
||||||
|
|
||||||
|
diff --git a/scripts/rabbitmq-server-ha.ocf b/scripts/rabbitmq-server-ha.ocf
|
||||||
|
index 5d65061..33f0f56 100755
|
||||||
|
--- a/scripts/rabbitmq-server-ha.ocf
|
||||||
|
+++ b/scripts/rabbitmq-server-ha.ocf
|
||||||
|
@@ -2217,7 +2217,6 @@ action_notify() {
|
||||||
|
rc2=$?
|
||||||
|
update_rabbit_start_time_if_rc $rc2
|
||||||
|
fi
|
||||||
|
- ocf_log info "${LH} post-start end."
|
||||||
|
if [ -s "${OCF_RESKEY_definitions_dump_file}" ] ; then
|
||||||
|
ocf_log info "File ${OCF_RESKEY_definitions_dump_file} exists"
|
||||||
|
ocf_run curl --silent --show-error --request POST --user $OCF_RESKEY_admin_user:$OCF_RESKEY_admin_password $OCF_RESKEY_host_ip:15672/api/definitions --header "Content-Type:application/json" --data @$OCF_RESKEY_definitions_dump_file
|
||||||
|
@@ -2234,6 +2233,7 @@ action_notify() {
|
||||||
|
return $OCF_ERR_GENERIC
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
+ ocf_log info "${LH} post-start end."
|
||||||
|
;;
|
||||||
|
stop)
|
||||||
|
# if rabbitmq-server stops on any another node, we should remove it from cluster (as ordinary operation)
|
50
ocf-pull-request-66.patch
Normal file
50
ocf-pull-request-66.patch
Normal file
@ -0,0 +1,50 @@
|
|||||||
|
From 21d14dbe7389c2d0cc8778476ba5c71ad5ad4406 Mon Sep 17 00:00:00 2001
|
||||||
|
From: Vincent Untz <vuntz@suse.com>
|
||||||
|
Date: Wed, 13 Dec 2017 12:34:31 +0100
|
||||||
|
Subject: [PATCH] OCF RA: Do not consider local failures as remote node
|
||||||
|
problems
|
||||||
|
|
||||||
|
In is_clustered_with(), commands that we run to check if the node is
|
||||||
|
clustered with us, or partitioned with us may fail. When they fail, it
|
||||||
|
actually doesn't tell us anything about the remote node.
|
||||||
|
|
||||||
|
Until now, we were considering such failures as hints that the remote
|
||||||
|
node is not in a sane state with us. But doing so has pretty negative
|
||||||
|
impact, as it can cause rabbitmq to get restarted on the remote node,
|
||||||
|
causing quite some disruption.
|
||||||
|
|
||||||
|
So instead of doing this, ignore the error (it's still logged).
|
||||||
|
|
||||||
|
There was a comment in the code wondering what is the best behavior;
|
||||||
|
based on experience, I think preferring stability is the slightly more
|
||||||
|
acceptable poison between the two options.
|
||||||
|
---
|
||||||
|
scripts/rabbitmq-server-ha.ocf | 8 ++++----
|
||||||
|
1 file changed, 4 insertions(+), 4 deletions(-)
|
||||||
|
|
||||||
|
diff --git a/scripts/rabbitmq-server-ha.ocf b/scripts/rabbitmq-server-ha.ocf
|
||||||
|
index 87bb7d4..bc6a538 100755
|
||||||
|
--- a/scripts/rabbitmq-server-ha.ocf
|
||||||
|
+++ b/scripts/rabbitmq-server-ha.ocf
|
||||||
|
@@ -870,8 +870,8 @@ is_clustered_with()
|
||||||
|
rc=$?
|
||||||
|
if [ "$rc" -ne 0 ]; then
|
||||||
|
ocf_log err "${LH} Failed to check whether '$node_name' is considered running by us"
|
||||||
|
- # XXX Or should we give remote node benefit of a doubt?
|
||||||
|
- return 1
|
||||||
|
+ # We had a transient local error; that doesn't mean the remote node is
|
||||||
|
+ # not part of the cluster, so ignore this
|
||||||
|
elif [ "$seen_as_running" != true ]; then
|
||||||
|
ocf_log info "${LH} Node $node_name is not running, considering it not clustered with us"
|
||||||
|
return 1
|
||||||
|
@@ -882,8 +882,8 @@ is_clustered_with()
|
||||||
|
rc=$?
|
||||||
|
if [ "$rc" -ne 0 ]; then
|
||||||
|
ocf_log err "${LH} Failed to check whether '$node_name' is partitioned with us"
|
||||||
|
- # XXX Or should we give remote node benefit of a doubt?
|
||||||
|
- return 1
|
||||||
|
+ # We had a transient local error; that doesn't mean the remote node is
|
||||||
|
+ # not partitioned with us, so ignore this
|
||||||
|
elif [ "$seen_as_partitioned" != false ]; then
|
||||||
|
ocf_log info "${LH} Node $node_name is partitioned from us"
|
||||||
|
return 1
|
@ -1,3 +1,12 @@
|
|||||||
|
-------------------------------------------------------------------
|
||||||
|
Wed Dec 13 12:13:03 UTC 2017 - vuntz@suse.com
|
||||||
|
|
||||||
|
- Add ocf-pull-request-63.patch and ocf-pull-request-64.patch:
|
||||||
|
fixes to avoid moving master unnecessarily, and to make start
|
||||||
|
notification handler more reliable.
|
||||||
|
- Add ocf-pull-request-66.patch: do not consider transient local
|
||||||
|
failures as failures of remote nodes.
|
||||||
|
|
||||||
-------------------------------------------------------------------
|
-------------------------------------------------------------------
|
||||||
Thu Nov 23 13:53:44 UTC 2017 - rbrown@suse.com
|
Thu Nov 23 13:53:44 UTC 2017 - rbrown@suse.com
|
||||||
|
|
||||||
|
@ -50,6 +50,9 @@ Source6: rabbitmq-server.service
|
|||||||
Source7: rabbitmq-server.tmpfiles.d.conf
|
Source7: rabbitmq-server.tmpfiles.d.conf
|
||||||
Source8: README.SUSE
|
Source8: README.SUSE
|
||||||
Source9: rabbitmq.config.example
|
Source9: rabbitmq.config.example
|
||||||
|
Patch0: ocf-pull-request-63.patch
|
||||||
|
Patch1: ocf-pull-request-64.patch
|
||||||
|
Patch2: ocf-pull-request-66.patch
|
||||||
BuildRequires: erlang
|
BuildRequires: erlang
|
||||||
BuildRequires: erlang-src
|
BuildRequires: erlang-src
|
||||||
BuildRequires: fdupes
|
BuildRequires: fdupes
|
||||||
@ -115,6 +118,9 @@ This package includes the RabbitMQ AMQP language bindings for Erlang.
|
|||||||
%prep
|
%prep
|
||||||
%setup -q
|
%setup -q
|
||||||
cp %{SOURCE8} .
|
cp %{SOURCE8} .
|
||||||
|
%patch0 -p1
|
||||||
|
%patch1 -p1
|
||||||
|
%patch2 -p1
|
||||||
|
|
||||||
%build
|
%build
|
||||||
make all %{_make_args} %{?_smp_mflags}
|
make all %{_make_args} %{?_smp_mflags}
|
||||||
|
Loading…
Reference in New Issue
Block a user