From 6f2036b23fd25d7c27a108c4df82e650d859a1aa69999a86bfaea006acd807e9 Mon Sep 17 00:00:00 2001 From: Dirk Mueller Date: Wed, 13 Dec 2017 16:09:08 +0000 Subject: [PATCH] Accepting request 556717 from home:vuntz:branches:network:messaging:amqp - Add ocf-pull-request-63.patch and ocf-pull-request-64.patch: fixes to avoid moving master unnecessarily, and to make start notification handler more reliable. - Add ocf-pull-request-66.patch: do not consider transient local failures as failures of remote nodes. OBS-URL: https://build.opensuse.org/request/show/556717 OBS-URL: https://build.opensuse.org/package/show/network:messaging:amqp/rabbitmq-server?expand=0&rev=84 --- ocf-pull-request-63.patch | 29 ++++++++++++ ocf-pull-request-64.patch | 98 +++++++++++++++++++++++++++++++++++++++ ocf-pull-request-66.patch | 50 ++++++++++++++++++++ rabbitmq-server.changes | 9 ++++ rabbitmq-server.spec | 6 +++ 5 files changed, 192 insertions(+) create mode 100644 ocf-pull-request-63.patch create mode 100644 ocf-pull-request-64.patch create mode 100644 ocf-pull-request-66.patch diff --git a/ocf-pull-request-63.patch b/ocf-pull-request-63.patch new file mode 100644 index 0000000..4566fa8 --- /dev/null +++ b/ocf-pull-request-63.patch @@ -0,0 +1,29 @@ +From 62a4f7561171328cd1d62cab394d0bba269ea7ad Mon Sep 17 00:00:00 2001 +From: Vincent Untz +Date: Fri, 8 Dec 2017 13:32:45 +0100 +Subject: [PATCH] OCF RA: Avoid promoting nodes with same start time as master + +It may happen that two nodes have the same start time, and one of these +is the master. When this happens, the node actually gets the same score +as the master and can get promoted. There's no reason to avoid being +stable here, so let's keep the same master in that scenario. +--- + scripts/rabbitmq-server-ha.ocf | 5 +++++ + 1 file changed, 5 insertions(+) + +diff --git a/scripts/rabbitmq-server-ha.ocf b/scripts/rabbitmq-server-ha.ocf +index 87bb7d4..da6aee6 100755 +--- a/scripts/rabbitmq-server-ha.ocf ++++ b/scripts/rabbitmq-server-ha.ocf +@@ -1608,6 +1608,11 @@ get_monitor() { + ocf_log info "${LH} comparing us (start time: $our_start_time, score: $new_score) with $node (start time: $node_start_time, score: $node_score)" + if [ $node_start_time -ne 0 -a $node_score -ne 0 -a $node_start_time -lt $our_start_time ]; then + new_score=$((node_score - 10 < new_score ? node_score - 10 : new_score )) ++ elif [ $node_start_time -ne 0 -a $node_score -ne 0 -a $node_start_time -eq $our_start_time ]; then ++ # Do not get promoted if the other node is already master and we have the same start time ++ if is_master $node; then ++ new_score=$((node_score - 10 < new_score ? node_score - 10 : new_score )) ++ fi + fi + done + fi diff --git a/ocf-pull-request-64.patch b/ocf-pull-request-64.patch new file mode 100644 index 0000000..b91d774 --- /dev/null +++ b/ocf-pull-request-64.patch @@ -0,0 +1,98 @@ +From a8e7a62513567b7beab895115d88f57257d21856 Mon Sep 17 00:00:00 2001 +From: Vincent Untz +Date: Fri, 8 Dec 2017 14:13:59 +0100 +Subject: [PATCH 1/3] OCF RA: Fix test for no node in start notification + handler + +If there's nothing starting and nothing active, then we do a -z " ", +which doesn't have the same result as -z "". Instead, just test for +emptiness for each set of nodes. +--- + scripts/rabbitmq-server-ha.ocf | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/scripts/rabbitmq-server-ha.ocf b/scripts/rabbitmq-server-ha.ocf +index 87bb7d4..8a82ac7 100755 +--- a/scripts/rabbitmq-server-ha.ocf ++++ b/scripts/rabbitmq-server-ha.ocf +@@ -2185,7 +2185,7 @@ action_notify() { + local nodes_list="${OCF_RESKEY_CRM_meta_notify_start_uname} ${OCF_RESKEY_CRM_meta_notify_active_uname}" + # Do nothing, if the list of nodes being started or running reported empty + # Delegate recovery, if needed, to the "running out of the cluster" monitor's logic +- if [ -z "${nodes_list}" ] ; then ++ if [ -z "${OCF_RESKEY_CRM_meta_notify_start_uname}" -a -z "${OCF_RESKEY_CRM_meta_notify_active_uname}" ] ; then + ocf_log warn "${LH} I'm a last man standing and I must survive!" + ocf_log info "${LH} post-start end." + return $OCF_SUCCESS + +From 2f284bf595dbbe1938a1ce3028b0299b1a75a6cc Mon Sep 17 00:00:00 2001 +From: Vincent Untz +Date: Fri, 8 Dec 2017 14:15:24 +0100 +Subject: [PATCH 2/3] OCF RA: Do not start rabbitmq if notification of start is + not about us + +Right now, every time we get a start notification, all nodes will ensure +the rabbitmq app is started. This makes little sense, as nodes that are +already active don't need to do that. + +On top of that, this had the sideeffect of updating the start time for +each of these nodes, which could result in the master moving to another +node. +--- + scripts/rabbitmq-server-ha.ocf | 3 +-- + 1 file changed, 1 insertion(+), 2 deletions(-) + +diff --git a/scripts/rabbitmq-server-ha.ocf b/scripts/rabbitmq-server-ha.ocf +index 8a82ac7..5d65061 100755 +--- a/scripts/rabbitmq-server-ha.ocf ++++ b/scripts/rabbitmq-server-ha.ocf +@@ -2182,7 +2182,6 @@ action_notify() { + ;; + start) + ocf_log info "${LH} post-start begin." +- local nodes_list="${OCF_RESKEY_CRM_meta_notify_start_uname} ${OCF_RESKEY_CRM_meta_notify_active_uname}" + # Do nothing, if the list of nodes being started or running reported empty + # Delegate recovery, if needed, to the "running out of the cluster" monitor's logic + if [ -z "${OCF_RESKEY_CRM_meta_notify_start_uname}" -a -z "${OCF_RESKEY_CRM_meta_notify_active_uname}" ] ; then +@@ -2191,7 +2190,7 @@ action_notify() { + return $OCF_SUCCESS + fi + # check did this event from this host +- my_host "${nodes_list}" ++ my_host "${OCF_RESKEY_CRM_meta_notify_start_uname}" + rc=$? + # Do nothing, if there is no master reported + # Delegate recovery, if needed, to the "running out of the cluster" monitor's logic + +From a6dc3f91b0c1038927567cbdce9f6a9538904075 Mon Sep 17 00:00:00 2001 +From: Vincent Untz +Date: Fri, 8 Dec 2017 14:17:38 +0100 +Subject: [PATCH 3/3] OCF RA: Fix logging in start notification handler + +The "post-start end" log message was written too early (some things were +still done afterwards), and not in all cases (it was inside a if +statement). +--- + scripts/rabbitmq-server-ha.ocf | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/scripts/rabbitmq-server-ha.ocf b/scripts/rabbitmq-server-ha.ocf +index 5d65061..33f0f56 100755 +--- a/scripts/rabbitmq-server-ha.ocf ++++ b/scripts/rabbitmq-server-ha.ocf +@@ -2217,7 +2217,6 @@ action_notify() { + rc2=$? + update_rabbit_start_time_if_rc $rc2 + fi +- ocf_log info "${LH} post-start end." + if [ -s "${OCF_RESKEY_definitions_dump_file}" ] ; then + ocf_log info "File ${OCF_RESKEY_definitions_dump_file} exists" + ocf_run curl --silent --show-error --request POST --user $OCF_RESKEY_admin_user:$OCF_RESKEY_admin_password $OCF_RESKEY_host_ip:15672/api/definitions --header "Content-Type:application/json" --data @$OCF_RESKEY_definitions_dump_file +@@ -2234,6 +2233,7 @@ action_notify() { + return $OCF_ERR_GENERIC + fi + fi ++ ocf_log info "${LH} post-start end." + ;; + stop) + # if rabbitmq-server stops on any another node, we should remove it from cluster (as ordinary operation) diff --git a/ocf-pull-request-66.patch b/ocf-pull-request-66.patch new file mode 100644 index 0000000..15bcb35 --- /dev/null +++ b/ocf-pull-request-66.patch @@ -0,0 +1,50 @@ +From 21d14dbe7389c2d0cc8778476ba5c71ad5ad4406 Mon Sep 17 00:00:00 2001 +From: Vincent Untz +Date: Wed, 13 Dec 2017 12:34:31 +0100 +Subject: [PATCH] OCF RA: Do not consider local failures as remote node + problems + +In is_clustered_with(), commands that we run to check if the node is +clustered with us, or partitioned with us may fail. When they fail, it +actually doesn't tell us anything about the remote node. + +Until now, we were considering such failures as hints that the remote +node is not in a sane state with us. But doing so has pretty negative +impact, as it can cause rabbitmq to get restarted on the remote node, +causing quite some disruption. + +So instead of doing this, ignore the error (it's still logged). + +There was a comment in the code wondering what is the best behavior; +based on experience, I think preferring stability is the slightly more +acceptable poison between the two options. +--- + scripts/rabbitmq-server-ha.ocf | 8 ++++---- + 1 file changed, 4 insertions(+), 4 deletions(-) + +diff --git a/scripts/rabbitmq-server-ha.ocf b/scripts/rabbitmq-server-ha.ocf +index 87bb7d4..bc6a538 100755 +--- a/scripts/rabbitmq-server-ha.ocf ++++ b/scripts/rabbitmq-server-ha.ocf +@@ -870,8 +870,8 @@ is_clustered_with() + rc=$? + if [ "$rc" -ne 0 ]; then + ocf_log err "${LH} Failed to check whether '$node_name' is considered running by us" +- # XXX Or should we give remote node benefit of a doubt? +- return 1 ++ # We had a transient local error; that doesn't mean the remote node is ++ # not part of the cluster, so ignore this + elif [ "$seen_as_running" != true ]; then + ocf_log info "${LH} Node $node_name is not running, considering it not clustered with us" + return 1 +@@ -882,8 +882,8 @@ is_clustered_with() + rc=$? + if [ "$rc" -ne 0 ]; then + ocf_log err "${LH} Failed to check whether '$node_name' is partitioned with us" +- # XXX Or should we give remote node benefit of a doubt? +- return 1 ++ # We had a transient local error; that doesn't mean the remote node is ++ # not partitioned with us, so ignore this + elif [ "$seen_as_partitioned" != false ]; then + ocf_log info "${LH} Node $node_name is partitioned from us" + return 1 diff --git a/rabbitmq-server.changes b/rabbitmq-server.changes index 424fe4e..594d50e 100644 --- a/rabbitmq-server.changes +++ b/rabbitmq-server.changes @@ -1,3 +1,12 @@ +------------------------------------------------------------------- +Wed Dec 13 12:13:03 UTC 2017 - vuntz@suse.com + +- Add ocf-pull-request-63.patch and ocf-pull-request-64.patch: + fixes to avoid moving master unnecessarily, and to make start + notification handler more reliable. +- Add ocf-pull-request-66.patch: do not consider transient local + failures as failures of remote nodes. + ------------------------------------------------------------------- Thu Nov 23 13:53:44 UTC 2017 - rbrown@suse.com diff --git a/rabbitmq-server.spec b/rabbitmq-server.spec index 5c8202e..debc702 100644 --- a/rabbitmq-server.spec +++ b/rabbitmq-server.spec @@ -50,6 +50,9 @@ Source6: rabbitmq-server.service Source7: rabbitmq-server.tmpfiles.d.conf Source8: README.SUSE Source9: rabbitmq.config.example +Patch0: ocf-pull-request-63.patch +Patch1: ocf-pull-request-64.patch +Patch2: ocf-pull-request-66.patch BuildRequires: erlang BuildRequires: erlang-src BuildRequires: fdupes @@ -115,6 +118,9 @@ This package includes the RabbitMQ AMQP language bindings for Erlang. %prep %setup -q cp %{SOURCE8} . +%patch0 -p1 +%patch1 -p1 +%patch2 -p1 %build make all %{_make_args} %{?_smp_mflags}