drbd-utils/0011-drbd.ocf-explicitly-timeout-crm_master-IPC-early.patch

116 lines
3.2 KiB
Diff

From 077a313e9fcacf0b12a35e8094bfdba6ac8aa0d4 Mon Sep 17 00:00:00 2001
From: Lars Ellenberg <lars.ellenberg@linbit.com>
Date: Fri, 6 Dec 2024 15:31:01 +0100
Subject: [PATCH 11/12] drbd.ocf: explicitly timeout crm_master IPC early
Some environments are notorious for timing out in crm_master
one out of ten thousand times. You don't want to know the details.
That would then cause a timeout on the monitor action,
and pacemaker would feel the need to "recover" from "resource failure",
spuriously restarting everything that depends on the DRBD resources.
If we fail to update the "master score" in time,
we can still report the operation result.
The next monitor action will happen,
and we get an other attempt at updating the master score.
If that update was relevant, worst case it is now delayed.
Better than the previous worst case, operation timeout
interpreted as resource failure.
---
scripts/drbd.ocf | 44 +++++++++++++++++++++++++++++++++++++++-----
1 file changed, 39 insertions(+), 5 deletions(-)
diff --git a/scripts/drbd.ocf b/scripts/drbd.ocf
index 1d051baa550d..c31858343ba4 100755
--- a/scripts/drbd.ocf
+++ b/scripts/drbd.ocf
@@ -53,6 +53,7 @@
# OCF_RESKEY_CRM_meta_master_max
# OCF_RESKEY_CRM_meta_master_node_max
#
+# OCF_RESKEY_CRM_meta_timeout
# OCF_RESKEY_CRM_meta_interval
#
# OCF_RESKEY_CRM_meta_notify
@@ -515,22 +516,55 @@ if $USE_DEBUG_LOG ; then
fi
do_cmd_success_log_level=""
-do_cmd() {
+do_cmd() { __do_cmd_with_timeout "" "$@"; }
+do_cmd_CRM_meta_timeout()
+{
+ local timeout=$(( ($OCF_RESKEY_CRM_meta_timeout - 500 - $SECONDS*1000) / 1000 ))
+ # if we are short on time already,
+ # try to get away with whatever time is left
+ (( timeout > 0 )) || timeout=0
+ __do_cmd_with_timeout "$timeout" "$@";
+}
+__do_cmd_with_timeout() {
# Run a command, return its exit code, capture any output, and log
# everything if appropriate.
+ # use coreutils "timeout", if timeout is != 0.
+ local timeout=$1; shift
local cmd="$*" cmd_out cmd_err ret=125
local success_log_level=${do_cmd_success_log_level:-debug}
local failure_log_level=${do_cmd_failure_log_level:-err}
ocf_log debug "$DRBD_RESOURCE: Calling $cmd"
+ if [[ $timeout =~ ^[1-9][0-9]*$ ]]; then
+ timeout="-t $timeout"
+ else
+ timeout=""
+ fi
+
+ local pipe
+ local pipe_pid
+ local result
+
# capture stdout, stderr, and exit code
- eval "$(exec 3>&1;
+ if exec {pipe}< <(exec 3>&1;
printf "cmd_err=%q\n" \
"$( exec 2>&1 1>&3 3>&-; \
out=$( "$@" ); \
ex=$?; \
printf "cmd_out=%q\nret=%q\n" "$out" "$ex" )"
- )"
+ )
+ then
+ pipe_pid=$!
+
+ read -r -d "" -u $pipe $timeout result
+ if (( $? > 128 )) ; then
+ kill -KILL $pipe_pid
+ # wait $pipe_pid
+ fi
+ exec {pipe}<&-
+
+ eval "$result"
+ fi
if [ $ret != 0 ]; then
ocf_log $failure_log_level "$DRBD_RESOURCE: Called $cmd"
@@ -585,13 +619,13 @@ set_master_score() {
if [[ $1 -le 0 ]]; then
remove_master_score
else
- do_cmd ${HA_SBIN_DIR}/crm_master -Q -l reboot -v $1 &&
+ do_cmd_CRM_meta_timeout ${HA_SBIN_DIR}/crm_master -Q -l reboot -v $1 &&
current_master_score=$1
fi
}
remove_master_score() {
- do_cmd ${HA_SBIN_DIR}/crm_master -l reboot -D
+ do_cmd_CRM_meta_timeout ${HA_SBIN_DIR}/crm_master -l reboot -D
current_master_score=""
}
--
2.43.0