From 077a313e9fcacf0b12a35e8094bfdba6ac8aa0d4 Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Fri, 6 Dec 2024 15:31:01 +0100 Subject: [PATCH 11/12] drbd.ocf: explicitly timeout crm_master IPC early Some environments are notorious for timing out in crm_master one out of ten thousand times. You don't want to know the details. That would then cause a timeout on the monitor action, and pacemaker would feel the need to "recover" from "resource failure", spuriously restarting everything that depends on the DRBD resources. If we fail to update the "master score" in time, we can still report the operation result. The next monitor action will happen, and we get an other attempt at updating the master score. If that update was relevant, worst case it is now delayed. Better than the previous worst case, operation timeout interpreted as resource failure. --- scripts/drbd.ocf | 44 +++++++++++++++++++++++++++++++++++++++----- 1 file changed, 39 insertions(+), 5 deletions(-) diff --git a/scripts/drbd.ocf b/scripts/drbd.ocf index 1d051baa550d..c31858343ba4 100755 --- a/scripts/drbd.ocf +++ b/scripts/drbd.ocf @@ -53,6 +53,7 @@ # OCF_RESKEY_CRM_meta_master_max # OCF_RESKEY_CRM_meta_master_node_max # +# OCF_RESKEY_CRM_meta_timeout # OCF_RESKEY_CRM_meta_interval # # OCF_RESKEY_CRM_meta_notify @@ -515,22 +516,55 @@ if $USE_DEBUG_LOG ; then fi do_cmd_success_log_level="" -do_cmd() { +do_cmd() { __do_cmd_with_timeout "" "$@"; } +do_cmd_CRM_meta_timeout() +{ + local timeout=$(( ($OCF_RESKEY_CRM_meta_timeout - 500 - $SECONDS*1000) / 1000 )) + # if we are short on time already, + # try to get away with whatever time is left + (( timeout > 0 )) || timeout=0 + __do_cmd_with_timeout "$timeout" "$@"; +} +__do_cmd_with_timeout() { # Run a command, return its exit code, capture any output, and log # everything if appropriate. + # use coreutils "timeout", if timeout is != 0. + local timeout=$1; shift local cmd="$*" cmd_out cmd_err ret=125 local success_log_level=${do_cmd_success_log_level:-debug} local failure_log_level=${do_cmd_failure_log_level:-err} ocf_log debug "$DRBD_RESOURCE: Calling $cmd" + if [[ $timeout =~ ^[1-9][0-9]*$ ]]; then + timeout="-t $timeout" + else + timeout="" + fi + + local pipe + local pipe_pid + local result + # capture stdout, stderr, and exit code - eval "$(exec 3>&1; + if exec {pipe}< <(exec 3>&1; printf "cmd_err=%q\n" \ "$( exec 2>&1 1>&3 3>&-; \ out=$( "$@" ); \ ex=$?; \ printf "cmd_out=%q\nret=%q\n" "$out" "$ex" )" - )" + ) + then + pipe_pid=$! + + read -r -d "" -u $pipe $timeout result + if (( $? > 128 )) ; then + kill -KILL $pipe_pid + # wait $pipe_pid + fi + exec {pipe}<&- + + eval "$result" + fi if [ $ret != 0 ]; then ocf_log $failure_log_level "$DRBD_RESOURCE: Called $cmd" @@ -585,13 +619,13 @@ set_master_score() { if [[ $1 -le 0 ]]; then remove_master_score else - do_cmd ${HA_SBIN_DIR}/crm_master -Q -l reboot -v $1 && + do_cmd_CRM_meta_timeout ${HA_SBIN_DIR}/crm_master -Q -l reboot -v $1 && current_master_score=$1 fi } remove_master_score() { - do_cmd ${HA_SBIN_DIR}/crm_master -l reboot -D + do_cmd_CRM_meta_timeout ${HA_SBIN_DIR}/crm_master -l reboot -D current_master_score="" } -- 2.43.0