mariadb-image/healthcheck.sh

396 lines
10 KiB
Bash
Raw Permalink Normal View History

#!/bin/bash
#
# Healthcheck script for MariaDB
#
# Runs various tests on the MariaDB server to check its health. Pass the tests
# to run as arguments. If all tests succeed, the server is considered healthy,
# otherwise it's not.
#
# Arguments are processed in strict order. Set replication_* options before
# the --replication option. This allows a different set of replication checks
# on different connections.
#
# --su{=|-mysql} is option to run the healthcheck as a different unix user.
# Useful if mysql@localhost user exists with unix socket authentication
# Using this option disregards previous options set, so should usually be the
# first option.
#
# Some tests require SQL privileges.
#
# TEST MINIMUM GRANTS REQUIRED
# connect none*
# innodb_initialized USAGE
# innodb_buffer_pool_loaded USAGE
# galera_online USAGE
# galera_ready USAGE
# replication REPLICATION_CLIENT (<10.5)or REPLICA MONITOR (10.5+)
# mariadbupgrade none, however unix user permissions on datadir
#
# The SQL user used is the default for the mariadb client. This can be the unix user
# if no user(or password) is set in the [mariadb-client] section of a configuration
# file. --defaults-{file,extra-file,group-suffix} can specify a file/configuration
# different from elsewhere.
#
# Note * though denied error message will result in error log without
# any permissions. USAGE recommend to avoid this.
set -eo pipefail
_process_sql()
{
mariadb ${nodefaults:+--no-defaults} \
${def['file']:+--defaults-file=${def['file']}} \
${def['extra_file']:+--defaults-extra-file=${def['extra_file']}} \
${def['group_suffix']:+--defaults-group-suffix=${def['group_suffix']}} \
--skip-ssl --skip-ssl-verify-server-cert \
--protocol socket \
-B "$@"
}
# TESTS
# CONNECT
#
# Tests that a connection can be made over TCP, the final state
# of the entrypoint and is listening. The authentication used
# isn't tested.
connect()
{
local s
# short cut mechanism, to work with --require-secure-transport
s=$(_process_sql --skip-column-names -e 'select @@skip_networking')
case "$s" in
0|1)
connect_s=$s
return "$s";
;;
esac
# falling back to tcp if there wasn't a connection answer.
s=$(mariadb ${nodefaults:+--no-defaults} \
${def['file']:+--defaults-file=${def['file']}} \
${def['extra_file']:+--defaults-extra-file=${def['extra_file']}} \
${def['group_suffix']:+--defaults-group-suffix=${def['group_suffix']}} \
--skip-ssl --skip-ssl-verify-server-cert \
-h localhost --protocol tcp \
--skip-column-names --batch --skip-print-query-on-error \
-e 'select @@skip_networking' 2>&1)
case "$s" in
1) # skip-networking=1 (no network)
;&
ERROR\ 2002\ \(HY000\):*)
# cannot connect
connect_s=1
;;
0) # skip-networking=0
;&
ERROR\ 1820\ \(HY000\)*) # password expire
;&
ERROR\ 4151\ \(HY000\):*) # account locked
;&
ERROR\ 1226\ \(42000\)*) # resource limit exceeded
;&
ERROR\ 1[0-9][0-9][0-9]\ \(28000\):*)
# grep access denied and other 28000 client errors - we did connect
connect_s=0
;;
*)
>&2 echo "Unknown error $s"
connect_s=1
;;
esac
return $connect_s
}
# INNODB_INITIALIZED
#
# This tests that the crash recovery of InnoDB has completed
# along with all the other things required to make it to a healthy
# operational state. Note this may return true in the early
# states of initialization. Use with a connect test to avoid
# these false positives.
innodb_initialized()
{
local s
s=$(_process_sql --skip-column-names -e "select 1 from information_schema.ENGINES WHERE engine='innodb' AND support in ('YES', 'DEFAULT', 'ENABLED')")
[ "$s" == 1 ]
}
# INNODB_BUFFER_POOL_LOADED
#
# Tests the load of the innodb buffer pool as been complete
# implies innodb_buffer_pool_load_at_startup=1 (default), or if
# manually SET innodb_buffer_pool_load_now=1
innodb_buffer_pool_loaded()
{
local s
s=$(_process_sql --skip-column-names -e "select VARIABLE_VALUE from information_schema.GLOBAL_STATUS WHERE VARIABLE_NAME='Innodb_buffer_pool_load_status'")
if [[ $s =~ 'load completed' ]]; then
return 0
fi
return 1
}
# GALERA_ONLINE
#
# Tests that the galera node is in the SYNCed state
galera_online()
{
local s
s=$(_process_sql --skip-column-names -e "select VARIABLE_VALUE from information_schema.GLOBAL_STATUS WHERE VARIABLE_NAME='WSREP_LOCAL_STATE'")
# 4 from https://galeracluster.com/library/documentation/node-states.html#node-state-changes
# not https://xkcd.com/221/
if [[ $s -eq 4 ]]; then
return 0
fi
return 1
}
# GALERA_READY
#
# Tests that the Galera provider is ready.
galera_ready()
{
local s
s=$(_process_sql --skip-column-names -e "select VARIABLE_VALUE from information_schema.GLOBAL_STATUS WHERE VARIABLE_NAME='WSREP_READY'")
if [ "$s" = "ON" ]; then
return 0
fi
return 1
}
# REPLICATION
#
# Tests the replication has the required set of functions:
# --replication_all -> Checks all replication sources
# --replication_name=n -> sets the multisource connection name tested
# --replication_io -> IO thread is running
# --replication_sql -> SQL thread is running
# --replication_seconds_behind_master=n -> less than or equal this seconds of delay
# --replication_sql_remaining_delay=n -> less than or equal this seconds of remaining delay
# (ref: https://mariadb.com/kb/en/delayed-replication/)
replication()
{
# SHOW REPLICA available 10.5+
# https://github.com/koalaman/shellcheck/issues/2383
# shellcheck disable=SC2016,SC2026
_process_sql -e "SHOW ${repl['all']:+all} REPLICA${repl['all']:+S} ${repl['name']:+'${repl['name']}'} STATUS\G" | \
{
# required for trim of leading space.
shopt -s extglob
# Row header
read -t 5 -r
# read timeout
[ $? -gt 128 ] && return 1
while IFS=":" read -t 1 -r n v; do
# Trim leading space
n=${n##+([[:space:]])}
# Leading space on all values by the \G format needs to be trimmed.
v=${v:1}
case "$n" in
Slave_IO_Running)
if [ -n "${repl['io']}" ] && [ "$v" = 'No' ]; then
return 1
fi
;;
Slave_SQL_Running)
if [ -n "${repl['sql']}" ] && [ "$v" = 'No' ]; then
return 1
fi
;;
Seconds_Behind_Master)
# A NULL value is the IO thread not running:
if [ -n "${repl['seconds_behind_master']}" ] &&
{ [ "$v" = NULL ] ||
(( "${repl['seconds_behind_master']}" < "$v" )); }; then
return 1
fi
;;
SQL_Remaining_Delay)
# Unlike Seconds_Behind_Master, sql_remaining_delay will hit NULL
# once replication is caught up - https://mariadb.com/kb/en/delayed-replication/
if [ -n "${repl['sql_remaining_delay']}" ] &&
[ "$v" != NULL ] &&
(( "${repl['sql_remaining_delay']}" < "$v" )); then
return 1
fi
;;
esac
done
# read timeout
[ $? -gt 128 ] && return 1
return 0
}
# reachable in command not found(?)
# shellcheck disable=SC2317
return $?
}
# mariadbupgrade
#
# Test the lock on the file $datadir/mariadb_upgrade_info
# https://jira.mariadb.org/browse/MDEV-27068
mariadbupgrade()
{
local f="$datadir/mariadb_upgrade_info"
if [ -r "$f" ]; then
flock --exclusive --nonblock -n 9 9<"$f"
return $?
fi
return 0
}
# MAIN
if [ $# -eq 0 ]; then
echo "At least one argument required" >&2
exit 1
fi
#ENDOFSUBSTITUTIONS
# Marks the end of mysql -> mariadb name changes in 10.6+
# Global variables used by tests
declare -A repl
declare -A def
nodefaults=
connect_s=
datadir=/var/lib/mysql
if [ -f $datadir/.my-healthcheck.cnf ]; then
def['extra_file']=$datadir/.my-healthcheck.cnf
fi
_repl_param_check()
{
case "$1" in
seconds_behind_master) ;&
sql_remaining_delay)
if [ -z "${repl['io']}" ]; then
repl['io']=1
echo "Forcing --replication_io=1, $1 requires IO thread to be running" >&2
fi
;;
all)
if [ -n "${repl['name']}" ]; then
unset 'repl[name]'
echo "Option --replication_all incompatible with specified source --replication_name, clearing replication_name" >&2
fi
;;
name)
if [ -n "${repl['all']}" ]; then
unset 'repl[all]'
echo "Option --replication_name incompatible with --replication_all, clearing replication_all" >&2
fi
;;
esac
}
_test_exists() {
declare -F "$1" > /dev/null
return $?
}
while [ $# -gt 0 ]; do
case "$1" in
--su=*)
u="${1#*=}"
shift
exec gosu "${u}" "${BASH_SOURCE[0]}" "$@"
;;
--su)
shift
u=$1
shift
exec gosu "$u" "${BASH_SOURCE[0]}" "$@"
;;
--su-mysql)
shift
exec gosu mysql "${BASH_SOURCE[0]}" "$@"
;;
--replication_*=*)
# Change the n to what is between _ and = and make lower case
n=${1#*_}
n=${n%%=*}
n=${n,,*}
# v is after the =
v=${1#*=}
repl[$n]=$v
_repl_param_check "$n"
;;
--replication_*)
# Without =, look for a non --option next as the value,
# otherwise treat it as an "enable", just equate to 1.
# Clearing option is possible with "--replication_X="
n=${1#*_}
n=${n,,*}
if [ "${2:0:2}" == '--' ]; then
repl[$n]=1
else
repl[$n]=$2
shift
fi
_repl_param_check "$n"
;;
--datadir=*)
datadir=${1#*=}
;;
--datadir)
shift
datadir=${1}
;;
--no-defaults)
def=()
nodefaults=1
;;
--defaults-file=*|--defaults-extra-file=*|--defaults-group-suffix=*)
n=${1:11} # length --defaults-
n=${n%%=*}
n=${n//-/_}
# v is after the =
v=${1#*=}
def[$n]=$v
nodefaults=
;;
--defaults-file|--defaults-extra-file|--defaults-group-suffix)
n=${1:11} # length --defaults-
n=${n//-/_}
if [ "${2:0:2}" == '--' ]; then
def[$n]=""
else
def[$n]=$2
shift
fi
nodefaults=
;;
--no-connect)
# used for /docker-entrypoint-initdb.d scripts
# where you definately don't want a connection test
connect_s=0
;;
--*)
test=${1#--}
;;
*)
echo "Unknown healthcheck option $1" >&2
exit 1
esac
if [ -n "$test" ]; then
if ! _test_exists "$test" ; then
echo "healthcheck unknown option or test '$test'" >&2
exit 1
elif ! "$test"; then
echo "healthcheck $test failed" >&2
exit 1
fi
test=
fi
shift
done
if [ "$connect_s" != "0" ]; then
# we didn't pass a connnect test, so the current success status is suspicious
# return what connect thinks.
connect
exit $?
fi