From 4c38389917a54e137a4578b45f0f6a821c8c591a Mon Sep 17 00:00:00 2001 From: Matthias Gerstner Date: Wed, 5 Dec 2018 15:03:19 +0100 Subject: [PATCH 1/3] pam_slurm_adopt: avoid running outside of the sshd PAM service context This pam module is tailored towards running in the context of remote ssh logins. When running in a different context like a local sudo call then the module could be influenced by e.g. passing environment variables like SLURM_CONF. By limiting the module to only perform its actions when running in the sshd context by default this situation can be avoided. An additional pam module argument service= allows an Administrator to control this behaviour, if different behaviour is explicitly desired. Signed-off-by: Christian Goll --- contribs/pam_slurm_adopt/README | 172 ++++++++++++++++++++++++++++- contribs/pam_slurm_adopt/pam_slurm_adopt.c | 46 ++++++++ 2 files changed, 217 insertions(+), 1 deletion(-) diff --git a/contribs/pam_slurm_adopt/README b/contribs/pam_slurm_adopt/README index 07039740f8..8baece6d2e 100644 --- a/contribs/pam_slurm_adopt/README +++ b/contribs/pam_slurm_adopt/README @@ -1,5 +1,175 @@ Current documentation can be found here: https://slurm.schedmd.com/pam_slurm_adopt.html - (Which is generated from docs/html/pam_slurm_adopt.shtml.) + +======= +AUTHOR + Ryan Cox + +MODULE TYPES PROVIDED + account + +DESCRIPTION + This module attempts to determine the job which originated this connection. + The module is configurable; these are the default steps: + + 1) Check the local stepd for a count of jobs owned by the non-root user + a) If none, deny (option action_no_jobs) + b) If only one, adopt the process into that job + c) If multiple, continue + 2) Determine src/dst IP/port of socket + 3) Issue callerid RPC to slurmd at IP address of source + a) If the remote slurmd can identify the source job, adopt into that job + b) If not, continue + 4) Pick a random local job from the user to adopt into (option action_unknown) + + Jobs are adopted into a job's allocation step. + +MODULE OPTIONS +This module has the following options (* = default): + + ignore_root - By default, all root connections are ignored. If the RPC + is sent to a node which drops packets to the slurmd port, the + RPC will block for some time before failing. This is + unlikely to be desirable. Likewise, root may be trying to + administer the system and not do work that should be in a job. + The job may trigger oom-killer or just exit. If root restarts + a service or similar, it will be tracked and killed by Slurm + when the job exits. This sounds bad because it is bad. + + 1* = Let the connection through without adoption + 0 = I am crazy. I want random services to die when root jobs exit. I + also like it when RPCs block for a while then time out. + + + action_no_jobs - The action to perform if the user has no jobs on the node + + ignore = Do nothing. Fall through to the next pam module + deny* = Deny the connection + + + action_unknown - The action to perform when the user has multiple jobs on + the node *and* the RPC does not locate the source job. + If the RPC mechanism works properly in your environment, + this option will likely be relevant *only* when connecting + from a login node. + + newest* = Pick the newest job on the node. The "newest" job is chosen + based on the mtime of the job's step_extern cgroup; asking + Slurm would require an RPC to the controller. The user can ssh + in but may be adopted into a job that exits earlier than the + job they intended to check on. The ssh connection will at + least be subject to appropriate limits and the user can be + informed of better ways to accomplish their objectives if this + becomes a problem + allow = Let the connection through without adoption + deny = Deny the connection + + + action_adopt_failure - The action to perform if the process is unable to be + adopted into any job for whatever reason. If the + process cannot be adopted into the job identified by + the callerid RPC, it will fall through to the + action_unknown code and try to adopt there. A failure + at that point or if there is only one job will result + in this action being taken. + + allow* = Let the connection through without adoption + deny = Deny the connection + + action_generic_failure - The action to perform if there are certain failures + such as the inability to talk to the local slurmd + or if the kernel doesn't offer the correct + facilities. + + ignore* = Do nothing. Fall through to the next pam module + allow = Let the connection through without adoption + deny = Deny the connection + + log_level - See SlurmdDebug in slurm.conf(5) for available options. The + default log_level is info. + + disable_x11 - turn off Slurm built-in X11 forwarding support. + + 1 = Do not check for Slurm's X11 forwarding support, and no not + alter the DISPLAY variable. + 0* = If the step the job is adopted into has X11 enabled, set + the DISPLAY variable in the processes environment accordingly. + + service - The pam service name for which this module should run. By default + it only runs for sshd for which it was designed for. A + different service name can be specified like "login" or "*" to + allow the module to in any service context. For local pam logins + this module could cause unexpected behaviour or even security + issues. Therefore if the service name does not match then this + module will not perform the adoption logic and returns + PAM_IGNORE immediately. + +SLURM.CONF CONFIGURATION + PrologFlags=contain must be set in slurm.conf. This sets up the "extern" step + into which ssh-launched processes will be adopted. + + **** IMPORTANT **** + PrologFlags=contain must be in place *before* using this module. + The module bases its checks on local steps that have already been launched. If + the user has no steps on the node, such as the extern step, the module will + assume that the user has no jobs allocated to the node. Depending on your + configuration of the pam module, you might deny *all* user ssh attempts. + +NOTES + This module and the related RPC currently support Linux systems which + have network connection information available through /proc/net/tcp{,6}. A + proccess's sockets must exist as symlinks in its /proc/self/fd directory. + + The RPC data structure itself is OS-agnostic. If support is desired for a + different OS, relevant code must be added to find one's socket information + then match that information on the remote end to a particular process which + Slurm is tracking. + + IPv6 is supported by the RPC data structure itself and the code which sends it + and receives it. Sending the RPC to an IPv6 address is not currently + supported by Slurm. Once support is added, remove the relevant check in + slurm_network_callerid(). + + For the action_unknown=newest setting to work, the memory cgroup must be in + use so that the code can check mtimes of cgroup directories. If you would + prefer to use a different subsystem, modify the _indeterminate_multiple + function. + +FIREWALLS, IP ADDRESSES, ETC. + slurmd should be accessible on any IP address from which a user might launch + ssh. The RPC to determine the source job must be able to reach the slurmd + port on that particular IP address. + + If there is no slurmd on the source node, such as on a login node, it is + better to have the RPC be rejected rather than silently dropped. This + will allow better responsiveness to the RPC initiator. + +EXAMPLES / SUGGESTED USAGE + Use of this module is recommended on any compute node. + + Add the following line to the appropriate file in /etc/pam.d, such as + system-auth or sshd: + + account sufficient pam_slurm_adopt.so + + If you always want to allow access for an administrative group (e.g. wheel), + stack the pam_access module after pam_slurm_adopt. A success with + pam_slurm_adopt is sufficient to allow access but the pam_access module can + allow others, such as staff, access even without jobs. + + account sufficient pam_slurm_adopt.so + account required pam_access.so + + + Then edit the pam_access configuration file (/etc/security/access.conf): + + +:wheel:ALL + -:ALL:ALL + + When access is denied, the user will receive a relevant error message. + + pam_systemd.so is known to not play nice with Slurm's usage of cgroups. It is + recommended that you disable it or possibly add pam_slurm_adopt.so after + pam_systemd.so. diff --git a/contribs/pam_slurm_adopt/pam_slurm_adopt.c b/contribs/pam_slurm_adopt/pam_slurm_adopt.c index 51f21e8729..dccad90185 100644 --- a/contribs/pam_slurm_adopt/pam_slurm_adopt.c +++ b/contribs/pam_slurm_adopt/pam_slurm_adopt.c @@ -94,6 +94,7 @@ static struct { log_level_t log_level; char *node_name; bool disable_x11; + char *pam_service; } opts; static void _init_opts(void) @@ -107,6 +108,7 @@ static void _init_opts(void) opts.log_level = LOG_LEVEL_INFO; opts.node_name = NULL; opts.disable_x11 = false; + opts.pam_service = NULL; } static slurm_cgroup_conf_t *slurm_cgroup_conf = NULL; @@ -576,6 +578,9 @@ static void _parse_opts(pam_handle_t *pamh, int argc, const char **argv) opts.node_name = xstrdup(v); } else if (!xstrncasecmp(*argv, "disable_x11=1", 13)) { opts.disable_x11 = true; + } else if (!xstrncasecmp(*argv, "service=", 8)) { + v = (char *)(8 + *argv); + opts.pam_service = xstrdup(v); } } @@ -601,6 +606,40 @@ static int _load_cgroup_config() return SLURM_SUCCESS; } +/* Make sure to only continue if we're running in the sshd context + * + * If this module is used locally e.g. via sudo then unexpected things might + * happen (e.g. passing environment variables interpreted by slurm code like + * SLURM_CONF or inheriting file descriptors that are used by _try_rpc()). + */ +static int check_pam_service(pam_handle_t *pamh) +{ + const char *allowed = opts.pam_service ? opts.pam_service : "sshd"; + char *service = NULL; + int rc; + + if (!strcmp(allowed, "*")) + // any service name is allowed + return PAM_SUCCESS; + + rc = pam_get_item(pamh, PAM_SERVICE, (void*)&service); + + if (rc != PAM_SUCCESS) { + pam_syslog(pamh, LOG_ERR, "failed to obtain PAM_SERVICE name"); + return rc; + } + else if (service == NULL) { + // this shouldn't actually happen + return PAM_BAD_ITEM; + } + + if (!strcmp(service, allowed)) { + return PAM_SUCCESS; + } + + pam_syslog(pamh, LOG_INFO, "Not adopting process since this is not an allowed pam service"); + return PAM_IGNORE; +} /* Parse arguments, etc then get my socket address/port information. Attempt to * adopt this process into a job in the following order: @@ -622,6 +661,12 @@ PAM_EXTERN int pam_sm_acct_mgmt(pam_handle_t *pamh, int flags _init_opts(); _parse_opts(pamh, argc, argv); + + retval = check_pam_service(pamh); + if (retval != PAM_SUCCESS) { + return retval; + } + _log_init(opts.log_level); switch (opts.action_generic_failure) { @@ -765,6 +810,7 @@ cleanup: xfree(buf); xfree(slurm_cgroup_conf); xfree(opts.node_name); + xfree(opts.pam_service); return rc; } -- 2.16.4