From d5a2e95d8c8aaecbdd13260b956503bd321f327378f55588e66e37c0ae0d0a4f Mon Sep 17 00:00:00 2001 From: Egbert Eich Date: Tue, 14 Aug 2018 13:00:16 +0000 Subject: [PATCH] Accepting request 629222 from home:eeich:branches:network:cluster - Update to 17.11.9 * Fix segfault in slurmctld when a job's node bitmap is NULL during a scheduling cycle. Primarily caused by EnforcePartLimits=ALL. * Remove erroneous unlock in acct_gather_energy/ipmi. * Enable support for hwloc version 2.0.1. * Fix 'srun -q' (--qos) option handling. * Fix socket communication issue that can lead to lost task completition messages, which will cause a permanently stuck srun process. * Handle creation of TMPDIR if environment variable is set or changed in a task prolog script. * Avoid node layout fragmentation if running with a fixed CPU count but without Sockets and CoresPerSocket defined. * burst_buffer/cray - Fix datawarp swap default pool overriding jobdw. * Fix incorrect job priority assignment for multi-partition job with different PriorityTier settings on the partitions. * Fix sinfo to print correct node state. - When using a remote shared StateSaveLocation, slurmctld needs to be started after remote filesystems have become available. Add 'remote-fs.target' to the 'After=' directive in slurmctld.service (boo#1103561). - Update to 17.11.8 * Fix incomplete RESPONSE_[RESOURCE|JOB_PACK]_ALLOCATION building path. * Do not allocate nodes that were marked down due to the node not responding by ResumeTimeout. * task/cray plugin - search for "mems" cgroup information in the file "cpuset.mems" then fall back to the file "mems". * Fix ipmi profile debug uninitialized variable. * PMIx: fixed the direct connect inline msg sending. OBS-URL: https://build.opensuse.org/request/show/629222 OBS-URL: https://build.opensuse.org/package/show/network:cluster/slurm?expand=0&rev=64 --- slurm-17.11.7.tar.bz2 | 3 - slurm-17.11.9.tar.bz2 | 3 + slurm.changes | 95 +++++++++++++++++++ slurm.spec | 33 +++++-- ...it-when-backup-controller-takes-over.patch | 58 +++++++++++ 5 files changed, 180 insertions(+), 12 deletions(-) delete mode 100644 slurm-17.11.7.tar.bz2 create mode 100644 slurm-17.11.9.tar.bz2 create mode 100644 slurmctld-rerun-agent_init-when-backup-controller-takes-over.patch diff --git a/slurm-17.11.7.tar.bz2 b/slurm-17.11.7.tar.bz2 deleted file mode 100644 index b2466b1..0000000 --- a/slurm-17.11.7.tar.bz2 +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a4ab10870b1c35f67a3465796960b32e4270e52acc257987b10acc4f17035a57 -size 6249399 diff --git a/slurm-17.11.9.tar.bz2 b/slurm-17.11.9.tar.bz2 new file mode 100644 index 0000000..fbc6787 --- /dev/null +++ b/slurm-17.11.9.tar.bz2 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c56ed2eab6d2d2adf2ab5aec203175a64b9e8c5a5ba2af29470358e7808bd942 +size 6258698 diff --git a/slurm.changes b/slurm.changes index 510569e..c939652 100644 --- a/slurm.changes +++ b/slurm.changes @@ -1,3 +1,98 @@ +------------------------------------------------------------------- +Tue Aug 14 10:26:43 UTC 2018 - eich@suse.com + +- Update to 17.11.9 + * Fix segfault in slurmctld when a job's node bitmap is NULL during a + scheduling cycle. Primarily caused by EnforcePartLimits=ALL. + * Remove erroneous unlock in acct_gather_energy/ipmi. + * Enable support for hwloc version 2.0.1. + * Fix 'srun -q' (--qos) option handling. + * Fix socket communication issue that can lead to lost task completition + messages, which will cause a permanently stuck srun process. + * Handle creation of TMPDIR if environment variable is set or changed in + a task prolog script. + * Avoid node layout fragmentation if running with a fixed CPU count but + without Sockets and CoresPerSocket defined. + * burst_buffer/cray - Fix datawarp swap default pool overriding jobdw. + * Fix incorrect job priority assignment for multi-partition job with + different PriorityTier settings on the partitions. + * Fix sinfo to print correct node state. + +------------------------------------------------------------------- +Thu Aug 2 11:35:55 UTC 2018 - eich@suse.com + +- When using a remote shared StateSaveLocation, slurmctld needs to + be started after remote filesystems have become available. + Add 'remote-fs.target' to the 'After=' directive in slurmctld.service + (boo#1103561). + +------------------------------------------------------------------- +Tue Jul 31 18:29:40 UTC 2018 - eich@suse.com + +- Update to 17.11.8 + * Fix incomplete RESPONSE_[RESOURCE|JOB_PACK]_ALLOCATION building path. + * Do not allocate nodes that were marked down due to the node not responding + by ResumeTimeout. + * task/cray plugin - search for "mems" cgroup information in the file + "cpuset.mems" then fall back to the file "mems". + * Fix ipmi profile debug uninitialized variable. + * PMIx: fixed the direct connect inline msg sending. + * MYSQL: Fix issue not handling all fields when loading an archive dump. + * Allow a job_submit plugin to change the admin_comment field during + job_submit_plugin_modify(). + * job_submit/lua - fix access into reservation table. + * MySQL - Prevent deadlock caused by archive logic locking reads. + * Don't enforce MaxQueryTimeRange when requesting specific jobs. + * Modify --test-only logic to properly support jobs submitted to more than + one partition. + * Prevent slurmctld from abort when attempting to set non-existing + qos as def_qos_id. + * Add new job dependency type of "afterburstbuffer". The pending job will be + delayed until the first job completes execution and it's burst buffer + stage-out is completed. + * Reorder proctrack/task plugin load in the slurmstepd to match that of + slurmd + and avoid race condition calling task before proctrack can introduce. + * Prevent reboot of a busy KNL node when requesting inactive features. + * Revert to previous behavior when requesting memory per cpu/node introduced + in 17.11.7. + * Fix to reinitialize previously adjusted job members to their original + value + when validating the job memory in multi-partition requests. + * Fix _step_signal() from always returning SLURM_SUCCESS. + * Combine active and available node feature change logs on one line rather + than one line per node for performance reasons. + * Prevent occasionally leaking freezer cgroups. + * Fix potential segfault when closing the mpi/pmi2 plugin. + * Fix issues with --exclusive=[user|mcs] to work correctly + with preemption or when job requests a specific list of hosts. + * Make code compile with hdf5 1.10.2+ + * mpi/pmix: Fixed the collectives canceling. + * SlurmDBD: improve error message handling on archive load failure. + * Fix incorrect locking when deleting reservations. + * Fix incorrect locking when setting up the power save module. + * Fix setting format output length for squeue when showing array jobs. + * Add xstrstr function. + * Fix printing out of --hint options in sbatch, salloc --help. + * Prevent possible divide by zero in _validate_time_limit(). + * Add Delegate=yes to the slurmd.service file to prevent systemd from + interfering with the jobs' cgroup hierarchies. + * Change the backlog argument to the listen() syscall within srun to 4096 + to match elsewhere in the code, and avoid communication problems at scale. + +------------------------------------------------------------------- +Tue Jul 31 17:30:08 UTC 2018 - eich@suse.com + +- Fix race in the slurmctld backup controller which prevents it + to clean up allocations on nodes properly after failing over + (bsc#1084917). +- Handled %license in a backward compatible manner. + +------------------------------------------------------------------- +Sat Jul 28 15:30:58 UTC 2018 - eich@suse.com + +- Add a 'Recommends: slurm-munge' to slurm-slurmdbd. + ------------------------------------------------------------------- Wed Jul 11 12:04:55 UTC 2018 - eich@suse.com diff --git a/slurm.spec b/slurm.spec index aca9dbe..aa38751 100644 --- a/slurm.spec +++ b/slurm.spec @@ -18,7 +18,7 @@ # Check file META in sources: update so_version to (API_CURRENT - API_AGE) %define so_version 32 -%define ver 17.11.7 +%define ver 17.11.9 # so-version is 0 and seems to be stable %define pmi_so 0 @@ -73,6 +73,7 @@ Patch5: slurmd-uses-xdaemon_-for-systemd.patch Patch6: slurmdbd-uses-xdaemon_-for-systemd.patch Patch7: slurmsmwd-uses-xdaemon_-for-systemd.patch Patch8: removed-deprecated-xdaemon.patch +Patch9: slurmctld-rerun-agent_init-when-backup-controller-takes-over.patch Requires: slurm-config = %{version} Requires: slurm-node = %{version} @@ -208,6 +209,7 @@ Group: Productivity/Clustering/Computing Requires: slurm-config = %{version} Requires: slurm-plugins = %{version} Requires: slurm-sql = %{version} +Recommends: slurm-munge = %{version} %if 0%{?with_systemd} %{?systemd_requires} %else @@ -328,6 +330,7 @@ for the slurm daemons. %patch6 -p1 %patch7 -p1 %patch8 -p1 +%patch9 -p1 %build %configure --enable-shared \ @@ -399,14 +402,20 @@ PartitionName=normal Nodes=linux Default=YES MaxTime=24:00:00 State=UP EOF # 9/17/14 karl.w.schulz@intel.com - Add option to drop VM cache during epilog sed -i '/^# No other SLURM jobs,/i \\n# Drop clean caches (OpenHPC)\necho 3 > /proc/sys/vm/drop_caches\n\n#' %{buildroot}/%{_sysconfdir}/%{name}/slurm.epilog.clean -# chnage slurmdbd.conf for our needs -sed -i 's@LogFile=/var/log/slurm/slurmdbd.log@LogFile=/var/log/slurmdbd.log@' %{buildroot}/%{_sysconfdir}/%{name}/slurmdbd.conf -sed -i -e "s@PidFile=.*@PidFile=%{_localstatedir}/run/slurm/slurmdbd.pid@" %{buildroot}/%{_sysconfdir}/%{name}/slurmdbd.conf -# manage local state dir +# change slurmdbd.conf for our needs +sed -i 's@LogFile=/var/log/slurm/slurmdbd.log@LogFile=/var/log/slurmdbd.log@'\ + %{buildroot}/%{_sysconfdir}/%{name}/slurmdbd.conf +sed -i -e "s@PidFile=.*@PidFile=%{_localstatedir}/run/slurm/slurmdbd.pid@" \ + %{buildroot}/%{_sysconfdir}/%{name}/slurmdbd.conf +# manage local state dir and a remote states save location mkdir -p %{buildroot}/%_localstatedir/lib/slurm -sed -i -e "s@PIDFile=.*@PIDFile=%{_localstatedir}/run/slurm/slurmctld.pid@" %{buildroot}/%{_unitdir}/slurmctld.service -sed -i -e "s@PIDFile=.*@PIDFile=%{_localstatedir}/run/slurm/slurmd.pid@" %{buildroot}/%{_unitdir}/slurmd.service -sed -i -e "s@PIDFile=.*@PIDFile=%{_localstatedir}/run/slurm/slurmdbd.pid@" %{buildroot}/%{_unitdir}/slurmdbd.service +sed -i -e "s@PIDFile=.*@PIDFile=%{_localstatedir}/run/slurm/slurmctld.pid@" \ + -e "s@After=.*@After=network.target munge.service remote-fs.target@" \ + %{buildroot}/%{_unitdir}/slurmctld.service +sed -i -e "s@PIDFile=.*@PIDFile=%{_localstatedir}/run/slurm/slurmd.pid@" \ + %{buildroot}/%{_unitdir}/slurmd.service +sed -i -e "s@PIDFile=.*@PIDFile=%{_localstatedir}/run/slurm/slurmdbd.pid@" \ + %{buildroot}/%{_unitdir}/slurmdbd.service %endif # Delete unpackaged files: @@ -604,10 +613,16 @@ exit 0 %_res_update slurmdbd.service %_rest slurmdbd +%if 0%{?sle_version} > 120200 || 0%{?suse_version} > 1320 +%define my_license %license +%else +%define my_license %doc +%endif + %files %defattr(-,root,root) %doc AUTHORS NEWS RELEASE_NOTES DISCLAIMER -%license COPYING +%my_license COPYING %doc doc/html %{_bindir}/sacct %{_bindir}/sacctmgr diff --git a/slurmctld-rerun-agent_init-when-backup-controller-takes-over.patch b/slurmctld-rerun-agent_init-when-backup-controller-takes-over.patch new file mode 100644 index 0000000..9474643 --- /dev/null +++ b/slurmctld-rerun-agent_init-when-backup-controller-takes-over.patch @@ -0,0 +1,58 @@ +From: Egbert Eich +Date: Tue Jul 31 17:31:15 2018 +0200 +Subject: slurmctld: rerun agent_init() when backup controller takes over +Patch-mainline: Not yet +Git-commit: 169d9522c89a10dcffbf1403c20b4e6249bac79b +References: + +A slurmctld backup controller often fails to clean up jobs which have +finished, the node appears in an 'IDLE+COMPLETING' state while squeue -l +still shows the job in a completing state. +This situation persists until the primary controller is restarted and +cleans up all tasks in 'COMPLETING' state. +This issue is caused by a race condition in the backup controller: +When the backup controller detects that the primary controller is +inaccessible, it will run thru a restart cycle. To trigger the shutdown +of some entities, it will set slurmctld_config.shutdown_time to a value +!= 0. Before continuing as the controller in charge, it resets this +variable to 0 again. +The agent which handles the request queue - from a separate thread - +wakes up periodically (in a 2 sec interval) and checks for things to do. +If it finds slurmctld_config.shutdown_time set to a value != 0, it will +terminate. +If this wakeup occurs in the 'takeover window' between the variable +being set to !=0 and reset to 0, the agent goes away and will no longer +be available to handle queued requests as there is nothing at the end +of the 'takeover window' that would restart it. + +This fix adds a restart of the agent by calling agent_init() after +slurmctld_config.shutdown_time has been reset to 0. +Should an agent still be running (because it didn't wake up during the +'takeover window') it will be caught in agent_init(). + +Signed-off-by: Egbert Eich +--- + src/slurmctld/backup.c | 4 ++++ + 1 file changed, 4 insertions(+) +diff --git a/src/slurmctld/backup.c b/src/slurmctld/backup.c +index 24ddcde..cf3bb43 100644 +--- a/src/slurmctld/backup.c ++++ b/src/slurmctld/backup.c +@@ -65,6 +65,7 @@ + #include "src/slurmctld/read_config.h" + #include "src/slurmctld/slurmctld.h" + #include "src/slurmctld/trigger_mgr.h" ++#include "src/slurmctld/agent.h" + + #define SHUTDOWN_WAIT 2 /* Time to wait for primary server shutdown */ + +@@ -225,6 +226,9 @@ void run_backup(slurm_trigger_callbacks_t *callbacks) + abort(); + } + slurmctld_config.shutdown_time = (time_t) 0; ++ /* Reinit agent in case it has been terminated - agent_init() ++ will check itself */ ++ agent_init(); + unlock_slurmctld(config_write_lock); + select_g_select_nodeinfo_set_all(); +