commit 626fb47a3b8eb56c6f2aa5810bc9135293bdf2963a26a2eaa6e29faf78bed511 Author: Egbert Eich Date: Wed Jan 8 06:03:29 2025 +0000 - Update to version 24.11 * `slurmctld` - Reject arbitrary distribution jobs that do not specifying a task count. * Fix backwards compatibility of the `RESPONSE_JOB_INFO RPC` (used by `squeue`, `scontrol show job`, etc.) with Slurm clients version 24.05 and below. This was a regression in 24.11.0rc1. * Do not let `slurmctld`/`slurmd` start if there are more nodes defined in `slurm.conf` than the maximum supported amount (64k nodes). * `slurmctld` - Set job's exit code to 1 when a job fails with state `JOB_NODE_FAIL`. This fixes `sbatch --wait` not being able to exit with error code when a job fails for this reason in some cases. * Fix certain reservation updates requested from 23.02 clients. * `slurmrestd` - Fix populating non-required object fields of objects as `{}` in JSON/YAML instead of `null` causing compiled OpenAPI clients to reject the response to `GET /slurm/v0.0.40/jobs` due to validation failure of `.jobs[].job_resources`. * Fix issue where older versions of Slurm talking to a 24.11 dbd could loose step accounting. * Fix minor memory leaks. * Fix bad memory reference when `xstrchr` fails to find char. * Remove duplicate checks for a data structure. * Fix race condition in `stepmgr` step completion handling. * `slurm.spec` - add ability to specify patches to apply on the command line. * `slurm.spec` - add ability to supply extra version information. * Fix 24.11 HA issues. * Fix requeued jobs keeping their priority until the decay thread OBS-URL: https://build.opensuse.org/package/show/network:cluster/slurm?expand=0&rev=302 diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..9b03811 --- /dev/null +++ b/.gitattributes @@ -0,0 +1,23 @@ +## Default LFS +*.7z filter=lfs diff=lfs merge=lfs -text +*.bsp filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.gem filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.jar filter=lfs diff=lfs merge=lfs -text +*.lz filter=lfs diff=lfs merge=lfs -text +*.lzma filter=lfs diff=lfs merge=lfs -text +*.obscpio filter=lfs diff=lfs merge=lfs -text +*.oxt filter=lfs diff=lfs merge=lfs -text +*.pdf filter=lfs diff=lfs merge=lfs -text +*.png filter=lfs diff=lfs merge=lfs -text +*.rpm filter=lfs diff=lfs merge=lfs -text +*.tbz filter=lfs diff=lfs merge=lfs -text +*.tbz2 filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.ttf filter=lfs diff=lfs merge=lfs -text +*.txz filter=lfs diff=lfs merge=lfs -text +*.whl filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..57affb6 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +.osc diff --git a/Fix-test-21.41.patch b/Fix-test-21.41.patch new file mode 100644 index 0000000..bdb0499 --- /dev/null +++ b/Fix-test-21.41.patch @@ -0,0 +1,65 @@ +From: Egbert Eich +Date: Wed Jun 22 14:39:10 2022 +0200 +Subject: Fix test 21.41 +Patch-mainline: Not yet +Git-repo: https://github.com/SchedMD/slurm +Git-commit: 21619ffa15d1d656ee11a477ebb8215a06387fdd +References: + +Since expect is not line oriented, the output is not matched line by line. +Thus the order in which results are returned by sacctmgr actually matters: +If the first test case matches what is returned first, this part will be +consumed. If the 2nd test case will then match what is left over, the +test will actually succeed. +If this is not the case, ie if the first test matches a part that is +actually sent later, the earlier parts will actually be forgotten and +won't match at all. +To make the test resilient to different order of results, the test has +been rewritten to only contain a single match line. + +Signed-off-by: Egbert Eich +Signed-off-by: Egbert Eich +--- + testsuite/expect/test21.41 | 30 +++++++++++++++--------------- + 1 file changed, 15 insertions(+), 15 deletions(-) +diff --git a/testsuite/expect/test21.41 b/testsuite/expect/test21.41 +index c0961522db..1fd921a48f 100755 +--- a/testsuite/expect/test21.41 ++++ b/testsuite/expect/test21.41 +@@ -372,21 +372,21 @@ expect { + -re "There was a problem" { + fail "There was a problem with the sacctmgr command" + } +- -re "$user1.$wckey1.($number)." { +- set user1wckey1 $expect_out(1,string) +- exp_continue +- } +- -re "$user2.$wckey1.($number)." { +- set user2wckey1 $expect_out(1,string) +- exp_continue +- } +- -re "$user1.$wckey2.($number)." { +- set user1wckey2 $expect_out(1,string) +- exp_continue +- } +- -re "$user2.$wckey2.($number)." { +- set user2wckey2 $expect_out(1,string) +- exp_continue ++ -re "($user1|$user2).($wckey1|$wckey2).($number)." { ++ if { $expect_out(1,string) eq $user1 } { ++ if { $expect_out(2,string) eq $wckey1 } { ++ set user1wckey1 $expect_out(3,string) ++ } elseif { $expect_out(2,string) eq $wckey2 } { ++ set user1wckey2 $expect_out(3,string) ++ } ++ } elseif { $expect_out(1,string) eq $user2 } { ++ if { $expect_out(2,string) eq $wckey1 } { ++ set user2wckey1 $expect_out(3,string) ++ } elseif { $expect_out(2,string) eq $wckey2 } { ++ set user2wckey2 $expect_out(3,string) ++ } ++ } ++ exp_continue + } + timeout { + fail "sacctmgr wckeys not responding" diff --git a/Fix-test7.2-to-find-libpmix-under-lib64-as-well.patch b/Fix-test7.2-to-find-libpmix-under-lib64-as-well.patch new file mode 100644 index 0000000..f557bce --- /dev/null +++ b/Fix-test7.2-to-find-libpmix-under-lib64-as-well.patch @@ -0,0 +1,26 @@ +From: Egbert Eich +Date: Sat Jul 2 11:25:11 2022 +0200 +Subject: Fix test7.2 to find libpmix under lib64 as well +Patch-mainline: Not yet +Git-repo: https://github.com/SchedMD/slurm +Git-commit: 4771b96995f90a64a828aac16a10bd56db61a711 +References: + +Signed-off-by: Egbert Eich +Signed-off-by: Egbert Eich +--- + testsuite/expect/test7.2 | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) +diff --git a/testsuite/expect/test7.2 b/testsuite/expect/test7.2 +index 9d1f1a2dee..f63ecd643e 100755 +--- a/testsuite/expect/test7.2 ++++ b/testsuite/expect/test7.2 +@@ -42,7 +42,7 @@ if {[get_config_param "SwitchType"] eq "switch/cray"} { + skip "This test is incompatible with Cray systems" + } + +-if { [file exists ${slurm_dir}/lib/libpmi.so] == 0 } { ++if { ![file exists ${slurm_dir}/lib/libpmi.so] && ![file exists ${slurm_dir}/lib64/libpmi.so]} { + skip "PMI library not compiled, can't perform pmi testing" + } + diff --git a/README_Testsuite.md b/README_Testsuite.md new file mode 100644 index 0000000..3acd3a4 --- /dev/null +++ b/README_Testsuite.md @@ -0,0 +1,125 @@ +# Running the Slurm 'expect' Testsuite + +The ```slurm-testsuite``` package contains the Slurm expect test suite. +This package is meant to be installed on a test setup only, it should +NEVER BE INSTALLED ON A REGULAR OR EVEN PRODUCTION SYSTEM. +SUSE uses this package to determine regressions and for quality assurance. +The results are monitored and evaluated regularly in house. +A specific configuration is required to run this test suite, this document +attempts to describe the steps needed. +A small subset of tests is currently failing. The reasons are yet to be +determined. + +Please do not file bug reports based on test results! + +The testsuite is preconfigured to work with 4 nodes: ```node01```,..., +```node04```. ```node01``` serves as control and compute node. The slurm +configuration, home, and the test suite are shared across the nodes. +The test suite should be mounted under /home (to make ```sgather``` work +correctly). + +For tests involving MPI this test suite currently uses OpenMPI version 4. + +## Install and set up the Base System + +1. Prepare image with a minimal text mode installation. +2. Install, enable and start sshd and make sure root is able to log in + without password across all nodes. + ``` + # zypper install openssh-server openssh-clients + # systemctl enable --now sshd + # ssh-keygen -t rsa -f .ssh/id_rsa -N + # cat .ssh/id_rsa.pub >> .ssh/authorized_keys + ``` +3. Create a test user 'auser' allow ssh from/to root: + ``` + # useradd -m auser + # cp -r /root/.ssh /home/auser + ``` +4. Set up a persistent network if to obtain the network address and + hostname thru DHCP: + ``` + # echo 'SUBSYSTEM=="net", ACTION=="add", DRIVERS=="?*", '\ + 'ATTR{address}=="?*", ATTR{dev_id}=="0x0", ATTR{type}=="1",'\ + ' KERNEL=="?*", NAME="lan0" >> /etc/udev/rules.d/70-persistent-net.rules + # cat > /etc/sysconfig/network/ifcfg-lan0 <> /etc/exports <> /etc/fstab < +Date: Wed, 8 Jan 2020 20:56:25 +0100 +Subject: [PATCH] Remove rpath from build + +Signed-off-by: Egbert Eich +--- + contribs/perlapi/libslurm/perl/Makefile.PL.in | 4 ++-- + contribs/perlapi/libslurmdb/perl/Makefile.PL.in | 4 ++-- + 2 files changed, 4 insertions(+), 4 deletions(-) + +diff --git a/contribs/perlapi/libslurm/perl/Makefile.PL.in b/contribs/perlapi/libslurm/perl/Makefile.PL.in +index e8f8aff54d..b51f53f412 100644 +--- a/contribs/perlapi/libslurm/perl/Makefile.PL.in ++++ b/contribs/perlapi/libslurm/perl/Makefile.PL.in +@@ -68,7 +68,7 @@ DESTDIR_BUG + # AIX has problems with not always having the correct + # flags so we have to add some :) + my $os = lc(`uname`); +-my $other_ld_flags = '-Wl,-rpath,@top_builddir@/src/api/.libs -Wl,-rpath,@libdir@'; ++my $other_ld_flags = "-L@top_builddir@/src/api/.libs -lslurm"; + $other_ld_flags = " -brtl -G -bnoentry -bgcbypass:1000 -bexpfull" + if $os =~ "aix"; + +@@ -79,7 +79,7 @@ WriteMakefile( + ($] >= 5.005 ? ## Add these new keywords supported since 5.005 + (ABSTRACT_FROM => 'lib/Slurm.pm', # retrieve abstract from module + AUTHOR => 'Hongjia Cao ') : ()), +- LIBS => ['-L@top_builddir@/src/api/.libs -L@libdir@ -lslurm'], # e.g., '-lm' ++ LIBS => ["-L@prefix@/lib -lslurm"], # e.g., '-lm' + DEFINE => '', # e.g., '-DHAVE_SOMETHING' + INC => "-I. -I@top_srcdir@ -I@top_srcdir@/contribs/perlapi/common -I@top_builddir@", + # Un-comment this if you add C files to link with later: +diff --git a/contribs/perlapi/libslurmdb/perl/Makefile.PL.in b/contribs/perlapi/libslurmdb/perl/Makefile.PL.in +index 4fb38b9725..148efa6e82 100644 +--- a/contribs/perlapi/libslurmdb/perl/Makefile.PL.in ++++ b/contribs/perlapi/libslurmdb/perl/Makefile.PL.in +@@ -68,7 +68,7 @@ DESTDIR_BUG + # AIX has problems with not always having the correct + # flags so we have to add some :) + my $os = lc(`uname`); +-my $other_ld_flags = '-Wl,-rpath,@top_builddir@/src/db_api/.libs -Wl,-rpath,@libdir@'; ++my $other_ld_flags = "-L@top_builddir@/src/api/.libs -lslurm"; + $other_ld_flags = " -brtl -G -bnoentry -bgcbypass:1000 -bexpfull" + if $os =~ "aix"; + +@@ -79,7 +79,7 @@ WriteMakefile( + ($] >= 5.005 ? ## Add these new keywords supported since 5.005 + (ABSTRACT_FROM => 'Slurmdb.pm', # retrieve abstract from module + AUTHOR => 'Don Lipari ') : ()), +- LIBS => ['-L@top_builddir@/src/api/.libs -L@libdir@ -lslurm'], # e.g., '-lm' ++ LIBS => ["-L@prefix@/lib -lslurm"], # e.g., '-lm' + DEFINE => '', # e.g., '-DHAVE_SOMETHING' + INC => "-I. -I@top_srcdir@ -I@top_srcdir@/contribs/perlapi/common -I@top_builddir@", + # Un-comment this if you add C files to link with later: +-- +2.42.1 + diff --git a/_service b/_service new file mode 100644 index 0000000..5d571b0 --- /dev/null +++ b/_service @@ -0,0 +1,5 @@ + + + yes + + diff --git a/pam_slurm-Initialize-arrays-and-pass-sizes.patch b/pam_slurm-Initialize-arrays-and-pass-sizes.patch new file mode 100644 index 0000000..ed3649c --- /dev/null +++ b/pam_slurm-Initialize-arrays-and-pass-sizes.patch @@ -0,0 +1,86 @@ +From d51d3e1db8b2ed650a042352eff041ae77e467f9 Mon Sep 17 00:00:00 2001 +From: Egbert Eich +Date: Mon, 20 Feb 2023 21:29:27 +0100 +Subject: [PATCH] pam_slurm: Initialize arrays and pass sizes + +PAM is security critical: +- clear arrays +- ensure strings are NULL-terminated. + +Signed-off-by: Egbert Eich +Originally-from: Sebastian Krahmer +Signed-off-by: Egbert Eich +--- + contribs/pam/pam_slurm.c | 20 +++++++++++--------- + 1 file changed, 11 insertions(+), 9 deletions(-) + +diff --git a/contribs/pam/pam_slurm.c b/contribs/pam/pam_slurm.c +index a27e651548..eac9879c07 100644 +--- a/contribs/pam/pam_slurm.c ++++ b/contribs/pam/pam_slurm.c +@@ -279,9 +279,9 @@ static int + _gethostname_short (char *name, size_t len) + { + int error_code, name_len; +- char *dot_ptr, path_name[1024]; ++ char *dot_ptr, path_name[1024] = {0}; + +- error_code = gethostname(path_name, sizeof(path_name)); ++ error_code = gethostname(path_name, sizeof(path_name) - 1); + if (error_code) + return error_code; + +@@ -309,13 +309,13 @@ static int + _slurm_match_allocation(uid_t uid) + { + int authorized = 0, i; +- char hostname[HOST_NAME_MAX]; ++ char hostname[HOST_NAME_MAX] = {0}; + char *nodename = NULL; + job_info_msg_t * msg; + + slurm_init(NULL); + +- if (_gethostname_short(hostname, sizeof(hostname)) < 0) { ++ if (_gethostname_short(hostname, sizeof(hostname) - 1) < 0) { + _log_msg(LOG_ERR, "gethostname: %m"); + return 0; + } +@@ -438,7 +438,7 @@ _send_denial_msg(pam_handle_t *pamh, struct _options *opts, + */ + extern void libpam_slurm_init (void) + { +- char libslurmname[64]; ++ char libslurmname[64] = {0}; + + if (slurm_h) + return; +@@ -446,10 +446,10 @@ extern void libpam_slurm_init (void) + /* First try to use the same libslurm version ("libslurm.so.24.0.0"), + * Second try to match the major version number ("libslurm.so.24"), + * Otherwise use "libslurm.so" */ +- if (snprintf(libslurmname, sizeof(libslurmname), ++ if (snprintf(libslurmname, sizeof(libslurmname) - 1, + "libslurm.so.%d.%d.%d", SLURM_API_CURRENT, + SLURM_API_REVISION, SLURM_API_AGE) >= +- sizeof(libslurmname) ) { ++ sizeof(libslurmname) - 1) { + _log_msg (LOG_ERR, "Unable to write libslurmname\n"); + } else if ((slurm_h = dlopen(libslurmname, RTLD_NOW|RTLD_GLOBAL))) { + return; +@@ -458,8 +458,10 @@ extern void libpam_slurm_init (void) + libslurmname, dlerror ()); + } + +- if (snprintf(libslurmname, sizeof(libslurmname), "libslurm.so.%d", +- SLURM_API_CURRENT) >= sizeof(libslurmname) ) { ++ memset(libslurmname, 0, sizeof(libslurmname)); ++ ++ if (snprintf(libslurmname, sizeof(libslurmname) - 1, "libslurm.so.%d", ++ SLURM_API_CURRENT) >= sizeof(libslurmname) - 1) { + _log_msg (LOG_ERR, "Unable to write libslurmname\n"); + } else if ((slurm_h = dlopen(libslurmname, RTLD_NOW|RTLD_GLOBAL))) { + return; +-- +2.42.1 + diff --git a/slurm-23.11.5.tar.bz2 b/slurm-23.11.5.tar.bz2 new file mode 100644 index 0000000..5cc1221 --- /dev/null +++ b/slurm-23.11.5.tar.bz2 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7a8f4b1b46d3a8ec9a95066b04635c97f9095877f6189a8ff7388e5e74daeef3 +size 7365175 diff --git a/slurm-24.05.4.tar.bz2 b/slurm-24.05.4.tar.bz2 new file mode 100644 index 0000000..5a8f531 --- /dev/null +++ b/slurm-24.05.4.tar.bz2 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:240a2105c8801bc0d222fa2bbcf46f71392ef94cce9253357e5f43f029adaf9b +size 7183430 diff --git a/slurm-24.11.0.tar.bz2 b/slurm-24.11.0.tar.bz2 new file mode 100644 index 0000000..4004c77 --- /dev/null +++ b/slurm-24.11.0.tar.bz2 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:39ebeeeeb5d874e090b7f2629bd319bfe7c41510931ff2244f85e961bdc69056 +size 7254375 diff --git a/slurm-rpmlintrc b/slurm-rpmlintrc new file mode 100644 index 0000000..1003086 --- /dev/null +++ b/slurm-rpmlintrc @@ -0,0 +1,54 @@ +addFilter(".*obsolete-not-provided slurm-sched-wiki.*") +addFilter(".*obsolete-not-provided slurmdb-direct.*") + +# libslurm provides an ABI and a wire protocol. The wire protocol may change +# with any Slurm version in an incompatible way. The wire protocol is +# implemented in the library libslurm. +# Therefore, multiple versions of libslurm should not be installed on the +# same system. +# Thus, libraries depending on libslurm need to match the installed Slurm +# version - independent of their major versions. +# To host multiple Slurm versions in the same repository we add the version +# string to package names for none-base versions - even to library packages. +# This mainly applies to upgrade packages for Leap and SLE in the maintenance +# channel. +addFilter("libnss_slurm\d_\d{2}_\d{2}.*: E: shlib-policy-name-error.*") + +# Our logrotate file names are derived from the service name. +addFilter(".* (W|E): incoherent-logrotate-file /etc/logrotate.d/slurm.*\.conf") + +# libpmix2 is opened using dlopen() - no automatic dependency resolution possible +addFilter("slurm(|_.*)-plugins.x86_64: (W|E): explicit-lib-dependency libpmix2") + +# We need to build for more than one product. rpmlint tests differ between products +# Some may not trigger on all products. +addFilter("slurm.src: (W|E): unused-rpmlintrc-filter .*") + +# ? Why should we package the log dir? +addFilter(".*: E: logrotate-log-dir-not-packaged /var/log") +# these packages contain %service_del_postun_without_restart - +# which we define if it is not defined. This needs to be kept +# as long as we need to support the HPC module for SLE-12. +addFilter("slurm(|-node|-slurmdbd)\..*: systemd-service-without-service_del_postun .*") + +# Testsuite is not meant to be installed on a user system. +# The idea is to place the test suite into a package mostly unaltered +# to place it to a test rig without requiring to install the full Slurm +# sources. +# This is by intention - the test suite requires a test suite specific +# systemd unit file. +addFilter("slurm(|_.*)-testsuite.*: E: filelist-forbidden-systemd-userdirs") +# We want to give root a script to set up the test system +addFilter("slurm(|_.*)-testsuite.*: E: (suse-|)filelist-forbidden-fhs23 /root") +# Testsuite needs this as it builds test binaries. +addFilter("slurm(|_.*)-testsuite.*: devel-dependency libnuma-devel") +addFilter("slurm(|_.*)-testsuite.*: E: explicit-lib-dependency libnuma-devel") +# To reduce the amount of preparation, the test suite supplies all required +# settings. +addFilter("slurm(|_.*)-testsuite.*: sudoers-file-unauthorized .*") +# Testsuite needs to override default slurmd.service +addFilter("slurm(|_.*)-testsuite.x86_64: W: suse-filelist-forbidden-systemd-userdirs /etc/systemd/system/slurmd.service") +# dito +addFilter("slurm(|_.*)-testsuite.x86_64: W: systemd-unit-in-etc /etc/systemd/system/slurmd.service") +# No lib dependency - test suite needs devel package to compile tests +addFilter("slurm-testsuite.x86_64: W: explicit-lib-dependency .*") diff --git a/slurm.changes b/slurm.changes new file mode 100644 index 0000000..3bda40c --- /dev/null +++ b/slurm.changes @@ -0,0 +1,5377 @@ +------------------------------------------------------------------- +Mon Jan 6 12:40:31 UTC 2025 - Egbert Eich + +- Update to version 24.11 + * `slurmctld` - Reject arbitrary distribution jobs that do not + specifying a task count. + * Fix backwards compatibility of the `RESPONSE_JOB_INFO RPC` + (used by `squeue`, `scontrol show job`, etc.) with Slurm clients + version 24.05 and below. This was a regression in 24.11.0rc1. + * Do not let `slurmctld`/`slurmd` start if there are more nodes + defined in `slurm.conf` than the maximum supported amount + (64k nodes). + * `slurmctld` - Set job's exit code to 1 when a job fails with + state `JOB_NODE_FAIL`. This fixes `sbatch --wait` not being able + to exit with error code when a job fails for this reason in + some cases. + * Fix certain reservation updates requested from 23.02 clients. + * `slurmrestd` - Fix populating non-required object fields of + objects as `{}` in JSON/YAML instead of `null` causing compiled + OpenAPI clients to reject the response to + `GET /slurm/v0.0.40/jobs` due to validation failure of + `.jobs[].job_resources`. + * Fix issue where older versions of Slurm talking to a 24.11 dbd + could loose step accounting. + * Fix minor memory leaks. + * Fix bad memory reference when `xstrchr` fails to find char. + * Remove duplicate checks for a data structure. + * Fix race condition in `stepmgr` step completion handling. + * `slurm.spec` - add ability to specify patches to apply on the + command line. + * `slurm.spec` - add ability to supply extra version information. + * Fix 24.11 HA issues. + * Fix requeued jobs keeping their priority until the decay thread + happens. + * Fix potential memory corruption in `select/cons_tres` plugin. + * Avoid cache coherency issue on non-x86 platforms that could + result in a POSIX signal being ignored or an abort(). + * `slurmctld` - Remove assertion in development builds that would + trigger if an outdated client attempted to connect. + * `slurmd` - Wait for `PrologEpilogTimeout` on reconfigure for + prologs to finish. This avoids a situation where the slurmd + never detects that the prolog completed. + * `job_container/tmpfs` - Setup x11 forwarding within the namespace. + * `slurmctld` - fix memory leak when sending a `DBD_JOB_START` + message. + * Fix issue with accounting rollup dealing with association tables. + * Fix minor memory leaks. + * Fix potential thread safety issues. + * Init mutex in burst_buffer plugins. + * `slurmdbd` - don't log errors when no changes occur from db + requests. + * `slurmcltd`,`slurmd` - Avoid deadlock during reconfigure if too + many POSIX signals are received. + * Improve error type logged from partial or incomplete reading + from socket or pipe to avoid potentially logging an error from + a previous syscall. + * `slurmrestd` - Improve the handling of queries when unable to + connect to slurmdbd by providing responses when possible. + * `slurmrestd`,`sackd`,`scrun` - Avoid rare hangs related to I/O. + * `scrun` - Add support `--all` argument for kill subcommand. + * Remove `srun --cpu-bind=rank`. + * Add `resource_spec/cpus` and `resource_spec/memory` entry + points in data_parser to print the `CpuSpecList` and + `MemSpecLimit` in `sinfo --json`. + * `sinfo` - Add `.sinfo[].resource_spec.cpus` and + `.sinfo[].resource_spec.memory` fields to print the `CpuSpecList` + and `MemSpecLimit` dumped by `sinfo --{json|yaml}`. + * Increase efficency of sending logs to syslog. + * Switch to new official YAML mime type `application/yaml` in + compliance with RFC9512 as primary mime type for YAML formatting. + * `slurmrestd` - Removed deprecated fields from the following + endpoints: + `.result' from `POST /slurm/v0.0.42/job/submit`. + `.job_id`, `.step_id`, `.job_submit_user_msg` from `POST /slurm/v0.0.42/job/{job_id}`. + `.job.exclusive`, `.jobs[].exclusive` to `POST /slurm/v0.0.42/job/submit`. + `.jobs[].exclusive` from `GET /slurm/v0.0.42/job/{job_id}`. + `.jobs[].exclusive` from `GET /slurm/v0.0.42/jobs`. + `.job.oversubscribe`, `.jobs[].oversubscribe` to `POST /slurm/v0.0.42/job/submit`. + `.jobs[].oversubscribe` from `GET /slurm/v0.0.42/job/{job_id}`. + `.jobs[].oversubscribe` from `GET /slurm/v0.0.42/jobs`. + * `scontrol` - Removed deprecated fields `.jobs[].exclusive` and + `.jobs[].oversubscribe` from `scontrol show jobs --{json|yaml}`. + * `squeue` - Removed deprecated fields `.jobs[].exclusive` and + `.jobs[].oversubscribe` from `squeue --{json|yaml}`. + * Improve the way to run external commands and fork processes to + avoid non-async-signal safe calls between a fork and an exec. + We fork ourselves now and executes the commands in a safe + environment. This includes spank prolog/epilog executions. + * Improve `MaxMemPerCPU` enforcement when exclusive jobs request + per node memory and the partition has heterogeneous nodes. + * Remove a `TOCTOU` where multiple steps requesting an energy + reading at the same time could cause too frequent accesses + to the drivers. + * Limit `SwitchName` to `HOST_NAME_MAX` chars length. + * For `scancel --ctld` and the following rest api endpoints: + `DELETE /slurm/v0.0.40/jobs` + `DELETE /slurm/v0.0.41/jobs` + `DELETE /slurm/v0.0.42/jobs` + Support array expressions in the responses to the client. + * `salloc` - Always output node names to the user when an + allocation is granted. + * `slurmrestd` - Removed all v0.0.39 endpoints. + * `select/linear` - Reject jobs asking for GRES per + `job|socket|task` or `cpus|mem` per GRES. + * Add `/nodes` POST endpoint to REST API, supports multiple + node update whereas previously only single nodes could be + updated through `/node/` endpoint: + `POST /slurm/v0.0.42/nodes` + * Do not allow changing or setting `PreemptMode=GANG` to a + partition as this is a cluster-wide option. + * Add `%b` as a file name pattern for the array task id modulo 10. + * Skip packing empty nodes when they are hidden during + `REQUEST_NODE_INFO RPC`. + * `accounting_storage/mysql` - Avoid a fatal condition when + the db server is not reachable. + * Always lay out steps cyclically on nodes in an allocation. + * `squeue` - add priority by partition + (`.jobs[].priority_by_partition`) to JSON and YAML output. + * `slurmrestd` - Add clarification to `failed to open slurmdbd + connection` error if the error was the result of an + authentication failure. + * Make it so `slurmctld` responds to RPCs that have authentication + errors with the `SLURM_PROTOCOL_AUTHENTICATION_ERROR` error + code. + * `openapi/slurmctld` - Display the correct error code instead + of `Unspecified error` if querying the following endpoints + fails: + `GET /slurm/v0.0.40/diag/` + `GET /slurm/v0.0.41/diag/` + `GET /slurm/v0.0.42/diag/` + `GET /slurm/v0.0.40/licenses/` + `GET /slurm/v0.0.41/licenses/` + `GET /slurm/v0.0.42/licenses/` + `GET /slurm/v0.0.40/reconfigure` + `GET /slurm/v0.0.41/reconfigure` + `GET /slurm/v0.0.42/reconfigure` + * Fix how used CPUs are tracked in a job allocation to allow the + max number of concurrent steps to run at a time if threads per + core is greater than 1. + * In existing allocations SLURM_GPUS_PER_NODE environment + variable will be ignored by srun if `--gpus` is specified. + * When using `--get-user-env` explicitly or implicitly, check + if PID or mnt namespaces are disabled and fall back to old + logic that does not rely on them when they are not available. + * Removed non-functional option `SLURM_PROLOG_CPU_MASK` from + `TaskProlog` which was used to reset the affinity of a task + based on the mask given. + * `slurmrestd` - Support passing of `-d latest` to load latest + version of `data_parser` plugin. + * `sacct`,`sacctmgr`,`scontrol`,`sdiag`,`sinfo`,`squeue`,`sshare` + - Change response to `--json=list` or `--yaml=list` to send + list of plugins to stdout and descriptive header to stderr to + allow for easier parsing. + * `slurmrestd` - Change response to `-d list`, `-a list` or + `-s list` to send list of plugins to stdout and descriptive + header to stderr to allow for easier parsing. + * `sacct`,`sacctmgr`,`scontrol`,`sdiag`,`sinfo`,`squeue`, + `sshare`,`slurmrestd` - Avoid crash when loading `data_parser` + plugins fail due to NULL dereference. + * Add autodetected GPUs to the output of `slurmd -C` + * Remove `burst_buffer/lua` call `slurm.job_info_to_string()`. + * Add `SchedulerParameters=bf_allow_magnetic_slot` option. It + allows jobs in magnetic reservations to be planned by backfill + scheduler. + * `slurmrestd` - Refuse to run as root, `SlurmUser`, and + `nobody(99)`. + * `openapi/slurmctld` - Revert regression that caused signaling + jobs to cancel entire job arrays instead of job array tasks: + `DELETE /slurm/v0.0.40/{job_id}` + `DELETE /slurm/v0.0.41/{job_id}` + `DELETE /slurm/v0.0.42/{job_id}` + * `openapi/slurmctld` - Support more formats for `{job_id}` + including job steps: + `DELETE /slurm/v0.0.40/{job_id}` + `DELETE /slurm/v0.0.41/{job_id}` + `DELETE /slurm/v0.0.42/{job_id}` + * Alter scheduling of jobs at submission time to consider job + submission time and job id. This makes it so that that + interactive jobs aren't allocated resources before batch jobs + when they have the same priority at submit time. + * Fix multi-cluster submissions with differing Switch plugins. + * `slurmrestd` - Change `+prefer_refs` flag to default in + `data_parser/v0.0.42` plugin. Add `+minimize_refs` flag to + inline single referenced schemas in the OpenAPI schema. This + sets the default OpenAPI schema generation behavior of + `data_parser/v0.0.42` to match v0.0.41 `+prefer_refs` and + v0.0.40 (without flags). + * Fix `LaunchParameters=batch_step_set_cpu_freq`. + * Clearer `seff` warning message for running jobs. + * `data_parser/v0.0.42` - Rename `JOB_INFO` field + `minimum_switches` to `required_switches` to reflect the + actual behavior. + * `data_parser/v0.0.42` - Rename `ACCOUNT_CONDITION` field + `assocation` to `association` to fix typo. + * `cgroup/v2` - fix cgroup cleanup when running inside a + container without write permissions to `/sys/fs/cgroup`. + * `cgroup/v2` - fix accounting of swap events detection. + * Fix gathering MaxRSS for jobs that run shorter than two + `jobacctgather` intervals. Get the metrics from cgroups + `memory.peak` or `memory.max_usage_in_bytes` where available. + * `openapi/slurmctld` - Set complex number support for the + following fields: + `.shares[][].fairshare.factor` + `.shares[][].fairshare.level` + for endpoints: + `GET /slurm/v0.0.42/shares` + and for commands: + `sshare --json` + `sshare --yaml` + * `data_parser/v0.0.42` - Avoid dumping `Infinity` for `NO_VAL` + tagged `number` fields. + * Add `TopologyParam=TopoMaxSizeUnroll=#` to allow + `--nodes=-` for `topology/block`. + * `sacct` - Respect `--noheader` for `--batch-script` and + `--env-vars`. + * `sacct` - Remove extra newline in output from `--batch-script` + and --env-vars. + * Add `sacctmgr ping` command to query status of `slurmdbd`. + * Generate an error message when a `NodeSet` name conflicts with + a `NodeName`, and prevent the controller from starting if such + a conflict exists. + * `slurmd` - properly detect slurmd restarts in the energy + gathering logic which caused bad numbers in accounting. + * `sackd` - retry fetching slurm configs indefinately in + configless mode. + * `job_submit/lua` - Add `assoc_qos` attribute to `job_desc` + to display all potential QOS's for a job's association. + * `job_submit/lua` - Add `slurm.get_qos_priority()` function + to retrieve the given QOS's priority. + * `sbcast` - Add `--nodelist` option to specify where files are + transmitted to. + * `sbcast` - Add `--no-allocation` option to transmit files to + nodes outside of a job allocation + * Add `DataParserParameters` `slurm.conf` parameter to allow + setting default value for CLI `--json` and `--yaml` arguments. + * `seff` - improve step's max memory consumption report by using + `TresUsageInTot` and `TresUsageInAve` instead of overestimating + the values. + * Enable RPC queueing for `REQUEST_KILL_JOBS`, which is used when + `scancel` is executed with `--ctld` flag. + * `slurmdbd` - Add `-u` option. This is used to determine if + restarting the DBD will result in database conversion. + * Fix `srun` inside an `salloc` in a federated cluster when using + IPv6. + * Calculate the forwarding timeouts according to tree depth + rather than node count / tree width for each level. Fixes race + conditions with same timeouts between two consecutive node + levels. + * Add ability to submit jobs with multiple QOS. + * Fix difference in behavior when swapping partition order in job + submission. + * Improve `PLANNED` state detection for mixed nodes and updating + state before yielding backfill locks. + * Always consider partition priority tiers when deciding to try + scheduling jobs on submit. + * Prevent starting jobs without reservations on submit when there + are pending jobs with reservations that have flags `FLEX` or + `ANY_NODES` that can be scheduled on overlapping nodes. + * Prevent jobs that request both high and low priority tier + partitions from starting on submit in lower priority tier + partitions if it could delay pending jobs in higher priority + tier partitions. + * `scontrol` - Wait for `slurmctld` to start reconfigure in + foreground mode before returning. + * Improve reconfigure handling on Linux to only close open file + descriptors to avoid long delays on systems with large + `RLIMIT_NOFILE` settings. + * `salloc` - Removed `--get-user-env` option. + * Removed the instant on feature from `switch/hpe_slingshot`. + * Hardware collectives in `switch/hpe_slingshot` now requires + `enable_stepmgr`. + * Allow backfill to plan jobs on nodes currently being used by + exclusive user or mcs jobs. + * Avoid miscaching IPv6 address to hostname lookups that could + have caused logs to have the incorrect hostname. + * `scontrol` - Add `--json`/`--yaml` support to `listpids` + * `scontrol` - Add `liststeps` + * `scontrol` - Add `listjobs` + * `slurmrestd` - Avoid connection to slurmdbd for the following + endpoints: + `GET /slurm/v0.0.42/jobs` + `GET /slurm/v0.0.42/job/{job_id}` + * `slurmctld` - Changed incoming RPC handling to dedicated thread + pool. + * `job_container/tmpfs` - Add `EntireStepInNS` option that will + place the `slurmstepd` process within the constructed namespace + directly. + * `scontrol show topo` - Show aggregated block sizes when using + `topology/block`. + * `slurmrestd` - Add more descriptive HTTP status for + authentication failure and connectivity errors with controller. + * `slurmrestd` - Improve reporting errors from `slurmctld` for + job queries: + `GET /slurm/v0.0.41/{job_id}` + `GET /slurm/v0.0.41/jobs/` + * Avoid rejecting a step request that needs fewer GRES than nodes + in the job allocation. + * `slurmrestd` - Tag the never populated `.jobs[].pid` field as + deprecated for the following endpoints: + `GET /slurm/v0.0.42/{job_id}` + `GET /slurm/v0.0.42/jobs/` + * `scontrol`,`squeue` - Tag the never populated `.jobs[].pid` field + as deprecated for the following: + `scontrol show jobs --json` + `scontrol show jobs --yaml` + `scontrol show job ${JOB_ID} --json` + `scontrol show job ${JOB_ID} --yaml` + `squeue --json` + `squeue --yaml` + * `data_parser` v0.0.42 - fix timestamp parsing regression + introduced in in v0.0.40 (eaf3b6631f), parsing of non iso 8601 + style timestamps + * `cgroup/v2` will detect some special container and namespaced + setups and will work with it. + * Support IPv6 in configless mode. + * Add `SlurmctldParamters=ignore_constraint_validation` to ignore + `constraint/feature` validation at submission. + * `slurmrestd` - Set `.pings[].mode` field as deprecated in the + following endpoints: + `GET /slurm/v0.0.42/ping` + * `scontrol` - Set `.pings[].mode` field as deprecated in the + following commands: + `scontrol ping --json` + `scontrol ping --yaml` + * `slurmrestd` - Set `.pings[].pinged` field as deprecated in + the following endpoints: + `GET /slurm/v0.0.42/ping` + * `scontrol` - Set `.pings[].pinged` field as deprecated in the + following commands: + `scontrol ping --json` + `scontrol ping --yaml` + * `slurmrestd` - Add `.pings[].primary` field to the following + endpoints: + `GET /slurm/v0.0.42/ping` + * `scontrol` - Add `.pings[].primary` field to the following + commands: + `scontrol ping --json` + `scontrol ping --yaml` + * `slurmrestd` - Add `.pings[].responding` field to the following + endpoints: + `GET /slurm/v0.0.42/ping` + * `scontrol` - Add `.pings[].responding` field to the following + commands: + `scontrol ping --json` + `scontrol ping --yaml` + * Prevent jobs without reservations from delaying jobs in + reservations with flags `FLEX` or `ANY_NODES` in the main + scheduler. + * Fix allowing to ask for multiple different types of TRES + when one of them has a value of 0. + * `slurmctld` - Add a grace period to ensure the agent retry + queue is properly flushed during shutdown. + * Don't ship `src/slurmrestd/plugins/openapi/slurmdbd/openapi.json` + `slurmrest` should always be used to enerate a new OpenAPI + schema (aka openapi.json or openapi.yaml). + * `mpi/pmix` - Fix potential deadlock and races with het jobs, + and fix potential memory and FDs leaks. + * Fix jobs with `--gpus` being rejected in some edge cases for + partitions where not all nodes have the same amount of GPUs + and CPUs configured. + * In an extra constraints expression in a job request, do not + allow an empty string for a key or value. + * In an extra constraints expression in a job request, fix + validation that requests are separated by boolean operators. + * Add `TaskPluginParam=OOMKillStep` to kill the step as a whole + when one task OOMs. + * Fix `scontrol` show conf not showing all `TaskPluginParam` + elements. + * `slurmrestd` - Add fields `.job.oom_kill_step` + `.jobs[].oom_kill_step` to `POST /slurm/v0.0.42/job/submit` + and `POST /slurm/v0.0.42/job/allocate`. + * Improve performance for `_will_run_test()`. + * Add `SchedulerParameters=bf_topopt_enable` option to enable + experimental hook to control backfill. + * If a step fails to launch under certain conditions, set the + step's state to `NODE_FAIL`. + * `sched/backfill` - Fix certain situations where a job would + not get a planned time, which could lead to it being delayed + by lower priority jobs. + * `slurmrestd` - Dump JSON `null` instead of `{}` (empty object) + for non-required fields in objects to avoid client + compatiblity issues for v0.0.42 version tagged endpoints. + * `sacct`,`sacctmgr`,`scontrol`,`sdiag`,`sinfo`,`squeue`, + `sshare` - Dump `null` instead `{}` (empty object) for + non-required fields in objects to avoid client compatiblity + issues when run with `--json` or `--yaml`. + +------------------------------------------------------------------- +Fri Nov 1 12:50:27 UTC 2024 - Egbert Eich + +- Update to version 24.05.4 & fix for CVE-2024-48936. + * Fix generic int sort functions. + * Fix user look up using possible unrealized uid in the dbd. + * `slurmrestd` - Fix regressions that allowed `slurmrestd` to + be run as SlurmUser when `SlurmUser` was not root. + * mpi/pmix fix race conditions with het jobs at step start/end + which could make srun to hang. + * Fix not showing some `SelectTypeParameters` in `scontrol show + config`. + * Avoid assert when dumping removed certain fields in JSON/YAML. + * Improve how shards are scheduled with affinity in mind. + * Fix `MaxJobsAccruePU` not being respected when `MaxJobsAccruePA` + is set in the same QOS. + * Prevent backfill from planning jobs that use overlapping + resources for the same time slot if the job's time limit is + less than `bf_resolution`. + * Fix memory leak when requesting typed gres and + `--[cpus|mem]-per-gpu`. + * Prevent backfill from breaking out due to "system state + changed" every 30 seconds if reservations use `REPLACE` or + `REPLACE_DOWN` flags. + * `slurmrestd` - Make sure that scheduler_unset parameter defaults + to true even when the following flags are also set: + `show_duplicates`, `skip_steps`, `disable_truncate_usage_time`, + `run_away_jobs`, `whole_hetjob`, `disable_whole_hetjob`, + `disable_wait_for_result`, `usage_time_as_submit_time`, + `show_batch_script`, and or `show_job_environment`. Additionaly, + always make sure show_duplicates and + `disable_truncate_usage_time` default to true when the following + flags are also set: `scheduler_unset`, `scheduled_on_submit`, + `scheduled_by_main`, `scheduled_by_backfill`, and or `job_started`. + This effects the following endpoints: + `GET /slurmdb/v0.0.40/jobs` + `GET /slurmdb/v0.0.41/jobs` + * Ignore `--json` and `--yaml` options for `scontrol` show config + to prevent mixing output types. + * Fix not considering nodes in reservations with Maintenance or + Overlap flags when creating new reservations with `nodecnt` or + when they replace down nodes. + * Fix suspending/resuming steps running under a 23.02 `slurmstepd` + process. + * Fix options like `sprio --me` and `squeue --me` for users with + a uid greater than 2147483647. + * `fatal()` if `BlockSizes=0`. This value is invalid and would + otherwise cause the `slurmctld` to crash. + * `sacctmgr` - Fix issue where clearing out a preemption list using + `preempt=''` would cause the given qos to no longer be preempt-able + until set again. + * Fix `stepmgr` creating job steps concurrently. + * `data_parser/v0.0.40` - Avoid dumping "Infinity" for `NO_VAL` tagged + "number" fields. + * `data_parser/v0.0.41` - Avoid dumping "Infinity" for `NO_VAL` tagged + "number" fields. + * `slurmctld` - Fix a potential leak while updating a reservation. + * `slurmctld` - Fix state save with reservation flags when a update + fails. + * Fix reservation update issues with parameters Accounts and Users, when + using +/- signs. + * `slurmrestd` - Don't dump warning on empty wckeys in: + `GET /slurmdb/v0.0.40/config` + `GET /slurmdb/v0.0.41/config` + * Fix slurmd possibly leaving zombie processes on start up in configless + when the initial attempt to fetch the config fails. + * Fix crash when trying to drain a non-existing node (possibly deleted + before). + * `slurmctld` - fix segfault when calculating limit decay for jobs with + an invalid association. + * Fix IPMI energy gathering with multiple sensors. + * `data_parser/v0.0.39` - Remove xassert requiring errors and warnings + to have a source string. + * `slurmrestd` - Prevent potential segfault when there is an error + parsing an array field which could lead to a double xfree. This + applies to several endpoints in `data_parser` v0.0.39, v0.0.40 and + v0.0.41. + * `scancel` - Fix a regression from 23.11.6 where using both the + `--ctld` and `--sibling` options would cancel the federated job on + all clusters instead of only the cluster(s) specified by `--sibling`. + * `accounting_storage/mysql` - Fix bug when removing an association + specified with an empty partition. + * Fix setting multiple partition state restore on a job correctly. + * Fix difference in behavior when swapping partition order in job + submission. + * Fix security issue in stepmgr that could permit an attacker to + execute processes under other users' jobs. CVE-2024-48936. + +------------------------------------------------------------------- +Wed Oct 23 08:54:29 UTC 2024 - Egbert Eich + +- Add %(?%sysusers_requires} to slurm-config. + This fixes issues when building against Slurm. + +------------------------------------------------------------------- +Mon Oct 14 10:40:10 UTC 2024 - Egbert Eich + +- Update to version 24.05.3 + * `data_parser/v0.0.40` - Added field descriptions. + * `slurmrestd` - Avoid creating new slurmdbd connection per request + to `* /slurm/slurmctld/*/*` endpoints. + * Fix compilation issue with `switch/hpe_slingshot` plugin. + * Fix gres per task allocation with threads-per-core. + * `data_parser/v0.0.41` - Added field descriptions. + * `slurmrestd` - Change back generated OpenAPI schema for + `DELETE /slurm/v0.0.40/jobs/` to `RequestBody` instead of using + parameters for request. `slurmrestd` will continue accept endpoint + requests via `RequestBody` or HTTP query. + * `topology/tree` - Fix issues with switch distance optimization. + * Fix potential segfault of secondary `slurmctld` when falling back + to the primary when running with a `JobComp` plugin. + * Enable `--json`/`--yaml=v0.0.39` options on client commands to + dump data using data_parser/v0.0.39 instead or outputting nothing. + * `switch/hpe_slingshot` - Fix issue that could result in a 0 length + state file. + * Fix unnecessary message protocol downgrade for unregistered nodes. + * Fix unnecessarily packing alias addrs when terminating jobs with + a mix of non-cloud/dynamic nodes and powered down cloud/dynamic + nodes. + * `accounting_storage/mysql` - Fix issue when deleting a qos that + could remove too many commas from the qos and/or delta_qos fields + of the assoc table. + * `slurmctld` - Fix memory leak when using RestrictedCoresPerGPU. + * Fix allowing access to reservations without `MaxStartDelay` set. + * Fix regression introduced in 24.05.0rc1 breaking + `srun --send-libs` parsing. + * Fix slurmd vsize memory leak when using job submission/allocation + commands that implicitly or explicitly use --get-user-env. + * `slurmd` - Fix node going into invalid state when using + `CPUSpecList` and setting CPUs to the # of cores on a + multithreaded node. + * Fix reboot asap nodes being considered in backfill after a restart. + * Fix `--clusters`/`-M queries` for clusters outside of a + federation when `fed_display` is configured. + * Fix `scontrol` allowing updating job with bad `cpus-per-task` value. + * `sattach` - Fix regression from 24.05.2 security fix leading to + crash. + * `mpi/pmix` - Fix assertion when built under `--enable-debug`. +- Changes from Slurm 24.05.2 + * Fix energy gathering rpc counter underflow in + `_rpc_acct_gather_energy` when more than 10 threads try to get + energy at the same time. This prevented the possibility to get + energy from slurmd by any step until slurmd was restarted, + so losing energy accounting metrics in the node. + * `accounting_storage/mysql` - Fix issue where new user with `wckey` + did not have a default wckey sent to the slurmctld. + * `slurmrestd` - Prevent slurmrestd segfault when handling the + following endpoints when none of the optional parameters are + specified: + `DELETE /slurm/v0.0.40/jobs` + `DELETE /slurm/v0.0.41/jobs` + `GET /slurm/v0.0.40/shares` + `GET /slurm/v0.0.41/shares` + `GET /slurmdb/v0.0.40/instance` + `GET /slurmdb/v0.0.41/instance` + `GET /slurmdb/v0.0.40/instances` + `GET /slurmdb/v0.0.41/instances` + `POST /slurm/v0.0.40/job/{job_id}` + `POST /slurm/v0.0.41/job/{job_id}` + * Fix IPMI energy gathering when no IPMIPowerSensors are specified + in `acct_gather.conf`. This situation resulted in an accounted + energy of 0 for job steps. + * Fix a minor memory leak in slurmctld when updating a job dependency. + * `scontrol`,`squeue` - Fix regression that caused incorrect values + for multisocket nodes at `.jobs[].job_resources.nodes.allocation` + for `scontrol show jobs --(json|yaml)` and `squeue --(json|yaml)`. + * `slurmrestd` - Fix regression that caused incorrect values for + multisocket nodes at `.jobs[].job_resources.nodes.allocation` to + be dumped with endpoints: + `GET /slurm/v0.0.41/job/{job_id}` + `GET /slurm/v0.0.41/jobs` + * `jobcomp/filetxt` - Fix truncation of job record lines > 1024 + characters. + * `switch/hpe_slingshot` - Drain node on failure to delete CXI + services. + * Fix a performance regression from 23.11.0 in CPU frequency + handling when no `CpuFreqDef` is defined. + * Fix one-task-per-sharing not working across multiple nodes. + * Fix inconsistent number of CPUs when creating a reservation + using the TRESPerNode option. + * `data_parser/v0.0.40+` - Fix job state parsing which could + break filtering. + * Prevent `cpus-per-task` to be modified in jobs where a `-c` + value has been explicitly specified and the requested memory + constraints implicitly increase the number of CPUs to allocate. + * `slurmrestd` - Fix regression where args `-s v0.0.39,dbv0.0.39` + and `-d v0.0.39` would result in `GET /openapi/v3` not + registering as a valid possible query resulting in 404 errors. + * `slurmrestd` - Fix memory leak for dbv0.0.39 jobs query which + occurred if the query parameters specified account, association, + cluster, constraints, format, groups, job_name, partition, qos, + reason, reservation, state, users, or wckey. This affects the + following endpoints: + `GET /slurmdb/v0.0.39/jobs` + * `slurmrestd` - In the case the slurmdbd does not respond to a + persistent connection init message, prevent the closed fd from + being used, and instead emit an error or warning depending on + if the connection was required. + * Fix 24.05.0 regression that caused the slurmdbd not to send back + an error message if there is an error initializing a persistent + connection. + * Reduce latency of forwarded x11 packets. + * Add `curr_dependency` (representing the current dependency of + the job). + and `orig_dependency` (representing the original requested + dependency of the job) fields to the job record in + `job_submit.lua` (for job update) and `jobcomp.lua`. + * Fix potential segfault of slurmctld configured with + `SlurmctldParameters=enable_rpc_queue` from happening on + reconfigure. + * Fix potential segfault of slurmctld on its shutdown when rate + limitting is enabled. + * `slurmrestd` - Fix missing job environment for `SLURM_JOB_NAME`, + `SLURM_OPEN_MODE`, `SLURM_JOB_DEPENDENCY`, `SLURM_PROFILE`, + `SLURM_ACCTG_FREQ`, `SLURM_NETWORK` and `SLURM_CPU_FREQ_REQ` to + match sbatch. + * Fix GRES environment variable indices being incorrect when only + using a subset of all GPUs on a node and the + `--gres-flags=allow-task-sharing` option. + * Prevent `scontrol` from segfaulting when requesting scontrol + show reservation `--json` or `--yaml` if there is an error + retrieving reservations from the `slurmctld`. + * `switch/hpe_slingshot` - Fix security issue around managing VNI + access. CVE-2024-42511. + * `switch/nvidia_imex` - Fix security issue managing IMEX channel + access. CVE-2024-42511. + * `switch/nvidia_imex` - Allow for compatibility with + `job_container/tmpfs`. +- Changes in Slurm 24.05.1 + * Fix `slurmctld` and `slurmdbd` potentially stopping instead of + performing a logrotate when recieving `SIGUSR2` when using + `auth/slurm`. + * `switch/hpe_slingshot` - Fix slurmctld crash when upgrading + from 23.02. + * Fix "Could not find group" errors from `validate_group()` when + using `AllowGroups` with large `/etc/group` files. + * Add `AccountingStoreFlags=no_stdio` which allows to not record + the stdio paths of the job when set. + * `slurmrestd` - Prevent a slurmrestd segfault when parsing the + `crontab` field, which was never usable. Now it explicitly + ignores the value and emits a warning if it is used for the + following endpoints: + `POST /slurm/v0.0.39/job/{job_id}` + `POST /slurm/v0.0.39/job/submit` + `POST /slurm/v0.0.40/job/{job_id}` + `POST /slurm/v0.0.40/job/submit` + `POST /slurm/v0.0.41/job/{job_id}` + `POST /slurm/v0.0.41/job/submit` + `POST /slurm/v0.0.41/job/allocate` + * `mpi/pmi2` - Fix communication issue leading to task launch + failure with "`invalid kvs seq from node`". + * Fix getting user environment when using sbatch with + `--get-user-env` or `--export=` when there is a user profile + script that reads `/proc`. + * Prevent slurmd from crashing if `acct_gather_energy/gpu` is + configured but `GresTypes` is not configured. + * Do not log the following errors when `AcctGatherEnergyType` + plugins are used but a node does not have or cannot find sensors: + "`error: _get_joules_task: can't get info from slurmd`" + "`error: slurm_get_node_energy: Zero Bytes were transmitted or + received`" + However, the following error will continue to be logged: + "`error: Can't get energy data. No power sensors are available. + Try later`" + * `sbatch`, `srun` - Set `SLURM_NETWORK` environment variable if + `--network` is set. + * Fix cloud nodes not being able to forward to nodes that restarted + with new IP addresses. + * Fix cwd not being set correctly when running a SPANK plugin with a + `spank_user_init()` hook and the new "`contain_spank`" option set. + * `slurmctld` - Avoid deadlock during shutdown when `auth/slurm` + is active. + * Fix segfault in `slurmctld` with `topology/block`. + * `sacct` - Fix printing of job group for job steps. + * `scrun` - Log when an invalid environment variable causes the + job submission to be rejected. + * `accounting_storage/mysql` - Fix problem where listing or + modifying an association when specifying a qos list could hang + or take a very long time. + * `gpu/nvml` - Fix `gpuutil/gpumem` only tracking last GPU in step. + Now, `gpuutil/gpumem` will record sums of all GPUS in the step. + * Fix error in `scrontab` jobs when using + `slurm.conf:PropagatePrioProcess=1`. + * Fix `slurmctld` crash on a batch job submission with + `--nodes 0,...`. + * Fix dynamic IP address fanout forwarding when using `auth/slurm`. + * Restrict listening sockets in the `mpi/pmix` plugin and `sattach` + to the `SrunPortRange`. + * `slurmrestd` - Limit mime types returned from query to + `GET /openapi/v3` to only return one mime type per serializer + plugin to fix issues with OpenAPI client generators that are + unable to handle multiple mime type aliases. + * Fix many commands possibly reporting an "`Unexpected Message + Received`" when in reality the connection timed out. + * Prevent slurmctld from starting if there is not a json + serializer present and the `extra_constraints` feature is enabled. + * Fix heterogeneous job components not being signaled with + `scancel --ctld` and `DELETE slurm/v0.0.40/jobs` if the job ids + are not explicitly given, the heterogeneous job components match + the given filters, and the heterogeneous job leader does not + match the given filters. + * Fix regression from 23.02 impeding job licenses from being cleared. + * Move error to `log_flag` which made `_get_joules_task` error to + be logged to the user when too many rpcs were queued in slurmd + for gathering energy. + * For `scancel --ctld` and the associated rest api endpoints: + `DELETE /slurm/v0.0.40/jobs` + `DELETE /slurm/v0.0.41/jobs` + Fix canceling the final array task in a job array when the task + is pending and all array tasks have been split into separate job + records. Previously this task was not canceled. + * Fix `power_save operation` after recovering from a failed + reconfigure. + * `slurmctld` - Skip removing the pidfile when running under + systemd. In that situation it is never created in the first place. + * Fix issue where altering the flags on a Slurm account + (`UsersAreCoords`) several limits on the account's association + would be set to 0 in Slurm's internal cache. + * Fix memory leak in the controller when relaying `stepmgr` step + accounting to the dbd. + * Fix segfault when submitting stepmgr jobs within an existing + allocation. + * Added `disable_slurm_hydra_bootstrap` as a possible `MpiParams` + parameter in `slurm.conf`. Using this will disable env variable + injection to allocations for the following variables: + `I_MPI_HYDRA_BOOTSTRAP,` `I_MPI_HYDRA_BOOTSTRAP_EXEC_EXTRA_ARGS`, + `HYDRA_BOOTSTRAP`, `HYDRA_LAUNCHER_EXTRA_ARGS`. + * `scrun` - Delay shutdown until after start requested. + This caused `scrun` to never start or shutdown and hung forever + when using `--tty`. + * Fix backup `slurmctld` potentially not running the agent when + taking over as the primary controller. + * Fix primary controller not running the agent when a reconfigure + of the `slurmctld` fails. + * `slurmd` - fix premature timeout waiting for + `REQUEST_LAUNCH_PROLOG` with large array jobs causing node to + drain. + * `jobcomp/{elasticsearch,kafka}` - Avoid sending fields with + invalid date/time. + * `jobcomp/elasticsearch` - Fix `slurmctld` memory leak from + curl usage. + * `acct_gather_profile/influxdb` - Fix slurmstepd memory leak from + curl usage + * Fix 24.05.0 regression not deleting job hash dirs after + `MinJobAge`. + * Fix filtering arguments being ignored when using squeue `--json`. + * `switch/nvidia_imex` - Move setup call after `spank_init()` to + allow namespace manipulation within the SPANK plugin. + * `switch/nvidia_imex` - Skip plugin operation if + `nvidia-caps-imex-channels` device is not present rather than + preventing slurmd from starting. + * `switch/nvidia_imex` - Skip plugin operation if + `job_container/tmpfs` is configured due to incompatibility. + * `switch/nvidia_imex` - Remove any pre-existing channels when + `slurmd` starts. + * `rpc_queue` - Add support for an optional `rpc_queue.yaml` + configuration file. + * `slurmrestd` - Add new +prefer_refs flag to `data_parser/v0.0.41` + plugin. This flag will avoid inlining single referenced schemas + in the OpenAPI schema. + +------------------------------------------------------------------- +Tue Jun 4 09:36:54 UTC 2024 - Christian Goll + +- Updated to new release 24.05.0 with following major changes + * Important Notes: + If using the slurmdbd (Slurm DataBase Daemon) you must update + this first. NOTE: If using a backup DBD you must start the + primary first to do any database conversion, the backup will not + start until this has happened. The 24.05 slurmdbd will work + with Slurm daemons of version 23.02 and above. You will not + need to update all clusters at the same time, but it is very + important to update slurmdbd first and having it running before + updating any other clusters making use of it. + * Highlights + + Federation - allow client command operation when slurmdbd is + unavailable. + + `burst_buffer/lua` - Added two new hooks: `slurm_bb_test_data_in` + and `slurm_bb_test_data_out`. The syntax and use of the new hooks + are documented in `etc/burst_buffer.lua.example`. These are + required to exist. slurmctld now checks on startup if the + `burst_buffer.lua` script loads and contains all required hooks; + `slurmctld` will exit with a fatal error if this is not + successful. Added `PollInterval` to `burst_buffer.conf`. Removed + the arbitrary limit of 512 copies of the script running + simultaneously. + + Add QOS limit `MaxTRESRunMinsPerAccount`. + + Add QOS limit `MaxTRESRunMinsPerUser`. + + Add `ELIGIBLE` environment variable to `jobcomp/script` plugin. + + Always use the QOS name for `SLURM_JOB_QOS` environment variables. + Previously the batch environment would use the description field, + which was usually equivalent to the name. + + `cgroup/v2` - Require dbus-1 version >= 1.11.16. + + Allow `NodeSet` names to be used in SuspendExcNodes. + + `SuspendExcNodes=:N` now counts allocated nodes in `N`. + The first `N` powered up nodes in are protected from + being suspended. + + Store job output, input and error paths in `SlurmDBD`. + + Add `USER_DELETE` reservation flag to allow users with access + to a reservation to delete it. + + Add `SlurmctldParameters=enable_stepmgr` to enable step + management through the `slurmstepd` instead of the controller. + + Added `PrologFlags=RunInJob` to make prolog and epilog run + inside the job extern step to include it in the job's cgroup. + + Add ability to reserve MPI ports at the job level for stepmgr + jobs and subdivide them at the step level. + + `slurmrestd` - Add `--generate-openapi-spec argument`. + * Configuration File Changes (see appropriate man page for details) + + `CoreSpecPlugin` has been removed. + + Removed `TopologyPlugin` tree and dragonfly support from + `select/linear`. If those topology plugins are desired please + switch to `select/cons_tres`. + + Changed the default value for `UnkillableStepTimeout` to 60 + seconds or five times the value of `MessageTimeout`, whichever + is greater. + + An error log has been added if `JobAcctGatherParams` '`UsePss`' + or '`NoShare`' are configured with a plugin other than + `jobacct_gather/linux`. In such case these parameters are ignored. + + `helpers.conf` - Added `Flags=rebootless` parameter allowing + feature changes without rebooting compute nodes. + + `topology/block` - Replaced the `BlockLevels` with `BlockSizes` + in `topology.conf`. + + Add `contain_spank` option to `SlurmdParameters`. When set, + `spank_user_init()`, `spank_task_post_fork()`, and + `spank_task_exit()` will execute within the + `job_container/tmpfs` plugin namespace. + + Add `SlurmctldParameters=max_powered_nodes=N`, which prevents + powering up nodes after the max is reached. + + Add `ExclusiveTopo` to a partition definition in `slurm.conf`. + + Add `AccountingStorageParameters=max_step_records` to limit how + many steps are recorded in the database for each job - excluding + batch. + * Command Changes (see man pages for details) + + Add support for "elevenses" as an additional time specification. + + Add support for `sbcast --preserve` when `job_container/tmpfs` + configured (previously documented as unsupported). + + `scontrol` - Add new subcommand `power` for node power control. + + `squeue` - Adjust `StdErr`, `StdOut`, and `StdIn` output formats. + These will now consistently print "`(null)`" if a value is + unavailable. `StdErr` will no longer display `StdOut` if it is + not distinctly set. `StdOut` will now correctly display the + default filename pattern for job arrays, and no longer show it + for non-batch jobs. However, the expansion patterns will + no longer be substituted by default. + + Add `--segment` to job allocation to be used in topology/block. + + Add `--exclusive=topo` for use with topology/block. + + `squeue` - Add `--expand-patterns` option to expand `StdErr`, + `StdOut`, `StdIn` filename patterns as best as possible. + + `sacct` - Add `--expand-patterns` option to expand `StdErr`, + `StdOut`, `StdIn` filename patterns as best as possible. + + `sreport` - Requesting `format=Planned` will now return the + expected `Planned` time as documented, instead of `PlannedDown`. + To request `Planned Down`, one must use now `format=PLNDDown` + or `format=PlannedDown` explicitly. The abbreviations + "`Pl`" or "`Pla`" will now make reference to Planned instead + of `PlannedDown`. + * API Changes + + Removed `ListIterator` type from ``. + + Removed `slurm_xlate_job_id()` from `` + * SLURMRESTD Changes + + `openapi/dbv0.0.38` and `openapi/v0.0.38` plugins have been + removed. + + `openapi/dbv0.0.39` and `openapi/v0.0.39` plugins have been + tagged as deprecated to warn of their removal in the next release. + + Changed `slurmrestd.service` to only listen on TCP socket by + default. Environments with existing drop-in units for the + service may need further adjustments to work after upgrading. + + `slurmrestd` - Tagged `script` field as deprecated in + `POST /slurm/v0.0.41/job/submit` in anticipation of removal in + future OpenAPI plugin versions. Job submissions should set the + `job.script` (or `jobs[0].script` for HetJobs) fields instead. + + `slurmrestd` - Attempt to automatically convert enumerated + string arrays with incoming non-string values into strings. + Add warning when incoming value for enumerated string arrays + can not be converted to string and silently ignore instead of + rejecting entire request. This change affects any endpoint that + uses an enunmerated string as given in the OpenAPI specification. + An example of this conversion would be to + `POST /slurm/v0.0.41/job/submit` with `.job.exclusive = true`. + While the JSON (boolean) true value matches a possible + enumeration, it is not the expected "true" string. This change + automatically converts the (boolean) `true` to (string) "`true`" + avoiding a parsing failure. + + `slurmrestd` - Add `POST /slurm/v0.0.41/job/allocate` endpoint. + This endpoint will create a new job allocation without any steps. + The allocation will need to be ended via signaling the job or + it will run to the timelimit. + + `slurmrestd` - Allow startup when `slurmdbd` is not configured + and avoid loading `slurmdbd` specific plugins. + * MPI/PMI2 Changes + + Jobs submitted with the `SLURM_HOSTFILE` environment variable + set implies using an arbitrary distribution. Nevertheless, the + logic used in PMI2 when generating their associated + `PMI_process_mapping` values has been changed and will now be + the same used for the plane distribution, as if `-m plane` were + used. This has been changed because the original arbitrary + distribution implementation did not account for multiple + instances of the same host being present in `SLURM_HOSTFILE`, + providing an incorrect process mapping in such case. This + change also enables distributing tasks in blocks when using + arbitrary distribution, which was not the case before. This + only affects `mpi`/`pmi2` plugin. +- Removed Fix-test-21.41.patch as upstream test changed. +- Dropped package plugin-ext-sensors-rrd as the plugin module no + longer exists. + +------------------------------------------------------------------- +Mon Mar 25 15:16:44 UTC 2024 - Christian Goll + +- removed Keep-logs-of-skipped-test-when-running-test-cases-sequentially.patch + as incoperated upstream +- Changes in Slurm 23.02.5 + * Add the `JobId` to `debug()` messages indicating when + `cpus_per_task/mem_per_cpu` or `pn_min_cpus` are being + automatically adjusted. + * Fix regression in 23.02.2 that caused `slurmctld -R` to crash on + startup if a node features plugin is configured. + * Fix and prevent reoccurring reservations from overlapping. + * `job_container/tmpfs` - Avoid attempts to share `BasePath` + between nodes. + * Change the log message warning for rate limited users from + verbose to info. + * With `CR_Cpu_Memory`, fix node selection for jobs that request + gres and `--mem-per-cpu`. + * Fix a regression from 22.05.7 in which some jobs were allocated + too few nodes, thus overcommitting cpus to some tasks. + * Fix a job being stuck in the completing state if the job ends + while the primary controller is down or unresponsive and the + backup controller has not yet taken over. + * Fix `slurmctld` segfault when a node registers with a configured + `CpuSpecList` while slurmctld configuration has the node without + `CpuSpecList`. + * Fix cloud nodes getting stuck in `POWERED_DOWN+NO_RESPOND` state + after not registering by `ResumeTimeout`. + * `slurmstepd` - Avoid cleanup of `config.json`-less containers + spooldir getting skipped. + * `slurmstepd` - Cleanup per task generated environment for + containers in spooldir. + * Fix `scontrol segfault` when 'completing' command requested + repeatedly in interactive mode. + * Properly handle a race condition between `bind()` and `listen()` + calls in the network stack when running with `SrunPortRange` set. + * Federation - Fix revoked jobs being returned regardless of the + `-a`/`--all` option for privileged users. + * Federation - Fix canceling pending federated jobs from non-origin + clusters which could leave federated jobs orphaned from the origin + cluster. + * Fix sinfo segfault when printing multiple clusters with + `--noheader` option. + * Federation - fix clusters not syncing if clusters are added to + a federation before they have registered with the dbd. + * Change `pmi2` plugin to honor the `SrunPortRange` option. This + matches the new behavior of the pmix plugin in 23.02.0. Note that + neither of these plugins makes use of the "`MpiParams=ports=`" + option, and previously were only limited by the systems ephemeral + port range. + * `node_features/helpers` - Fix node selection for jobs requesting + changeable features with the '`|`' operator, which could prevent + jobs from running on some valid nodes. + * `node_features/helpers` - Fix inconsistent handling of '`&`' and + '`|`', where an AND'd feature was sometimes AND'd to all sets of + features instead of just the current set. E.g. "`foo|bar&baz`" was + interpreted as `{foo,baz}` or `{bar,baz}` instead of how it is + documented: "`{foo} or {bar,baz}`". + * Fix job accounting so that when a job is requeued its allocated + node count is cleared. After the requeue, sacct will correctly + show that the job has 0 `AllocNodes` while it is pending or if + it is canceled before restarting. + * `sacct` - `AllocCPUS` now correctly shows 0 if a job has not yet + received an allocation or if the job was canceled before getting + one. + * Fix intel oneapi autodetect: detect the `/dev/dri/renderD[0-9]+` + gpus, and do not detect `/dev/dri/card[0-9]+`. + * Format batch, extern, interactive, and pending step ids into + strings that are human readable. + * Fix node selection for jobs that request `--gpus` and a number + of tasks fewer than gpus, which resulted in incorrectly rejecting + these jobs. + * Remove `MYSQL_OPT_RECONNECT` completely. + * Fix cloud nodes in `POWERING_UP` state disappearing (getting set + to `FUTURE`) when an `scontrol reconfigure` happens. + * `openapi/dbv0.0.39` - Avoid assert / segfault on missing + coordinators list. + * `slurmrestd` - Correct memory leak while parsing OpenAPI + specification templates with server overrides. + * `slurmrestd` - Reduce memory usage when printing out job CPU + frequency. + * Fix overwriting user node reason with system message. + * Remove `--uid` / `--gid` options from salloc and srun commands. + * Prevent deadlock when rpc_queue is enabled. + * `slurmrestd` - Correct OpenAPI specification generation bug where + fields with overlapping parent paths would not get generated. + * Fix memory leak as a result of a partition info query. + * Fix memory leak as a result of a job info query. + * slurmrestd - For `GET /slurm/v0.0.39/node[s]`, change format of + node's energy field `current_watts` to a dictionary to account + for unset value instead of dumping `4294967294`. + * `slurmrestd` - For `GET /slurm/v0.0.39/qos`, change format of + QOS's field `priority` to a dictionary to account for unset + value instead of dumping `4294967294`. + * `slurmrestd` - For `GET /slurm/v0.0.39/job[s]`, the `return code` + code field in `v0.0.39_job_exit_code` will be set to 127 instead + of being left unset where job does not have a relevant return code. + * `data_parser/v0.0.39` - Add `required/memory_per_cpu` and + required/memory_per_node to `sacct --json` and `sacct --yaml` and + `GET /slurmdb/v0.0.39/jobs` from `slurmrestd`. + * For step allocations, fix `--gres=none` sometimes not ignoring + gres from the job. + * Fix `--exclusive` jobs incorrectly gang-scheduling where they + shouldn't. + * Fix allocations with `CR_SOCKET`, gres not assigned to a specific + socket, and block core distribion potentially allocating more + sockets than required. + * `gpu/oneapi` - Store cores correctly so CPU affinity is tracked. + * Revert a change in 23.02.3 where Slurm would kill a script's + process group as soon as the script ended instead of waiting as + long as any process in + that process group held the stdout/stderr file descriptors open. + That change broke some scripts that relied on the previous + behavior. Setting time limits for scripts (such as + `PrologEpilogTimeout`) is strongly encouraged to avoid Slurm + waiting indefinitely for scripts to finish. + * Allow slurmdbd -R to work if the root assoc id is not 1. + * Fix `slurmdbd -R` not returning an error under certain conditions. + * `slurmdbd` - Avoid potential NULL pointer dereference in the + mysql plugin. + * Revert a change in 23.02 where `SLURM_NTASKS` was no longer + set in the job's environment when `--ntasks-per-node` was + requested. + * Limit periodic node registrations to 50 instead of the full + `TreeWidth`. + Since unresolvable `cloud/dynamic` nodes must disable fanout by + setting `TreeWidth` to a large number, this would cause all nodes + to register at once. + * Fix regression in 23.02.3 which broken x11 forwarding for hosts + when `MUNGE` sends a localhost address in the encode host field. + This is caused when the node hostname is mapped to 127.0.0.1 + (or similar) in `/etc/hosts`. + * `openapi/[db]v0.0.39` - fix memory leak on parsing error. + * `data_parser/v0.0.39` - fix updating qos for associations. + * `openapi/dbv0.0.39` - fix updating values for associations with + null users. + * Fix minor memory leak with `--tres-per-task` and licenses. + * Fix cyclic socket cpu distribution for tasks in a step where + `--cpus-per-task` < usable threads per core. +- Changes in Slurm 23.02.4 + * Fix `sbatch` return code when --wait is requested on a job array. + * `switch/hpe_slingshot` - avoid segfault when running with old + libcxi. + * Avoid slurmctld segfault when specifying + `AccountingStorageExternalHost`. + * Fix collected `GPUUtilization` values for `acct_gather_profile` + plugins. + * Fix slurmrestd handling of job hold/release operations. + * Make spank `S_JOB_ARGV` item value hold the requested command + argv instead of the srun `--bcast` value when `--bcast` requested + (only in local context). + * Fix step running indefinitely when slurmctld takes more than + `MessageTimeout` to respond. Now, `slurmctld` will cancel the + step when detected, preventing following steps from getting stuck + waiting for resources to be released. + * Fix regression to make job_desc.min_cpus accurate again in + job_submit when requesting a job with `--ntasks-per-node`. + * `scontrol` - Permit changes to `StdErr` and `StdIn` for pending + jobs. + * `scontrol` - Reset std{err,in,out} when set to empty string. + * `slurmrestd` - mark environment as a required field for job + submission descriptions. + * `slurmrestd` - avoid dumping null in OpenAPI schema required + fields. + `data_parser/v0.0.39` - avoid rejecting valid `memory_per_node` + formatted as dictionary provided with a job description. + * `data_parser/v0.0.39` - avoid rejecting valid `memory_per_cpu` + formatted as dictionary provided with a job description. + * `slurmrestd` - Return HTTP error code 404 when job query fails. + * `slurmrestd` - Add return schema to error response to job and + license query. + * Fix handling of ArrayTaskThrottle in backfill. + * Fix regression in 23.02.2 when checking gres state on `slurmctld` + startup or reconfigure. Gres changes in the configuration were + not updated on `slurmctld` startup. On startup or reconfigure, + these messages were present in the log: + "`error: Attempt to change gres/gpu Count`". + * Fix potential double count of gres when dealing with limits. + * `switch/hpe_slingshot` - support alternate traffic class names + with "`TC_`" prefix. + * `scrontab` - Fix cutting off the final character of quoted + variables. + * Fix `slurmstepd` segfault when `ContainerPath` is not set in + `oci.conf`. + * Change the log message warning for rate limited users from + debug to verbose. + * Fixed an issue where jobs requesting licenses were incorrectly + rejected. + * `smail` - Fix issues where emails at job completion were not + being sent. + * `scontrol/slurmctld` - fix comma parsing when updating a + reservation's nodes. + * `cgroup/v2` - Avoid capturing log output for ebpf when + constraining devices, as this can lead to inadvertent failure + if the log buffer is too small. + * Fix --gpu-bind=single binding tasks to wrong gpus, leading to + some gpus having more tasks than they should and other gpus being + unused. + * Fix main scheduler loop not starting after failover to backup + controller. + * Added error message when attempting to use sattach on batch or + extern steps. + * Fix regression in 23.02 that causes slurmstepd to crash when + `srun` requests more than `TreeWidth` nodes in a step and uses + the `pmi2` or `pmix` plugin. + * Reject job `ArrayTaskThrottle` update requests from unprivileged + users. + * `data_parser/v0.0.39` - populate description fields of property + objects in generated OpenAPI specifications where defined. + * `slurmstepd` - Avoid segfault caused by ContainerPath not being + terminated by '`/`' in `oci.conf`. + * `data_parser/v0.0.39` - Change `v0.0.39_job_info` response to tag + `exit_code` field as being complex instead of only an unsigned + integer. + * `job_container/tmpfs` - Fix %h and %n substitution in `BasePath` + where `%h` was substituted as the `NodeName` instead of the + hostname, and `%n` was substituted as an empty string. + * Fix regression where --cpu-bind=verbose would override + `TaskPluginParam`. + * `scancel` - Fix `--clusters`/`-M` for federations. Only filtered + jobs (e.g. -A, -u, -p, etc.) from the specified clusters will be + canceled, rather than all jobs in the federation. + Specific jobids will still be routed to the origin cluster + for cancellation. + +------------------------------------------------------------------- +Mon Jan 29 13:47:55 UTC 2024 - Egbert Eich + +- Update to version 23.11.03 + * slurmrestd - Reject single http query with multiple path + requests. + * Fix launching Singularity v4.x containers with + `srun --container` by setting .process.terminal to true in + generated `config.json` when step has pseudoterminal (`--pty`) + requested. + * Fix loading in `dyanmic/cloud` node jobs after `net_cred` + expired. + * Fix cgroup null path error on `slurmd/slurmstepd` tear down. + * `data_parser/v0.0.40` - Prevent failure if accounting is + disabled, instead issue a warning if needed data from the + database can not be retrieved. + * `openapi/slurmctld` - Prevent failure if accounting is disabled. + * Prevent `slurmscriptd` processing delays from blocking other + threads in `slurmctld` while trying to launch various scripts. + This is additional work for a fix in 23.02.6. + * Fix memory leak when receiving alias addrs from controller. + * `scontrol` - Accept `scontrol token lifespan=infinite` to + create tokens that effectively do not expire. + * Avoid errors when Slurmdb accounting disabled when `--json` or + `--yaml` is invoked with CLI commands and `slurmrestd`. Add + warnings when query would have populated data from Slurmdb + instead of errors. + * Fix `slurmctld` memory leak when running job with + `--tres-per-task=gres:shard:#` + * Fix backfill trying to start jobs outside of backfill window. + * Fix oversubscription on partitions with `PreemptMode=OFF`. + * Preserve node reason on power up if the node is downed + or drained. + * `data_parser/v0.0.40` - Avoid aborting when invoking a not + implemented parser. + * `data_parser/v0.0.40` - Fix how nice values are parsed for job + submissions. + * `data_parser/v0.0.40` - Fix regression where parsing error did + not result in invalid request being rejected. + * Fix segfault in front-end node registration. + * Prevent jobs using none typed gpus from being killed by the + controller after a reconfig or restart. + * Fix deadlock situation in the dbd when adding associations. + * Update default values of text/blob columns when updating from + old mysql versions in more situations. This improves a + previous fix to handle an uncommon case when upgrading + mysql/mariadb. + * Fix rpmbuild in openSUSE/SLES due to incorrect mariadb + dependency. + * When upgrading the slurmdbd to 23.11, avoid generating a query + to update the association table that is larger than + `max_allowed_packet` which would result in an upgrade failure. + * Fix rare deadlock when a dynamic node registers at the same + time that a once per minute background task occurs. + * `data_parser/v0.0.40` - Fix enumerated strings in OpenAPI + specification not have type field specified. + * Improve `scontrol show job -d` information of used shared + gres (`shard/mps`) topology. + * accounting_storage/mysql - Fix usage query to use new lineage + column instead of lft/rgt. + * `slurmrestd` - Improve handling of missing parsers when + content plugins expect parsers not loaded. + * `slurmrestd` - Correct parsing of StepIds when querying jobs. + * `slurmrestd` - Improve error from parsing failures of lists. + * `slurmrestd` - Improve parsing of singular values for lists. + * `accounting_storage/mysql` - Fix `PrivateData=User` when + listing associations. + * Disable sorting of dynamic nodes to avoid issues when + restarting with heterogenous jobs that cause jobs to abort on + restart. + * Don't allow deletion of non-dynamic nodes. + * `accounting_storage/mysql` - Fix issue adding partition based + associations. + * Respect non-"slurm" settings for `I_MPI_HYDRA_BOOTSTRAP` and + `HYDRA_BOOTSTRAP` and avoid injecting the `--external-launcher` + option which will cause `mpirun/mpiexec` to fail with an + unexpected argument error. + * Fix bug where scontrol hold would change node count for jobs + with implicitly defined node counts. + * `data_parser/v0.0.40` - Fix regression of support for "hold" + in job description. + * Avoid sending KILL RPCs to unresolvable `POWERING_UP` and + `POWERED_DOWN` nodes. + * `data_parser/v0.0.38` - Fix several potential NULL + dereferences that could cause slurmrestd to crash. + * Add `--gres-flags=one-task-per-sharing`. Do not allow different + tasks in to be allocated shared gres from the same sharing gres. + * Add `SelectTypeParameters=ENFORCE_BINDING_GRES` and + `ONE_TASK_PER_SHARING_GRES`. + This gives default behavior for a job's `--gres-flags`. + * Alter the networking code to try connecting to the backup + controllers if the DNS lookup for the primary `SlurmctldHost` + fails. + * Alter the name resolution to only log at `verbose()` in client + commands on failures. This allows for HA setups where the DNS + entries are withdrawn for some `SlurmctldHost` entries without + flooding the user with errors. + * Prevent `slurmscriptd` PID leaks when running `slurmctld` in + foreground mode. + * Open all `slurmctld` listening ports at startup, and persist + throughout. + This also changes the backup `slurmctld` process to open the + `SlurmctldPort` range, instead of only the first. + * Fix backup `slurmctld` shutting down instead of resuming + standby duty if it took control. + * Fix race condition that delayed the primary `slurmctld` + resuming when taking control from a backup controller. + * `srun` - Ensure processed messages are meant for this job in + case of a rapidly-reused TCP port. + * `srun` - Prevent step launch failure while waiting for step + allocation if a stray message is received. + * Fix backup `slurmctld` to be able to respond to configless + config file requests correctly. + * Fix `slurmctld` crashing when recovering from a failed + reconfigure. + * Fix `slurmscriptd` operation after recovering from a failed + reconfigure. +- Make sure `-std=gnu99` is added to CFLAGS on SLE-12. +- Use %%autopatch. + +------------------------------------------------------------------- +Fri Jan 12 11:08:01 UTC 2024 - Christian Goll + +- Update to 23.11.1 with following major improvements and fixing + CVE-2023-49933, CVE-2023-49934, CVE-2023-49935, CVE-2023-49936 + and CVE-2023-49937 + * Substantially overhauled the SlurmDBD association management + code. For clusters updated to 23.11, account and user + additions or removals are significantly faster than in prior + releases. + * Overhauled `scontrol reconfigure` to prevent configuration + mistakes from disabling slurmctld and slurmd. Instead, an + error will be returned, and the running configuration will + persist. This does require updates to the systemd service + files to use the `--systemd` option to `slurmctld` and `slurmd`. + * Added a new internal `auth/cred` plugin - `auth/slurm`. This + builds off the prior `auth/jwt` model, and permits operation + of the `slurmdbd` and `slurmctld` without access to full + directory information with a suitable configuration. + * Added a new `--external-launcher` option to `srun`, which is + automatically set by common MPI launcher implementations and + ensures processes using those non-srun launchers have full + access to all resources allocated on each node. + * Reworked the dynamic/cloud modes of operation to allow for + "fanout" - where Slurm communication can be automatically + offloaded to compute nodes for increased cluster scalability. + * Overhauled and extended the Reservation subsystem to allow + for most of the same resource requirements as are placed on + the job. Notably, this permits reservations to now reserve + GRES directly. +- Details of changes: + * Fix `scontrol update job=... TimeLimit+=/-=` when used with a + raw JobId of job array element. + * Reject `TimeLimit` increment/decrement when called on job with + `TimeLimit=UNLIMITED`. + * Fix issue with requesting a job with `*licenses` as well as + `*tres-per-task=license`. + * `slurmctld` - Prevent segfault in `getopt_long()` with an + invalid long option. + * slurmrestd - Added `/meta/slurm/cluster` field to responses. + * Adjust systemd service files to start daemons after + `remote-fs.target`. + * Fix `task/cgroup` indexing tasks in cgroup plugins, which + caused `jobacct/gather` to match the gathered stats with the + wrong task id. + * `select/linear` - Fix regression in 23.11 in which jobs that + requested `*cpus-per-task` were rejected. + * `data_parser/v0.0.40` - Fix the parsing for + `/slurmdb/v0.0.40/jobs` exit_code query parameter. + * If a job requests more shards which would allocate more than + one sharing GRES (gpu) per node refuse it unless + `SelectTypeparameters` has `MULTIPLE_SHARING_GRES_PJ`. + * Trigger fatal exit when Slurm API function is called before + `slurm_init()` is called. + * `slurmd` - Fix issue with `scontrol reconfigure` when started + with `-c`. + * `slurmrestd` - Job submissions that result in the following + error codes will be considered as successfully submitted (with + a warning), instead of returning an HTTP 500 error back: + `ESLURM_NODES_BUSY`, `ESLURM_RESERVATION_BUSY`, `ESLURM_JOB_HELD`, + `ESLURM_NODE_NOT_AVAIL`, `ESLURM_QOS_THRES`, + `ESLURM_ACCOUNTING_POLICY`, `ESLURM_RESERVATION_NOT_USABLE`, + `ESLURM_REQUESTED_PART_CONFIG_UNAVAILABLE`, + `ESLURM_BURST_BUFFER_WAIT`, ESLURM_PARTITION_DOWN`, + `ESLURM_LICENSES_UNAVAILABLE`. + * Fix a `slurmctld` fatal error when upgrading to 23.11 and + changing from `select/cons_res` to `select/cons_tres` at the + same time. + * `slurmctld` - Reject arbitrary distribution jobs that have a + minimum node count that differs from the number of unique + nodes in the hostlist. + * Prevent `slurmdbd` errors when updating reservations with names + containing apostrophes. + * Prevent message extension attacks that could bypass the + message hash. CVE-2023-49933. + * Prevent SQL injection attacks in `slurmdbd`. CVE-2023-49934. + * Prevent message hash bypass in slurmd which can allow an + attacker to reuse root-level MUNGE tokens and escalate + permissions. CVE-2023-49935. + * Prevent NULL pointer dereference on size_valp overflow. + CVE-2023-49936. + * Prevent double-xfree() on error in `_unpack_node_reg_resp()`. + CVE-2023-49937. + * For jobs that request `*cpus-per-gpu`, ensure that the + `*cpus-per-gpu request` is honored on every node in the and + not just for the job as a whole. + * Fix listing available `data_parser` plugins for json and yaml + when giving no commands to `scontrol` or `sacctmgr`. + * `slurmctld` - Rework `scontrol reconfigure` to avoid race + conditions that can result in stray jobs. + * `slurmctld` - Shave ~1 second off average reconfigure time by + terminating internal processing threads faster. + * Skip running `slurmdbd -R` if the connected cluster is 23.11 + or newer. This operation is no longer relevant for 23.11. + * Ensure `slurmscriptd` shuts down before `slurmctld` is stopped + or reconfigured. + * Improve error handling and error messages in `slurmctld` to + `slurmscriptd` communications. This includes avoiding + potential deadlock in `slurmctld` if slurmscript dies + unexpectedly. + * Do not hold batch jobs whose extra constraints cannot be + immediately satisfied, and set the state reason to + `Constraints` instead of `BadConstraints`. + * Fix verbose log message printing a hex number instead of a job + id. + * Upgrade rate limit parameters message from debug to info. + * For `SchedulerParameters=extra_constraints`, prevent `slurmctld` + segfault when starting a `slurmd` with `*extra` for a node + that did not previously set this. + This also ensures the extra constraints model works off the + current node state, not the prior state. + * Fix `*tres-per-task` assertion. + * Fix a few issues when creating reservations. + * Add `SchedulerParameters=time_min_as_soft_limit` option. + * Remove `SLURM_WORKING_CLUSTER` env from batch and srun + environments. + * `cli_filter/lua` - return nil for unset time options rather + than the string `2982616-04:14:00` (which is the internal + macro `NO_VAL` represented as time string). + * Remove 'none' plugins for all but auth and cred. scontrol show + config will report (null) now. + * Removed `select/cons_res`. Please update your configuration to + `select/cons_tres`. + * `mpi/pmix` - When aborted with status 0, avoid marking + job/step as failed. + * Fixed typo on `initialized` for the description of + `ESLURM_PLUGIN_NOT_LOADED`. + * `cgroup.conf` - Removed deprecated parameters `AllowedKmemSpace`, + `ConstrainKmemSpace`, `MaxKmemPercent`, and `MinKmemSpace`. + * `proctrack/cgroup` - Add `SignalChildrenProcesses=` + option to `cgroup.conf`. This allows signals for cancelling, + suspending, resuming, etc. to be sent to children processes in + a `step/job` rather than just the parent. + * Add `PreemptParameters=suspend_grace_time` parameter to + control amount of time between `SIGTSTP` and `SIGSTOP` signals + when suspending jobs. + * `job_submit/throttle` - improve reset of submitted job counts + per user in order to better honor + `SchedulerParameters=jobs_per_user_per_hour=#`. + * Load the user environment into a private pid namespace to + avoid user scripts leaving background processes on a node. + * `scontrol` show `assoc_mgr` will display Lineage instead of Lft + for associations. + * Add `SlurmctldParameters=no_quick_restart` to avoid a new + `slurmctld` taking over the old `slurmctld` by accident. + * Fix `--cpus-per-gpu` for step allocations, which was + previously ignored for job steps. `*cpus-per-gpu` implies + `--exact`. + * Fix mutual exclusivity of `--cpus-per-gpu` and + `--cpus-per-task`: fatal if both options are requested in the + commandline or both are requested in the environment. If one + option is requested in the command line, it will override the + other option in the environment. + * `slurmrestd` - `openapi/dbv0.0.37` and `openapi/v0.0.37` + plugins have been removed. + * `slurmrestd` - `openapi/dbv0.0.38` and `openapi/v0.0.38` + plugins have been tagged as deprecated. + * `slurmrestd` - added auto population of `info/version` field. + * `sdiag` - add `--yaml` and `--json` arg support to specify + data_parser plugin. + * `sacct` - add `--yaml` and `--json` arg support to specify + `data_parser` plugin. + * `scontrol` - add `--yaml` and `--json` arg support to specify + `data_parser` plugin. + * `sinfo` - add `--yaml` and `--json` arg support to specify + `data_parser` plugin. + * `squeue` - add `--yaml` and `--json` arg support to specify + `data_parser` plugin. + * Changed the default `SelectType` to `select/cons_tres` (from + `select/linear`). + * Allow `SlurmUser`/`root` to use reservations without specific + permissions. + * Fix sending step signals to nodes not allocated by the step. + * Remove `CgroupAutomount=` option from `cgroup.conf`. + * Add `TopologyRoute=RoutePart` to route communications based + on partition node lists. + * Added ability for configless to push Prolog and Epilog + scripts to `slurmd`s. + * Prolog and Epilog do not have to be fully qualified pathnames. + * Changed default value of `PriorityType` from `priority/basic` + to `priority/multifactor`. + * `torque/mpiexec` - Propogate exit code from `launched` process. + * `slurmrestd` - Add new rlimits fields for job submission. + * Define SPANK options environment variables when + `--export=[NIL|NONE]` is specified. + * `slurmrestd` - Numeric input fields provided with a null + formatted value will now convert to zero (0) where it can be + a valid value. This is expected to be only be notable with job + submission against v0.0.38 versioned endpoints with job + requests with fields provided with null values. These fields + were already rejected by v0.0.39+ endpoints, unless `+complex` + parser value is provided to v0.0.40+ endpoints. + * `slurmrestd` - Improve parsing of integers and floating point + numbers when handling incoming user provided numeric fields. + Fields that would have not rejected a number for a numeric + field followed by other non-numeric characters will now get + rejected. This is expected to be only be notable with job + submission against v0.0.38 versioned endpoints with malformed + job requests. + * Reject reservation update if it will result in previously + submitted jobs losing access to the reservation. + * `data_parser/v0.0.40` - output partition state when dumping + partitions. + * Allow for a shared suffix to be used with the hostlist format. + E.g., `node[0001-0010]-int`. + * Replace `SRUN_CPUS_PER_TASK` with `SLURM_CPUS_PER_TASK` and + get back the previous behavior before Slurm 22.05 since now we + have the new external launcher step. + * `job_container/tmpfs` - Add `BasePath=none` option to disable + plugin on node subsets when there is a global setting. + * Add QOS flag `Relative`. If set the QOS limits will be treated + as percentages of a cluster/partition instead of absolutes. + * Remove `FIRST_CORES` flag from reservations. + * Add cloud instance id and instance type to node records. + Can be viewed/updated with `scontrol`. + * `slurmd` - add `instance-id`, `instance-type`, and `extra` + options to allow them to be set on startup. + * Add cloud instance accounting to database that can be viewed + with `sacctmgr show instance`. + * `select/linear` - fix task launch failure that sometimes + occurred when requesting `*threads-per-core` or + `--hint=nomultithread`. This also fixes memory calculation + with one of these options and `*mem-per-cpu`: + Previously, memory = mem-per-cpu * all cpus including unusable + threads. + Now, memory = mem-per-cpu * only usuable threads. This + behavior matches the documentation and select/cons_tres. + * `gpu/nvml` - Reduce chances of `NVML_ERROR_INSUFFICIENT_SIZE` + error when getting gpu memory information. + * `slurmrestd` - Convert to generating `OperationIDs` based on + path for all v0.0.40 tagged paths. + * `slurmrestd` - Reduce memory used while dumping a job's stdio + paths. + * `slurmrestd` - Jobs queried from `data_parser/v0.0.40` from + `slurmdb` will have `step/id` field given as a string to match + CLI formatting instead of an object. + * `sacct` - Output in JSON or YAML output will will have the + `step/id` field given as a string instead of an object. + * `scontrol`/`squeue` - Step output in JSON or YAML output will + will have the `id` field given as a string instead of an + object. + * `slurmrestd` - For `GET /slurmdb/v0.0.40/jobs` mimick default + behavior for handling of job start and end times as `sacct` + when one or both fields are not provided as a query parameter. + * `openapi/slurmctld` - Add `GET /slurm/v0.0.40/shares` endpoint + to dump same output as `sshare`. + * `sshare` - add JSON/YAML support. + * `data_parser/v0.0.40` - Remove `required/memory` output in + json. It is replaced by `required/memory_per_cpu` and + `required/memory_per_node`. + * `slurmrestd` - Add numeric id to all association identifiers + to allow unique identification where association has been + deleted but is still referenced by accounting record. + * `slurmrestd` - Add accounting, id, and comment fields to + association dumps. + * Use `memory.current` in cgroup/v2 instead of manually + calculating RSS. This makes accounting consistent with + OOM Killer. + * `sreport` - cluster Utilization `PlannedDown` field now + includes the time that all nodes were in the `POWERED_DOWN` + state instead of just cloud nodes. + * `scontrol` update partition now allows `Nodes+=` and + `Nodes-=` to add/delete nodes from the existing + partition node list. `Nodes=+host1,-host2` is also allowed. + * `sacctmgr` - add `--yaml` and `--json` arg support to specify + `data_parser` plugin. + * `sacctmgr` can now modify QOS's RawUsage to zero or a positive + value. + * `sdiag` - Added statistics on why the main and backfill + schedulers have stopped evaluation on each scheduling cycle. + the number of `RPC limit exceeded...` messages that are logged. + * Rename `sbcast --fanout` to `--treewidth`. + * Remove `SLURM_NODE_ALIASES` env variable. + * Enable fanout for dynamic and unaddresable cloud nodes. + * Fix how steps are dealloced in an allocation if the last step + of an srun never completes due to a node failure. + * Remove redundant database indexes. + * Add database index to suspend table to speed up archive/purges. + * When requesting `--tres-per-task` alter incorrect request for + TRES, it should be `TRESType/TRESName` not `TRESType:TRESName`. + * Make it so reservations can reserve GRES. + * `sbcast` - use the specified `--fanout` value on all hops in + message forwarding; previously the specified fanout was only + used on the first hop, and additional hops used `TreeWidth` in + `slurm.conf`. + * `slurmrestd`- remove logger prefix from `-s/-a list` options + outputs. + * `switch/hpe_slingshot` - Add support for collectives. + * Nodes with suspended jobs can now be displayed as `MIXED`. + * Fix inconsistent handling of using cli and/or environment + options for `tres_per_task=cpu:#` and `cpus_per_gpu`. + * Requesting `--cpus-per-task` will now set + `SLURM_TRES_PER_TASK=cpu:#` in the environment. + * For some tres related environment variables such as + `SLURM_TRES_PER_TASK`, when `srun` requests a different value + for that option, set these environment variables to the value + requested by `srun`. Previously these environment variables + were unchanged from the job allocation. This bug only affected + the output environment variables, not the actual step resource + allocation. + * `RoutePlugin=route/topology` has been replaced with + `TopologyParam=RouteTree`. + * If `ThreadsPerCore` in `slurm.conf` is configured with less + than the number of hardware threads, fix a bug where the task + plugins used fewer cores instead of using fewer threads per core. + * Fix arbitrary distribution allowing it to be used with `salloc` + and `sbatch` and fix how cpus are allocated to nodes. + * Allow nodes to reboot while node is drained or in a + maintenance state. + * Allow `scontrol` reboot to use nodesets to filter nodes to reboot. + * Fix how the topology of typed gres gets updated. + * Changes to the Type option in gres.conf now can be applied with + `scontrol` reconfig. + * Allow for jobs that request a newly configured gres type to be + queued even when the needed `slurmd`s have not yet registered. + * Kill recovered jobs that require unconfigured gres types. + * If keepalives are configured, enable them on all persistent + connections. + * Configless - Also send Includes from configuration files not + parsed by the controller (i.e. from `plugstack.conf`). + * Add `gpu/nrt` plugin for nodes using Trainium/Inferentia + devices. + * `data_parser/v0.0.40` - Add `START_RECEIVED` to job flags in + dumped output. + * SPANK - Failures from most spank functions (not epilog or + exit) will now cause the step to be marked as failed and the + command (`srun`, `salloc`, `sbatch *wait`) to return 1. + +------------------------------------------------------------------- +Wed Jan 3 10:45:48 UTC 2024 - Egbert Eich + +- Update to 23.02.6 to fix (CVE-2023-49933 - bsc#1218046, CVE-2023-49935 - + bsc#1218049, CVE-2023-49936 - bsc#1218050, CVE-2023-49937 - bsc#1218051, + CVE-2023-49938 - bsc#1218053) + * Security Fixes: + + Add `JobAcctGatherParams=DisableGPUAcct` to disable gpu accounting. + + `acct_gather_energy/ipmi` - Improve logging of DCMI issues. + + `gpu/oneapi` - Add support for new env vars `ZE_FLAT_DEVICE_HIERARCHY` + and `ZE_ENABLE_PCI_ID_DEVICE_ORDER`. + + `data_parser/v0.0.39` - skip empty string when parsing QOS ids. + + Remove error message from `assoc_mgr_update_assocs` when purposefully + resetting the default QOS. + * Bug Fixes: + + `libslurm_nss` - Avoid causing glibc to assert due to an unexpected + return from slurm_nss due to an error during lookup. + + Fix job requests with `--tres-per-task` sometimes resulting in bad + allocations that cannot run subsequent job steps. + + Fix issue with `slurmd` where `srun` fails to be warned when a node + prolog script runs beyond `MsgTimeout` set in `slurm.conf`. + + `gres/shard` - Fix plugin functions to have matching parameter orders. + + `gpu/nvml` - Fix issue that resulted in the wrong MIG devices being + constrained to a job + + `gpu/nvml` - Fix linking issue with MIGs that prevented multiple MIGs + being used in a single job for certain MIG configurations + + Fix file descriptor leak in slurmd when using `acct_gather_energy/ipmi` + with DCMI devices. + + `sview` - avoid crash when job has a node list string > 49 characters. + + Prevent `slurmctld` crash during reconfigure when packing job start + messages. + + Preserve reason uid on reconfig. + + Update node reason with updated `INVAL` state reason if different from + last registration. + + `conmgr` - Avoid NULL dereference when using `auth/none`. + + `data_parser/v0.0.39` - Fixed how deleted QOS and associations for jobs + are dumped. + + `burst_buffer/lua` - fix stage in counter not decrementing when a job is + cancelled during stage in. This counter is used to enforce the limit of + 128 scripts per stage. + + `data_parser/v0.0.39` - Fix how the `INVALID` nodes state is dumped. + + `data_parser/v0.0.39` - Fix parsing of flag arrays to allow muliple + flags to be set. + + Avoid leaking sockets when an x11 application is closed in an allocation. + + Fix missing mutex unlock in group cache code which could cause slurmctld + to freeze. + + Fix scrontab monthly jobs possibly skipping a month if added near the + end of the month. + + Fix loading of the gpu account gather energy plugin. + + Fix `slurmctld` segfault when reconfiguring after a job resize. + + Fix crash in slurmstepd that can occur when launching tasks via mpi using + the `pmi2` plugin and using the `route/topology` plugin. + + Fix `qos doesn't exist` error message in `assoc_mgr_update_assocs` + to print the attempted new default qos, rather than the current default + qos. + + `data_parser/v0.0.39` - Fix segfault when POSTing data with association + usage. + * Other Changes and Improvements: + + Prevent message extension attacks that could bypass the message hash. + CVE-2023-49933. + + Prevent message hash bypass in slurmd which can allow an attacker to + reuse root-level MUNGE tokens and escalate permissions. CVE-2023-49935. + + Prevent NULL pointer dereference on `size_valp` overflow. CVE-2023-49936. + + Prevent double-xfree() on error in `_unpack_node_reg_resp()`. + CVE-2023-49937. + + Prevent modified `sbcast` RPCs from opening a file with the wrong group + permissions. CVE-2023-49938. +- Fix %do_obsoletes macro expansion to work with SLE-12. + +------------------------------------------------------------------- +Thu Nov 30 18:52:44 UTC 2023 - Egbert Eich + +- Add missing service file for slurmrestd (boo#1217711). + +------------------------------------------------------------------- +Tue Nov 28 14:14:28 UTC 2023 - Egbert Eich + +- Explicitly create an Obsoletes: entry for each package version + that is obsoleted by the present version. These are all published + versions of the last two major releases as well as all minor + versions of the present release lower than the current one + (bsc#1216869 2nd part). + This prevents the current version to upgrade a old Slurm version + for which no upgrade path exists. + +------------------------------------------------------------------- +Mon Nov 20 15:29:55 UTC 2023 - Egbert Eich + +- On SLE-12 exclude build for s390x. + +------------------------------------------------------------------- +Mon Nov 6 07:38:00 UTC 2023 - Egbert Eich + +- Add missing dependencies to slurm-config to plugins package. + These should help to tie down the slurm version and help to avoid + a package mix (bsc#1216869). + +------------------------------------------------------------------- +Thu Oct 12 08:23:20 UTC 2023 - Christian Goll + +- update to 23.02.6 to fix (CVE-2023-41914, bsc#1216207) + * Removed Fix-test-32.8.patch as fixed upstream + * Bug Fixes: + + Fix `CpusPerTres=` not upgreadable with scontrol update + + Fix unintentional gres removal when validating the gres job state. + + Fix `--without-hpe-slingshot` configure option. + + Fix cgroup v2 memory calculations when transparent huge pages are used. + + Fix parsing of `sgather --timeout` option. + + Fix regression from 22.05.0 that caused `srun --cpu-bind "=verbose"` + and `"=v"` options give different CPU bind masks. + + Fix "_find_node_record: lookup failure for node" error message appearing + for all dynamic nodes during reconfigure. + + Avoid segfault if loading serializer plugin fails. + + `slurmrestd` - Correct OpenAPI format for `GET /slurm/v0.0.39/licenses`. + + `slurmrestd` - Correct OpenAPI format for + `GET /slurm/v0.0.39/job/{job_id}`. + + `slurmrestd` - Change format to multiple fields in + `GET /slurmdb/v0.0.39/assocations` and `GET /slurmdb/v0.0.39/qos` to + handle infinite and unset states. + + When a node fails in a job with `--no-kill`, preserve the extern step on the + remaining nodes to avoid breaking features that rely on the extern step + such as `pam_slurm_adopt`, `x11`, and `job_container/tmpfs`. + + `auth/jwt` - Ignore `x5c` field in JWKS files. + + `auth/jwt` - Treat 'alg' field as optional in JWKS files. + + Allow job_desc.selinux_context to be read from the job_submit.lua script. + + Skip check in slurmstepd that causes a large number of errors in the + munge log: "Unauthorized credential for client UID=0 GID=0". + This error will still appear on `slurmd`/`slurmctld`/`slurmdbd` start up + and is not a cause for concern. + + `slurmctld` - Allow startup with zero partitions. + + Fix some mig profile names in slurm not matching nvidia mig profiles. + + Prevent `slurmscriptd` processing delays from blocking other threads in + `slurmctld` while trying to launch `{Prolog|Epilog}Slurmctld`. + + Fix sacct printing ReqMem field when memory doesn't exist in requested + TRES. + + Fix how heterogenous steps in an allocation with `CR_PACK_NODE` or + `-mpack` are created. + + Fix `slurmctld` crash from race condition within `job_submit_throttle` + plugin. + + Fix `--with-systemdsystemunitdir` when requesting a default location. + + Fix not being able to cancel an array task by the jobid (i.e. not + `_`) through scancel, job launch failure or prolog + failure. + + Fix cancelling the whole array job when the array task is the meta job + and it fails job or prolog launch and is not requeable. Cancel only the + specific task instead. + + Fix regression in 21.08.2 where MailProg did not run for `mail-type=end` + for jobs with non+zero exit codes. + + Fix incorrect setting of memory.swap.max in cgroup/v2. + + Fix `jobacctgather/cgroup` collection of disk/io, gpumem, gpuutil TRES + values. + + Fix -d singleton for heterogeneous jobs. + + Downgrade info logs about a job meeting a "maximum node limit" in the + select plugin to `DebugFlags=SelectType`. These info logs could spam the + slurmctld log file under certain circumstances. + + `prep/script` - Fix `[Srun|Task]` missing + `SLURM_JOB_NODELIST`. + + gres - Rebuild GRES core bitmap for nodes at startup. This fixes error: + "Core bitmaps size mismatch on node [HOSTNAME]", which causes jobs to + enter state "Requested node configuration is not available". + + `slurmctd` - Allow startup with zero nodes. + + Fix filesystem handling race conditions that could lead to an attacker + taking control of an arbitrary file, or removing entire directories' + contents. CVE-2023-41914. + +------------------------------------------------------------------- +Mon Sep 18 05:23:19 UTC 2023 - Egbert Eich + +- Updated to version 23.02.5 with the following changes: + * Bug Fixes: + + Revert a change in 23.02 where `SLURM_NTASKS` was no longer set in the + job's environment when `--ntasks-per-node` was requested. + The method that is is being set, however, is different and should be more + accurate in more situations. + + Change pmi2 plugin to honor the `SrunPortRange` option. This matches the + new behavior of the pmix plugin in 23.02.0. Note that neither of these + plugins makes use of the `MpiParams=ports=` option, and previously + were only limited by the systems ephemeral port range. + + Fix regression in 23.02.2 that caused slurmctld -R to crash on startup if + a node features plugin is configured. + + Fix and prevent reoccurring reservations from overlapping. + + `job_container/tmpfs` - Avoid attempts to share BasePath between nodes. + + With `CR_Cpu_Memory`, fix node selection for jobs that request gres and + `--mem-per-cpu`. + + Fix a regression from 22.05.7 in which some jobs were allocated too few + nodes, thus overcommitting cpus to some tasks. + + Fix a job being stuck in the completing state if the job ends while the + primary controller is down or unresponsive and the backup controller has + not yet taken over. + + Fix `slurmctld` segfault when a node registers with a configured + `CpuSpecList` while `slurmctld` configuration has the node without + `CpuSpecList`. + + Fix cloud nodes getting stuck in `POWERED_DOWN+NO_RESPOND` state after + not registering by `ResumeTimeout`. + + `slurmstepd` - Avoid cleanup of `config.json-less` containers spooldir + getting skipped. + + Fix scontrol segfault when 'completing' command requested repeatedly in + interactive mode. + + Properly handle a race condition between `bind()` and `listen()` calls + in the network stack when running with SrunPortRange set. + + Federation - Fix revoked jobs being returned regardless of the + `-a`/`--all` option for privileged users. + + Federation - Fix canceling pending federated jobs from non-origin + clusters which could leave federated jobs orphaned from the origin + cluster. + + Fix sinfo segfault when printing multiple clusters with `--noheader` + option. + + Federation - fix clusters not syncing if clusters are added to a + federation before they have registered with the dbd. + + `node_features/helpers` - Fix node selection for jobs requesting + changeable. + features with the `|` operator, which could prevent jobs from + running on some valid nodes. + + `node_features/helpers` - Fix inconsistent handling of `&` and `|`, + where an AND'd feature was sometimes AND'd to all sets of features + instead of just the current set. E.g. `foo|bar&baz` was interpreted + as `{foo,baz}` or `{bar,baz}` instead of how it is documented: + `{foo} or {bar,baz}`. + + Fix job accounting so that when a job is requeued its allocated node + count is cleared. After the requeue, sacct will correctly show that + the job has 0 `AllocNodes` while it is pending or if it is canceled + before restarting. + + `sacct` - `AllocCPUS` now correctly shows 0 if a job has not yet + received an allocation or if the job was canceled before getting one. + + Fix intel OneAPI autodetect: detect the `/dev/dri/renderD[0-9]+` GPUs, + and do not detect `/dev/dri/card[0-9]+`. + + Fix node selection for jobs that request `--gpus` and a number of + tasks fewer than GPUs, which resulted in incorrectly rejecting these + jobs. + + Remove `MYSQL_OPT_RECONNECT` completely. + + Fix cloud nodes in `POWERING_UP` state disappearing (getting set + to `FUTURE`) + when an `scontrol reconfigure` happens. + + `openapi/dbv0.0.39` - Avoid assert / segfault on missing coordinators + list. + + `slurmrestd` - Correct memory leak while parsing OpenAPI specification + templates with server overrides. + + Fix overwriting user node reason with system message. + + Prevent deadlock when `rpc_queue` is enabled. + + `slurmrestd` - Correct OpenAPI specification generation bug where + fields with overlapping parent paths would not get generated. + + Fix memory leak as a result of a partition info query. + + Fix memory leak as a result of a job info query. + + For step allocations, fix `--gres=none` sometimes not ignoring gres + from the job. + + Fix `--exclusive` jobs incorrectly gang-scheduling where they shouldn't. + + Fix allocations with `CR_SOCKET`, gres not assigned to a specific + socket, and block core distribion potentially allocating more sockets + than required. + + Revert a change in 23.02.3 where Slurm would kill a script's process + group as soon as the script ended instead of waiting as long as any + process in that process group held the stdout/stderr file descriptors + open. That change broke some scripts that relied on the previous + behavior. Setting time limits for scripts (such as + `PrologEpilogTimeout`) is strongly encouraged to avoid Slurm waiting + indefinitely for scripts to finish. + + Fix `slurmdbd -R` not returning an error under certain conditions. + + `slurmdbd` - Avoid potential NULL pointer dereference in the mysql + plugin. + + Fix regression in 23.02.3 which broken X11 forwarding for hosts when + MUNGE sends a localhost address in the encode host field. This is caused + when the node hostname is mapped to 127.0.0.1 (or similar) in + `/etc/hosts`. + + `openapi/[db]v0.0.39` - fix memory leak on parsing error. + + `data_parser/v0.0.39` - fix updating qos for associations. + + `openapi/dbv0.0.39` - fix updating values for associations with null + users. + + Fix minor memory leak with `--tres-per-task` and licenses. + + Fix cyclic socket cpu distribution for tasks in a step where + `--cpus-per-task` < usable threads per core. + + `slurmrestd` - For `GET /slurm/v0.0.39/node[s]`, change format of + node's energy field `current_watts` to a dictionary to account for + unset value instead of dumping 4294967294. + + `slurmrestd` - For `GET /slurm/v0.0.39/qos`, change format of QOS's + field "priority" to a dictionary to account for unset value instead of + dumping 4294967294. + + slurmrestd - For `GET /slurm/v0.0.39/job[s]`, the 'return code' + code field in `v0.0.39_job_exit`_code will be set to -127 instead of + being left unset where job does not have a relevant return code. + * Other Changes: + + Remove --uid / --gid options from salloc and srun commands. These options + did not work correctly since the CVE-2022-29500 fix in combination with + some changes made in 23.02.0. + + Add the `JobId` to `debug()` messages indicating when + `cpus_per_task/mem_per_cpu` or `pn_min_cpus` are being automatically + adjusted. + + Change the log message warning for rate limited users from verbose to + info. + + `slurmstepd` - Cleanup per task generated environment for containers in + spooldir. + + Format batch, extern, interactive, and pending step ids into strings that + are human readable. + + `slurmrestd` - Reduce memory usage when printing out job CPU frequency. + + `data_parser/v0.0.39` - Add `required/memory_per_cpu` and + `required/memory_per_node` to `sacct --json` and `sacct --yaml` and + `GET /slurmdb/v0.0.39/jobs` from slurmrestd. + + `gpu/oneapi` - Store cores correctly so CPU affinity is tracked. + + Allow `slurmdbd -R` to work if the root assoc id is not 1. + + Limit periodic node registrations to 50 instead of the full `TreeWidth`. + Since unresolvable `cloud/dynamic` nodes must disable fanout by setting + `TreeWidth` to a large number, this would cause all nodes to register at + once. + +------------------------------------------------------------------- +Mon Aug 21 09:43:08 UTC 2023 - Christian Goll + +- Updated to 23.02.4 with the following changes: + * Bug Fixes: + + Fix main scheduler loop not starting after a failover to backup + controller. Avoid slurmctld segfault when specifying + `AccountingStorageExternalHost` (bsc#1214983). + + Fix sbatch return code when `--wait` is requested on a job array. + + Fix collected `GPUUtilization` values for `acct_gather_profile` plugins. + + Fix `slurmrestd` handling of job hold/release operations. + + Fix step running indefinitely when slurmctld takes more than + `MessageTimeout` to respond. Now, `slurmctld` will cancel the step when + detected, preventing following steps from getting stuck waiting for + resources to be released. + + Fix regression to make `job_desc.min_cpus` accurate again in `job_submit` + when requesting a job with `--ntasks-per-node`. + + Fix handling of `ArrayTaskThrottle` in backfill. + + Fix regression in 23.02.2 when checking gres state on `slurmctld` + startup or reconfigure. Gres changes in the configuration were not + updated on slurmctld startup. On startup or reconfigure, these messages + were present in the log: `error: Attempt to change gres/gpu Count`. + + Fix potential double count of gres when dealing with limits. + + Fix `slurmstepd` segfault when `ContainerPath` is not set in `oci.conf` + + Fixed an issue where jobs requesting licenses were incorrectly rejected. + + `scrontab` - Fix cutting off the final character of quoted variables. + + `smail` - Fix issues where e-mails at job completion were not being sent. + + `scontrol/slurmctld` - fix comma parsing when updating a reservation's + nodes. + + Fix `--gpu-bind=single binding` tasks to wrong gpus, leading to some gpus + having more tasks than they should and other gpus being unused. + + Fix regression in 23.02 that causes slurmstepd to crash when `srun` + requests more than `TreeWidth` nodes in a step and uses the pmi2 or + pmix plugin. + + `job_container/tmpfs` - Fix `%h` and `%n` substitution in `BasePath` + where `%h` was substituted as the NodeName instead of the hostname, + and %n was substituted as an empty string. + + Fix regression where `--cpu-bind=verbose` would override + `TaskPluginParam`. + + `scancel` - Fix `--clusters/-M` for federations. Only filtered jobs + (e.g. `-A`, `-u`, `-p`, etc.) from the specified clusters will be + canceled, rather than all jobs in the federation. Specific jobids + will still be routed to the origin cluster for cancellation. + * Other changes: + + Make spank `S_JOB_ARGV` item value hold the requested command `argv` + instead of the `srun --bcast` value when `--bcast` requested (only in + local context). + + `scontrol` - Permit changes to StdErr and StdIn for pending jobs. + + `scontrol` - Reset `std`{`err`,`in`,`out`} when set to empty string. + + `slurmrestd` - mark environment as a required field for job submission + descriptions. + + `slurmrestd` - avoid dumping null in OpenAPI schema required fields. + + `data_parser/v0.0.39` - avoid rejecting valid `memory_per_node` formatted + as dictionary provided with a job description. + + `data_parser/v0.0.39` - avoid rejecting valid `memory_per_cpu` formatted + as dictionary provided with a job description. + + `slurmrestd` - Return HTTP error code 404 when job query fails. + + `slurmrestd` - Add return schema to error response to job and license + query. + + Change the log message warning for rate limited users from debug to + verbose. + + `cgroup/v2` - Avoid capturing log output for ebpf when constraining + devices, + as this can lead to inadvertent failure if the log buffer is too small. + + Added error message when attempting to use sattach on batch or extern + steps. + + Reject job `ArrayTaskThrottle` update requests from unprivileged users. + + `data_parser/v0.0.39` - populate description fields of property objects + in generated OpenAPI specifications where defined. + + `slurmstepd` - Avoid segfault caused by `ContainerPath` not being + terminated by `/` in `oci.conf`. + + `data_parser/v0.0.39` - Change `v0.0.39_job_info` response to tag + `exit_code` field as being complex instead of only an unsigned integer. +- Updated to 23.02.3 with the following changes: + * Bug Fixes: + + `slurmctld` - Fix backup slurmctld crash when it takes control + multiple times. + + Fix regression in 23.02.2 that ignored the partition `DefCpuPerGPU` + setting on the first pass of scheduling a job requesting + `--gpus --ntasks`. + + `srun` - fix issue creating regular and interactive steps because + environment variables were incorrectly set on non-HetSteps. + + Fix dynamic nodes getting stuck in allocated states when reconfiguring. + + Fix regression in 23.02.2 that set the `SLURM_NTASKS` environment + variable in sbatch jobs from `--ntasks-per-node` when `--ntasks` was not + requested. + + Fix regression in 23.02 that caused sbatch jobs to set the wrong number + of tasks when requesting `--ntasks-per-node` without `--ntasks`, and also + requesting one of the following options: `--sockets-per-node`, + `--cores-per-socket`, `--threads-per-core` (or `--hint=nomultithread`), + or `-B,--extra-node-info`. + + Fix double counting suspended job counts on nodes when reconfiguring, + which prevented nodes with suspended jobs from being powered down or + rebooted once the jobs completed. + + Fix backfill not scheduling jobs submitted with `--prefer` and + `--constraint` properly. + + mpi/pmix - fix regression introduced in 23.02.2 which caused PMIx shmem + backed files permissions to be incorrect. + + api/submit - fix memory leaks when submission of batch regular jobs + or batch HetJobs fails (response data is a return code). + + Fix regression in 23.02 leading to error() messages being sent at `INFO` + instead of `ERR` in syslog. + + Fix `TresUsageIn[Tot|Ave]` calculation for `gres/gpumem` and + `gres/gpuutil`. + + Fix issue in the gpu plugins where gpu frequencies would only be set if + both gpu memory and gpu frequencies were set, while one or the other + suffices. + + Fix reservations group ACL's not working with the root group. + + Fix updating a job with a ReqNodeList greater than the job's node count. + + Fix inadvertent permission denied error for `--task-prolog` and + `--task-epilog` with filesystems mounted with `root_squash`. + + Fix missing detailed cpu and gres information in json/yaml output from + `scontrol`, `squeue` and `sinfo`. + + Fix regression in 23.02 that causes a failure to allocate job steps that + request `--cpus-per-gpu` and gpus with types. + + Fix potentially waiting indefinitely for a defunct process to finish, + which affects various scripts including `Prolog` and `Epilog`. This could + have various symptoms, such as jobs getting stuck in a completing state. + + Fix losing list of reservations on job when updating job with list of + reservations and restarting the controller. + + Fix nodes resuming after down and drain state update requests from + clients older than 23.02. + + Fix advanced reservation creation/update when an association that should + have access to it is composed with partition(s). + + Fix job layout calculations with `--ntasks-per-gpu`, especially when + `--nodes` has not been explicitly provided. + + Fix X11 forwarding for jobs submitted from the slurmctld host. + + When a job requests `--no-kill` and one or more nodes fail during the + job, fix subsequent job steps unable to use some of the remaining + resources allocated to the job. + + Fix shared gres allocation when using `--tres-per-task` with tasks that + span multiple sockets. + + `auth/jwt` - Fix memory leak. + * Other changes: + + `openapi/dbv0.0.39/users` - If a default account update failed, resulting + in a no-op, the query returned success without any warning. Now a warning + is sent back to the client that the default account wasn't modified. + + Avoid job write lock when nodes are dynamically added/removed. + + `burst_buffer/lua` - allow jobs to get scheduled sooner after + `slurm_bb_data_in` completes. + + `openapi/v0.0.39` - fix memory leak in `_job_post_het_submit()`. + + Avoid possible `slurmctld` segfault caused by race condition with already + completed `slurmdbd_conn` connections. + + `Slurmdbd.conf` checks included conf files for 0600 permissions + + `slurmrestd` - fix regression "oversubscribe" fields were removed from + job descriptions and submissions from v0.0.39 end points. + + `accounting_storage/mysql` - Query for indiviual QOS correctly when you + have more than 10. + + Add warning message about ignoring `--tres-per-tasks=license` when used + on a step. + + `sshare` - Fix command to work when using `priority/basic`. + + Avoid loading `cli_filter` plugins outside of `salloc`/`sbatch`/`scron`/ + `srun`. This fixes a number of missing symbol problems that can manifest + for executables linked against libslurm (and not `libslurmfull`). + + Allow cloud_reg_addrs to update dynamically registered node's addrs on + subsequent registrations. + + Revert a change in 22.05.5 that prevented tasks from sharing a core if + `--cpus-per-task` > threads per core, but caused incorrect accounting and + cpu binding. Instead, `--ntasks-per-core=1` may be requested to prevent + tasks from sharing a core. + + Correctly send `assoc_mgr` lock to mcs plugin. + + Avoid unnecessary `gres/gpumem` and `gres/gpuutil` `TRES` position + lookups. + + `sacct` - when printing `PLANNED` time, use end time instead of start + time for jobs cancelled before they started. + + Hold the job with `(Reservation ... invalid)` state reason if the + reservation is not usable by the job. + + `sbatch` - Added new `--export=NIL` option. +- Removed: + * Fix-test-3.13.patch + * Fix-test-38.11.patch as both tests changed upstream + +------------------------------------------------------------------- +Tue Jul 4 19:21:37 UTC 2023 - Egbert Eich + +- Create a macro for upgrade dependency to ensure uniform handling. + +------------------------------------------------------------------- +Tue May 9 09:28:23 UTC 2023 - Christian Goll + +- updated to 23.02.02 which includes a number of fixes to Slurm stability + * Includes a fix for a regression in 23.02 that caused openmpi mpirun to fail + to launch tasks. + * It also includes two functional changes: Don't update the cron job tasks if + the whole crontab file is left untouched after opening it with scrontab -e + * Sort dynamic nodes and include them in topology after scontrol reconfigure + or a slurmctld restart. + +------------------------------------------------------------------- +Mon Apr 17 19:16:40 UTC 2023 - Egbert Eich + +- Web-configurator: changed presets to SUSE defaults. +- If %_restart_on_update is no longer defined replace by own + macro. +- Marked slurm-openlava, slurm-seff and slurm-sjstat noarch. +- rpmlint: + * dropped some rpmlint filters which are no longer relevant. + * added/refreshed filters. For Details, see rpmlintrc. +- Remove workaround to fix the restart issue in an Slurm package + described in bsc#1088693. + The Slurm version in this package as 16.05. Any attempt to + directly migrate to the current version is bound to fail + anyway. +- Now require slurm-munge if munge authentication is installed. + +------------------------------------------------------------------- +Fri Mar 31 07:37:01 UTC 2023 - Christian Goll + +- updated to 23.02.1 with the following changes: + * job_container/tmpfs - cleanup job container even if namespace mount is + already unmounted. + * openapi/dbv0.0.38 - Fix not displaying an error when updating QOS or + associations fails. + * Fix nodes remaining as PLANNED after slurmctld save state recovery. + * Add cgroup.conf EnableControllers option for cgroup/v2. + * Get correct cgroup root to allow slurmd to run in containers like Docker. + * slurmctld - add missing PrivateData=jobs check to step ContainerID lookup + requests originated from 'scontrol show step container-id=' or certain + scrun operations when container state can't be directly queried. + * Fix nodes un-draining after being drained due to unkillable step. + * Fix remote licenses allowed percentages reset to 0 during upgrade. + * sacct - Avoid truncating time strings when using SLURM_TIME_FORMAT with + the --parsable option. + * Fix regression in 22.05.0rc1 that broke Nodes=ALL in a NodeSet. + * openapi/v0.0.39 - fix jobs submitted via slurmrestd being allocated fewer + CPUs than tasks when requesting multiple tasks. + * Fix job not being scheduled on valid nodes and potentially being rejected + when using parentheses at the beginning of square brackets in a feature + request, for example: "feat1&[(feat2|feat3)]". + * Fix regression in 23.02.0rc1 which made --gres-flags=enforce-binding no + longer enforce optimal core-gpu job placement. + * mpi/pmix - Fix v5 to load correctly when libpmix.so isn't in the normal + lib path. + * data_parser/v0.0.39 - fix regression where "memory_per_node" would be + rejected for job submission. + * data_parser/v0.0.39 - fix regression where "memory_per_cpu" would be + rejected for job submission. + * slurmctld - add an assert to check for magic number presence before deleting + a partition record and clear the magic afterwards to better diagnose + potential memory problems. + * Clean up OCI containers task directories correctly. + * scrun - Run under existing job when SLURM_JOB_ID is present. + * Prevent a slurmstepd crash when the I/O subsystem has hung. + * common/conmgr - fix memory leak of complete connection list. + * job_container/tmpfs - avoid printing extraneous error messages when running + a spank plugin that implements slurm_spank_job_prolog() or + slurm_spank_job_epilog(). + * Fix srun < 23.02 always getting an "exact" core allocation. + * Prevent scontrol < 23.02 from setting MaxCPUsPerSocket to 0. + * Add ScronParameters=explicit_scancel and corresponding scancel --cron + option. +- removed right-pmix-path.patch as fixed upstream + +------------------------------------------------------------------- +Thu Mar 16 15:48:15 UTC 2023 - Christian Goll + +- use libpmix.so.2 instead of libpmix.so to fix (bsc#1209260) + this removes the need of pmix-pluginlib + added: right-pmix-path.patch + +------------------------------------------------------------------- +Wed Mar 15 10:19:12 UTC 2023 - Christian Goll + +- slurm-plugins need to require pmix-pluginlib (bsc#1209260) + +------------------------------------------------------------------- +Tue Mar 7 15:18:05 UTC 2023 - Egbert Eich + +- Stop pulling firewall rules from github. There is no benefit to + host these separately. +- Remove pre-sle12 pieces. + +------------------------------------------------------------------- +Wed Mar 1 17:18:41 UTC 2023 - Egbert Eich + +- Add missing Provides:, Conflicts: and Obsoletes: to slurm-cray, + slurm-hdf5 and slurm-testsuite to avoid package conflicts. +- Unify Obsoletes:. +- Consolidate spec files between different Slurm releases in + Leap/SLE maintenance. +- Add dependency for the general plugin package to the + AcctGatherProfile HDF5 plugin. +- Adjust node RealMemory in slurm.conf of test suite for 8G test + nodes. + +------------------------------------------------------------------- +Mon Feb 20 20:45:59 UTC 2023 - Egbert Eich + +- updated to 23.02.0 + * Highlights + + slurmctld - Add new RPC rate limiting feature. This is enabled through + SlurmctldParameters=rl_enable, otherwise disabled by default. + + Make scontrol reconfigure and sending a SIGHUP to the slurmctld behave + the same. If you were using SIGHUP as a 'lighter' scontrol reconfigure + to rotate logs please update your scripts to use SIGUSR2 instead. + + Change cloud nodes to show by default. PrivateData=cloud is no longer + needed. + + sreport - Count planned (FKA reserved) time for jobs running in + IGNORE_JOBS reservations. Previously was lumped into IDLE time. + + job_container/tmpfs - Support running with an arbitrary list of private + mount points (/tmp and /dev/shm are the default, but not required). + + job_container/tmpfs - Set more environment variables in InitScript. + + Make all cgroup directories created by Slurm owned by root. This was the + behavior in cgroup/v2 but not in cgroup/v1 where by default the step + directories ownership were set to the user and group of the job. + + accounting_storage/mysql - change purge/archive to calculate record ages + based on end time, rather than start or submission times. + + job_submit/lua - add support for log_user() from slurm_job_modify(). + + Run the following scripts in slurmscriptd instead of slurmctld: + ResumeProgram, ResumeFailProgram, SuspendProgram, ResvProlog, ResvEpilog, + and RebootProgram (only with SlurmctldParameters=reboot_from_controller). + + Only permit changing log levels with 'srun --slurmd-debug' by root + or SlurmUser. + + slurmctld will fatal() when reconfiguring the job_submit plugin fails. + + Add PowerDownOnIdle partition option to power down nodes after nodes + become idle. + + Add "[jobid.stepid]" prefix from slurmstepd and "slurmscriptd" prefix + from slurmcriptd to Syslog logging. Previously was only happening when + logging to a file. + + Add purge and archive functionality for job environment and job batch + script records. + + Extend support for Include files to all "configless" client commands. + + Make node weight usable for powered down and rebooting nodes. + + Removed 'launch' plugin. + + Add "Extra" field to job to store extra information other than a comment. + + Add usage gathering for AMD (requires ROCM 5.5+) and NVIDIA gpus. + + Add job's allocated nodes, features, oversubscribe, partition, and + reservation to SLURM_RESUME_FILE output for power saving. + + Automatically create directories for stdout/stderr output files. Paths + may use %j and related substitution characters as well. + + Add --tres-per-task to salloc/sbatch/srun. + + Allow nodefeatures plugin features to work with cloud nodes. + e.g. - Powered down nodes have no active changeable features. + - Nodes can't be changed to other active features until powered down. + - Active changeable features are reset/cleared on power down. + + Make slurmstepd cgroups constrained by total configured memory from + slurm.conf (NodeName=<> RealMemory=#) instead of total physical memory. + + node_features/helpers - add support for the OR and parentheses operators + in a --constraint expression. + + slurmctld will fatal() when [Prolog|Epilog]Slurmctld are defined but + are not executable. + + Validate node registered active features are a super set of node's + currently active changeable features. + + On clusters without any PrologFlags options, batch jobs with failed + prologs no longer generate an output file. + + Add SLURM_JOB_START_TIME and SLURM_JOB_END_TIME environment variables. + + Add SuspendExcStates option to slurm.conf to avoid suspending/powering + down specific node states. + + Add support for DCMI power readings in IPMI plugin. + + slurmrestd served /slurm/v0.0.39 and /slurmdb/v0.0.39 endpoints had major + changes from prior versions. Almost all schemas have been renamed and + modified. Sites using OpenAPI Generator clients are highly suggested to + upgrade to to using atleast version 6.x due to limitations with prior + versions. + + Allow for --nodelist to contain more nodes than required by --nodes. + + Rename "nodes" to "nodes_resume" in SLURM_RESUME_FILE job output. + + Rename "all_nodes" to "all_nodes_resume" in SLURM_RESUME_FILE output. + + Add jobcomp/kafka plugin. + + Add new PreemptParameters=reclaim_licenses option which will allow higher + priority jobs to preempt jobs to free up used licenses. (This is only + enabled for with PreemptModes of CANCEL and REQUEUE, as Slurm cannot + guarantee suspended jobs will release licenses correctly.) + + hpe/slingshot - add support for the instant-on feature. + + Add ability to update SuspendExc* parameters with scontrol. + + Add ability to restore SuspendExc* parameters on restart with slurmctld + -R option. + + Add ability to clear a GRES specification by setting it to "0" via + 'scontrol update job'. + + Add SLURM_JOB_OVERSUBSCRIBE environment variable for Epilog, Prolog, + EpilogSlurmctld, PrologSlurmctld, and mail ouput. + + System node down reasons are appended to existing reasons, separated + by ':'. + + New command scrun has been added. scrun acts as an Open Container + Initiative (OCI) runtime proxy to run containers seamlessly via Slurm. + + Fixed GpuFreqDef option. When set in slurm.conf, it will be used if + --gpu-freq was not explicitly set by the job step. + * Configuration Changes + + job_container.conf - Added "Dirs" option to list desired private mount + points. + + node_features plugins - invalid users specified for AllowUserBoot will + now result in fatal() rather than just an error. + + Deprecate AllowedKmemSpace, ConstrainKmemSpace, MaxKmemPercent, and + MinKmemSpace. + + Allow jobs to queue even if the user is not in AllowGroups when + EnforcePartLimits=no is set. This ensures consistency for all the + Partition access controls, and matches the documented behavior for + EnforcePartLimits. + + Add InfluxDBTimeout parameter to acct_gather.conf. + + job_container/tmpfs - add support for expanding %h and %n in BasePath. + + slurm.conf - Removed SlurmctldPlugstack option. + + Add new SlurmctldParameters=validate_nodeaddr_threads= option to + allow concurrent hostname resolution at slurmctld startup. + + Add new AccountingStoreFlags=job_extra option to store a job's extra field + in the database. + + Add new "defer_batch" option to SchedulerParameters to only defer + scheduling for batch jobs. + + Add new DebugFlags option 'JobComp' to replace 'Elasticsearch'. + + Add configurable job requeue limit parameter - MaxBatchRequeue - in + slurm.conf to permit changes from the old hard-coded value of 5. + + helpers.conf - Allow specification of node specific features. + + helpers.conf - Allow many features to one helper script. + + job_container/tmpfs - Add "Shared" option to support shared namespaces. + This allows autofs to work with the job_container/tmpfs plugin when + enabled. + + acct_gather.conf - Added EnergyIPMIPowerSensors=Node=DCMI and + Node=DCMI_ENHANCED. + + Add new "getnameinfo_cache_timeout=" option to + CommunicationParameters to adjust or disable caching the results of + getnameinfo(). + + Add new PrologFlags=ForceRequeueOnFail option to automatically requeue + batch jobs on Prolog failures regardless of the job --requeue setting. + + Add HealthCheckNodeState=NONDRAINED_IDLE option. + + Add 'explicit' to Flags in gres.conf. This makes it so the gres is not + automatically added to a job's allocation when --exclusive is used. Note + that this is a per-node flag. + + Moved the "preempt_" options from SchedulerParameters to + PreemptParameters, and dropped the prefix from the option names. + (The old options will still be parsed for backwards compatibility, + but are now undocumented.) + + Add LaunchParameters=ulimit_pam_adopt, which enables setting RLIMIT_RSS + in adopted processes. + + Update SwitchParameters=job_vni to enable/disable creating job VNIs + for all jobs, or when a user requests them. + + Update SwitchParameters=single_node_vni to enable/disable creating + single node vnis for all jobs, or when a user requests them. + + Add ability to preserve SuspendExc* parameters on reconfig with + ReconfigFlags=KeepPowerSaveSettings. + + slurmdbd.conf - Add new AllResourcesAbsolute to force all new resources + to be created with the Absolute flag. + + topology/tree - Add new TopologyParam=SwitchAsNodeRank option to reorder + nodes based on switch layout. This can be useful if the naming convention + for the nodes does not natually map to the network topology. + + Removed the default setting for GpuFreqDef. If unset, no attempt to change + the GPU frequency will be made if --gpu-freq is not set for the step. + * Command Changes + + sacctmgr - no longer force updates to the AdminComment, Comment, or + SystemComment to lower-case. + + sinfo - Add -F/--future option to sinfo to display future nodes. + + sacct - Rename 'Reserved' field to 'Planned' to match sreport and the + nomenclature of the 'Planned' node. + + scontrol - advanced reservation flag MAINT will no longer replace nodes, + similar to STATIC_ALLOC + + sbatch - add parsing for #PBS -d and #PBS -w. + + scontrol show assoc_mgr will show username(uid) instead of uid in + QoS section. + + Add strigger --draining and -R/--resume options. + + Change --oversubscribe and --exclusive to be mutually exclusive for + job submission. Job submission commands will now fatal if both are set. + Previously, these options would override each other, with the last one + in the job submission command taking effect. + + scontrol - Requested TRES and allocated TRES will now always be printed + when showing jobs, instead of one TRES output that was either the + requested or allocated. + + srun --ntasks-per-core now applies to job and step allocations. Now, + use of --ntasks-per-core=1 implies --cpu-bind=cores and + --ntasks-per-core>1 implies --cpu-bind=threads. + + salloc/sbatch/srun - Check and abort if ntasks-per-core > + threads-per-core. + + scontrol - Add ResumeAfter= option to "scontrol update nodename=". + + Add a new "nodes=" argument to scontrol setdebug to allow the debug + level on the slurmd processes to be temporarily altered. + + Add a new "nodes=" argument to "scontrol setdebugflags" as well. + + Make it so scrontab prints client-side the job_submit() err_msg (which + can be set i.e. by using the log_user() function for the lua plugin). + + scontrol - Reservations will not be allowed to have STATIC_ALLOC or + MAINT flags and REPLACE[_DOWN] flags simultaneously. + + scontrol - Reservations will only accept one reoccurring flag when + being created or updated. + + scontrol - A reservation cannot be updated to be reoccurring if it is + already a floating reservation. + + squeue - removed unused '%s' and 'SelectJobInfo' formats. + + squeue - align print format for exit and derived codes with that of + other components (:). + + sacct - Add --array option to expand job arrays and display array + tasks on separate lines. + + Partial support for '--json' and '--yaml' formated outputs have been + implemented for sacctmgr, sdiag, sinfo, squeue, and scontrol. The + resultant data ouput will be filtered by normal command arguments. + Formatting arguments will continue to be ignored. + + salloc/sbatch/srun - extended the --nodes syntax to allow for a list + of valid node counts to be allocated to the job. This also supports + a "step count" value (e.g., --nodes=20-100:20 is equivalent to + --nodes=20,40,60,80,100) which can simplify the syntax when the job + needs to scale by a certain "chunk" size. + + srun - add user requestible vnis with '--network=job_vni' option. + + srun - add user requestible single node vnis with the + '--network=single_node_vni' option. + * API Changes + + job_container plugins - container_p_stepd_create() function signature + replaced uint32_t uid with stepd_step_rec_t* step. + + gres plugins - gres_g_get_devices() function signature replaced pid_t + pid with stepd_step_rec_t* step. + + cgroup plugins - task_cgroup_devices_constrain() function signature + removed pid_t pid. + + task plugins - replace task_p_pre_set_affinity(), task_p_set_affinity(), + and task_p_post_set_affinity() with task_p_pre_launch_priv() like it + was back in slurm 20.11. + + Allow for concurrent processing of job_submit_g_submit() and + job_submit_g_modify() calls. If your plugin is not capable of concurrent + operation you must add additional locking within your plugin. + + Removed return value from slurm_list_append(). + + The List and ListIterator types have been removed in favor of list_t + and list_itr_t respectively. + + burst buffer plugins - add bb_g_build_het_job_script(). + bb_g_get_status() - added authenticated UID and GID. + bb_g_run_script() - added job_info argument. + + burst_buffer.lua - Pass UID and GID to most hooks. Pass job_info + (detailed job information) to many hooks. See + etc/burst_buffer.lua.example for a complete list of changes. + WARNING: Backwards compatibility is broken for + slurm_bb_get_status: UID and GID are passed before the variadic + arguments. If UID and GID are not explicitly listed as arguments to + slurm_bb_get_status(), then they will be included in the variadic + arguments. + Backwards compatibility is maintained for all other hooks because + the new arguments are passed after the existing arguments. + + node_features plugins - node_features_p_reboot_weight() function + removed. + node_features_p_job_valid() - added parameter feature_list. + node_features_p_job_xlate() - added parameters feature_list and + job_node_bitmap. + + New data_parser interface with v0.0.39 plugin. +* Added: Fix-test-1.99.patch +* Reworked: Fix-test-38.11.patch + pam_slurm-Initialize-arrays-and-pass-sizes.patch + +------------------------------------------------------------------- +Thu Feb 9 07:54:01 UTC 2023 - Egbert Eich + +- testsuite: on later SUSE versions claim ownership of directory + /etc/security/limits.d. + +------------------------------------------------------------------- +Fri Dec 2 16:39:45 UTC 2022 - Egbert Eich + +- Move the ext_sensors/rrd plugin to a separate package: this + plugin requires librrd which in turn requires huge parts of + the client side X Window System stack. + There is probably no use in cluttering up a system for a + plugin that probably only used by a few. + +------------------------------------------------------------------- +Fri Oct 21 15:14:30 UTC 2022 - Egbert Eich + +- Test Suite fixes: + * Update README_Testsuite.md. + * Clean up left over files when de-installing test suite. + * Adjustment to test suite package: for SLE mark the openmpi4 + devel package and slurm-hdf5 optional. + * Add -ffat-lto-objects to the build flags when LTO is set to + make sure the object files we ship with the test suite still + work correctly (boo#1204697). + * Improve setup-testsuite.sh: copy ssh fingerprints from all nodes. + +------------------------------------------------------------------- +Fri Oct 14 08:49:24 UTC 2022 - Christian Goll + +- updated to 22.05.5 +- NOTE: Slurm validates that libraries are of the same version. Unfortunately, + due to an oversight, we failed to notice that the slurmstepd loads the + hash_k12 library only after a job has completed. This means that if the + hash_k12 library is upgraded before a job finishes, the slurmstepd will load + the new library when the job finishes, and will fail due to a mismatch of + versions. This results in nodes with slurmstepd processes stuck + indefinitely. These processes require manual intervention to clean up. There + is no clean way to resolve these hung slurmstepd processes. + The only recommended way to upgrade between minor versions of 22.05 with + RPMā€™s or upgrades that replace current binaries and libraries is to drain the + nodes of running jobs first. +- Fixes a number of moderate severity issues, noteable are: + * Load hash plugin at slurmstepd launch time to prevent issues loading the + plugin at step completion if the Slurm installation is upgraded. + * Update nvml plugin to match the unique id format for MIG devices in new + Nvidia drivers. + * Fix multi-node step launch failure when nodes in the controller aren't in + natural order. This can happen with inconsistent node naming (such as + node15 and node052) or with dynamic nodes which can register in any order. + * job_container/tmpfs - cleanup containers even when the .ns file isn't + mounted anymore. + * Wait up to PrologEpilogTimeout before shutting down slurmd to allow prolog + and epilog scripts to complete or timeout. Previously, slurmd waited 120 + seconds before timing out and killing prolog and epilog scripts. + +------------------------------------------------------------------- +Sat Sep 24 07:34:31 UTC 2022 - Egbert Eich + +- Do not deduplicate files of testsuite Slurm configuration. + This directory is supposed to be mounted over /etc/slurm + therefore it must not contain softlinks to the files in + this directory. +- Improve .a and .o file collection for test suite: find these + files even if there are multiple ones in a single line. + +------------------------------------------------------------------- +Tue Sep 20 21:12:19 UTC 2022 - Egbert Eich + +- Fix build for older product version. + +------------------------------------------------------------------- +Tue Aug 2 12:43:39 UTC 2022 - Egbert Eich + +- Fix a potential security vulnerability in the test package + (bsc#1201674, CVE-2022-31251). + +------------------------------------------------------------------- +Thu Jul 21 19:20:42 UTC 2022 - Bernhard Wiedemann + +- make slurmtest.tar reproducible + +------------------------------------------------------------------- +Thu Jul 14 15:20:46 UTC 2022 - Egbert Eich + +- Improve check for mpicc in testsuite package: if binary isn't + found, don't crash. +- Patch NOFILE Limit in the slurmd.service copy for the testsuite. + +------------------------------------------------------------------- +Mon Jun 20 09:23:17 UTC 2022 - Christian Goll + +- update to 22.05.2 with following fixes: + * Fix regression which allowed the oversubscription of licenses. + * Fix a segfault in slurmctld when requesting gres in job arrays. + +------------------------------------------------------------------- +Wed Jun 8 13:15:24 UTC 2022 - Egbert Eich + +- Package the Slurm testsuite for QA purposes. + NOTE: This package is not meant to be used for testing by the + user but rather for testing by the maintainers to ensure the + package is working properly. + DO NOT report test suite failures unless you are able to confirm + that the failure is really a bug. + * Fixes for test suite: + Keep-logs-of-skipped-test-when-running-test-cases-sequentially.patch + Fix-test-21.41.patch + Fix-test-38.11.patch + Fix-test-32.8.patch + Fix-test-3.13.patch + Fix-test7.2-to-find-libpmix-under-lib64-as-well.patch + * Add documentation: + README_Testsuite.md +- Allow log in as user 'slurm'. This allows admins to run certain + priviledged commands more easily without becoming root. + +------------------------------------------------------------------- +Tue May 31 12:56:05 UTC 2022 - Christian Goll + +- update to 22.05.0 with following changes: +- Support for dynamic node addition and removal +- Support for native Linux cgroup v2 operation +- Newly added plugins to support HPE Slingshot 11 networks + (switch/hpe_slingshot), and Intel Xe GPUs (gpu/oneapi) +- Added new acct_gather_interconnect/sysfs plugin to collect statistics + from arbitrary network interfaces. +- Expanded and synced set of environment variables available in the + Prolog/Epilog/PrologSlurmctld/EpilogSlurmctld scripts. +- New "--prefer" option to job submissions to allow for a "soft + constraint" request to influence node selection. +- Optional support for license planning in the backfill scheduler with + "bf_licenses" option in SchedulerParameters. +- removed file slurm-2.4.4-init.patch as sysvinit is now realy deprecated +- removed file load-pmix-major-version.patch as fixed upstream +- set environment variable SUSE_ZNOW to 0 in %build to avoid module load + failures due to unresolved symbols as module take advantage of lazy + bindings (bsc#1200030). + +------------------------------------------------------------------- +Tue May 10 10:26:02 UTC 2022 - Egbert Eich + +- Add a comment about the CommunicationParameters=block_null_hash + option warning users who migrate - just in case. + +------------------------------------------------------------------- +Fri May 6 09:33:34 UTC 2022 - Christian Goll + +- Update to 21.08.8 which fixes CVE-2022-29500 (bsc#1199278), + CVE-2022-29501 (bsc#1199279), and CVE-2022-29502 (bsc#1199281). +- Added 'CommunicationParameters=block_null_hash' to slurm.conf, please + add this parameter to existing configurations. + +------------------------------------------------------------------- +Mon May 2 14:12:59 UTC 2022 - Christian Goll + - Update to 21.08.7 with following changes: + * openapi/v0.0.37 - correct calculation for bf_queue_len_mean in /diag. + * Avoid shrinking a reservation when overlapping with downed nodes. + * Only check TRES limits against current usage for TRES requested by the job. + * Do not allocate shared gres (MPS) in whole-node allocations + * Constrain slurmstepd to job/step cgroup like in previous versions of Slurm. + * Fix warnings on 32-bit compilers related to printf() formats. + * Fix reconfigure issues after disabling/reenabling the GANG PreemptMode. + * Fix race condition where a cgroup was being deleted while another step + was creating it. + * Set the slurmd port correctly if multi-slurmd + * Fix FAIL mail not being sent if a job was cancelled due to preemption. + * slurmrestd - move debug logs for HTTP handling to be gated by debugflag + NETWORK to avoid unnecessary logging of communication contents. + * Fix issue with bad memory access when shrinking running steps. + * Fix various issues with internal job accounting with GRES when jobs are + shrunk. + * Fix ipmi polling on slurmd reconfig or restart. + * Fix srun crash when reserved ports are being used and het step fails + to launch. + * openapi/dbv0.0.37 - fix DELETE execution path on /user/{user_name}. + * slurmctld - Properly requeue all components of a het job if PrologSlurmctld + fails. + * rlimits - remove final calls to limit nofiles to 4096 but to instead use + the max possible nofiles in slurmd and slurmdbd. + * Allow the DBD agent to load large messages (up to MAX_BUF_SIZE) from state. + * Fix potential deadlock during slurmctld restart when there is a completing + job. + * slurmstepd - reduce user requested soft rlimits when they are above max + hard rlimits to avoid rlimit request being completely ignored and + processes using default limits. + * Fix Slurm user commands displaying available features as active features + when no features were active. + * Don't power down nodes that are rebooting. + * Clear pending node reboot on power down request. + * Ignore node registrations while node is powering down. + * Don't reboot any node that is power down. + * Don't allow a node to reboot if it's marked for power down. + * Fix issuing reboot and downing when rebooting a powering up node. + * Clear DRAIN on node after failing to resume before ResumeTimeout. + * Prevent repeating power down if node fails to resume before ResumeTimeout. + * Fix federated cloud node communication with srun and cloud_dns. + * Fix jobs being scheduled on nodes marked to be powered_down when idle. + * Fix problem where a privileged user could not view array tasks specified by + _ when PrivateData had the jobs value set. + - Changes in Slurm 21.08.6 + * Fix plugin_name definitions in a number of plugins to improve logging. + * Close sbcast file transfers when job is cancelled. + * scrontab - fix handling of --gpus and --ntasks-per-gpu options. + * sched/backfill - fix job_queue_rec_t memory leak. + * Fix magnetic reservation logic in both main and backfill schedulers. + * job_container/tmpfs - fix memory leak when using InitScript. + * slurmrestd / openapi - fix memory leaks. + * Fix slurmctld segfault due to job array resv_list double free. + * Fix multi-reservation job testing logic. + * Fix slurmctld segfault due to insufficient job reservation parse validation. + * Fix main and backfill schedulers handling for already rejected job array. + * sched/backfill - restore resv_ptr after yielding locks. + * acct_gather_energy/xcc - appropriately close and destroy the IPMI context. + * Protect slurmstepd from making multiple calls to the cleanup logic. + * Prevent slurmstepd segfault at cleanup time in mpi_fini(). + * Fix slurmctld sometimes hanging if shutdown while PrologSlurmctld or + EpilogSlurmctld were running and PrologEpilogTimeout is set in slurm.conf. + * Fix affinity of the batch step if batch host is different than the first + node in the allocation. + * slurmdbd - fix segfault after multiple failover/failback operations. + * Fix jobcomp filetxt job selection condition. + * Fix -f flag of sacct not being used. + * Select cores for job steps according to the socket distribution. Previously, + sockets were always filled before selecting cores from the next socket. + * Keep node in Future state if epilog completes while in Future state. + * Fix erroneous --constraint behavior by preventing multiple sets of brackets. + * Make ResetAccrueTime update the job's accrue_time to now. + * Fix sattach initialization with configless mode. + * Revert packing limit checks affecting pmi2. + * sacct - fixed assertion failure when using -c option and a federation + display + * Fix issue that allowed steps to overallocate the job's memory. + * Fix the sanity check mode of AutoDetect so that it actually works. + * Fix deallocated nodes that didn't actually launch a job from waiting for + Epilogslurmctld to complete before clearing completing node's state. + * Job should be in a completing state if EpilogSlurmctld when being requeued. + * Fix job not being requeued properly if all node epilog's completed before + EpilogSlurmctld finished. + * Keep job completing until EpilogSlurmctld is completed even when "downing" + a node. + * Fix handling reboot with multiple job features. + * Fix nodes getting powered down when creating new partitions. + * Fix bad bit_realloc which potentially could lead to bad memory access. + * slurmctld - remove limit on the number of open files. + * Fix bug where job_state file of size above 2GB wasn't saved without any + error message. + * Fix various issues with no_consume gres. + * Fix regression in 21.08.0rc1 where job steps failed to launch on systems + that reserved a CPU in a cgroup outside of Slurm (for example, on systems + with WekaIO). + * Fix OverTimeLimit not being reset on scontrol reconfigure when it is + removed from slurm.conf. + * serializer/yaml - use dynamic buffer to allow creation of YAML outputs + larger than 1MiB. + * Fix minor memory leak affecting openapi users at process termination. + * Fix batch jobs not resolving the username when nss_slurm is enabled. + * slurmrestd - Avoid slurmrestd ignoring invalid HTTP method if the response + serialized without error. + * openapi/dbv0.0.37 - Correct conditional that caused the diag output to + give an internal server error status on success. + * Make --mem-bind=sort work with task_affinity + * Fix sacctmgr to set MaxJobsAccruePer{User|Account} and MinPrioThres in + sacctmgr add qos, modify already worked correctly. + * job_container/tmpfs - avoid printing extraneous error messages in Prolog + and Epilog, and when the job completes. + * Fix step CPU memory allocation with --threads-per-core without --exact. + * Remove implicit --exact when --threads-per-core or --hint=nomultithread + is used. + * Do not allow a step to request more threads per core than the + allocation did. + * Remove implicit --exact when --cpus-per-task is used. + +------------------------------------------------------------------- +Wed Dec 22 09:24:28 UTC 2021 - Christian Goll + +- update to 21.08.5 with following changes: + * Fix issue where typeless GRES node updates were not immediately reflected. + * Fix setting the default scrontab job working directory so that it's the home + of the different user (*u ) and not that of root or SlurmUser editor. + * Fix stepd not respecting SlurmdSyslogDebug. + * Fix concurrency issue with squeue. + * Fix job start time not being reset after launch when job is packed onto + already booting node. + * Fix updating SLURM_NODE_ALIASES for jobs packed onto powering up nodes. + * Cray - Fix issues with starting hetjobs. + * auth/jwks - Print fatal() message when jwks is configured but file could + not be opened. + * If sacctmgr has an association with an unknown qos as the default qos + print 'UNKN*###' instead of leaving a blank name. + * Correctly determine task count when giving --cpus-per-gpu, --gpus and + *-ntasks-per-node without task count. + * slurmctld - Fix places where the global last_job_update was not being set + to the time of update when a job's reason and description were updated. + * slurmctld - Fix case where a job submitted with more than one partition + would not have its reason updated while waiting to start. + * Fix memory leak in node feature rebooting. + * Fix time limit permanetly set to 1 minute by backfill for job array tasks + higher than the first with QOS NoReserve flag and PreemptMode configured. + * Fix sacct -N to show jobs that started in the current second + * Fix issue on running steps where both SLURM_NTASKS_PER_TRES and + SLURM_NTASKS_PER_GPU are set. + * Handle oversubscription request correctly when also requesting + *-ntasks-per-tres. + * Correctly detect when a step requests bad gres inside an allocation. + * slurmstepd - Correct possible deadlock when UnkillableStepTimeout triggers. + * srun - use maximum number of open files while handling job I/O. + * Fix writing to Xauthority files on root_squash NFS exports, which was + preventing X11 forwarding from completing setup. + * Fix regression in 21.08.0rc1 that broke --gres=none. + * Fix srun --cpus-per-task and --threads-per-core not implicitly setting + *-exact. It was meant to work this way in 21.08. + * Fix regression in 21.08.0 that broke dynamic future nodes. + * Fix dynamic future nodes remembering active state on restart. + * Fix powered down nodes getting stuck in COMPLETING+POWERED_DOWN when job is + cancelled before nodes are powering up. + + + +------------------------------------------------------------------- +Wed Nov 17 08:33:13 UTC 2021 - Christian Goll + +- updated to 21.08.4 which fixes (CVE-2021-43337) which is only present + in 21.08 tree. + * CVE-2021-43337: + For sites using the new AccountingStoreFlags=job_script and/or job_env + options, an issue was reported with the access control rules in SlurmDBD + that will permit users to request job scripts and environment files that + they should not have access to. (Scripts/environments are meant to only be + accessible by user accounts with administrator privileges, by account + coordinators for jobs submitted under their account, and by the user + themselves.) +- changes from 21.08.3: + * This includes a number of fixes since the last release a month ago, + including one critical fix to prevent a communication issue between + slurmctld and slurmdbd for sites that have started using the new + AccountingStoreFlags=job_script functionality. + +------------------------------------------------------------------- +Fri Oct 29 15:54:53 UTC 2021 - Egbert Eich + +- Utilize sysuser infrastructure to set user/group slurm. + For munge authentication slurm should have a fixed UID across + all nodes including the management server. Set it to 120 +- Limit firewalld service definitions to SUSE versions >= 15. + +------------------------------------------------------------------- +Mon Oct 18 13:36:14 UTC 2021 - Christian Goll + +- added service definitions for firewalld (JSC#SLE-22741) + +------------------------------------------------------------------- +Wed Oct 6 07:12:52 UTC 2021 - Christian Goll + +- update to 21.08.2 +- major change: + * removed of support of the TaskAffinity=yes option in cgroup.conf. Please + consider using "TaskPlugins=cgroup,affinity" in slurm.conf as an option. +- minor changes and bugfixes: + * slurmctld - fix how the max number of cores on a node in a partition are + calculated when the partition contains multi*socket nodes. This in turn + corrects certain jobs node count estimations displayed client*side. + * job_submit/cray_aries - fix "craynetwork" GRES specification after changes + introduced in 21.08.0rc1 that made TRES always have a type prefix. + * Ignore nonsensical check in the slurmd for [Pro|Epi]logSlurmctld. + * Fix writing to stderr/syslog when systemd runs slurmctld in the foreground. + * Fix issue with updating job started with node range. + * Fix issue with nodes not clearing state in the database when the slurmctld + is started with clean*start. + * Fix hetjob components > 1 timing out due to InactiveLimit. + * Fix sprio printing -nan for normalized association priority if + PriorityWeightAssoc was not defined. + * Disallow FirstJobId=0. + * Preserve job start info in the database for a requeued job that hadn't + registered the first time in the database yet. + * Only send one message on prolog failure from the slurmd. + * Remove support for TaskAffinity=yes in cgroup.conf. + * accounting_storage/mysql - fix issue where querying jobs via sacct + *-whole-hetjob=yes or slurmrestd (which automatically includes this flag) + could in some cases return more records than expected. + * Fix issue for preemption of job array task that makes afterok dependency + fail. Additionally, send emails when requeueing happens due to preemption. + * Fix sending requeue mail type. + * Properly resize a job's GRES bitmaps and counts when resizing the job. + * Fix node being able to transition to CLOUD state from non-cloud state. + * Fix regression introduced in 21.08.0rc1 which broke a step's ability to + inherit GRES from the job when the step didn't request GRES but the job did. + * Fix errors in logic when picking nodes based on bracketed anded constraints. + This also enforces the requirement to have a count when using such + constraints. + * Handle job resize better in the database. + * Exclude currently running, resized jobs from the runaway jobs list. + * Make it possible to shrink a job more than once. + + +------------------------------------------------------------------- +Tue Sep 28 15:53:38 UTC 2021 - Christian Goll + +- moved pam module from /lib64 to /usr/lib64 which fixes boo#1191095 + via the macro %_pam_moduledir + +------------------------------------------------------------------- +Fri Sep 17 07:22:44 UTC 2021 - Christian Goll + +- updated to 21.08.1 with following bug fixes: + * Fix potential memory leak if a problem happens while allocating GRES for + a job. + * If an overallocation of GRES happens terminate the creation of a job. + * AutoDetect=nvml: Fatal if no devices found in MIG mode. + * Print federation and cluster sacctmgr error messages to stderr. + * Fix off by one error in --gpu-bind=mask_gpu. + * Add --gpu-bind=none to disable gpu binding when using --gpus-per-task. + * Handle the burst buffer state "alloc-revoke" which previously would not + display in the job correctly. + * Fix issue in the slurmstepd SPANK prolog/epilog handler where configuration + values were used before being initialized. + * Restore a step's ability to utilize all of an allocations memory if --mem=0. + * Fix --cpu-bind=verbose garbage taskid. + * Fix cgroup task affinity issues from garbage taskid info. + * Make gres_job_state_validate() client logging behavior as before 44466a4641. + * Fix steps with --hint overriding an allocation with --threads-per-core. + * Require requesting a GPU if --mem-per-gpu is requested. + * Return error early if a job is requesting --ntasks-per-gpu and no gpus or + task count. + * Properly clear out pending step if unavailable to run with available + resources. + * Kill all processes spawned by burst_buffer.lua including decendents. + * openapi/v0.0.{35,36,37} - Avoid setting default values of min_cpus, + job name, cwd, mail_type, and contiguous on job update. + * openapi/v0.0.{35,36,37} - Clear user hold on job update if hold=false. + * Prevent CRON_JOB flag from being cleared when loading job state. + * sacctmgr - Fix deleting WCKeys when not specifying a cluster. + * Fix getting memory for a step when the first node in the step isn't the + first node in the allocation. + * Make SelectTypeParameters=CR_Core_Memory default for cons_tres and cons_res. + * Correctly handle mutex unlocks in the gres code if failures happen. + * Give better error message if -m plane is given with no size. + * Fix --distribution=arbitrary for salloc. + * Fix jobcomp/script regression introduced in 21.08.0rc1 0c75b9ac9d. + * Only send the batch node in the step_hostlist in the job credential. + * When setting affinity for the batch step don't assume the batch host is node + 0. + * In task/affinity better checking for node existence when laying out + affinity. + * slurmrestd - fix job submission with auth/jwt. + +- removed Fix-statement-condition-in-netloc-autoconf-macro.patch + issue was fixed upstream + +------------------------------------------------------------------- +Mon Sep 6 15:34:06 UTC 2021 - Egbert Eich + +- Fix-statement-condition-in-netloc-autoconf-macro.patch: + Fix netloc check, reestablish netloc disable code. +- Make configure arg '--with-pmix' conditional. +- Move openapi plugins to package slurm-restd. + +------------------------------------------------------------------- +Thu Sep 2 13:19:33 UTC 2021 - Christian Goll + +- updated to 21.08.0, major changes: + * A new "AccountingStoreFlags=job_script" option to store the job scripts + directly in SlurmDBD. + * Added "sacct -o SubmitLine" format option to get the submit line + of a job/step. + * Changes to the node state management so that nodes are marked as PLANNED + instead of IDLE if the scheduler is still accumulating resources while + waiting to launch a job on them. + * RS256 token support in auth/jwt. + * Overhaul of the cgroup subsystems to simplify operation, mitigate a number + of inherent race conditions, and prepare for future cgroup v2 support. + * Further improvements to cloud node power state management. + * A new child process of the Slurm controller called "slurmscriptd" + responsible for executing PrologSlurmctld and EpilogSlurmctld scripts, + which significantly reduces performance issues associated with enabling + those options. + * A new burst_buffer/lua plugin allowing for site-specific asynchronous job + data management. + * Fixes to the job_container/tmpfs plugin to allow the slurmd process to be + restarted while the job is running without issue. + * Added json/yaml output to sacct, squeue, and sinfo commands. + * Added a new node_features/helpers plugin to provide a generic way to change + settings on a compute node across a reboot. + * Added support for automatically detecting and broadcasting shared libraries + for an executable launched with "srun --bcast". + * Added initial OCI container execution support with a new --container option + to sbatch and srun. + * Improved "configless" support by allowing multiple control servers to be + specified through the slurmd --conf-server option, and send additional + configuration files at startup including cli_filter.lua. +- minor changes: + * If an overallocation of GRES happens terminate the creation of a job. + * AutoDetect=nvml: Fatal if no devices found in MIG mode. + * Print federation and cluster sacctmgr error messages to stderr. + * Add --gpu-bind=none to disable gpu binding when using --gpus-per-task. + * Handle the burst buffer state "alloc-revoke" which previously would not + display in the job correctly. + * Fix issue in the slurmstepd SPANK prolog/epilog handler where configuration + values were used before being initialized. + * Restored --gpu-bind=single: to check core affinity like + *-gpu-bind=closest does. This removal of this behavior only was in rc2. + * slurmd - Fix assert failure on initialization due to bad node name. + * Fix error codes in cgroup/v1. + * Don't destroy the memory step outside fini, which leads to a double destroy + causing an error message. + * Add support for lua 5.4. + * Force cgroup.clone_children to 0 in slurm cgroup directories. This caused + issues in task cpuset plugin in systems with it enabled by default. + * Clear GRES HAS_TYPE flag when removing type name. + * Environment flags in gres.conf now override flags set by AutoDetect. + * Environment flags in gres.conf now apply to subsequent gres.conf lines where + Environment flags are not set. + * Set missing job_uid and job_gid members when preparing a kill_job_msg_t in + abort_job_on_node(), abort_job_on_nodes() and kill_job_on_node(). + * Fix swappiness not being set in cgroups. + * Fix coordinators for new subaccounts. + * Fix coordinators when adding existing users with PrivateData=users. + * slurmctld - do not attempt to relinquish control to self. + * openapi/v0.0.37 - Honor kill_on_invalid_dependency as job parameter. + * Check max_gres when doing step allocation, fix for regression in rc2. + * SPANK plugins are now required to match the current Slurm version, and must + be recompiled for each new Slurm release. + * node_features/helpers - add ExecTime configuration option. + * srun - Fix force termination with -X. + * On slurmctld restart set node typed GRES counts correctly. + * Fix places where a step wasn't allocated in the slurmctld but wasn't ever + removed from the job. + * Fix step allocation memory when using --threads-per-core. + * Fix step allocations to consume all threads on a core when using + threads-per-core. + * Add check to validate cpu request on a step if --threads-per-core is given + and it is less than what the core on the node has in the allocation. + * Fix issue where a step could request more gres than the job had and the step + would hang forever. This bug was only introduced in 21.08.0rc2. + * Only print \r\n for logging messages on stderr when --pty has been + explicitly requested. + * Relax check on SPANK plugins to only require Slurm major + minor versions + to match. + * job_container/tmpfs - delegate handling of /dev/shm to the extern step + so new step launches will be attached correctly even after the slurmd + process has been restarted. + * Limit the wait time in proctrack_g_wait() to UnkillableStepTimeout instead + of a hardcoded value of 256 seconds, and limit the delay between tests to a + maximum of 32 seconds. + * fatal() on start if using job_container/tmpfs without PrologFlags=Contian. + * Load bf_when_last_cycle from job state only if protocol version >= 21.08. + * Docs - remove man3 section entirely. + * Set step memory when using MemPerGPU or DefMemPerGPU. Previously a step's + memory was not set even when it requested *-mem-per-gpu and at least one + GPU. + * Add cli_filter.lua support in configless mode. + * Check that the step requests at least as many gres as nodes. + * Make job's SLURM_JOB_GPUS print global GPU IDs instead of MIG unique_ids. + * Fix miscounting of GPU envs in prolog/epilog if MultipleFiles was used. + * Support MIGs in prolog/epilog's CUDA_VISIBLE_DEVICES & co. + * Add SLURM_JOB_GPUS back into Prolog; add it to Epilog. + * Fix issue where the original executable, not the bcast'd version, was + executed with 'srun *-bcast'. + * sacct - print '-' header correctly for fields over 53-characters wide. + * openapi/dbv0.0.37 - replace "REST" with "Slurm OpenAPI" for plugin_name. + * openapi/v0.0.37 - replace "REST" with "Slurm OpenAPI" for plugin_name. + * configless - fix segfault on 'scontrol reconfigure'. + * Use FREE_NULL_LIST instead of list_destroy. + * If we made are running an interactive session we need to force track_steps. + * Disable OPOST flag when using --pty to avoid issues with Emac. + * Fix issue where extra bonus core was allocated in some situations. + * Avoid putting gres with count of 0 on a TRES req/alloc. + * Fix memory in requested TRES when --mem-per-gpu is used. + * Changed ReqMem field in sacct to match memory from ReqTRES. + * Changed --gpu-bind=single: to no longer check core affinity like + *-gpu-bind=closest does. This consequently affects --ntasks-per-gpu. + * slurmrestd - add v0.0.37 OpenAPI plugin. + * slurmrestd/v0.0.37 - rename standard_in -> standard_input. + * slurmrestd/v0.0.37 - rename standard_out -> standard_output. + * Changed the --format handling for negative field widths (left justified) + to apply to the column headers as well as the printed fields. + * Add LimitFactor to the QOS. A float that is factored into an associations + [Grp|Max]TRES limits. For example, if the LimitFactor is 2, then an + association with a GrpTRES of 30 CPUs, would be allowed to allocate 60 + CPUs when running under this QOS. + * slurmrestd - Pass SLURM_NO_CHANGE_IN_DATA to client as 403 (Not Modified). + * slurmrestd/v0.0.37 - Add update_time field to Jobs query to allow clients + to only get jobs list based on change timestamp. + * Reset job eligible time when job is manually held. + * Add DEBUG_FLAG_JAG to improve logging related to job account gathering. + * Convert logging in account_gather/common to DEBUG_FLAG_JAG. + * Add more logging for jag_common_poll_data() when prec_extra() called. + * slurmrestd/v0.0.37 - add API to fetch reservation(s) info. + * Catch more errors in task/cgroup initalization and cleanup to avoid allowing + jobs to start when cgroups failure to configure correctly. + * Fix cgroup ns detection when using containers (e.g. LXC or Docker). + * Reset job's next_step_id counter to 0 after being requeued. + * Make scontrol exit with non-zero status after failing to delete a partition + or reservation. + * Make NtasksPerTRES optional in slurm_sprint_job_info(). + * slurmrestd/v0.0.37 - Add update_time field to nodes query to allow clients + to only get nodes list based on change timestamp. + * common/parse_config - catch and propagate return codes when handling a match + on a key-value pattern. This implies error codes detected in the handlers + are now not ignored and users of _handle_keyvalue_match() can fatal(). + * common/hostlist - fix hostlist_delete_nth() xassert() upper bound check. + * API change: Removed slurm_kill_job_msg and modified the function signature + for slurm_kill_job2. slurm_kill_job2 should be used instead of + slurm_kill_job_msg. + * Fix non-zero exit code for scontrol ping when all controllers are down. + * Enforce a valid configuration for AccountingStorageEnforce in slurm.conf. + If the configuration is invalid, then an error message will be printed and + the command or daemon (including slurmctld) will not run. + * slurmrestd/v0.0.37 - Add update_time field to partitions/reservations query + to allow clients to only get the entities list when something changed. + * slurmdbd.service - add "After" relationship to all common names for MariaDB + to reduce startup delays. + * slurmrestd/v0.0.37 - Correct displaying node states that are UNKNOWN. + * slurmrestd/v0.0.37 - Add flags to node states. + * Fix first job on fresh cluster not being assigned JobId=1 (or FirstJobId). + * squeue - make it so --nodelist is sensitive to --clusters. + * squeue - do --nodelist node validation in the same order as listing. + * Removed AccountingStoreJobComment option. Please update your config to use + AccountingStoreFlags=job_comment instead. + * AccountingStoreFlags=job_script allows you to store the job's batch script. + * AccountingStoreFlags=job_env allows you to store the job's env vars. + * Add sacct -o SubmitLine to get the submit line of a job/step. + * Removed DefaultStorage{Host,Loc,Pass,Port,Type,User} options. + * Fix NtasksPerTRES delimiter from : to = in scontrol show job output. + * Removed CacheGroups, CheckpointType, JobCheckpointDir, MemLimitEnforce, + SchedulerPort, SchedulerRootFilter options. + * Make job accounting queries use consistent timeframes with and w/o jobs. + * --cpus-per-task and --threads-per-core now imply --exact. + This fixes issues where steps would be allocated the wrong number of CPUs. + * configure: the --with option handling has been made consistent across the + various optional libraries. Specifying *-with-foo=/path/to/foo will only + check that directory for the applicable library (rather than, in some cases, + falling back to the default directories), and will always error the build + if the library is not found (instead of a mix of error messages and non- + fatal warning messages). + * configure: replace --with-rmsi_dir option with proper handling for + *-with-rsmi=dir. + * Pass additional job environment variables to MailProg. + * Add SLURM_JOB_WORK_DIR to Prolog, Epilog. + * Removed sched/hold plugin. + * Fix srun overwriting SLURM_SUBMIT_DIR and SLURM_SUBMIT_HOST when within an + existing allocation. + * step_ctx code has been removed from the api. + * cli_filter/lua, jobcomp/lua, job_submit/lua now load their scripts from the + same directory as the slurm.conf file (and thus now will respect changes + to the SLURM_CONF environment variable). + * SPANK - call slurm_spank_init if defined without slurm_spank_slurmd_exit in + slurmd context. + * job_container/tmpfs - Remove need for .active file to allow salloc without + an interactive step to work. + * slurmd - Delay background node registration on every failure up to 128s on + startup. + * slurmctld - Always notify slurmd that node registration was accepted to + avoid slurmd needless attempting to re-register if there is configuration + issue. + * Put node into "INVAL" state upon registering with an invalid node + configuration. Node must register with a valid configuration to continue. + * Make --cpu-bind=threads default for --threads-per-core -- cli and env can + override. + * jobcomp/elasticsearch - Use data_t to serialize data. The plugin now has the + JSON-C library as a prerequisite. + * scrontab - create the temporary file under the TMPDIR environment variable + (if set), otherwise continue to use TmpFS as configured in slurm.conf. + * Add LastBusyTime to "scontrol show nodes" and slurmrestd nodes output, + which represents the time the node last had jobs on it. + * slurmd - allow multiple comma-separated controllers to be specified in + configless mode with *-conf-server + * sacctmgr - changed column headings to "ParentID" and "ParentName" instead + of "Par ID" and "Par Name" respectively. + * Perl API - make sure man pages are installed under the --prefix given to + configure. + * Manually powering down of nodes with scontrol now ignores + SuspendExc. + * SALLOC_THREADS_PER_CORE and SBATCH_THREADS_PER_CORE have been added as + input environment variables for salloc and sbatch, respectively. They do + the same thing as *-threads-per-core. + * Distinguish queued reboot requests (REBOOT) from issued reboots (REBOOT^). + * Set the maximum number of open files per process to 4096 to avoid + performance issues when closing the entire range with closeall(). + * auth/jwt - add support for RS256 tokens. + * Relax reservation purge due to any invalid uid after creation time. + * Reject srun that requests both --exclusive and --overlap. + * service files - change dependency to network-online rather than just + network to ensure DNS and other services are available. + * RSMI: Fix incorrect PCI BDF bits. + * plugins/cli_filter - Convert to using data_t to serialize JSON. + * Fix testing array job after regaining locks in backfill. + * Don't display node's comment with "scontrol show nodes" unless set. + * Add "Extra" field to node to store extra information other than a comment. + * scrontab - Use /tmp instead of TmpFS if TMPDIR is not set. + * Add ResumeTimeout, SuspendTimeout and SuspendTime to Partitions. + * sreport - change to sorting TopUsage by the --tres option. + * slurmrestd - do not run allow operation as SlurmUser/root by default. + * Allow map_cpu and mask_cpu for non-whole node allocation. + * TaskPluginParam=verbose is now treated as a default. Previously it would be + applied regardless of the job specifying a *-cpu-bind. + * Add "node_reg_mem_percent" SlurmctldParameter to define percentage of + memory nodes are allowed to register with. + * Show correct number of SocketsPerBoard in slurmd -C with hwloc2. + * Alter sreport's cluster utilization report column name from + 'Reserved' to 'Planned' to match the nomenclature of the 'Planned' node. + * Add StateComplete format option to sinfo to show base_state+flags. + * "scontrol show node" now shows State as base_state+flags instead of + shortened state with flags appended. eg. IDLE# *> IDLE+POWERING_UP. + Also "POWER" state flag string is "POWERED_DOWN". + * slurmd/req - add missing job_env_t's het_job_id initialization off the + request in _rpc_{abort,terminate}_job(). This caused problems for Native + Cray builds when joining a CNCU job_container plugin with Epilog configured. + * Fix joining a CNCU job_container on a Native Cray build before executing the + UnkillableStepProgram for a HetJob step. + * slurmrestd/v0.0.35 - Plugin has been tagged as deprecated. + * srun - Job steps requiring more cores than available to be rejected unless + '--overlap' is specificed. + * Add bf_node_space_size to SchedulerParameters. + * Add scontrol update node state=POWER_DOWN_FORCE and POWER_DOWN_ASAP as new + ways to power off and reset especially CLOUD nodes. + * Define and separate node power state transitions. Previously a powering + down node was in both states, POWERING_OFF and POWERED_OFF. These are now + separated. + * Create a new process called slurmscriptd which runs PrologSlurmctld and + EpilogSlurmctld. This avoids fork() calls from slurmctld, and can avoid + performance issues if the slurmctld has a large memory footprint. + * Added new Script option to DebugFlags for debugging slurmscriptd. + * scrontab - add ability to update crontab from a file or standard input. + * scrontab - add ability to set and expand variables. + * Pass JSON of job to node mappings to ResumeProgram. + * If running steps in an allocation with CR_PACK_NODE or -mpack the srun will + only attempt to allocate as much as needed from the allocation instead + of always trying to allocate every node in the allocation. + * Jobs that request the whole node now check to see if any gres are allocated. + * Rename SbcastParameters to BcastParameters. + * Make srun sensitive to BcastParameters. + * RSMI: Add gres_links_create_empty() and preserve RSMI enumeration order. + * GPUs: Use index instead of dev_num for CUDA_VISIBLE_DEVICES + * Don't run epilog on nodes if job never launched. + * QOS accrue limits only apply to the job QOS, not partition QOS. + * Add --gpu-bind=per_task: option, --gpus-per-task will now + set this option by default. + * Treat any return code from SPANK plugin that is not SLURM_SUCCESS to be an + error or rejection. + * Print the statistics for extern step adopted processes in sstat. + * Fix SLURM_NODE_ALIASES to work for ipv6 node addrs. + * Add support for automatically detecting and broadcasting executable shared + object dependencies for sbcast and srun *-bcast. + +------------------------------------------------------------------- +Fri Jul 2 08:01:32 UTC 2021 - Christian Goll + +- Updated to 20.11.8: + * slurmctld - fix erroneous "StepId=CORRUPT" messages in error logs. + * Correct the error given when auth plugin fails to pack a credential. + * acct_gather_filesystem/lustre - only emit collection error once per step. + * Add GRES environment variables (e.g., CUDA_VISIBLE_DEVICES) into the + interactive step, the same as is done for the batch step. + * Fix various potential deadlocks when altering objects in the database + dealing with every cluster in the database. + * slurmrestd: + - handle slurmdbd connection failures without segfaulting. + - fix segfault for searches in slurmdb/v0.0.36/jobs. + - remove (non-functioning) users query parameter for + slurmdb/v0.0.36/jobs from openapi.json + - fix segfault in slurmrestd db/jobs with numeric queries + - add argv handling for job/submit endpoint. + - add description for slurmdb/job endpoint. + * slurmrestd/dbv0.0.36: + - Fix values dumped in job state/current and + job step state. + - Correct description for previous state property. + * srun: + - fix broken node step allocation in a heterogeneous allocation. + - leave SLURM_DIST_UNKNOWN as default for --interactive. + * Fail step creation if -n is not multiple of --ntasks-per-gpu. + * job_container/tmpfs - Fix slowdown on teardown. + * Fix problem with SlurmctldProlog where requeued jobs would never launch. + * job_container/tmpfs - Fix issue when restarting slurmd where the namespace + mount points could disappear. + * sacct: + - avoid truncating JobId at 34 characters. + - fix segfault when printing StepId (or when using --long). + * scancel - fix segfault when --wckey filtering option is used. + * select/cons_tres - Fix memory leak. + * Prevent file descriptor leak in job_container/tmpfs on slurmd restart. + * perlapi/libslurmdb - expose tres_req_str to job hash. + * scrontab - close and reopen temporary crontab file to deal with editors + that do not change the original file, but instead write out then rename + a new file. + * sstat - fix linking so that it will work when --without-shared-libslurm + was used to build Slurm. + * Clear allocated cpus for running steps in a job before handling requested + nodes on new step. + * Don't reject a step if not enough nodes are available. Instead, defer the + step until enough nodes are available to satisfy the request. + * Don't reject a step if it requests at least one specific node that is + already allocated to another step. Instead, defer the step until the + requested node(s) become available. + * Better handling of --mem=0. + * Ignore DefCpuPerGpu when --cpus-per-task given. + + +------------------------------------------------------------------- +Fri May 14 10:07:04 UTC 2021 - Christian Goll + +- Updated to 20.11.7 which fixes CVE-2021-31215 (bsc#1186024) +- New features in 20.11.7: + * slurmd - handle configless failures gracefully instead of hanging + indefinitely. + * select/cons_tres - fix Dragonfly topology not selecting nodes in the same + leaf switch when it should as well as requests with *-switches option. + * Fix issue where certain step requests wouldn't run if the first node in the + job allocation was full and there were idle resources on other nodes in + the job allocation. + * Fix deadlock issue with Slurmctld. + * torque/qstat - fix printf error message in output. + * When adding associations or wckeys avoid checking multiple times a user or + cluster name. + * Fix wrong jobacctgather information on a step on multiple nodes + due to timeouts sending its the information gathered on its node. + * Fix missing xstrdup which could result in slurmctld segfault on array jobs. + * Fix security issue in PrologSlurmctld and EpilogSlurmctld by always + prepending SPANK_ to all user-set environment variables. CVE-2021-31215. +- New features in 20.11.6: + * Fix sacct assert with the --qos option. + * Use pkg-config --atleast-version instead of --modversion for systemd. + * common/fd - fix getsockopt() call in fd_get_socket_error(). + * Properly handle the return from fd_get_socket_error() in _conn_readable(). + * cons_res - Fix issue where running jobs were not taken into consideration + when creating a reservation. + * Avoid a deadlock between job_list for_each and assoc QOS_LOCK. + * Fix TRESRunMins usage for partition qos on restart/reconfig. + * Fix printing of number of tasks on a completed job that didn't request + tasks. + * Fix updating GrpTRESRunMins when decrementing job time is bigger than it. + * Make it so we handle multithreaded allocations correctly when doing + --exclusive or --core-spec allocations. + * Fix incorrect round-up division in _pick_step_cores + * Use appropriate math to adjust cpu counts when --ntasks-per-core=1. + * cons_tres - Fix consideration of power downed nodes. + * cons_tres - Fix DefCpuPerGPU, increase cpus-per-task to match with + gpus-per-task * cpus-per-gpu. + * Fix under-cpu memory auto-adjustment when MaxMemPerCPU is set. + * Make it possible to override CR_CORE_DEFAULT_DIST_BLOCK. + * Perl API - fix retrieving/storing of slurm_step_id_t in job_step_info_t. + * Recover state of burst buffers when slurmctld is restarted to avoid skipping + burst buffer stages. + * Fix race condition in burst buffer plugin which caused a burst buffer + in stage-in to not get state saved if slurmctld stopped. + * auth/jwt - print an error if jwt_file= has not been set in slurmdbd. + * Fix RESV_DEL_HOLD not being a valid state when using squeue --states. + * Add missing squeue selectable states in valid states error message. + * Fix scheduling last array task multiple times on error, causing segfault. + * Fix issue where a step could be allocated more memory than the job when + dealing with --mem-per-cpu and --threads-per-core. + * Fix removing qos from assoc with -= can lead to assoc with no qos + * auth/jwt - fix segfault on invalid credential in slurmdbd due to + missing validate_slurm_user() function in context. + * Fix single Port= not being applied to range of nodes in slurm.conf + * Fix Jobs not requesting a tres are not starting because of that tres limit. + * acct_gather_energy/rapl - fix AveWatts calculation. + * job_container/tmpfs - Fix issues with cleanup and slurmd restarting on + running jobs. + +------------------------------------------------------------------- +Mon May 3 16:09:44 UTC 2021 - Egbert Eich + +- Ship REST API version and auth plugins with slurmrestd. +- Add YAML support for REST API to build (bsc#1185603). + +------------------------------------------------------------------- +Wed Mar 17 08:55:58 UTC 2021 - Christian Goll + +- Udpate to 20.11.5: +- New features: + * New job_container/tmpfs plugin developed by NERSC that can be used to + create per-job filesystem namespaces. Documentaiion and configuration + can be found in the respecting man page. +- Bug fixes: + * Fix main scheduler bug where bf_hetjob_prio truncates SchedulerParameters. + * Fix sacct not displaying UserCPU, SystemCPU and TotalCPU for large times. + * scrontab - fix to return the correct index for a bad #SCRON option. + * scrontab - fix memory leak when invalid option found in #SCRON line. + * Add errno for when a user requests multiple partitions and they are using + partition based associations. + * Fix issue where a job could run in a wrong partition when using + EnforcePartLimits=any and partition based associations. + * Remove possible deadlock when adding associations/wckeys in multiple + threads. + * When using PrologFlags=alloc make sure the correct Slurm version is set + in the credential. + * When sending a job a warning signal make sure we always send SIGCONT + beforehand. + * Fix issue where a batch job would continue running if a prolog failed on a + node that wasn't the batch host and requeuing was disabled. + * Fix issue where sometimes salloc/srun wouldn't get a message about a prolog + failure in the job's stdout. + * Requeue or kill job on a prolog failure when PrologFlags is not set. + * Fix race condition causing node reboots to get requeued before + ResumeTimeout expires. + * Preserve node boot_req_time on reconfigure. + * Preserve node power_save_req_time on reconfigure. + * Fix node reboots being queued and issued multiple times and preventing the + reboot to time out. + * Fix run_command to exit correctly if track_script kills the calling thread. + * Only requeue a job when the PrologSlurmctld returns nonzero. + * When a job is signaled with SIGKILL make sure we flush all + prologs/setup scripts. + * Handle burst buffer scripts if the job is canceled while stage_in is + happening. + * When shutting down the slurmctld make note to ignore error message when + we have to kill a prolog/setup script we are tracking. + * scrontab - add support for the --open-mode option. + * acct_gather_profile/influxdb - avoid segfault on plugin shutdown if setup + has not completed successfully. + * Reduce delay in starting salloc allocations when running with prologs. + * Alter AllocNodes check to work if the allocating node's domain doesn't + match the slurmctld's. This restores the pre*20.11 behavior. + * Fix slurmctld segfault if jobs from a prior version had the now-removed + INVALID_DEPEND state flag set and were allowed to run in 20.11. + * Add job_container/tmpfs plugin to give a method to provide a private /tmp + per job. + * Set the correct core affinity when using AutoDetect. + * slurmrestd - mark "environment" as required for job submissions in schema. + +------------------------------------------------------------------- +Tue Feb 23 16:24:16 UTC 2021 - Christian Goll + +- Udpate to 20.11.04 + * Fix node selection for advanced reservations with features. + * mpi/pmix: Handle pipe failure better when using ucx. + * mpi/pmix: include PMIX_NODEID for each process entry. + * Fix job getting rejected after being requeued on same node that died. + * job_submit/lua - add "network" field. + * Fix situations when a reoccuring reservation could erroneously skip a + period. + * Ensure that a reservations [pro|epi]log are ran on reoccuring reservations. + * Fix threads-per-core memory allocation issue when using CR_CPU_MEMORY. + * Fix scheduling issue with --gpus. + * Fix gpu allocations that request --cpus-per-task. + * mpi/pmix: fixed print messages for all PMIXP_* macros + * Add mapping for XCPU to --signal option. + * Fix regression in 20.11 that prevented a full pass of the main scheduler + from ever executing. + * Work around a glibc bug in which "0" is incorrectly printed as "nan" + which will result in corrupted association state on restart. + * Fix regression in 20.11 which made slurmd incorrectly attempt to find the + parent slurmd address when not applicable and send incorrect reverse*tree + info to the slurmstepd. + * Fix cgroup ns detection when using containers (e.g. LXC or Docker). + * scrontab - change temporary file handling to work with emacs. +- Removed check-for-lipmix.so.MAJOR.patch +- Added: load-pmix-major-version.patch + + +------------------------------------------------------------------- +Wed Jan 20 10:13:23 UTC 2021 - Ana Guerrero Lopez + +- Update to 20.11.03 +- This release includes a major functional change to how job step launch is + handled compared to the previous 20.11 releases. This affects srun as + well as MPI stacks - such as Open MPI - which may use srun internally as + part of the process launch. + One of the changes made in the Slurm 20.11 release was to the semantics + for job steps launched through the 'srun' command. This also + inadvertently impacts many MPI releases that use srun underneath their + own mpiexec/mpirun command. + For 20.11.{0,1,2} releases, the default behavior for srun was changed + such that each step was allocated exactly what was requested by the + options given to srun, and did not have access to all resources assigned + to the job on the node by default. This change was equivalent to Slurm + setting the --exclusive option by default on all job steps. Job steps + desiring all resources on the node needed to explicitly request them + through the new '--whole' option. + In the 20.11.3 release, we have reverted to the 20.02 and older behavior + of assigning all resources on a node to the job step by default. + This reversion is a major behavioral change which we would not generally + do on a maintenance release, but is being done in the interest of + restoring compatibility with the large number of existing Open MPI (and + other MPI flavors) and job scripts that exist in production, and to + remove what has proven to be a significant hurdle in moving to the new + release. + Please note that one change to step launch remains - by default, in + 20.11 steps are no longer permitted to overlap on the resources they + have been assigned. If that behavior is desired, all steps must + explicitly opt-in through the newly added '--overlap' option. + Further details and a full explanation of the issue can be found at: + https://bugs.schedmd.com/show_bug.cgi?id=10383#c63 +- Other changes from 20.11.03 + * Fix segfault when parsing bad "#SBATCH hetjob" directive. + * Allow countless gpu:srun, sbatch->srun sequence. + * Reject job credential if non-superuser sets the LAUNCH_NO_ALLOC flag. + * Make it so srun --no-allocate works again. + * jobacct_gather/linux - Don't count memory on tasks that have already + finished. + * Fix 19.05/20.02 batch steps talking with a 20.11 slurmctld. + * jobacct_gather/common - Do not process jobacct's with same taskid when + calling prec_extra. + * Cleanup all tracked jobacct tasks when extern step child process finishes. + * slurmrestd/dbv0.0.36 - Correct structure of dbv0.0.36_tres_list. + * Fix regression causing task/affinity and task/cgroup to be out of sync when + configured ThreadsPerCore is different than the physical threads per core. + * Fix situation when --gpus is given but not max nodes (-N1-1) in a job + allocation. + * Interactive step - ignore cpu bind and mem bind options, and do not set + the associated environment variables which lead to unexpected behavior + from srun commands launched within the interactive step. + * Handle exit code from pipe when using UCX with PMIx. + +------------------------------------------------------------------- +Fri Jan 8 13:27:02 UTC 2021 - Egbert Eich + +- Fix fallout introduced by: + "Replace '%service_del_postun -n' with '%service_del_postun_without_restart'" + for older Leap/SLE versions. + +------------------------------------------------------------------- +Fri Jan 8 12:20:27 UTC 2021 - Egbert Eich + +- Fix Provides:/Conflicts: for libnss_slurm (bsc#1180700). + +------------------------------------------------------------------- +Tue Jan 5 08:02:02 UTC 2021 - Ana Guerrero Lopez + +- Add support for configuration files from external plugins. + While built-in plugins have their configuration added in slurm.conf, + external SPANK plugins add their configuration to plugstack.conf + To allow packaging easily spank plugins, their configuration files + should be added independently at /etc/spack/plugstack.conf.d and + plugstack.conf should be left with an oneliner including all the + files under /etc/spack/plugstack.conf.d + +------------------------------------------------------------------- +Mon Dec 28 14:37:58 UTC 2020 - Ana Guerrero Lopez + +- Update to 20.11.02 + * Fix older versions of sacct not working with 20.11. + * Fix slurmctld crash when using a pre-20.11 srun in a job allocation. + * Correct logic problem in _validate_user_access. + * Fix libpmi to initialize Slurm configuration correctly. + +- Update to 20.11.01 + * Fix spelling of "overcomited" to "overcomitted" in sreport's cluster + utilization report. + * Silence debug message about shutting down backup controllers if none are + configured. + * Don't create interactive srun until PrologSlurmctld is done. + * Fix fd symlink path resolution. + * Fix slurmctld segfault on subnode reservation restore after node + configuration change. + * Fix resource allocation response message environment allocation size. + * Ensure that details->env_sup is NULL terminated. + * select/cray_aries - Correctly remove jobs/steps from blades using NPC. + * cons_tres - Avoid max_node_gres when entire node is allocated with + --ntasks-per-gpu. + * Allow NULL arg to data_get_type(). + * In sreport have usage for a reservation contain all jobs that ran in the + reservation instead of just the ones that ran in the time specified. This + matches the report for the reservation is not truncated for a time period. + * Fix issue with sending wrong batch step id to a < 20.11 slurmd. + * Add a job's alloc_node to lua for job modification and completion. + * Fix regression getting a slurmdbd connection through the perl API. + * Stop the extern step terminate monitor right after proctrack_g_wait(). + * Fix removing the normalized priority of assocs. + * slurmrestd/v0.0.36 - Use correct name for partition field: + "min nodes per job" -"min_nodes_per_job". + * slurmrestd/v0.0.36 - Add node comment field. + * Fix regression marking cloud nodes as "unexpectedly rebooted" after + multiple boots. + * Fix slurmctld segfault in _slurm_rpc_job_step_create(). + * slurmrestd/v0.0.36 - Filter node states against NODE_STATE_BASE to avoid + the extended states all being reported as "invalid". + * Fix race that can prevent the prolog for a requeued job from running. + * cli_filter - add "type" to readily distinguish between the CLI command in + use. + * smail - reduce sleep before seff to 5 seconds. + * Ensure SPANK prolog and epilog run without an explicit PlugStackConfig. + * Disable MySQL automatic reconnection. + * Fix allowing "b" after memory unit suffixes. + * Fix slurmctld segfault with reservations without licenses. + * Due to internal restructuring ahead of the 20.11 release, applications + calling libslurm MUST call slurm_init(NULL) before any API calls. + Otherwise the API call is likely to fail due to libslurm's internal + configuration not being available. + * slurm.spec - allow custom paths for PMIx and UCX install locations. + * Use rpath if enabled when testing for Mellanox's UCX libraries. + * slurmrestd/dbv0.0.36 - Change user query for associations to optional. + * slurmrestd/dbv0.0.36 - Change account query for associations to optional. + * mpi/pmix - change the error handler error message to be more useful. + * Add missing connection in acct_storage_p_{clear_stats, reconfig, shutdown}. + * Perl API - fix issue when running in configless mode. + * nss_slurm - avoid deadlock when stray sockets are found. + * Display correct value for ScronParameters in 'scontrol show config' + +------------------------------------------------------------------- +Mon Nov 30 20:48:01 UTC 2020 - Egbert Eich + +- Update to version 20.11.0 + Slurm 20.11 includes a number of new features including: + * Overhaul of the job step management and launch code, alongside improved + GPU task placement support. + * A new "Interactive Step" mode of operation for salloc. + * A new "scrontab" command that can be used to submit and manage + periodically repeating jobs. + * IPv6 support. + * Changes to the reservation logic, with new options allowing users + to delete reservations, allowing admins to skip the next occurance of a + repeated reservation, and allowing for a job to be submitted and eligible + to run within multiple reservations. + * Dynamic Future Nodes - automatically associate a dynamically + provisioned (or "cloud") node against a NodeName definition with matching + hardware. + * An experimental new RPC queuing mode for slurmctld to reduce thread + contention on heavily loaded clusters. + * SlurmDBD integration with the Slurm REST API. + Also check + https://github.com/SchedMD/slurm/blob/slurm-20-11-0-1/RELEASE_NOTES + +------------------------------------------------------------------- +Wed Nov 18 08:40:59 UTC 2020 - Ana Guerrero Lopez + +- Updated to 20.02.6, addresses two security fixes: + * PMIx - fix potential buffer overflows from use of unpackmem(). + CVE-2020-27745 (bsc#1178890) + * X11 forwarding - fix potential leak of the magic cookie when sent as an + argument to the xauth command. CVE-2020-27746 (bsc#1178891) +- And many other bugfixes, full log and details available at: + * https://lists.schedmd.com/pipermail/slurm-announce/2020/000045.html + +------------------------------------------------------------------- +Tue Nov 3 14:31:02 UTC 2020 - Franck Bui + +- Replace '%service_del_postun -n' with '%service_del_postun_without_restart' + + '-n' is deprecated and will be removed in the future. + +------------------------------------------------------------------- +Thu Oct 29 12:35:18 UTC 2020 - Ana Guerrero Lopez + +- Updated to 20.02.5, changes: + * Fix leak of TRESRunMins when job time is changed with --time-min + * pam_slurm - explicitly initialize slurm config to support configless mode. + * scontrol - Fix exit code when creating/updating reservations with wrong + Flags. + * When a GRES has a no_consume flag, report 0 for allocated. + * Fix cgroup cleanup by jobacct_gather/cgroup. + * When creating reservations/jobs don't allow counts on a feature unless + using an XOR. + * Improve number of boards discovery + * Fix updating a reservation NodeCnt on a zero-count reservation. + * slurmrestd - provide an explicit error messages when PSK auth fails. + * cons_tres - fix job requesting single gres per-node getting two or more + nodes with less CPUs than requested per-task. + * cons_tres - fix calculation of cores when using gres and cpus-per-task. + * cons_tres - fix job not getting access to socket without GPU or with less + than --gpus-per-socket when not enough cpus available on required socket + and not using --gres-flags=enforce binding. + * Fix HDF5 type version build error. + * Fix creation of CoreCnt only reservations when the first node isn't + available. + * Fix wrong DBD Agent queue size in sdiag when using accounting_storage/none. + * Improve job constraints XOR option logic. + * Fix preemption of hetjobs when needed nodes not in leader component. + * Fix wrong bit_or() messing potential preemptor jobs node bitmap, causing + bad node deallocations and even allocation of nodes from other partitions. + * Fix double-deallocation of preempted non-leader hetjob components. + * slurmdbd - prevent truncation of the step nodelists over 4095. + * Fix nodes remaining in drain state state after rebooting with ASAP option. + + - changes from 20.02.4: + * srun - suppress job step creation warning message when waiting on + PrologSlurmctld. + * slurmrestd - fix incorrect return values in data_list_for_each() functions. + * mpi/pmix - fix issue where HetJobs could fail to launch. + * slurmrestd - set content-type header in responses. + * Fix cons_res GRES overallocation for --gres-flags=disable-binding. + * Fix cons_res incorrectly filtering cores with respect to GRES locality for + --gres-flags=disable-binding requests. + * Fix regression where a dependency on multiple jobs in a single array using + underscores would only add the first job. + * slurmrestd - fix corrupted output due to incorrect use of memcpy(). + * slurmrestd - address a number of minor Coverity warnings. + * Handle retry failure when slurmstepd is communicating with srun correctly. + * Fix jobacct_gather possibly duplicate stats when _is_a_lwp error shows up. + * Fix tasks binding to GRES which are closest to the allocated CPUs. + * Fix AMD GPU ROCM 3.5 support. + * Fix handling of job arrays in sacct when querying specific steps. + * slurmrestd - avoid fallback to local socket authentication if JWT + authentication is ill-formed. + * slurmrestd - restrict ability of requests to use different authentication + plugins. + * slurmrestd - unlink named unix sockets before closing. + * slurmrestd - fix invalid formatting in openapi.json. + * Fix batch jobs stuck in CF state on FrontEnd mode. + * Add a separate explicit error message when rejecting changes to active node + features. + * cons_common/job_test - fix slurmctld SIGABRT due to double-free. + * Fix updating reservations to set the duration correctly if updating the + start time. + * Fix update reservation to promiscuous mode. + * Fix override of job tasks count to max when ntasks-per-node present. + * Fix min CPUs per node not being at least CPUs per task requested. + * Fix CPUs allocated to match CPUs requested when requesting GRES and + threads per core equal to one. + * Fix NodeName config parsing with Boards and without CPUs. + * Ensure SLURM_JOB_USER and SLURM_JOB_UID are set in SrunProlog/Epilog. + * Fix error messages for certain invalid salloc/sbatch/srun options. + * pmi2 - clean up sockets at step termination. + * Fix 'scontrol hold' to work with 'JobName'. + * sbatch - handle --uid/--gid in #SBATCH directives properly. + * Fix race condition in job termination on slurmd. + * Print specific error messages if trying to run use certain + priority/multifactor factors that cannot work without SlurmDBD. + * Avoid partial GRES allocation when --gpus-per-job is not satisfied. + * Cray - Avoid referencing a variable outside of it's correct scope when + dealing with creating steps within a het job. + * slurmrestd - correctly handle larger addresses from accept(). + * Avoid freeing wrong pointer with SlurmctldParameters=max_dbd_msg_action + with another option after that. + * Restore MCS label when suspended job is resumed. + * Fix insufficient lock levels. + * slurmrestd - use errno from job submission. + * Fix "user" filter for sacctmgr show transactions. + * Fix preemption logic. + * Fix no_consume GRES for exclusive (whole node) requests. + * Fix regression in 20.02 that caused an infinite loop in slurmctld when + requesting --distribution=plane for the job. + * Fix parsing of the --distribution option. + * Add CONF READ_LOCK to _handle_fed_send_job_sync. + * prep/script - always call slurmctld PrEp callback in _run_script(). + * Fix node estimation for jobs that use GPUs or --cpus-per-task. + * Fix jobcomp, job_submit and cli_filter Lua implementation plugins causing + slurmctld and/or job submission CLI tools segfaults due to bad return + handling when the respective Lua script failed to load. + * Fix propagation of gpu options through hetjob components. + * Add SLURM_CLUSTERS environment variable to scancel. + * Fix packing/unpacking of "unlinked" jobs. + * Connect slurmstepd's stderr to srun for steps launched with --pty. + * Handle MPS correctly when doing exclusive allocations. + * slurmrestd - fix compiling against libhttpparser in a non-default path. + * slurmrestd - avoid compilation issues with libhttpparser < 2.6. + * Fix compile issues when compiling slurmrestd without --enable-debug. + * Reset idle time on a reservation that is getting purged. + * Fix reoccurring reservations that have Purge_comp= to keep correct + duration if they are purged. + * scontrol - changed the "PROMISCUOUS" flag to "MAGNETIC" + * Early return from epilog_set_env in case of no_consume. + * Fix cons_common/job_test start time discovery logic to prevent skewed + results between "will run test" executions. + * Ensure TRESRunMins limits are maintained during "scontrol reconfigure". + * Improve error message when host lookup fails. + +- Refresh patch: pam_slurm-Initialize-arrays-and-pass-sizes.patch + +------------------------------------------------------------------- +Tue Jul 7 09:05:40 UTC 2020 - Egbert Eich + +- Add support for openPMIx also for Leap/SLE 15.0/1 (bsc#1173805). +- Do not run %check on SLE-12-SP2: Some incompatibility in tcl + makes this fail. +- Remove unneeded build dependency to postgresql-devel. +- Disable build on s390 (requires 64bit). + +------------------------------------------------------------------- +Wed Jun 3 11:11:11 UTC 2020 - Egbert Eich + +- Bring QA to the package build: add %%check stage. +- Remove cruft that isn't needed any longer. +- Add 'ghosted' run-file. +- Add rpmlint filter to handle issues with library packages + for Leap and enterprise upgrade versions. + +------------------------------------------------------------------- +Fri May 22 08:45:46 UTC 2020 - Christian Goll + +- Updated to 20.02.3 which fixes CVE-2020-12693 (bsc#1172004). +- Other changes are: + * Factor in ntasks-per-core=1 with cons_tres. + * Fix formatting in error message in cons_tres. + * Fix calling stat on a NULL variable. + * Fix minor memory leak when using reservations with flags=first_cores. + * Fix gpu bind issue when CPUs=Cores and ThreadsPerCore > 1 on a node. + * Fix --mem-per-gpu for heterogenous --gres requests. + * Fix slurmctld load order in load_all_part_state(). + * Fix race condition not finding jobacct gather task cgroup entry. + * Suppress error message when selecting nodes on disjoint topologies. + * Improve performance of _pack_default_job_details() with large number of job + * arguments. + * Fix archive loading previous to 17.11 jobs per-node req_mem. + * Fix regresion validating that --gpus-per-socket requires --sockets-per-node + * for steps. Should only validate allocation requests. + * error() instead of fatal() when parsing an invalid hostlist. + * nss_slurm - fix potential deadlock in slurmstepd on overloaded systems. + * cons_tres - fix --gres-flags=enforce-binding and related --cpus-per-gres. + * cons_tres - Allocate lowest numbered cores when filtering cores with gres. + * Fix getting system counts for named GRES/TRES. + * MySQL - Fix for handing typed GRES for association rollups. + * Fix step allocations when tasks_per_core > 1. + * Fix allocating more GRES than requested when asking for multiple GRES types. + +------------------------------------------------------------------- +Wed May 6 10:54:43 UTC 2020 - Egbert Eich + +- Treat libnss_slurm like any other package: add version string to + upgrade package. + +------------------------------------------------------------------- +Fri Mar 27 08:26:34 UTC 2020 - Christian Goll + +- Updated to 20.02.1 with following changes" + * Improve job state reason for jobs hitting partition_job_depth. + * Speed up testing of singleton dependencies. + * Fix negative loop bound in cons_tres. + * srun - capture the MPI plugin return code from mpi_hook_client_fini() and + use as final return code for step failure. + * Fix segfault in cli_filter/lua. + * Fix --gpu-bind=map_gpu reusability if tasks > elements. + * Make sure config_flags on a gres are sent to the slurmctld on node + registration. + * Prolog/Epilog - Fix missing GPU information. + * Fix segfault when using config parser for expanded lines. + * Fix bit overlap test function. + * Don't accrue time if job begin time is in the future. + * Remove accrue time when updating a job start/eligible time to the future. + * Fix regression in 20.02.0 that broke --depend=expand. + * Reset begin time on job release if it's not in the future. + * Fix for recovering burst buffers when using high-availability. + * Fix invalid read due to freeing an incorrectly allocated env array. + * Update slurmctld -i message to warn about losing data. + * Fix scontrol cancel_reboot so it clears the DRAIN flag and node reason for a + pending ASAP reboot. + +------------------------------------------------------------------- +Sun Mar 8 15:43:25 UTC 2020 - Egbert Eich + +- Remove legacy_cray: with 20.02 the special treatment for + cray-specific plugins on SLE version prior to 15SP2 is + no longer required. + +------------------------------------------------------------------- +Wed Mar 4 13:05:07 UTC 2020 - Christian Goll + +- slurm-plugins will now also require pmix not only libpmix + (bsc#1164326) + +------------------------------------------------------------------- +Fri Feb 28 17:27:43 UTC 2020 - Egbert Eich + +- Removed autopatch as it doesn't work for the SLE-11-SP4 build. + +------------------------------------------------------------------- +Thu Feb 27 20:07:19 UTC 2020 - Kasimir _ + +- Disable %arm builds as this is no longer supported. + +------------------------------------------------------------------- +Thu Feb 27 10:19:05 UTC 2020 - Christian Goll + +- pmix searches now also for libpmix.so.2 so that there is no dependency + for devel package (bsc#1164386) + * added patch file check-for-lipmix.so.MAJOR.patch + * reworded patch file Remove-rpath-from-build.patch to use %autopatch + +------------------------------------------------------------------- +Wed Feb 26 06:13:13 UTC 2020 - Egbert Eich + +- Update to version 20.02.0 (jsc#SLE-8491) + * Fix minor memory leak in slurmd on reconfig. + * Fix invalid ptr reference when rolling up data in the database. + * Change shtml2html.py to require python3 for RHEL8 support, and match + man2html.py. + * slurm.spec - override "hardening" linker flags to ensure RHEL8 builds + in a usable manner. + * Fix type mismatches in the perl API. + * Prevent use of uninitialized slurmctld_diag_stats. + * Fixed various Coverity issues. + * Only show warning about root-less topology in daemons. + * Fix accounting of jobs in IGNORE_JOBS reservations. + * Fix issue with batch steps state not loading correctly when upgrading from + 19.05. + * Deprecate max_depend_depth in SchedulerParameters and move it to + DependencyParameters. + * Silence erroneous error on slurmctld upgrade when loading federation state. + * Break infinite loop in cons_tres dealing with incorrect tasks per tres + request resulting in slurmctld hang. + * Improve handling of --gpus-per-task to make sure appropriate number of GPUs + is assigned to job. + * Fix seg fault on cons_res when requesting --spread-job. +- Move to python3 for everything but SLE-11-SP4 + * For SLE-11-SP4 add a workaround to handle a python3 script (python2.7 + compliant). + +------------------------------------------------------------------- +Wed Feb 19 21:27:00 UTC 2020 - Egbert Eich + +- Add explicit version dependency to libpmix as well. + 'slurm-devel' has a tight version dependency on libpmix - + allowing multiple libpmix versions in one package repository + is therefore essential. + +------------------------------------------------------------------- +Thu Feb 13 22:34:48 UTC 2020 - Egbert Eich + +- Update to version 20.02.0-rc1 + * sbatch - fix segfault when no newline at the end of a burst buffer file. + * Change scancel to only check job's base state when matching -t options. + * Save job dependency list in state files. + * cons_tres - allow jobs to be run on systems with root-less topologies. + * Restore pre-20.02pre1 PrologSlurmctld synchonization behavior to avoid + various race conditions, and ensure proper batch job launch. + * Add new slurmrestd command/daemon which implements the Slurm REST API. + +------------------------------------------------------------------- +Tue Feb 11 10:09:43 UTC 2020 - Christian Goll + +- Update to version 20.02.0-0pre1, highlights are + Highlights: + * Exclusive behavior of a node includes all GRES on a node as well + as the cpus. + * Use python3 instead of python for internal build/test scripts. + The slurm.spec file has been updated to depend on python3 as well. + * Added new NodeSet configuration option to help simplify partition + configuration sections for heterogeneous / condo*style clusters. + * Added slurm.conf option MaxDBDMsgs to control how many messages will be + stored in the slurmctld before throwing them away when the slurmdbd is down. + * The checkpoint plugin interface and all associated API calls have been + removed. + * slurm_init_job_desc_msg() initializes mail_type as uint16_t. This allows + mail_type to be set to NONE with scontrol. + * Add new slurm_spank_log() function to print messages back to the user from + within a SPANK plugin without prepending "error: " from slurm_error(). + * Enforce having partition name and nodelist=ALL when creating reservations + with flags=PART_NODES. + * SPANK - removed never-implemented slurm_spank_slurmd_init() interface. This + hook has always been accessible through slurm_spank_init() in the + S_CTX_SLURMD context instead. + * sbcast - add new BcastAddr option to NodeName lines to allow sbcast traffic + to flow over an alternate network path. + * Added auth/jwt plugin, and 'scontrol token' subcommand. PMIx - improve + * performance of proc map generation. Deprecate kill_invalid_depend in + * SchedulerParameters and move it to a new + option called DependencyParameters. + * Enable job dependencies for any job on any cluster in the same federation. + * Allow clusters to be added automatically to db at startup of ctld. Add + * AccountingStorageExternalHost slurm.conf parameter. The + * "ConditionPathExists" condition in slurmd.service has been disabled by + default to permit simpler installation of a "configless" Slurm cluster. + * In SchedulerParameters remove deprecated max_job_bf and replace with + bf_max_job_test. + * Disable sbatch, salloc, srun --reboot for non-admins. SPANK - added support + * for S_JOB_GID in the job script context with + spank_get_item(). + * Prolog/Epilog - add SLURM_JOB_GID environment variable. + configuration file changes: + * The mpi/openmpi plugin has been removed as it does nothing. + MpiDefault=openmpi will be translated to the functionally-equivalent + MpiDefault=none. + command changes (see man pages for details) + * Display StepId=.batch instead of StepId=.4294967294 in output + of "scontrol show step". (slurm_sprint_job_step_info()) + * MPMD in srun will now defer PATH resolution for the commands to launch to + slurmstepd. Previously it would handle resolution client*side, but with + a non*standard approach that walked PATH in reverse. + * squeue - added "--me" option, equivalent to --user=$USER. + * The LicensesUsed line has been removed from 'scontrol show config'. + Please see the 'scontrol show licenses' command as an alternative. + * sbatch - adjusted backoff times for "--wait" option to reduce load on + slurmctld. This results in a steady*state delay of 32s between queries, + instead of the prior 10s delay. +- Removed following deprecated patches: + * removed patch slurmctld-rerun-agent_init-when-backup-controller-takes-over.patch + * removed patch split-xdaemon-in-xdaemon_init-and-xdaemon_finish-for.patch + * removed patch slurmctld-uses-xdaemon_-for-systemd.patch + * removed patch slurmd-uses-xdaemon_-for-systemd.patch + * removed patch slurmdbd-uses-xdaemon_-for-systemd.patch + * removed patch slurmsmwd-uses-xdaemon_-for-systemd.patch + * removed patch removed-deprecated-xdaemon.patch + +------------------------------------------------------------------- +Wed Feb 5 15:37:05 UTC 2020 - Christian Goll + +- standard slurm.conf uses now also SlurmctldHost on all build + targets (bsc#1162377) + +------------------------------------------------------------------- +Mon Jan 27 08:42:55 UTC 2020 - Egbert Eich + +- Fix a missed systemd_requires -> systemd_ordering conversion. + +------------------------------------------------------------------- +Fri Jan 24 17:31:18 UTC 2020 - Egbert Eich + +- Remove special OHPC compatibility macro: these settings should + be applied univerally. +- Add a Recommends for mariadb to slurm-slurmdbd: it is recommened + to run the database on the same machine as the daemon. + +------------------------------------------------------------------- +Fri Jan 24 11:47:58 UTC 2020 - Dominique Leuenberger + +- BuildRequire pkgconfig(systemd) instead of systemd: allow OBS to + shortcut through the -mini flavors. +- Use systemd_ordering instead of systemd_requires: systemd is + never a strict requirement; but in case the system is scheduled + for installation together with systemd, we want systemd to be + installed prior to slurm. + +------------------------------------------------------------------- +Thu Jan 23 17:44:29 UTC 2020 - Christian Goll + +- start slurmdbd after mariadb (bsc#1161716) + +------------------------------------------------------------------- +Mon Jan 13 15:41:48 UTC 2020 - Egbert Eich + +- Fix base_ver for SLE 15 SP2. + +------------------------------------------------------------------- +Wed Jan 8 20:01:19 UTC 2020 - Egbert Eich + +- Update to version 19.05.5 (jsc#SLE-8491) + * Check %docdir/NEWS for details. + * Includes security fixes CVE-2019-19727, CVE-2019-19728, + CVE-2019-12838. + * Disable i586 builds as this is no longer supported. + * Create libnss_slurm package to support user and group resolution + thru slurmstepd. + * slurm-2.4.4-rpath.patch -> Remove-rpath-from-build.patch + Obsoleted: + - pam_slurm_adopt-avoid-running-outside-of-the-sshd-PA.patch + - pam_slurm_adopt-send_user_msg-don-t-copy-undefined-d.patch + - pam_slurm_adopt-use-uid-to-determine-whether-root-is.patch + +------------------------------------------------------------------- +Thu Jan 2 09:14:56 UTC 2020 - Egbert Eich + +- Deprecate "ControlMachine" only for SLURM version upgrades and + products newer than 1501. This ensures that the original setting + is retained for the SLURM version shipped origianlly with SLE-15-SP1 + or Leap 15.1. + +------------------------------------------------------------------- +Sat Dec 21 09:07:42 UTC 2019 - Egbert Eich + +- Update to v18.08.9 for fixing CVE-2019-19728 (bsc#1159692). + * Wrap END_TIMER{,2,3} macro definition in "do {} while (0)" block. + * Make sview work with glib2 v2.62. + * Make Slurm compile on linux after sys/sysctl.h was deprecated. + * Install slurmdbd.conf.example with 0600 permissions to encourage secure + use. CVE-2019-19727. + * srun - do not continue with job launch if --uid fails. CVE-2019-19728. + +------------------------------------------------------------------- +Wed Dec 11 18:23:46 UTC 2019 - Christian Goll + +- added pmix support jsc#SLE-10800 + +------------------------------------------------------------------- +Sun Dec 8 11:33:42 UTC 2019 - Egbert Eich + +- Use --with-shared-libslurm to build slurm binaries using libslurm. +- Make libslurm depend on slurm-config. + +------------------------------------------------------------------- +Fri Dec 6 17:06:32 UTC 2019 - Egbert Eich + +- Fix ownership of /var/spool/slurm on new installations + and upgrade (boo#1158696). + +------------------------------------------------------------------- +Thu Oct 31 10:18:21 UTC 2019 - Egbert Eich + +- Fix permissions of slurmdbd.conf (bsc#1155784, CVE-2019-19727). +- Fix %posttrans macro _res_update to cope with added newline + (bsc#1153259). + +------------------------------------------------------------------- +Mon Oct 21 15:54:43 UTC 2019 - Egbert Eich + +- Add package slurm-webdoc which sets up a web server to provide + the documentation for the version shipped. + +------------------------------------------------------------------- +Mon Oct 7 15:39:43 UTC 2019 - Egbert Eich + +- Move srun from 'slurm' to 'slurm-node': srun is required on the + nodes as well so sbatch will work. 'slurm-node' is a requirement + when 'slurm' is installed (bsc#1153095). + +------------------------------------------------------------------- +Wed Oct 2 08:26:02 UTC 2019 - Egbert Eich + +- Set %base_ver for SLE-15-SP2 to 18.08 (for now). + +------------------------------------------------------------------- +Wed Sep 11 10:55:25 UTC 2019 - Egbert Eich + +- Edit sample configuration to deprecate "ControlMachine", + "ControlAddr", "BackupController" and "BackupAddr" in favor + "SlurmctldHost". + +------------------------------------------------------------------- +Sat Aug 17 14:20:35 UTC 2019 - Egbert Eich + +- Fix logic of slurm-munge recommends: slurm-munge requires munge + already, so if we have munge installed we recommend slurm-munge + as the authentication when installing slurm or slurm-node. + +------------------------------------------------------------------- +Sun Jul 14 13:28:13 UTC 2019 - Egbert Eich + +- Fix build for SLE-11-SP4 and older. + +------------------------------------------------------------------- +Fri Jul 12 09:04:55 UTC 2019 - Christian Goll + +- added cray depend libraries to seperate package, as they are now + built, since json is enabled + +------------------------------------------------------------------- +Thu Jul 11 10:57:52 UTC 2019 - Christian Goll + +- Updated to 18.08.8 for fixing (CVE-2019-12838, bsc#1140709, jsc#SLE-7341, + jsc#SLE-7342) + * Update "xauth list" to use the same 10000ms timeout as the other xauth + commands. + * Fix issue in gres code to handle a gres cnt of 0. + * Don't purge jobs if backfill is running. + * Verify job is pending add/removing accrual time. + * Don't abort when the job doesn't have an association that was removed + before the job was able to make it to the database. + * Set state_reason if select_nodes() fails job for QOS or Account. + * Avoid seg_fault on referencing association without a valid_qos bitmap. + * If Association/QOS is removed on a pending job set that job as ineligible. + * When changing a jobs account/qos always make sure you remove the old limits. + * Don't reset a FAIL_QOS or FAIL_ACCOUNT job reason until the qos or + account changed. + * Restore "sreport -T ALL" functionality. + * Correctly typecast signals being sent through the api. + * Properly initialize structures throughout Slurm. + * Sync "numtask" squeue format option for jobs and steps to "numtasks". + * Fix sacct -PD to avoid CA before start jobs. + * Fix potential deadlock with backup slurmctld. + * Fixed issue with jobs not appearing in sacct after dependency satisfied. + * Fix showing non-eligible jobs when asking with -j and not -s. + * Fix issue with backfill scheduler scheduling tasks of an array + when not the head job. + * accounting_storage/mysql - fix SIGABRT in the archive load logic. + * accounting_storage/mysql - fix memory leak in the archive load logic. + * Limit records per single SQL statement when loading archived data. + * Fix unnecessary reloading of job submit plugins. + * Allow job submit plugins to be turned on/off with a reconfigure. + * Fix segfault when loading/unloading Lua job submit plugin multiple times. + * Fix printing duplicate error messages of jobs rejected by job submit plugin. + * Fix printing of job submit plugin messages of het jobs without pack id. + * Fix memory leak in group_cache.c + * Fix jobs stuck from FedJobLock when requeueing in a federation + * Fix requeueing job in a federation of clusters with differing associations + * sacctmgr - free memory before exiting in 'sacctmgr show runaway'. + * Fix seff showing memory overflow when steps tres mem usage is 0. + * Upon archive file name collision, create new archive file instead of + overwriting the old one to prevent lost records. + * Limit archive files to 50000 records per file so that archiving large + databases will succeed. + * Remove stray newlines in SPANK plugin error messages. + * Fix archive loading events. + * In select/cons_res: Only allocate 1 CPU per node with the --overcommit and + --nodelist options. + * Fix main scheduler from potentially not running through whole queue. + * cons_res/job_test - prevent a job from overallocating a node memory. + * cons_res/job_test - fix to consider a node's current allocated memory when + testing a job's memory request. + * Fix issue where multi-node job steps on cloud nodes wouldn't finish cleaning + up until the end of the job (rather than the end of the step). + * Fix issue with a 17.11 sbcast call to a 18.08 daemon. + * Add new job bit_flags of JOB_DEPENDENT. + * Make it so dependent jobs reset the AccrueTime and do not count against any + AccrueTime limits. + * Fix sacctmgr --parsable2 output for reservations and tres. + * Prevent slurmctld from potential segfault after job_start_data() called + for completing job. + * Fix jobs getting on nodes with "scontrol reboot asap". + * Record node reboot events to database. + * Fix node reboot failure message getting to event table. + * Don't write "(null)" to event table when no event reason exists. + * Fix minor memory leak when clearing runaway jobs. + * Avoid flooding slurmctld and logging when prolog complete RPC errors occur. + * Fix GCC 9 compiler warnings. + * Fix seff human readable memory string for values below a megabyte. + * Fix dump/load of rejected heterogeneous jobs. + * For heterogeneous jobs, do not count the each component against the QOS or + association job limit multiple times. + * slurmdbd - avoid reservation flag column corruption with the use of newer + flags, instead preserve the older flag fields that we can still fit in the + smallint field, and discard the rest. + * Fix security issue in accounting_storage/mysql plugin on archive file loads + by always escaping strings within the slurmdbd. CVE-2019-12838. + + +------------------------------------------------------------------- +Mon Jul 8 08:19:23 UTC 2019 - Egbert Eich + +- Fix build dependency issue around libibmad-devel introduced + in SLE-12-SP4. + +------------------------------------------------------------------- +Mon Jul 8 05:41:11 UTC 2019 - Egbert Eich + +- Add BuildRequires to address warnings during build: + * for libcurl-devel, libssh2-devel and rrdtool-devel + * for libjson-c-devel and liblz4-devel where available, + disable these with --without-json and --without-lz4 + where not. + * disable DataWarp (--without-datawarp). + +------------------------------------------------------------------- +Sat Jul 6 20:07:53 UTC 2019 - Egbert Eich + +- Update SLURM to 18.08.7: + * Set debug statement to debug2 to avoid benign error messages. + * Add SchedulerParameters option of bf_hetjob_immediate to attempt to start + a heterogeneous job as soon as all of its components are determined able + to do so. + * Fix underflow causing decay thread to exit. + * Fix main scheduler not considering hetjobs when building the job queue. + * Fix regression for sacct to display old jobs without a start time. + * Fix setting correct number of gres topology bits. + * Update hetjobs pending state reason when appropriate. + * Fix accounting_storage/filetxt's understanding of TRES. + * Set Accrue time when not enforcing limits. + * Fix srun segfault when requesting a hetjob with test_exec or bcast + options. + * Hide multipart priorities log message behind Priority debug flag. + * sched/backfill - Make hetjobs sensitive to bf_max_job_start. + * Fix slurmctld segfault due to job's partition pointer NULL dereference. + * Fix issue with OR'ed job dependencies. + * Add new job's bit_flags of INVALID_DEPEND to prevent rebuilding a job's + dependency string when it has at least one invalid and purged dependency. + * Promote federation unsynced siblings log message from debug to info. + * burst_buffer/cray - fix slurmctld SIGABRT due to illegal read/writes. + * burst_buffer/cray - fix memory leak due to unfreed job script content. + * node_features/knl_cray - fix script_argv use-after-free. + * burst_buffer/cray - fix script_argv use-after-free. + * Fix invalid reads of size 1 due to non null-terminated string reads. + * Add extra debug2 logs to identify why BadConstraints reason is set. + +------------------------------------------------------------------- +Sat Jul 6 18:05:33 UTC 2019 - Egbert Eich + +- Do not build hdf5 support where not available. + +------------------------------------------------------------------- +Sat Jul 6 11:21:08 UTC 2019 - Egbert Eich + +- Add support for version updates on SLE: Update packages to a + later version than the version supported originally on SLE + will receive a version string in their package name. + +------------------------------------------------------------------- +Wed Feb 27 11:06:10 UTC 2019 - Christian Goll + +- added the hdf5 job data gathering plugin + +------------------------------------------------------------------- +Fri Feb 1 19:27:10 UTC 2019 - eich@suse.com + +- Add backward compatibility with SLE-11 SP4 + +------------------------------------------------------------------- +Thu Jan 31 20:30:32 UTC 2019 - eich@suse.com + +- Update to version 18.08.05-2: + This version obsoletes: + Fix-contrib-perlapi-to-build-with-the-fix-for-CVE-2019-6438-750cc23ed.patch +- Fix spec file for older SUSE versions. + +------------------------------------------------------------------- +Thu Jan 31 09:00:06 UTC 2019 - eich@suse.com + +- Update to version 18.08.05: + * Add mitigation for a potential heap overflow on 32-bit systems in xmalloc. + (CVE-2019-6438, bsc#1123304). + * Other fixes: + + Backfill - If a job has a time_limit guess the end time of a job better + if OverTimeLimit is Unlimited. + + Fix "sacctmgr show events event=cluster" + + Fix sacctmgr show runawayjobs from sibling cluster + + Avoid bit offset of -1 in call to bit_nclear(). + + Insure that "hbm" is a configured GresType on knl systems. + + Fix NodeFeaturesPlugins=node_features/knl_generic to allow other gres + other than knl. + + cons_res: Prevent overflow on multiply. + + Better debug for bad values in gres.conf. + + Fix double accounting of energy at end of job. + + Read gres.conf for cloud nodes on slurmctld. + + Don't assume the first node of a job is the batch host when purging jobs + from a node. + + Better debugging when a job doesn't have a job_resrcs ptr. + + Store ave watts in energy plugins. + + Add XCC plugin for reading Lenovo Power. + + Fix minor memory leak when scheduling rebootable nodes. + + Fix debug2 prefix for sched log. + + Fix printing correct SLURM_JOB_ACCOUNT_PACK_GROUP_* in env for a Het Job. + + sbatch - search current working directory first for job script. + + Make it so held jobs reset the AccrueTime and do not count against any + AccrueTime limits. + + Add SchedulerParameters option of bf_hetjob_prio=[min|avg|max] to alter + the job sorting algorithm for scheduling heterogeneous jobs. + + Fix initialization of assoc_mgr_locks and slurmctld_locks lock + structures. + + Fix segfault with job arrays using X11 forwarding. + + Revert regression caused by e0ee1c7054 which caused negative values and + values starting with a decimal to be invalid for PriorityWeightTRES and + TRESBillingWeight. + + Fix possibility to update a job's reservation to none. + + Suppress connection errors to primary slurmdbd when backup dbd is active. + + Suppress connection errors to primary db when backup db kicks in + + Add missing fields for sacct --completion when using jobcomp/filetxt. + + Fix incorrect values set for UserCPU, SystemCPU, and TotalCPU sacct + fields when JobAcctGatherType=jobacct_gather/cgroup. + + Fixed srun from double printing invalid option msg twice. + + Remove unused -b flag from getopt call in sbatch. + + Disable reporting of node TRES in sreport. + + Re-enabling features combined by OR within parenthesis for non-knl + setups. + + Prevent sending duplicate requests to reboot a node before ResumeTimeout. + + Down nodes that don't reboot by ResumeTimeout. + + Update seff to reflect API change from rss_max to tres_usage_in_max. + + Add missing TRES constants from perl API. + + Fix issue where sacct would return incorrect array tasks when querying + specific tasks. + + Add missing variables to slurmdb_stats_t in the perlapi. + + Fix nodes not getting reboot RPC when job requires reboot of nodes. + + Fix failing update the partition list of a job. + + Use slurm.conf gres ids instead of gres.conf names to get a gres type + name. + * Disable + slurmctld-rerun-agent_init-when-backup-controller-takes-over.patch: + Believed to be fixed by commit c1a537dbbe6 + See: https://bugs.schedmd.com/show_bug.cgi?id=5511 + * Add + Fix-contrib-perlapi-to-build-with-the-fix-for-CVE-2019-6438-750cc23ed.patch: + Fix fallout from 750cc23ed for CVE-2019-6438. + +------------------------------------------------------------------- +Thu Dec 13 10:07:00 UTC 2018 - cgoll@suse.com +- Update to 18.08.04, with following highlights + * Fix message sent to user to display preempted instead of time limit when + a job is preempted. + * Fix memory leak when a failure happens processing a nodes gres config. + * Improve error message when failures happen processing a nodes gres config. + * Don't skip jobs in scontrol hold. + * Allow --cpu-bind=verbose to be used with SLURM_HINT environment variable. + * Enhanced handling for runaway jobs + * cons_res: Delay exiting cr_job_test until after cores/cpus are calculated + and distributed. + * Don't check existence of srun --prolog or --epilog executables when set to + "none" and SLURM_TEST_EXEC is used. + * Add "P" suffix support to job and step tres specifications. + * Fix jobacct_gather/cgroup to work correctly when more than one task is + started on a node. + * salloc - set SLURM_NTASKS_PER_CORE and SLURM_NTASKS_PER_SOCKET in the + environment if the corresponding command line options are used. + * slurmd - fix handling of the -f flag to specify alternate config file + locations. + * Add SchedulerParameters option of bf_ignore_newly_avail_nodes to avoid + scheduling lower priority jobs on resources that become available during + the backfill scheduling cycle when bf_continue is enabled. + * job_submit/lua: Add several slurmctld return codes and add user/group info + * salloc/sbatch/srun - print warning if mutually exclusive options of --mem + and --mem-per-cpu are both set. + - Refreshed: + * pam_slurm_adopt-avoid-running-outside-of-the-sshd-PA.patch + +------------------------------------------------------------------- +Mon Dec 10 10:49:14 UTC 2018 - cgoll@suse.com + +- restarting services on update only when activated +- added rotation of logs +- Added backported patches which harden the pam module pam_slurm_adopt + (BOO#1116758) which will be in slurm 19.05.x + * added pam_slurm_adopt-avoid-running-outside-of-the-sshd-PA.patch + [PATCH 1/3] pam_slurm_adopt: avoid running outside of the sshd PAM + * added pam_slurm_adopt-send_user_msg-don-t-copy-undefined-d.patch + [PATCH 2/3] pam_slurm_adopt: send_user_msg: don't copy undefined data + * added pam_slurm_adopt-use-uid-to-determine-whether-root-is.patch + [PATCH 3/3] pam_slurm_adopt: use uid to determine whether root is + logging on +- package slurm-pam_slurm now depends on slurm-node and not on slurm + + +------------------------------------------------------------------- +Wed Dec 5 16:00:50 UTC 2018 - Christian Goll + +- fixed code in %pretrans section to be compatible with lua 5.1 + +------------------------------------------------------------------- +Tue Nov 20 11:21:37 UTC 2018 - eich@suse.com + +- Added missing perl-base dependency. + +------------------------------------------------------------------- +Tue Nov 20 11:21:14 UTC 2018 - eich@suse.com + +- Moved HTML docs to doc package. + +------------------------------------------------------------------- +Tue Nov 20 11:20:05 UTC 2018 - eich@suse.com + +- Moved config man pages to a separate package: This way, they won't + get installed on compute nodes. + +------------------------------------------------------------------- +Tue Nov 20 11:11:15 UTC 2018 - eich@suse.com + +- Update to 18.08.3 + * Add new burst buffer state of "teardown-fail" to indicate the burst + buffer teardown operation is failing on specific buffers. + * Multiple backup slurmctld daemons can be configured + * Enable jobs with zero node count for creation and/or deletion of persistent + burst buffers. + * Add "scontrol show dwstat" command to display Cray burst buffer status. + * Add "GetSysStatus" option to burst_buffer.conf file. + * Add node and partition configuration options of "CpuBind" to control + default task binding. + * Add "NumaCpuBind" option to knl.conf + * Add sbatch "--batch" option to identify features required on batch node. + * Add "BatchFeatures" field to output of "scontrol show job". + * Add support for "--bb" option to sbatch command. + * Add new SystemComment field to job data structure and database. + * Expand reservation "flags" field from 32 to 64 bits. + * Add job state flag of "SIGNALING" to avoid race condition. + * Properly handle srun --will-run option when there are jobs in COMPLETING + state. + * Properly report who is signaling a step. + * Don't combine updated reservation records in sreport's reservation report. + * node_features plugin - Add suport for XOR & XAND of job constraints (node + feature specifications). + * Improvements to how srun searches for the executible when using cwd. + * Now programs can be checked before execution if test_exec is set. + * Report NodeFeatures plugin configuration with scontrol and sview commands. + * Add acct_gather_profile/influxdb plugin. + * Add new job state of SO/STAGE_OUT + * Correct SLURM_NTASKS and SLURM_NPROCS environment variable for + heterogeneous job step. + * Expand advanced reservation feature specification to support parenthesis + and counts of nodes with specified features. + * Defer job signaling until prolog is completed + * Have the primary slurmctld wait until the backup has completely shutdown + before taking control. + * Fix issue where unpacking job state after TRES count changed could lead to + invalid reads. + * Heterogeneous job steps allocations supported with Open MPI. + * Remove redundant function arguments from task plugins. + * Add Slurm configuration file check logic using "slurmctld -t" command. + * Add the use of a xml file to help performance when using hwloc. + * Remove support for "ChosLoc" configuration parameter. + * Configuration parameters "ControlMachine", "ControlAddr", + "BackupController" and "BackupAddr" replaced by an ordered list of + "SlurmctldHost" records. + * Remove --immediate option from sbatch. + * Add infrastructure for per-job and per-step TRES parameters. + * Add DefCpuPerGpu and DefMemPerGpu to global and per-partition configuration + parameters. + * Add ValidateMode configuration parameter to knl_cray.conf. + * Disable local PTY output processing when using 'srun --unbuffered'. + * Change the column name for the %U (User ID) field in squeue to 'UID'. + * CRAY - Add CheckGhalQuiesce to the CommunicationParameters. + * When a process is core dumping, avoid terminating other processes in that + task group. + * CPU frequency management enhancements: If scaling_available_frequencies + file is not available, then derive values from scaling_min_freq and + scaling_max_freq values. + * Add pending jobs count to sdiag output. + * Add configuration paramerers SlurmctldPrimaryOnProg and + SlurmctldPrimaryOffProg, which define programs to execute when a slurmctld + daemon changes state. + * Add configuration paramerers SlurmctldAddr for use with virtual IP to + manage backup slurmctld daemons. + * Explicitly shutdown the slurmd process when instructed to reboot. + * Add ability to create/update partition with TRESBillingWeights through + scontrol. + * Calcuate TRES billing values at submission. + * Add node_features plugin function "node_features_p_reboot_weight()". + * Add NodeRebootWeight parameter to knl.conf configuration file. + * Completely remove "gres" field from step record. Use "tres_per_node", + "tres_per_socket", etc. + * Add "Links" parameter to gres.conf configuration file. + * Force slurm_mktime() to set tm_isdst to -1. + * burst_buffer.conf - Add SetExecHost flag to enable burst buffer access + from the login node for interactive jobs. + * Append ", with requeued tasks" to job array "end" emails if any tasks in + the array were requeued. + * Add ResumeFailProgram slurm.conf option to specify a program that is called + when a node fails to respond by ResumeTimeout. + * Add new job pending reason of "ReqNodeNotAvail, reserved for maintenance". + * Remove AdminComment += syntax from 'scontrol update job'. + * sched/backfill: Reset job time limit if needed for deadline scheduling. + * For heterogeneous job component with required nodes, explicitly exclude + those nodes from all other job components. + * Add name of partition used to output of srun --test-only output. + * sdiag output now reports outgoing slurmctld message queue contents. + * Improve escaping special characters on user commands when specifying paths. + * Add salloc/sbatch/srun option of --gres-flags=disable-binding to disable + filtering of CPUs with respect to generic resource locality. + * SlurmDBD - Print warning if MySQL/MariaDB internal tuning is not at least + half of the recommended values. + * Add ability to specify a node reason when rebooting nodes with "scontrol + reboot". + * Add nextstate option to "scontrol reboot". + * Consider "resuming" (nextstate=resume) nodes as available in backfill + future scheduling. + * Add TimelimitRaw sacct output field to display timelimit numbers. + * Add support for sacct --whole-hetjob=[yes|no] option. + * Make salloc handle node requests the same as sbatch. + * Add shutdown_on_reboot SlurmdParameter to control whether the Slurmd will + shutdown itself down or not when a reboot request is received. + * Add cancel_reboot scontrol option to cancel pending reboot of nodes. + * Make Users case insensitive in the database based on + Parameters=PreserveCaseUser in the slurmdbd.conf. + * Improve scheduling when dealing with node_features that could have a + boot delay. + * Changed the default AuthType for slurmdbd to auth/munge. + * Added 'remote-fs.target' to After directive of slurmd.service file. + * Remove drain on node when reboot nextstate used. + * Speed up pack of job's qos. + * Add sacctmgr options to prevent/manage job queue stuffing: + - GrpJobsAccrue= + - MaxJobsAccrue= + * MinPrioThreshold + Minimum priority required to reserve resources when scheduling. + * Add control_inx value to trigger_info_msg_t to permit future work in the + trigger management code to distinguish which of multiple backup controllers + has changed state. + * NOTES: + PreemptType=preempt/job_prio has been removed - use PreemptType=preempt/qos + instead. + * Bluegene support was deprecated has now been removed + * cgroup_allowed_devices_file.conf was removed. It was never used by + default, as ConstrainDevices was not set. If needed, refer to the + cgroups.conf man page on how to create one. + * slurm.epilog.clean: Removed. User should use pam_slurm_adopt instead. +- Refreshed: + * removed-deprecated-xdaemon.patch + * slurmctld-uses-xdaemon_-for-systemd.patch + * slurmd-uses-xdaemon_-for-systemd.patch + * slurmdbd-uses-xdaemon_-for-systemd.patch + * slurmsmwd-uses-xdaemon_-for-systemd.patch + * slurmctld-rerun-agent_init-when-backup-controller-takes-over.patch + +------------------------------------------------------------------- +Sun Sep 30 15:18:08 UTC 2018 - eich@suse.com + +- Move config man-pages to config package. + +------------------------------------------------------------------- +Mon Sep 24 09:25:57 UTC 2018 - cgoll@suse.com + +- added correct link flags for perl bindings (bsc#1108671) + * added correct linker search path in slurm-2.4.4-rpath.patch + * perl:Switch is required by slurm torque wrappers + +------------------------------------------------------------------- +Sat Sep 22 06:09:18 UTC 2018 - eich@suse.com + +- Fix Requires(pre) and Requires(post) for slurm-config and slurm-node. + This fixes issues with failing slurm user creation when installed + during initial system installation (bsc#1109373). + +------------------------------------------------------------------- +Tue Aug 14 10:26:43 UTC 2018 - eich@suse.com + +- Update to 17.11.9 + * Fix segfault in slurmctld when a job's node bitmap is NULL during a + scheduling cycle. Primarily caused by EnforcePartLimits=ALL. + * Remove erroneous unlock in acct_gather_energy/ipmi. + * Enable support for hwloc version 2.0.1. + * Fix 'srun -q' (--qos) option handling. + * Fix socket communication issue that can lead to lost task completition + messages, which will cause a permanently stuck srun process. + * Handle creation of TMPDIR if environment variable is set or changed in + a task prolog script. + * Avoid node layout fragmentation if running with a fixed CPU count but + without Sockets and CoresPerSocket defined. + * burst_buffer/cray - Fix datawarp swap default pool overriding jobdw. + * Fix incorrect job priority assignment for multi-partition job with + different PriorityTier settings on the partitions. + * Fix sinfo to print correct node state. + +------------------------------------------------------------------- +Thu Aug 2 11:35:55 UTC 2018 - eich@suse.com + +- When using a remote shared StateSaveLocation, slurmctld needs to + be started after remote filesystems have become available. + Add 'remote-fs.target' to the 'After=' directive in slurmctld.service + (boo#1103561). + +------------------------------------------------------------------- +Tue Jul 31 18:29:40 UTC 2018 - eich@suse.com + +- Update to 17.11.8 + * Fix incomplete RESPONSE_[RESOURCE|JOB_PACK]_ALLOCATION building path. + * Do not allocate nodes that were marked down due to the node not responding + by ResumeTimeout. + * task/cray plugin - search for "mems" cgroup information in the file + "cpuset.mems" then fall back to the file "mems". + * Fix ipmi profile debug uninitialized variable. + * PMIx: fixed the direct connect inline msg sending. + * MYSQL: Fix issue not handling all fields when loading an archive dump. + * Allow a job_submit plugin to change the admin_comment field during + job_submit_plugin_modify(). + * job_submit/lua - fix access into reservation table. + * MySQL - Prevent deadlock caused by archive logic locking reads. + * Don't enforce MaxQueryTimeRange when requesting specific jobs. + * Modify --test-only logic to properly support jobs submitted to more than + one partition. + * Prevent slurmctld from abort when attempting to set non-existing + qos as def_qos_id. + * Add new job dependency type of "afterburstbuffer". The pending job will be + delayed until the first job completes execution and it's burst buffer + stage-out is completed. + * Reorder proctrack/task plugin load in the slurmstepd to match that of + slurmd + and avoid race condition calling task before proctrack can introduce. + * Prevent reboot of a busy KNL node when requesting inactive features. + * Revert to previous behavior when requesting memory per cpu/node introduced + in 17.11.7. + * Fix to reinitialize previously adjusted job members to their original + value + when validating the job memory in multi-partition requests. + * Fix _step_signal() from always returning SLURM_SUCCESS. + * Combine active and available node feature change logs on one line rather + than one line per node for performance reasons. + * Prevent occasionally leaking freezer cgroups. + * Fix potential segfault when closing the mpi/pmi2 plugin. + * Fix issues with --exclusive=[user|mcs] to work correctly + with preemption or when job requests a specific list of hosts. + * Make code compile with hdf5 1.10.2+ + * mpi/pmix: Fixed the collectives canceling. + * SlurmDBD: improve error message handling on archive load failure. + * Fix incorrect locking when deleting reservations. + * Fix incorrect locking when setting up the power save module. + * Fix setting format output length for squeue when showing array jobs. + * Add xstrstr function. + * Fix printing out of --hint options in sbatch, salloc --help. + * Prevent possible divide by zero in _validate_time_limit(). + * Add Delegate=yes to the slurmd.service file to prevent systemd from + interfering with the jobs' cgroup hierarchies. + * Change the backlog argument to the listen() syscall within srun to 4096 + to match elsewhere in the code, and avoid communication problems at scale. + +------------------------------------------------------------------- +Tue Jul 31 17:30:08 UTC 2018 - eich@suse.com + +- slurmctld-rerun-agent_init-when-backup-controller-takes-over.patch: + Fix race in the slurmctld backup controller which prevents it + to clean up allocations on nodes properly after failing over + (bsc#1084917). +- Handled %license in a backward compatible manner. + +------------------------------------------------------------------- +Sat Jul 28 15:30:58 UTC 2018 - eich@suse.com + +- Add a 'Recommends: slurm-munge' to slurm-slurmdbd. + +------------------------------------------------------------------- +Wed Jul 11 12:04:55 UTC 2018 - eich@suse.com + +- Shield comments between script snippets with a %{!?nil:...} to + avoid them being interpreted as scripts - in which case the update + level is passed as argument (see chapter 'Shared libraries' in: + https://en.opensuse.org/openSUSE:Packaging_scriptlet_snippets) + (bsc#1100850). + +------------------------------------------------------------------- +Tue Jun 5 13:24:43 UTC 2018 - cgoll@suse.com + +- Update from 17.11.5 to 17.11.7 +- Fix security issue in handling of username and gid fields + CVE-2018-10995 and bsc#1095508 what implied an + update from 17.11.5 to 17.11.7 + Highlights of 17.11.6: + * CRAY - Add slurmsmwd to the contribs/cray dir + * PMIX - Added the direct connect authentication. + * Prevent the backup slurmctld from losing the active/available node + features list on takeover. + * Be able to force power_down of cloud node even if in power_save state. + * Allow cloud nodes to be recognized in Slurm when booted out of band. + * Numerous fixes - check 'NEWS' file. + Highlights of 17.11.7: + * Notify srun and ctld when unkillable stepd exits. + * Numerous fixes - check 'NEWS' file. +- Add: slurmsmwd-uses-xdaemon_-for-systemd.patch + * Fixes daemoniziation in newly introduced slurmsmwd daemon. +- Rename: + split-xdaemon-in-xdaemon_init-and-xdaemon_finish-for-systemd-compatibilty.patch + to split-xdaemon-in-xdaemon_init-and-xdaemon_finish-for.patch + * remain in sync with commit messages which introduced that file + + +------------------------------------------------------------------- +Thu Apr 19 21:05:04 UTC 2018 - eich@suse.com + +- Avoid running pretrans scripts when running in an instsys: + there may be not much installed, yet. pretrans code should + be done in lua, this way, it will be executed by the rpm-internal + lua interpreter and not be passed to a shell which may not be + around at the time this scriptlet is run (bsc#1090292). + +------------------------------------------------------------------- +Fri Apr 13 10:03:05 UTC 2018 - eich@suse.com + +- Add requires for slurm-sql to the slurmdbd package. + +------------------------------------------------------------------- +Thu Apr 12 17:20:03 UTC 2018 - eich@suse.com + +- Package READMEs for pam and pam_slurm_adopt. +- Use the new %%license directive for COPYING file. + +------------------------------------------------------------------- +Thu Apr 12 16:40:44 UTC 2018 - eich@suse.com + +- Add: + * split-xdaemon-in-xdaemon_init-and-xdaemon_finish-for-systemd-compatibilty.patch + * slurmctld-uses-xdaemon_-for-systemd.patch + * slurmd-uses-xdaemon_-for-systemd.patch + * slurmdbd-uses-xdaemon_-for-systemd.patch + * removed-deprecated-xdaemon.patch + Fix interaction with systemd: systemd expects that a + daemonizing process doesn't go away until the PID file + with it PID of the daemon has bee written (bsc#1084125). + +------------------------------------------------------------------- +Wed Apr 11 11:27:31 UTC 2018 - eich@suse.com + +- Make sure systemd services get restarted only when all + packages are in a consistent state, not in the middle + of an 'update' transaction (bsc#1088693). + Since the %postun scripts that run on update are from + the old package they cannot be changed - thus we work + around the restart breakage. + +------------------------------------------------------------------- +Fri Mar 23 13:50:14 UTC 2018 - cgoll@suse.com + +- fixed wrong log file location in slurmdbd.conf and + fixed pid location for slurmdbd and made slurm-slurmdbd + depend on slurm config which provides the dir /var/run/slurm + (bsc#1086859). + +------------------------------------------------------------------- +Fri Mar 16 08:57:20 UTC 2018 - cgoll@suse.com + +- added comment for (bsc#1085606) + +------------------------------------------------------------------- +Wed Mar 14 19:34:58 UTC 2018 - eich@suse.com + +- Fix security issue in accounting_storage/mysql plugin by always escaping + strings within the slurmdbd. CVE-2018-7033 + http://web.nvd.nist.gov/view/vuln/detail?vulnId=CVE-2018-7033 + (bsc#1085240). +- Update slurm to v17.11.5 (FATE#325451) + Highlights of 17.11: + * Support for federated clusters to manage a single work-flow + across a set of clusters. + * Support for heterogeneous job allocations (various processor types, + memory sizes, etc. by job component). Support for heterogeneous job + steps within a single MPI_COMM_WORLD is not yet supported for most + configurations. + * X11 support is now fully integrated with the main Slurm code. Remove + any X11 plugin configured in your plugstack.conf file to avoid errors + being logged about conflicting options. + * Added new advanced reservation flag of "flex", which permits jobs + requesting the reservation to begin prior to the reservation's + start time and use resources inside or outside of the reservation. + A typical use case is to prevent jobs not explicitly requesting the + reservation from using those reserved resources rather than forcing + jobs requesting the reservation to use those resources in the time + frame reserved. + * The sprio command has been modified to report a job's priority + information for every partition the job has been submitted to. + * Group ID lookup performed at job submit time to avoid lookup on + all compute nodes. Enable with PrologFlags=SendGIDs configuration + parameter. + * Slurm commands and daemons dynamically link to libslurmfull.so + instead of statically linking. This dramatically reduces the + footprint of Slurm. + * In switch plugin, added plugin_id symbol to plugins and wrapped + switch_jobinfo_t with dynamic_plugin_data_t in interface calls + in order to pass switch information between clusters with different + switch types. + * Changed default ProctrackType to cgroup. + * Changed default sched_min_interval from 0 to 2 microseconds. + * Added new 'scontrol write batch_script ' command to fetch a job's + batch script. Removed the ability to see the script as part of the + 'scontrol -dd show job' command. + * Add new "billing" TRES which allows jobs to be limited based on the + job's billable TRES calculated by the job's partition's + TRESBillingWeights. + * Regular user use of "scontrol top" command is now disabled. Use the + configuration parameter "SchedulerParameters=enable_user_top" to + enable that functionality. The configuration parameter + "SchedulerParameters=disable_user_top" will be silently ignored. + * Change default to let pending jobs run outside of reservation after + reservation is gone to put jobs in held state. Added + NO_HOLD_JOBS_AFTER_END reservation flag to use old default. + Support for PMIx v2.0 as well as UCX support. + * Remove plugins for obsolete MPI stacks: + - lam + - mpich1_p4 + - mpich1_shmem + - mvapich + * Numerous fixes - check 'NEWS' file. +- slurmd-Fix-slurmd-for-new-API-in-hwloc-2.0.patch + plugins-cgroup-Fix-slurmd-for-new-API-in-hwloc-2.0.patch: + Removed. Code upstream. +- slurmctld-service-var-run-path.patch: + Replaced by sed script. +- Fix some rpmlint warnings. + +------------------------------------------------------------------- +Mon Jan 29 13:43:57 UTC 2018 - cgoll@suse.com + +- moved config files to slurm-config package (FATE#324574). + +------------------------------------------------------------------- +Mon Jan 29 04:01:28 UTC 2018 - jjolly@suse.com + +- Moved slurmstepd and man page into slurm-node due to slurmd dependency +- Moved config files into slurm-node +- Moved slurmd rc scripts into slurm-node +- Made slurm-munge require slurm-plugins instead of slurm itself + - slurm-node suggested slurm-munge, causing the whole slurm to be + installed. The slurm-plugins seems to be a more base class + (FATE#324574). + +------------------------------------------------------------------- +Wed Jan 17 14:21:49 UTC 2018 - cgoll@suse.com + +- split up light wight slurm-node package for deployment on nodes + (FATE#324574). + +------------------------------------------------------------------- +Fri Dec 1 16:04:55 UTC 2017 - cgoll@suse.com + +- added /var/spool/ directory and removed duplicated entries from slurm.conf + +------------------------------------------------------------------- +Fri Nov 10 13:52:30 UTC 2017 - eich@suse.com + +- Package so-versioned libs separately. libslurm is expected + to change more frequently and thus is packaged separately + from libpmi. + +------------------------------------------------------------------- +Wed Nov 1 16:15:04 UTC 2017 - eich@suse.com + +- Updated to 17.02.9 to fix CVE-2017-15566 (bsc#1065697). + Changes in 17.0.9 + * When resuming powered down nodes, mark DOWN nodes right after + ResumeTimeout + has been reached (previous logic would wait about one minute longer). + * Fix sreport not showing full column name for TRES Count. + * Fix slurmdb_reservations_get() giving wrong usage data when job's spanned + reservation that was modified. + * Fix sreport reservation utilization report showing bad data. + * Show all TRES' on a reservation in sreport reservation utilization report + by default. + * Fix sacctmgr show reservation handling "end" parameter. + * Work around issue with sysmacros.h and gcc7 / glibc 2.25. + * Fix layouts code to only allow setting a boolean. + * Fix sbatch --wait to keep waiting even if a message timeout occurs. + * CRAY - If configured with NodeFeatures=knl_cray and there are non-KNL + nodes which include no features the slurmctld will abort without + this patch when attemping strtok_r(NULL). + * Fix regression in 17.02.7 which would run the spank_task_privileged as + part of the slurmstepd instead of it's child process. + * Fix security issue in Prolog and Epilog by always prepending SPANK_ to + all user-set environment variables. CVE-2017-15566. + Changes in 17.0.8: + * Add 'slurmdbd:' to the accounting plugin to notify message is from dbd + instead of local. + * mpi/mvapich - Buffer being only partially cleared. No failures observed. + * Fix for job --switch option on dragonfly network. + * In salloc with --uid option, drop supplementary groups before changing UID. + * jobcomp/elasticsearch - strip any trailing slashes from JobCompLoc. + * jobcomp/elasticsearch - fix memory leak when transferring generated buffer. + * Prevent slurmstepd ABRT when parsing gres.conf CPUs. + * Fix sbatch --signal to signal all MPI ranks in a step instead of just those + on node 0. + * Check multiple partition limits when scheduling a job that were previously + only checked on submit. + * Cray: Avoid running application/step Node Health Check on the external + job step. + * Optimization enhancements for partition based job preemption. + * Address some build warnings from GCC 7.1, and one possible memory leak if + /proc is inaccessible. + * If creating/altering a core based reservation with scontrol/sview on a + remote cluster correctly determine the select type. + * Fix autoconf test for libcurl when clang is used. + * Fix default location for cgroup_allowed_devices_file.conf to use correct + default path. + * Document NewName option to sacctmgr. + * Reject a second PMI2_Init call within a single step to prevent slurmstepd + from hanging. + * Handle old 32bit values stored in the database for requested memory + correctly in sacct. + * Fix memory leaks in the task/cgroup plugin when constraining devices. + * Make extremely verbose info messages debug2 messages in the task/cgroup + plugin when constraining devices. + * Fix issue that would deny the stepd access to /dev/null where GRES has a + 'type' but no file defined. + * Fix issue where the slurmstepd would fatal on job launch if you have no + gres listed in your slurm.conf but some in gres.conf. + * Fix validating time spec to correctly validate various time formats. + * Make scontrol work correctly with job update timelimit [+|-]=. + * Reduce the visibily of a number of warnings in _part_access_check. + * Prevent segfault in sacctmgr if no association name is specified for + an update command. + * burst_buffer/cray plugin modified to work with changes in Cray UP05 + software release. + * Fix job reasons for jobs that are violating assoc MaxTRESPerNode limits. + * Fix segfault when unpacking a 16.05 slurm_cred in a 17.02 daemon. + * Fix setting TRES limits with case insensitive TRES names. + * Add alias for xstrncmp() -- slurm_xstrncmp(). + * Fix sorting of case insensitive strings when using xstrcasecmp(). + * Gracefully handle race condition when reading /proc as process exits. + * Avoid error on Cray duplicate setup of core specialization. + * Skip over undefined (hidden in Slurm) nodes in pbsnodes. + * Add empty hashes in perl api's slurm_load_node() for hidden nodes. + * CRAY - Add rpath logic to work for the alpscomm libs. + * Fixes for administrator extended TimeLimit (job reason & time limit reset). + * Fix gres selection on systems running select/linear. + * sview: Added window decorator for maximize,minimize,close buttons for all + systems. + * squeue: interpret negative length format specifiers as a request to + delimit values with spaces. + * Fix the torque pbsnodes wrapper script to parse a gres field with a type + set correctly. +- Fixed ABI version of libslurm. + +------------------------------------------------------------------- +Fri Oct 6 13:53:08 UTC 2017 - jengelh@inai.de + +- Trim redundant wording in descriptions. + +------------------------------------------------------------------- +Wed Sep 27 11:08:29 UTC 2017 - jjolly@suse.com + +- Updated to slurm 17-02-7-1 + * Added python as BuildRequires + * Removed sched-wiki package + * Removed slurmdb-direct package + * Obsoleted sched-wiki and slurmdb-direct packages + * Removing Cray-specific files + * Added /etc/slurm/layout.d files (new for this version) + * Remove /etc/slurm/cgroup files from package + * Added lib/slurm/mcs_account.so + * Removed lib/slurm/jobacct_gather_aix.so + * Removed lib/slurm/job_submit_cnode.so +- Created slurm-sql package +- Moved files from slurm-plugins to slurm-torque package +- Moved creation of /usr/lib/tmpfiles.d/slurm.conf into slurm.spec + * Removed tmpfiles.d-slurm.conf +- Changed /var/run path for slurm daemons to /var/run/slurm + * Added slurmctld-service-var-run-path.patch + (FATE#324026). + +------------------------------------------------------------------- +Tue Sep 12 16:00:11 UTC 2017 - jjolly@suse.com + +- Made tmpfiles_create post-install macro SLE12 SP2 or greater +- Directly calling systemd-tmpfiles --create for before SLE12 SP2 + +------------------------------------------------------------------- +Mon Jul 10 03:35:41 UTC 2017 - jjolly@suse.com + +- Allows OpenSUSE Factory build as well +- Removes unused .service files from project +- Adds /var/run/slurm to /usr/lib/tmpfiles.d for boottime creation + * Patches upstream .service files to allow for /var/run/slurm path + * Modifies slurm.conf to allow for /var/run/slurm path + +------------------------------------------------------------------- +Tue May 30 10:24:09 UTC 2017 - eich@suse.com + +- Move wrapper script mpiexec provided by slrum-torque to + mpiexec.slurm to avoid conflicts. This file is normally + provided by the MPI implementation (boo#1041706). + +------------------------------------------------------------------- +Mon May 8 10:10:04 UTC 2017 - eich@suse.com + +- Replace remaining ${RPM_BUILD_ROOT}s. +- Improve description. +- Fix up changelog. + +------------------------------------------------------------------- +Fri Mar 31 12:43:25 UTC 2017 - eich@suse.com +- Spec file: Replace "Requires : slurm-perlapi" by + "Requires: perl-slurm = %{version}" (boo#1031872). + +------------------------------------------------------------------- +Thu Feb 16 12:12:45 UTC 2017 - jengelh@inai.de + +- Trim redundant parts of description. Fixup RPM groups. +- Replace unnecessary %__ macro indirections; + replace historic $RPM_* variables by macros. + +------------------------------------------------------------------- +Wed Feb 15 18:55:28 UTC 2017 - eich@suse.com + +- slurmd-Fix-for-newer-API-versions.patch: + Stale patch removed. + +------------------------------------------------------------------- +Tue Feb 7 16:47:17 UTC 2017 - eich@suse.com + +- Use %slurm_u and %slurm_g macros defined at the beginning of the spec + file when adding the slurm user/group for consistency. +- Define these macros to daemon,root for non-systemd. +- For anything newer than Leap 42.1 or SLE-12-SP1 build OpenHPC compatible. + +------------------------------------------------------------------- +Wed Feb 1 20:17:47 UTC 2017 - eich@suse.com + +- Updated to 16.05.8.1 + * Remove StoragePass from being printed out in the slurmdbd log at debug2 + level. + * Defer PATH search for task program until launch in slurmstepd. + * Modify regression test1.89 to avoid leaving vestigial job. Also reduce + logging to reduce likelyhood of Expect buffer overflow. + * Do not PATH search for mult-prog launches if LaunchParamters=test_exec is + enabled. + * Fix for possible infinite loop in select/cons_res plugin when trying to + satisfy a job's ntasks_per_core or socket specification. + * If job is held for bad constraints make it so once updated the job doesn't + go into JobAdminHeld. + * sched/backfill - Fix logic to reserve resources for jobs that require a + node reboot (i.e. to change KNL mode) in order to start. + * When unpacking a node or front_end record from state and the protocol + version is lower than the min version, set it to the min. + * Remove redundant lookup for part_ptr when updating a reservation's nodes. + * Fix memory and file descriptor leaks in slurmd daemon's sbcast logic. + * Do not allocate specialized cores to jobs using the --exclusive option. + * Cancel interactive job if Prolog failure with "PrologFlags=contain" or + "PrologFlags=alloc" configured. Send new error prolog failure message to + the salloc or srun command as needed. + * Prevent possible out-of-bounds read in slurmstepd on an invalid #! line. + * Fix check for PluginDir within slurmctld to work with multiple directories. + * Cancel interactive jobs automatically on communication error to launching + srun/salloc process. + * Fix security issue caused by insecure file path handling triggered by the + failure of a Prolog script. To exploit this a user needs to anticipate or + cause the Prolog to fail for their job. CVE-2016-10030 (bsc#1018371). +- Replace group/user add macros with function calls. +- Fix array initialzation and ensure strings are always NULL terminated in +- pam_slurm.c (bsc#1007053). +- Disable building with netloc support: the netloc API is part of the devel + branch of hwloc. Since this devel branch was included accidentally and has + been reversed since, we need to disable this for the time being. +- Conditionalized architecture specific pieces to support non-x86 architectures + better. + +------------------------------------------------------------------- +Tue Jan 3 17:21:58 UTC 2017 - eich@suse.com + +- Remove: unneeded 'BuildRequires: python' +- Add: + BuildRequires: freeipmi-devel + BuildRequires: libibmad-devel + BuildRequires: libibumad-devel + so they are picked up by the slurm build. +- Enable modifications from openHPC Project. +- Enable lua API package build. +- Add a recommends for slurm-munge to the slurm package: + This is way, the munge auth method is available and slurm + works out of the box. +- Create /var/lib/slurm as StateSaveLocation directory. + /tmp is dangerous. + +------------------------------------------------------------------- +Fri Dec 2 19:39:56 UTC 2016 - eich@suse.com + +- Create slurm user/group in preinstall script. + +------------------------------------------------------------------- +Wed Nov 30 15:16:05 UTC 2016 - eich@suse.com + +- Keep %{_libdir}/libpmi* and %{_libdir}/mpi_pmi2* on SUSE. + +------------------------------------------------------------------- +Tue Nov 22 21:42:04 UTC 2016 - eich@suse.com + +- Fix build with and without OHCP_BUILD define. +- Fix build for systemd and non-systemd. + +------------------------------------------------------------------- +Fri Nov 4 20:15:47 UTC 2016 - eich@suse.com + +- Updated to 16-05-5 - equvalent to OpenHPC 1.2. + * Fix issue with resizing jobs and limits not be kept track of correctly. + * BGQ - Remove redeclaration of job_read_lock. + * BGQ - Tighter locks around structures when nodes/cables change state. + * Make it possible to change CPUsPerTask with scontrol. + * Make it so scontrol update part qos= will take away a partition QOS from + a partition. + * Backfill scheduling properly synchronized with Cray Node Health Check. + Prior logic could result in highest priority job getting improperly + postponed. + * Make it so daemons also support TopologyParam=NoInAddrAny. + * If scancel is operating on large number of jobs and RPC responses from + slurmctld daemon are slow then introduce a delay in sending the cancel job + requests from scancel in order to reduce load on slurmctld. + * Remove redundant logic when updating a job's task count. + * MySQL - Fix querying jobs with reservations when the id's have rolled. + * Perl - Fix use of uninitialized variable in slurm_job_step_get_pids. + * Launch batch job requsting --reboot after the boot completes. + * Do not attempt to power down a node which has never responded if the + slurmctld daemon restarts without state. + * Fix for possible slurmstepd segfault on invalid user ID. + * MySQL - Fix for possible race condition when archiving multiple clusters + at the same time. + * Add logic so that slurmstepd can be launched under valgrind. + * Increase buffer size to read /proc/*/stat files. + * Remove the SchedulerParameters option of "assoc_limit_continue", making it + the default value. Add option of "assoc_limit_stop". If "assoc_limit_stop" + is set and a job cannot start due to association limits, then do not attempt + to initiate any lower priority jobs in that partition. Setting this can + decrease system throughput and utlization, but avoid potentially starving + larger jobs by preventing them from launching indefinitely. + * Update a node's socket and cores per socket counts as needed after a node + boot to reflect configuration changes which can occur on KNL processors. + Note that the node's total core count must not change, only the distribution + of cores across varying socket counts (KNL NUMA nodes treated as sockets by + Slurm). + * Rename partition configuration from "Shared" to "OverSubscribe". Rename + salloc, sbatch, srun option from "--shared" to "--oversubscribe". The old + options will continue to function. Output field names also changed in + scontrol, sinfo, squeue and sview. + * Add SLURM_UMASK environment variable to user job. + * knl_conf: Added new configuration parameter of CapmcPollFreq. + * Cleanup two minor Coverity warnings. + * Make it so the tres units in a job's formatted string are converted like + they are in a step. + * Correct partition's MaxCPUsPerNode enforcement when nodes are shared by + multiple partitions. + * node_feature/knl_cray - Prevent slurmctld GRES errors for "hbm" references. + * Display thread name instead of thread id and remove process name in stderr + logging for "thread_id" LogTimeFormat. + * Log IP address of bad incomming message to slurmctld. + * If a user requests tasks, nodes and ntasks-per-node and + tasks-per-node/nodes != tasks print warning and ignore ntasks-per-node. + * Release CPU "owner" file locks. + * Update seff to fix warnings with ncpus, and list slurm-perlapi dependency + in spec file. + * Allow QOS timelimit to override partition timelimit when EnforcePartLimits + is set to all/any. + * Make it so qsub will do a "basename" on a wrapped command for the output + and error files. + * Add logic so that slurmstepd can be launched under valgrind. + * Increase buffer size to read /proc/*/stat files. + * Prevent job stuck in configuring state if slurmctld daemon restarted while + PrologSlurmctld is running. Also re-issue burst_buffer/pre-load operation + as needed. + * Move test for job wait reason value of BurstBufferResources and + BurstBufferStageIn later in the scheduling logic. + * Document which srun options apply to only job, only step, or job and step + allocations. + * Use more compatible function to get thread name (>= 2.6.11). + * Make it so the extern step uses a reverse tree when cleaning up. + * If extern step doesn't get added into the proctrack plugin make sure the + sleep is killed. + * Add web links to Slurm Diamond Collectors (from Harvard University) and + collectd (from EDF). + * Add job_submit plugin for the "reboot" field. + * Make some more Slurm constants (INFINITE, NO_VAL64, etc.) available to + job_submit/lua plugins. + * Send in a -1 for a taskid into spank_task_post_fork for the extern_step. + * MYSQL - Sightly better logic if a job completion comes in with an end time + of 0. + * task/cgroup plugin is configured with ConstrainRAMSpace=yes, then set soft + memory limit to allocated memory limit (previously no soft limit was set). + * Streamline when schedule() is called when running with message aggregation + on batch script completes. + * Fix incorrect casting when [un]packing derived_ec on slurmdb_job_rec_t. + * Document that persistent burst buffers can not be created or destroyed using + the salloc or srun --bb options. + * Add support for setting the SLURM_JOB_ACCOUNT, SLURM_JOB_QOS and + SLURM_JOB_RESERVAION environment variables are set for the salloc command. + Document the same environment variables for the salloc, sbatch and srun + commands in their man pages. + * Fix issue where sacctmgr load cluster.cfg wouldn't load associations + that had a partition in them. + * Don't return the extern step from sstat by default. + * In sstat print 'extern' instead of 4294967295 for the extern step. + * Make advanced reservations work properly with core specialization. + * slurmstepd modified to pre-load all relevant plugins at startup to avoid + the possibility of modified plugins later resulting in inconsistent API + or data structures and a failure of slurmstepd. + * Export functions from parse_time.c in libslurm.so. + * Export unit convert functions from slurm_protocol_api.c in libslurm.so. + * Fix scancel to allow multiple steps from a job to be cancelled at once. + * Update and expand upgrade guide (in Quick Start Administrator web page). + * burst_buffer/cray: Requeue, but do not hold a job which fails the pre_run + operation. + * Insure reported expected job start time is not in the past for pending jobs. + * Add support for PMIx v2. + + Required for FATE#316379. + +------------------------------------------------------------------- +Mon Oct 17 13:25:52 UTC 2016 - eich@suse.com + +- Setting 'download_files' service to mode='localonly' + and adding source tarball. (Required for Factory). + +------------------------------------------------------------------- +Sat Oct 15 18:11:39 UTC 2016 - eich@suse.com + +- version 15.08.7.1 + * Remove the 1024-character limit on lines in batch scripts. + task/affinity: Disable core-level task binding if more CPUs required than + available cores. + * Preemption/gang scheduling: If a job is suspended at slurmctld restart or + reconfiguration time, then leave it suspended rather than resume+suspend. + * Don't use lower weight nodes for job allocation when topology/tree used. + * Don't allow user specified reservation names to disrupt the normal + reservation sequeuece numbering scheme. + * Avoid hard-link/copy of script/environment files for job arrays. Use the + master job record file for all tasks of the job array. + NOTE: Job arrays submitted to Slurm version 15.08.6 or later will fail if + the slurmctld daemon is downgraded to an earlier version of Slurm. + * In slurmctld log file, log duplicate job ID found by slurmd. Previously was + being logged as prolog/epilog failure. + * If a job is requeued while in the process of being launch, remove it's + job ID from slurmd's record of active jobs in order to avoid generating a + duplicate job ID error when launched for the second time (which would + drain the node). + * Cleanup messages when handling job script and environment variables in + older directory structure formats. + * Prevent triggering gang scheduling within a partition if configured with + PreemptType=partition_prio and PreemptMode=suspend,gang. + * Decrease parallelism in job cancel request to prevent denial of service + when cancelling huge numbers of jobs. + * If all ephemeral ports are in use, try using other port numbers. + * Prevent "scontrol update job" from updating jobs that have already finished. + * Show requested TRES in "squeue -O tres" when job is pending. + * Backfill scheduler: Test association and QOS node limits before reserving + resources for pending job. + * Many bug fixes. +- Use source services to download package. +- Fix code for new API of hwloc-2.0. +- package netloc_to_topology where avialable. +- Package documentation. + +------------------------------------------------------------------- +Sun Nov 1 13:45:52 UTC 2015 - scorot@free.fr + +- version 15.08.3 + * Many new features and bug fixes. See NEWS file +- update files list accordingly +- fix wrong end of line in some files + +------------------------------------------------------------------- +Thu Aug 6 19:06:18 UTC 2015 - scorot@free.fr + +- version 14.11.8 + * Many bug fixes. See NEWS file +- update files list accordingly + +------------------------------------------------------------------- +Sun Nov 2 22:12:34 UTC 2014 - scorot@free.fr + +- add missing systemd requirements +- add missing rclink + +------------------------------------------------------------------- +Sun Nov 2 15:04:42 UTC 2014 - scorot@free.fr + +- version 14.03.9 + * Many bug fixes. See NEWS file +- add systemd support + +------------------------------------------------------------------- +Sat Jul 26 10:22:32 UTC 2014 - scorot@free.fr + +- version 14.03.6 + * Added support for native Slurm operation on Cray systems + (without ALPS). + * Added partition configuration parameters AllowAccounts, + AllowQOS, DenyAccounts and DenyQOS to provide greater control + over use. + * Added the ability to perform load based scheduling. Allocating + resources to jobs on the nodes with the largest number if idle + CPUs. + * Added support for reserving cores on a compute node for system + services (core specialization) + * Add mechanism for job_submit plugin to generate error message + for srun, salloc or sbatch to stderr. + * Support for Postgres database has long since been out of date + and problematic, so it has been removed entirely. If you + would like to use it the code still exists in <= 2.6, but will + not be included in this and future versions of the code. + * Added new structures and support for both server and cluster + resources. + * Significant performance improvements, especially with respect + to job array support. +- update files list + +------------------------------------------------------------------- +Sun Mar 16 15:59:01 UTC 2014 - scorot@free.fr + +- update to version 2.6.7 + * Support for job arrays, which increases performance and ease of + use for sets of similar jobs. + * Job profiling capability added to record a wide variety of job + characteristics for each task on a user configurable periodic + basis. Data currently available includes CPU use, memory use, + energy use, Infiniband network use, Lustre file system use, etc. + * Support for MPICH2 using PMI2 communications interface with much + greater scalability. + * Prolog and epilog support for advanced reservations. + * Much faster throughput for job step execution with --exclusive + option. The srun process is notified when resources become + available rather than periodic polling. + * Support improved for Intel MIC (Many Integrated Core) processor. + * Advanced reservations with hostname and core counts now supports + asymmetric reservations (e.g. specific different core count for + each node). + * External sensor plugin infrastructure added to record power + consumption, temperature, etc. + * Improved performance for high-throughput computing. + * MapReduce+ support (launches ~1000x faster, runs ~10x faster). + * Added "MaxCPUsPerNode" partition configuration parameter. This + can be especially useful to schedule GPUs. For example a node + can be associated with two Slurm partitions (e.g. "cpu" and + "gpu") and the partition/queue "cpu" could be limited to only a + subset of the node's CPUs, insuring that one or more CPUs would + be available to jobs in the "gpu" partition/queue. + +------------------------------------------------------------------- +Thu Jun 6 20:31:49 UTC 2013 - scorot@free.fr + +- version 2.5.7 + * Fix for linking to the select/cray plugin to not give warning + about undefined variable. + * Add missing symbols to the xlator.h + * Avoid placing pending jobs in AdminHold state due to backfill + scheduler interactions with advanced reservation. + * Accounting - make average by task not cpu. + * POE - Correct logic to support poe option "-euidevice sn_all" + and "-euidevice sn_single". + * Accounting - Fix minor initialization error. + * POE - Correct logic to support srun network instances count + with POE. + * POE - With the srun --launch-cmd option, report proper task + count when the --cpus-per-task option is used without the + --ntasks option. + * POE - Fix logic binding tasks to CPUs. + * sview - Fix race condition where new information could of + slipped past the node tab and we didn't notice. + * Accounting - Fix an invalid memory read when slurmctld sends + data about start job to slurmdbd. + * If a prolog or epilog failure occurs, drain the node rather + than setting it down and killing all of its jobs. + * Priority/multifactor - Avoid underflow in half-life calculation. + * POE - pack missing variable to allow fanout (more than 32 + nodes) + * Prevent clearing reason field for pending jobs. This bug was + introduced in v2.5.5 (see "Reject job at submit time ..."). + * BGQ - Fix issue with preemption on sub-block jobs where a job + would kill all preemptable jobs on the midplane instead of just + the ones it needed to. + * switch/nrt - Validate dynamic window allocation size. + * BGQ - When --geo is requested do not impose the default + conn_types. + * RebootNode logic - Defers (rather than forgets) reboot request + with job running on the node within a reservation. + * switch/nrt - Correct network_id use logic. Correct support for + user sn_all and sn_single options. + * sched/backfill - Modify logic to reduce overhead under heavy + load. + * Fix job step allocation with --exclusive and --hostlist option. + * Select/cons_res - Fix bug resulting in error of "cons_res: sync + loop not progressing, holding job #" + * checkpoint/blcr - Reset max_nodes from zero to NO_VAL on job + restart. + * launch/poe - Fix for hostlist file support with repeated host + names. + * priority/multifactor2 - Prevent possible divide by zero. + -- srun - Don't check for executable if --test-only flag is + used. + * energy - On a single node only use the last task for gathering + energy. Since we don't currently track energy usage per task + (only per step). Otherwise we get double the energy. + +------------------------------------------------------------------- +Sat Apr 6 11:13:17 UTC 2013 - scorot@free.fr + +- version 2.5.4 + * Support for IntelĀ® Many Integrated Core (MIC) processors. + * User control over CPU frequency of each job step. + * Recording power usage information for each job. + * Advanced reservation of cores rather than whole nodes. + * Integration with IBM's Parallel Environment including POE (Parallel + Operating Environment) and NRT (Network Resource Table) API. + * Highly optimized throughput for serial jobs in a new + "select/serial" plugin. + * CPU load is information available + * Configurable number of CPUs available to jobs in each SLURM + partition, which provides a mechanism to reserve CPUs for use + with GPUs. + +------------------------------------------------------------------- +Sat Nov 17 18:02:16 UTC 2012 - scorot@free.fr + +- remore runlevel 4 from init script thanks to patch1 +- fix self obsoletion of slurm-munge package +- use fdupes to remove duplicates +- spec file reformaing + +------------------------------------------------------------------- +Sat Nov 17 17:30:11 UTC 2012 - scorot@free.fr + +- put perl macro in a better within install section + +------------------------------------------------------------------- +Sat Nov 17 17:01:20 UTC 2012 - scorot@free.fr + +- enable numa on x86_64 arch only + +------------------------------------------------------------------- +Sat Nov 17 16:54:18 UTC 2012 - scorot@free.fr + +- add numa and hwloc support +- fix rpath with patch0 + +------------------------------------------------------------------- +Fri Nov 16 21:46:49 UTC 2012 - scorot@free.fr + +- fix perl module files list + +------------------------------------------------------------------- +Mon Nov 5 21:48:52 UTC 2012 - scorot@free.fr + +- use perl_process_packlist macro for the perl files cleanup +- fix some summaries length +- add cgoups directory and example the cgroup.release_common file + +------------------------------------------------------------------- +Sat Nov 3 18:19:59 UTC 2012 - scorot@free.fr + +- spec file cleanup + +------------------------------------------------------------------- +Sat Nov 3 15:57:47 UTC 2012 - scorot@free.fr + +- first package + diff --git a/slurm.spec b/slurm.spec new file mode 100644 index 0000000..d0e1eda --- /dev/null +++ b/slurm.spec @@ -0,0 +1,1395 @@ +# +# spec file for package slurm +# +# Copyright (c) 2025 SUSE LLC +# +# All modifications and additions to the file contributed by third parties +# remain the property of their copyright owners, unless otherwise agreed +# upon. The license for this file, and modifications and additions to the +# file, is the same license as for the pristine package itself (unless the +# license for the pristine package is not an Open Source License, in which +# case the license is the MIT License). An "Open Source License" is a +# license that conforms to the Open Source Definition (Version 1.9) +# published by the Open Source Initiative. + +# Please submit bugfixes or comments via https://bugs.opensuse.org/ +# + + +# Check file META in sources: update so_version to (API_CURRENT - API_AGE) +%define so_version 42 +# Make sure to update `upgrades` as well! +%define ver 24.11.0 +%define _ver _24_11 +%define dl_ver %{ver} +# so-version is 0 and seems to be stable +%define pmi_so 0 +%define nss_so 2 +%define pmix_so 2 +%define ver_major %(ver=%{version}; echo ${ver%.*}) + +%define pname slurm + +%define slurm_testsuite 1 + +ExcludeArch: i586 %arm s390 +%if 0%{?suse_version} < 1500 +ExcludeArch: s390x +%endif + +%if 0%{?suse_version} < 1315 +ExclusiveArch: do_not_build +%endif +%if 0%{?sle_version} == 120200 +%define base_ver 1702 +%define nocheck 1 +%endif +%if 0%{?sle_version} == 150000 +%define base_ver 1711 +%endif +%if 0%{?sle_version} == 150100 +%define base_ver 1808 +%endif +%if 0%{?sle_version} == 150200 +%define base_ver 2002 +%endif +%if 0%{?sle_version} == 150300 || 0%{?sle_version} == 150400 +%define base_ver 2011 +%endif +%if 0%{?sle_version} == 150500 || 0%{?sle_version} == 150600 +%define base_ver 2302 +%endif +%if 0%{?sle_version} == 150500 || 0%{?sle_version} == 150600 +%define base_ver 2302 +%endif +%if 0%{?sle_version} == 150700 +%define base_ver 2411 +%endif + +%define ver_m %{lua:x=string.gsub(rpm.expand("%ver"),"%.[^%.]*$","");print(x)} +# Keep format_spec_file from botching the define below: +%if 1 == 1 +%define base_conflicts() %{?nil: # +Conflicts: %{*} < %{ver_m}.0 +Conflicts: %{*} >= %{ver_m}.99 } +%endif + +%if 0%{?base_ver} > 0 && 0%{?base_ver} < %{lua:x=string.gsub(rpm.expand("%_ver"),"_","");print(x)} +%define upgrade 1 +%endif +%define upgrade_versions upgrades +%define do_obsoletes() %{lua: + local filename = rpm.expand("%_sourcedir") .. "/" .. rpm.expand("%upgrade_versions") + local version = rpm.expand("%version") + local arg = rpm.expand("%{1}") + local f = io.open(filename ,"r") + local em = false + if f~=nil then + f.close(f) + for line in io.lines(filename) do + if em then print('\\n') else em = true end + print("Obsoletes: " .. arg .. " = " .. line) + end + else + print("Obsoletes: " .. arg .. " < " .. version) + end } + +%define upgrade_dep() %{?upgrade: # +Provides: %{*} = %{version} +%{expand:%%do_obsoletes %{*}} +Conflicts: %{*} } + +%if 0%{?suse_version} >= 1500 +%define have_sysuser 1 +%endif + +# Build with PMIx only for SLE >= 15.0 and TW +%if 0%{?sle_version} >= 150000 || 0%{suse_version} >= 1550 +%{bcond_without pmix} +%else +%{bcond_with pmix} +%endif + +%define python_ver 3 +%if 0%{?sle_version} >= 150000 || 0%{?is_opensuse} +%define have_apache_rpm_macros 1 +%define have_http_parser 1 +%endif + +# it seems as disabling slurmrestd has no effect on 22.05 +%if 0%{?have_http_parser} +%define build_slurmrestd 1 +%endif + +%if 0 +%define have_netloc 1 +%endif + +%if 0%{?suse_version} >= 1500 +%define have_hdf5 1 +%define have_boolean_deps 1 +%define have_lz4 1 +%define have_firewalld 1 +%endif + +%ifarch x86_64 + %define have_libnuma 1 +%else + %ifarch %{ix86} + %if 0%{?sle_version} >= 120200 + %define have_libnuma 1 + %endif + %endif +%endif + +%define slurm_u %pname +%define slurm_g %pname +%define slurm_uid 120 +%define slurmdir %{_rundir}/slurm +%define slurmdescr "SLURM workload manager" + +%define libslurm libslurm%{so_version} +%{!?_rundir:%define _rundir /var/run} + +%if !0%{?_pam_moduledir:1} +%define _pam_moduledir /%_lib +%endif +%if 0%{!?_pam_secconfdir:1} +%define _pam_secconfdir %{_sysconfdir}/security +%endif + +Name: %{pname}%{?upgrade:%{_ver}} +Version: %{ver} +Release: 0 +Summary: Simple Linux Utility for Resource Management +License: SUSE-GPL-2.0-with-openssl-exception +Group: Productivity/Clustering/Computing +URL: https://www.schedmd.com +Source: https://download.schedmd.com/slurm/%{pname}-%{dl_ver}.tar.bz2 +Source1: %upgrade_versions +Source2: slurm-rpmlintrc +Source10: slurmd.xml +Source11: slurmctld.xml +Source12: slurmdbd.xml +# create: tar --owner=nobody --group=nogroup --exclude=*~ -cvzf test_setup.tar.gz test_setup +Source20: test_setup.tar.gz +Source21: README_Testsuite.md +Patch0: Remove-rpath-from-build.patch +Patch2: pam_slurm-Initialize-arrays-and-pass-sizes.patch +Patch15: Fix-test7.2-to-find-libpmix-under-lib64-as-well.patch + +%{upgrade_dep %pname} +Requires: %{name}-config = %{version} +%if 0%{?have_boolean_deps} +Requires: (%{name}-munge = %version if munge) +%else +Recommends: %{name}-munge = %version +%endif +Requires(pre): %{name}-node = %{version} +Recommends: %{name}-config-man = %{version} +Recommends: %{name}-doc = %{version} +BuildRequires: autoconf +BuildRequires: automake +BuildRequires: coreutils +BuildRequires: fdupes +%{?have_firewalld:BuildRequires: firewalld} +BuildRequires: gcc-c++ +BuildRequires: gtk2-devel +%if 0%{?have_hdf5} +BuildRequires: hdf5-devel +%endif +BuildRequires: libbitmask-devel +BuildRequires: libcpuset-devel +BuildRequires: python%{?python_ver} +%if 0%{?have_libnuma} +BuildRequires: libnuma-devel +%endif +BuildRequires: mysql-devel >= 5.0.0 +BuildRequires: ncurses-devel +%{?with_pmix:BuildRequires: pmix-devel} +BuildRequires: openssl-devel >= 0.9.6 +BuildRequires: pkgconfig +BuildRequires: readline-devel +%if 0%{?suse_version} > 1310 || 0%{?sle_version} + %if 0%{?sle_version} >= 120400 && 0%{?sle_version} < 150000 +BuildRequires: infiniband-diags-devel + %else +BuildRequires: libibmad-devel + %endif +BuildRequires: libibumad-devel +%endif +%if 0%{?suse_version} > 1140 +BuildRequires: libhwloc-devel +%ifarch %{ix86} x86_64 +BuildRequires: freeipmi-devel +%endif +%endif +BuildRequires: libcurl-devel +BuildRequires: libjson-c-devel +%if 0%{?have_lz4} +BuildRequires: liblz4-devel +%endif +BuildRequires: libssh2-devel +BuildRequires: libyaml-devel +BuildRequires: rrdtool-devel +%{?have_sysuser:BuildRequires: sysuser-tools} +%{?systemd_ordering} +BuildRequires: dejagnu +BuildRequires: zlib-devel +BuildRequires: pkgconfig(dbus-1) +BuildRequires: pkgconfig(systemd) +BuildRoot: %{_tmppath}/%{name}-%{version}-build +Obsoletes: slurm-sched-wiki < %{version} +Obsoletes: slurmdb-direct < %{version} + +%description +SLURM is a fault-tolerant scalable cluster management and job +scheduling system for Linux clusters containing up to 65,536 nodes. +Components include machine status, partition management, job +management, scheduling and accounting modules. + +%package doc +Summary: Documentation for SLURM +Group: Documentation/HTML +BuildArch: noarch +%{upgrade_dep %{pname}-doc} +%{base_conflicts %{pname}-config} + +%package webdoc +Summary: Set up SLURM Documentation Server +Group: Productivity/Clustering/Computing +BuildArch: noarch +%if 0%{?have_apache_rpm_macros} +BuildRequires: apache-rpm-macros +%else +%define apache_sysconfdir /etc/apache2 +%endif +Requires: slurm-doc = %{version} +Requires(pre): apache2 +%{upgrade_dep %{pname}-webdoc} + +%description webdoc +Set up HTTP server for SLURM configuration. + +%description doc +Documentation (HTML) for the SLURM cluster managment software. + +%package -n perl-%{name} +Summary: Perl API to SLURM +Group: Development/Languages/Perl +Requires: %{name} = %{version} +%if 0%{?suse_version} < 1140 +Requires: perl = %{perl_version} +%else +%{libperl_requires} +%{perl_requires} +%endif +%{upgrade_dep perl-%{pname}} + +%description -n perl-%{name} +This package includes the Perl API to provide an interface to SLURM +through Perl. + +%package -n %{libslurm} +# the .so number of libslurm is bumped with each major release +# therefore no need for a version string for Leap/SLE upgrade packages +Summary: Libraries for SLURM +Group: System/Libraries +Requires: %{name}-config +Conflicts: %{name}-config < %{ver_major} +Conflicts: %{name}-config > %{ver_major}.99 +Provides: libslurm = %{version} +Conflicts: libslurm + +%description -n %{libslurm} +This package contains the library needed to run programs dynamically linked +with SLURM. + +%package -n libpmi%{pmi_so}%{?upgrade:%{_ver}} +Summary: SLURM PMI Library +Group: System/Libraries +%{upgrade_dep libpmi%{pmi_so}} + +%description -n libpmi%{pmi_so}%{?upgrade:%{_ver}} +This package contains the library needed to run programs dynamically linked +with SLURM. + +%package -n libnss_%{pname}%{nss_so}%{?upgrade:%{_ver}} +Summary: NSS Plugin for SLURM +Group: System/Libraries +%{upgrade_dep libnss_%{pname}%{nss_so}} + +%description -n libnss_%{pname}%{nss_so}%{?upgrade:%{_ver}} +libnss_slurm is an optional NSS plugin that permits password and group +resolution for a job on a compute node to be serviced through the local +slurmstepd process. + +%package devel +Summary: Development package for SLURM +Group: Development/Libraries/C and C++ +Requires: %{libslurm} = %{version} +Requires: %{name} = %{version} +Requires: libpmi%{pmi_so}%{?upgrade:%{_ver}} = %{version} +%{upgrade_dep %{pname}-devel} + +%description devel +This package includes the header files for the SLURM API. + +%package auth-none +Summary: SLURM auth NULL implementation (no authentication) +Group: Productivity/Clustering/Computing +Requires: %{name} = %{version} +%{upgrade_dep %{pname}-auth-none} + +%description auth-none +This package cobtains the SLURM NULL authentication module. + +%package munge +Summary: SLURM authentication and crypto implementation using Munge +Group: Productivity/Clustering/Computing +Requires: %{name}-plugins = %{version} +Requires: munge +BuildRequires: munge-devel +Obsoletes: %{name}-auth-munge < %{version} +Provides: %{name}-auth-munge = %{version} +%{upgrade_dep %{pname}-munge} + +%description munge +This package contains the SLURM authentication module for Chris Dunlap's Munge. + +%package sview +Summary: SLURM graphical interface +Group: Productivity/Clustering/Computing +Requires: %{name}-plugins = %version +%{upgrade_dep %{pname}-sview} + +%description sview +sview is a graphical user interface to get and update state information for +jobs, partitions, and nodes managed by SLURM. + +%package slurmdbd +Summary: SLURM database daemon +Group: Productivity/Clustering/Computing +Requires: %{name}-config = %{version} +Requires: %{name}-plugins = %{version} +Requires: %{name}-sql = %{version} +%if 0%{?suse_version} > 1310 +Recommends: mariadb +%endif +%if 0%{?have_boolean_deps} +Recommends: (%{name}-munge = %version if munge) +%else +Recommends: %{name}-munge = %version +%endif +%{?systemd_ordering} +Obsoletes: slurm-sched-wiki < %{version} +Obsoletes: slurmdb-direct < %{version} +%{upgrade_dep %{pname}-slurmdbd} + +%description slurmdbd +The SLURM database daemon provides accounting of jobs in a database. + +%package sql +Summary: Slurm SQL support +Group: Productivity/Clustering/Computing +%{upgrade_dep %{pname}-sql} + +%description sql +Contains interfaces to MySQL for use by SLURM. + +%package plugins +Summary: SLURM plugins (loadable shared objects) +Group: Productivity/Clustering/Computing +%{upgrade_dep %{pname}-plugins} +%if %{with pmix} +Requires: libpmix%{pmix_so} +Requires: pmix +%endif +Requires: %{name}-config = %{version} + +%description plugins +This package contains the SLURM plugins (loadable shared objects) + +%package torque +Summary: Wrappers for transitition from Torque/PBS to SLURM +Group: Productivity/Clustering/Computing +Requires: perl-%{name} = %{version} +Requires: perl-Switch +Provides: torque-client +Requires: %{name}-plugins = %{version} +%{upgrade_dep %{pname}-torque} + +%description torque +Wrapper scripts for aiding migration from Torque/PBS to SLURM. + +%package openlava +Summary: Wrappers for transitition from OpenLava/LSF to Slurm +Group: Productivity/Clustering/Computing +Requires: perl-%{name} = %{version} +BuildArch: noarch +%{upgrade_dep %{pname}-openlava} + +%description openlava +Wrapper scripts for aiding migration from OpenLava/LSF to Slurm + +%package seff +Summary: Mail tool that includes job statistics in user notification email +Group: Productivity/Clustering/Computing +Requires: perl-%{name} = %{version} +BuildArch: noarch +%{upgrade_dep %{pname}-seff} + +%description seff +Mail program used directly by the SLURM daemons. On completion of a job, +it waits for accounting information to be available and includes that +information in the email body. + +%package sjstat +Summary: Perl tool to print SLURM job state information +Group: Productivity/Clustering/Computing +Requires: %{name} = %{version} +BuildArch: noarch +%{upgrade_dep %{pname}-sjstat} +%if 0%{?suse_version} < 1140 +Requires: perl = %{perl_version} +%else +%{perl_requires} +%endif + +%description sjstat +This package contains a Perl tool to print SLURM job state information. + +%package pam_slurm +Summary: PAM module for restricting access to compute nodes via SLURM +Group: Productivity/Clustering/Computing +Requires: %{name}-node = %{version} +%{upgrade_dep %{pname}-pam_slurm} +BuildRequires: pam-devel + +%description pam_slurm +This module restricts access to compute nodes in a cluster where the Simple +Linux Utility for Resource Managment (SLURM) is in use. Access is granted +to root, any user with an SLURM-launched job currently running on the node, +or any user who has allocated resources on the node according to the SLURM. + +%package lua +Summary: Lua API for SLURM +Group: Development/Languages/Other +Requires: %{name} = %{version} +%{upgrade_dep %{pname}-lua} +BuildRequires: lua-devel + +%description lua +This package includes the Lua API to provide an interface to SLURM +through Lua. + +%package rest +Summary: Slurm REST API Interface +Group: Productivity/Clustering/Computing +Requires: %{name}-config = %{version} +%if 0%{?have_http_parser} +BuildRequires: http-parser-devel +%endif +%if 0%{?have_boolean_deps} +Recommends: (%{name}-munge = %version if munge) +%else +Recommends: %{name}-munge = %version +%endif +%{upgrade_dep %{pname}-rest} + +%description rest +This package provides the interface to SLURM via REST API. + +%package node +Summary: Minimal slurm node +Group: Productivity/Clustering/Computing +Requires: %{name}-config = %{version} +Requires: %{name}-plugins = %{version} +%if 0%{?have_boolean_deps} +Recommends: (%{name}-munge = %version if munge) +%else +Recommends: %{name}-munge = %version +%endif +%{?with_pmix:Recommends: pmix-devel} +%{?systemd_ordering} +%{upgrade_dep %{pname}-node} + +%description node +This package contains just the minmal code to run a compute node. + +%package config +Summary: Config files and directories for slurm services +Group: Productivity/Clustering/Computing +%{?sysusers_requires} +Requires: logrotate +BuildArch: noarch +%if 0%{?suse_version} <= 1140 +Requires(pre): pwdutils +%else +Requires(pre): shadow +%endif +%{?systemd_ordering} +%{upgrade_dep %{pname}-config} + +%description config +This package contains the slurm config files necessary direcories +for the slurm daemons. + +%package config-man +Summary: Config files and directories for slurm services +Group: Documentation/Man +BuildArch: noarch +%{upgrade_dep %{pname}-config-man} +%{base_conflicts %{pname}-config} + +%description config-man +Man pages for the SLURM cluster managment software config files. + +%package hdf5 +Summary: Store accounting data in hdf5 +Group: Productivity/Clustering/Computing +%{upgrade_dep %{pname}-hdf5} +Requires: %{name}-plugins = %version + +%description hdf5 +Plugin to store accounting in the hdf5 file format. This plugin has to be +activated in the slurm configuration. Includes also utility the program +sh5utils to merge this hdf5 files or extract data from them. + +%package cray +Summary: Cray specific plugins +Group: Productivity/Clustering/Computing +%{upgrade_dep %{pname}-cray} + +%description cray +Plugins for specific cray hardware, includes power and knl node management. +Contains also cray specific documentation. + +# Certain packages are not shipped with SLE. +%define ts_depends %{?sle_version:Recommends}%{!?sle_version:Requires} + +%package testsuite +Summary: Regression tests from Slurm sources +Group: Productivity/Clustering/Computing +%{upgrade_dep %{pname}-testsuite} +Requires: %{name} = %version +Requires: %{name}-auth-none = %version +Requires: %{name}-cray = %version +Requires: %{name}-devel = %version +%{?have_hdf5:%ts_depends: %{name}-hdf5 = %version} +Requires: %{name}-lua = %version +Requires: %{name}-munge = %version +Requires: %{name}-node = %version +Requires: %{name}-openlava = %version +Requires: %{name}-rest = %version +Requires: %{name}-seff = %version +Requires: %{name}-sjstat = %version +Requires: %{name}-slurmdbd = %version +Requires: %{name}-sql = %version +Requires: %{name}-torque = %version +Requires: mariadb +%{?with_pmix:Requires: pmix-devel} +Requires: bind-utils +Requires: bzip2 +Requires: expect +Requires: gcc-c++ +Requires: libnuma-devel +%ts_depends: openmpi4-gnu-hpc-devel +Requires: pam +Requires: pdsh +Requires: perl-%{name} = %version +Requires: sudo +Requires: tar +BuildRequires: sudo + +%description testsuite +NOTE: THIS PACKAGE IS FOR TESTING PURPOSES ONLY. IT REQUIRES A +DEDICATED TESTING ENVIRONMENT. + +DO NOT INSTALL ON A PRODUCTION SYSTEM! + +Slurm provides a test set implemented as 'expect' scripts. +Not all of the tests are expected to pass, some require a modified +configuration. This test package is meant for internal purposes. +Do not run test suite and file bug reports for each failed test! + +%prep +%setup -q -n %{pname}-%{dl_ver} +%autopatch -p1 + +%if 0%{?python_ver} < 3 +# Workaround for wrongly flagged python3 to keep SLE-11-SP4 building +mkdir -p mybin; ln -s /usr/bin/python2 mybin/python3 +%endif + +%build +# needed as slurm works that way bsc#1200030 +export SUSE_ZNOW=0 + +# To make stripped object files work which we ship in the +# testsuite package we need to build with -ffat-lto-objects. +# This should not affect anything as we do not ship static +# libraries and object files - except for the testsuite. +%if "%{?_lto_cflags}" != "" +%global _lto_cflags %{_lto_cflags} -ffat-lto-objects +%endif + +autoreconf +[ -e $(pwd)/mybin ] && PATH=$(pwd)/mybin:$PATH +%if 0%{?suse_version} < 1500 +export CFLAGS="-std=gnu99 %optflags" +%endif +%configure --enable-shared \ + --disable-static \ + --without-rpath \ + --without-datawarp \ + --with-shared-libslurm \ + --with-pam_dir=%_pam_moduledir \ + %{?with_pmix:--with-pmix=%_prefix/} \ +%if 0%{!?build_slurmrestd:1} + --disable-slurmrestd \ +%endif + --with-yaml \ +%{!?have_netloc:--without-netloc} \ + --sysconfdir=%{_sysconfdir}/%{pname} \ +%{!?have_hdf5:--without-hdf5} \ +%{!?have_lz4:--without-lz4} + +make %{?_smp_mflags} + +%install +[ -e $(pwd)/mybin ] && PATH=$(pwd)/mybin:$PATH +%make_install +make install-contrib DESTDIR=%{buildroot} PERL_MM_PARAMS="INSTALLDIRS=vendor" + +mkdir -p %{buildroot}%{_unitdir} +install -p -m644 etc/slurmd.service etc/slurmdbd.service etc/slurmctld.service %{buildroot}%{_unitdir} +ln -s /usr/sbin/service %{buildroot}%{_sbindir}/rcslurmd +ln -s /usr/sbin/service %{buildroot}%{_sbindir}/rcslurmdbd +ln -s /usr/sbin/service %{buildroot}%{_sbindir}/rcslurmctld +%if 0%{?build_slurmrestd} +install -p -m644 etc/slurmrestd.service %{buildroot}%{_unitdir} +ln -s /usr/sbin/service %{buildroot}%{_sbindir}/rcslurmrestd +%endif +install -d -m 0755 %{buildroot}/%{_tmpfilesdir}/ +cat <<-EOF > %{buildroot}/%{_tmpfilesdir}/%{pname}.conf + # Create a directory with permissions 0700 owned by user slurm, group slurm + d %{_rundir}/slurm 0700 slurm slurm +EOF +chmod 0644 %{buildroot}/%{_tmpfilesdir}/%{pname}.conf +mkdir -p %{buildroot}%{_localstatedir}/spool/slurm + +install -D -m644 etc/cgroup.conf.example %{buildroot}/%{_sysconfdir}/%{pname}/cgroup.conf +install -D -m644 etc/slurm.conf.example %{buildroot}/%{_sysconfdir}/%{pname}/slurm.conf.example +install -D -m600 etc/slurmdbd.conf.example %{buildroot}/%{_sysconfdir}/%{pname}/slurmdbd.conf +install -D -m755 contribs/sjstat %{buildroot}%{_bindir}/sjstat +install -D -m755 contribs/sgather/sgather %{buildroot}%{_bindir}/sgather +%if 0%{?have_firewalld} +install -D -m644 %{S:10} %{buildroot}/%{_prefix}/lib/firewalld/services/slurmd.xml +install -D -m644 %{S:11} %{buildroot}/%{_prefix}/lib/firewalld/services/slurmctld.xml +install -D -m644 %{S:12} %{buildroot}/%{_prefix}/lib/firewalld/services/slurmdbd.xml +%endif + +cat <%{buildroot}%{_sysconfdir}/%{pname}/plugstack.conf +include %{_sysconfdir}/%{pname}/plugstack.conf.d/*.conf +EOF + +mkdir -p %{buildroot}%{_sysconfdir}/%{pname}/plugstack.conf.d + +cp contribs/pam_slurm_adopt/README ../README.pam_slurm_adopt +cp contribs/pam/README ../README.pam_slurm +# remove static pam libs +rm -v %{buildroot}%{_pam_moduledir}/*la +# change slurm.conf for our needs +head -n -2 %{buildroot}/%{_sysconfdir}/%{pname}/slurm.conf.example | grep -v ReturnToService > %{buildroot}/%{_sysconfdir}/%{pname}/slurm.conf +sed -i 's#\(StateSaveLocation=\).*#\1%_localstatedir/lib/slurm#' %{buildroot}/%{_sysconfdir}/%{pname}/slurm.conf +sed -i 's#^\(SlurmdPidFile=\).*$#\1%{_localstatedir}/run/slurm/slurmd.pid#' %{buildroot}/%{_sysconfdir}/%{pname}/slurm.conf +sed -i 's#^\(SlurmctldPidFile=\).*$#\1%{_localstatedir}/run/slurm/slurmctld.pid#' %{buildroot}/%{_sysconfdir}/%{pname}/slurm.conf +sed -i 's#^\(SlurmdSpoolDir=\)/.*#\1%{_localstatedir}/spool/slurm#' %{buildroot}/%{_sysconfdir}/%{pname}/slurm.conf +sed -i -e '/^ControlMachine=/i# Ordered List of Control Nodes' \ + -e 's#ControlMachine=\(.*\)$#SlurmctldHost=\1(10.0.10.20)#' \ + -e 's#BackupController=.*#SlurmctldHost=linux1(10.0.10.21)#' \ + -e '/.*ControlAddr=.*/d' \ + -e '/.*BackupAddr=.*/d' %{buildroot}/%{_sysconfdir}/%{pname}/slurm.conf +cat >>%{buildroot}/%{_sysconfdir}/%{pname}/slurm.conf < system-user-%{pname}.conf +%sysusers_generate_pre system-user-%{pname}.conf %{pname} system-user-%{pname}.conf +install -D -m 644 system-user-%{pname}.conf %{buildroot}%{_sysusersdir}/system-user-%{pname}.conf +%endif + +# Delete static files: +rm -rf %{buildroot}/%{_libdir}/slurm/*.{a,la} \ + %{buildroot}/%{_libdir}/*.la \ + %{buildroot}/%_lib/security/*.la + +# Fix perl +rm %{buildroot}%{perl_archlib}/perllocal.pod \ + %{buildroot}%{perl_sitearch}/auto/Slurm/.packlist \ + %{buildroot}%{perl_sitearch}/auto/Slurmdb/.packlist + +# Fix shell completion bindings +for i in `find %{buildroot}/usr/share/bash-completion/completions/ -type l`; do + ln -sf $(basename $(readlink -f $i)) $i; +done + +mkdir -p %{buildroot}%{perl_vendorarch} + +mv %{buildroot}%{perl_sitearch}/* \ + %{buildroot}%{perl_vendorarch} + +# Remove Cray specific binaries +rm -f %{buildroot}/%{_sbindir}/capmc_suspend \ + %{buildroot}/%{_sbindir}/capmc_resume + +# Build man pages that are generated directly by the tools +%{buildroot}%{_bindir}/sjobexitmod --roff > %{buildroot}/%{_mandir}/man1/sjobexitmod.1 +%{buildroot}%{_bindir}/sjstat --roff > %{buildroot}/%{_mandir}/man1/sjstat.1 + +# avoid conflicts with other packages, make wrapper unique +mv %{buildroot}/%{_bindir}/mpiexec %{buildroot}/%{_bindir}/mpiexec.slurm + +mkdir -p %{buildroot}/etc/ld.so.conf.d +echo '%{_libdir}/slurm' > %{buildroot}/etc/ld.so.conf.d/slurm.conf +chmod 644 %{buildroot}/etc/ld.so.conf.d/slurm.conf + +# Make pkg-config file +mkdir -p %{buildroot}/%{_libdir}/pkgconfig +cat > %{buildroot}/%{_libdir}/pkgconfig/slurm.pc < %{buildroot}/%{_sysconfdir}/logrotate.d/${service}.conf +/var/log/${service}.log { + compress + dateext + missingok + nocreate + notifempty + maxage 365 + rotate 99 + copytruncate + postrotate + pgrep ${service} && killall -SIGUSR2 ${service} || exit 0 + endscript +} +EOF +done +mkdir -p %{buildroot}/%{apache_sysconfdir}/conf.d +cat > %{buildroot}/%{apache_sysconfdir}/conf.d/slurm.conf < + AllowOverride None + DirectoryIndex slurm.html + # Controls who can get stuff from this server. + + Require all granted + + + Order allow,deny + Allow from all + + +EOF +cat > %{buildroot}/%{_sysconfdir}/%{pname}/nss_slurm.conf < %{buildroot}/srv/slurm-testsuite/testsuite/expect/globals.local </dev/null ]}]} { + set mpicc "" +} +set testsuite_user "auser" +#set testsuite_cleanup_on_failure false +EOF +mkdir -p %{buildroot}/srv/slurm-testsuite/shared +mkdir -p %{buildroot}%_localstatedir/lib/slurm/shared +cd %{buildroot}/srv/slurm-testsuite +find -type f -name "*.[ao]" -print | while read f; do + # drop non-deterministic lto bits from .o files + strip -p --discard-locals -R .gnu.lto_* -R .gnu.debuglto_* -N __gnu_lto_v1 $f +done +%if 0%{?suse_version} >= 1500 +%define tar_sort --sort=name +%endif +tar --group=%slurm_g --owner=%slurm_u \ + %{?tar_sort} --mtime="@${SOURCE_DATE_EPOCH:-`date +%%s`}" --pax-option=exthdr.name=%d/PaxHeaders/%f,delete=atime,delete=ctime \ + -cjf /tmp/slurmtest.tar.bz2 * +cd - +rm -rf %{buildroot}/srv/slurm-testsuite +mkdir -p %{buildroot}/srv/slurm-testsuite +mkdir -p %{buildroot}/%{_datadir}/%{name} +mv /tmp/slurmtest.tar.bz2 %{buildroot}/%{_datadir}/%{name} + +mkdir -p %{buildroot}/etc/sudoers.d +echo "slurm ALL=(auser) NOPASSWD:ALL" > %{buildroot}/etc/sudoers.d/slurm +chmod 0440 %{buildroot}/etc/sudoers.d/slurm + +SLURMD_SERVICE=%{buildroot}%_sysconfdir/systemd/system/slurmd.service +mkdir -p `dirname $SLURMD_SERVICE` +cp %{buildroot}/%_unitdir/slurmd.service $SLURMD_SERVICE +if grep -qE "^LimitNPROC" $SLURMD_SERVICE; then + sed -i -e '/LimitNPROC/s@=.*@=infinity@' $SLURMD_SERVICE +else + sed -i -e '/LimitSTACK/aLimitNPROC=infinity' $SLURMD_SERVICE +fi +if grep -qE "^LimitNOFILE" $SLURMD_SERVICE; then + sed -i -e '/LimitNOFILE/s@=.*@=131072:infinity@' $SLURMD_SERVICE +else + sed -i -e '/LimitSTACK/aLimitNOFILE=131072:infinity' $SLURMD_SERVICE +fi +sed -i -e '/ExecStart/aExecStartPre=/bin/bash -c "for i in 0 1 2 3; do test -e /dev/nvidia$i || mknod /dev/nvidia$i c 10 $((i+2)); done"' $SLURMD_SERVICE + +tar -xzf %{S:20} +mkdir -p %{buildroot}%{_pam_secconfdir}/limits.d +mv test_setup/slurm.conf.limits %{buildroot}%_pam_secconfdir/limits.d/slurm.conf +%if 0%{?sle_version} < 150200 +sed -i -e '/hard[[:space:]]*nofile/s@unlimited@1048576@' %{buildroot}%_pam_secconfdir/limits.d/slurm.conf +%endif + +mkdir -p %{buildroot}/root +mv test_setup/setup-testsuite.sh %{buildroot}/root + +mkdir -p %{buildroot}/srv/slurm-testsuite/config/plugstack.conf.d +cp %{S:21} . +%endif + +%fdupes -s %{buildroot} +# For testsuite - do after fdupes! +[ -d test_setup -a -d %{buildroot}/srv/slurm-testsuite/config ] && \ + mv test_setup/* %{buildroot}/srv/slurm-testsuite/config + +# Temporary - remove when build is fixed upstream. +%if !0%{?build_slurmrestd} +rm -f %{buildroot}%{_mandir}/man8/slurmrestd.* +rm -f %{buildroot}%{_libdir}/slurm/openapi_*.so +rm -f %{buildroot}%{_libdir}/slurm/rest_auth_*.so +%endif + +%check +%{!?nocheck:make check} + +%define fixperm() [ $1 -eq 1 -a -e %2 ] && /bin/chmod %1 %2 + +%if 0%{!?service_del_postun_without_restart:1} +%define service_del_postun_without_restart() %{expand:%%service_del_postun -n %{**}} +%endif + +%pre +%service_add_pre slurmctld.service + +%post +%service_add_post slurmctld.service + +%preun +%service_del_preun slurmctld.service + +%postun +%service_del_postun_without_restart slurmctld.service + +%pre slurmdbd +%service_add_pre slurmdbd.service + +%post slurmdbd +%{fixperm 0600 %{_sysconfdir}/%{pname}/slurmdbd.conf} +%{fixperm 0600 %{_sysconfdir}/%{pname}/slurmdbd.conf.example} +%service_add_post slurmdbd.service + +%preun slurmdbd +%service_del_preun slurmdbd.service + +%postun slurmdbd +%{fixperm 0600 %{_sysconfdir}/%{pname}/slurmdbd.conf} +%{fixperm 0600 %{_sysconfdir}/%{pname}/slurmdbd.conf.example} +%service_del_postun_without_restart slurmdbd.service + +%pre node +%service_add_pre slurmd.service + +%post node +%service_add_post slurmd.service + +%preun node +%service_del_preun slurmd.service + +%postun node +%service_del_postun_without_restart slurmd.service + +%pre rest +%service_add_pre slurmrestd.service + +%post rest +%service_add_post slurmrestd.service + +%preun rest +%service_del_preun slurmrestd.service + +%postun rest +%service_add_pre slurmrestd.service + +%pre config %{?have_sysuser:-f %{pname}.pre} +%if 0%{!?have_sysuser:1} +getent group %slurm_g >/dev/null || groupadd -r %slurm_g +getent passwd %slurm_u >/dev/null || useradd -r -g %slurm_g -d %slurmdir -s /bin/bash -c %{slurmdescr} %slurm_u +[ -d %{_localstatedir}/spool/slurm ] && /bin/chown -h %slurm_u:%slurm_g %{_localstatedir}/spool/slurm +exit 0 +%endif + +%post config +%if 0%{?tmpfiles_create:1} + %tmpfiles_create slurm.conf +%else + systemd-tmpfiles --create slurm.conf +%endif + +%post -n %{libslurm} -p /sbin/ldconfig +%postun -n %{libslurm} -p /sbin/ldconfig + +%post -n libpmi%{pmi_so}%{?upgrade:%{_ver}} -p /sbin/ldconfig +%postun -n libpmi%{pmi_so}%{?upgrade:%{_ver}} -p /sbin/ldconfig + +%post -n libnss_%{pname}%{nss_so}%{?upgrade:%{_ver}} -p /sbin/ldconfig +%postun -n libnss_%{pname}%{nss_so}%{?upgrade:%{_ver}} -p /sbin/ldconfig + +%post testsuite +rm -rf /srv/slurm-testsuite/src /srv/slurm-testsuite/testsuite /srv/slurm-testsuite/config.h +runuser -u %slurm_u -- tar --same-owner -C /srv/slurm-testsuite -xjf %{_datadir}/%{name}/slurmtest.tar.bz2 + +%preun testsuite +rm -rf /srv/slurm-testsuite/src /srv/slurm-testsuite/testsuite \ + /srv/slurm-testsuite/slurm /srv/slurm-testsuite/shared \ + /srv/slurm-testsuite/config.h + +%if 0%{!?_restart_on_update:1} +%define _restart_on_update() %{?nil: [ $1 -ge 1 ] && { DISABLE_RESTART_ON_UPDATE=no; \ + [ -e /etc/sysconfig/services ] && . /etc/sysconfig/services || : \ + case "$DISABLE_RESTART_ON_UPDATE" in \ + yes|1) ;; \ + *) /usr/bin/systemctl try-restart %{*} || : ;; \ + esac; } \ + } +%endif + +%posttrans +%_restart_on_update slurmctld + +%posttrans node +%_restart_on_update slurmd + +%posttrans slurmdbd +%_restart_on_update slurmdbd + +%posttrans rest +%_restart_on_update slurmrestd + +%if 0%{?sle_version} > 120200 || 0%{?suse_version} > 1320 +%define my_license %license +%else +%define my_license %doc +%endif + +%files +%doc AUTHORS NEWS RELEASE_NOTES DISCLAIMER +%my_license COPYING +%{_bindir}/sacct +%{_bindir}/sacctmgr +%{_bindir}/salloc +%{_bindir}/sattach +%{_bindir}/sbatch +%{_bindir}/sbcast +%{_bindir}/scancel +%{_bindir}/scrontab +%{_bindir}/scontrol +%{_bindir}/sdiag +%{_bindir}/sgather +%{_bindir}/sinfo +%{_bindir}/sjobexitmod +%{_bindir}/sprio +%{_bindir}/squeue +%{_bindir}/sreport +%{_bindir}/sshare +%{_bindir}/sstat +%{_bindir}/strigger +%{?have_netloc:%{_bindir}/netloc_to_topology} +%{_sbindir}/sackd +%{_sbindir}/slurmctld +%{_datadir}/bash-completion/completions/ +%dir %{_libdir}/slurm/src +%{_unitdir}/slurmctld.service +%{_sbindir}/rcslurmctld +%{_mandir}/man1/sacct.1* +%{_mandir}/man1/sacctmgr.1* +%{_mandir}/man1/salloc.1* +%{_mandir}/man1/sattach.1* +%{_mandir}/man1/sbatch.1* +%{_mandir}/man1/sbcast.1* +%{_mandir}/man1/scancel.1* +%{_mandir}/man1/scrontab.1* +%{_mandir}/man1/scontrol.1* +%{_mandir}/man1/sdiag.1.* +%{_mandir}/man1/sgather.1.* +%{_mandir}/man1/sinfo.1* +%{_mandir}/man1/slurm.1* +%{_mandir}/man1/sprio.1* +%{_mandir}/man1/squeue.1* +%{_mandir}/man1/sreport.1* +%{_mandir}/man1/sshare.1* +%{_mandir}/man1/sstat.1* +%{_mandir}/man1/strigger.1* +%{_mandir}/man1/sjobexitmod.1.* +%{_mandir}/man1/sjstat.1.* +%{_mandir}/man8/slurmctld.* +%{_mandir}/man8/spank.* +%{_mandir}/man8/sackd.* + +%files openlava +%{_bindir}/bjobs +%{_bindir}/bkill +%{_bindir}/bsub +%{_bindir}/lsid + +%files seff +%{_bindir}/seff +%{_bindir}/smail + +%files doc +%dir %{_datadir}/doc/%{pname}-%{version}%{?rc_v:-%rc_v} +%{_datadir}/doc/%{pname}-%{version}%{?rc_v:-%rc_v}/* + +%files webdoc +%config %{apache_sysconfdir}/conf.d/slurm.conf + +%files -n %{libslurm} +%{_libdir}/libslurm*.so.%{so_version}* + +%files -n libpmi%{pmi_so}%{?upgrade:%{_ver}} +%{_libdir}/libpmi*.so.%{pmi_so}* + +%files -n libnss_%{pname}%{nss_so}%{?upgrade:%{_ver}} +%config(noreplace) %{_sysconfdir}/%{pname}/nss_slurm.conf +%{_libdir}/libnss_slurm.so.%{nss_so} + +%files devel +%{_prefix}/include/slurm +%{_libdir}/libpmi.so +%{_libdir}/libpmi2.so +%{_libdir}/libslurm.so +%{_libdir}/slurm/src/* +%{_libdir}/pkgconfig/slurm.pc + +%files sview +%{_bindir}/sview +%{_mandir}/man1/sview.1* + +%files auth-none +%{_libdir}/slurm/auth_none.so + +%files munge +%{_libdir}/slurm/auth_munge.so +%{_libdir}/slurm/cred_munge.so + +%files -n perl-%{name} +%{perl_vendorarch}/Slurm.pm +%{perl_vendorarch}/Slurm +%{perl_vendorarch}/Slurmdb.pm +%{perl_vendorarch}/auto/Slurm +%{perl_vendorarch}/auto/Slurmdb +%dir %{perl_vendorarch}/auto +%{_mandir}/man3/Slurm*.3pm.* + +%files slurmdbd +%{_sbindir}/slurmdbd +%{_mandir}/man5/slurmdbd.* +%{_mandir}/man8/slurmdbd.* +%config(noreplace) %attr(0600,%slurm_u,%slurm_g) %{_sysconfdir}/%{pname}/slurmdbd.conf +%{_unitdir}/slurmdbd.service +%{_sbindir}/rcslurmdbd + +%files sql +%dir %{_libdir}/slurm +%{_libdir}/slurm/accounting_storage_mysql.so +%{_libdir}/slurm/jobcomp_mysql.so + +%files plugins +%config %{_sysconfdir}/ld.so.conf.d/slurm.conf +%config(noreplace) %{_sysconfdir}/%{pname}/plugstack.conf +%dir %{_sysconfdir}/%{pname}/plugstack.conf.d +%dir %{_libdir}/slurm +%{_libdir}/slurm/libslurmfull.so +%{_libdir}/slurm/accounting_storage_slurmdbd.so +%{_libdir}/slurm/accounting_storage_ctld_relay.so +%{_libdir}/slurm/acct_gather_energy_pm_counters.so +%{_libdir}/slurm/acct_gather_energy_gpu.so +%{_libdir}/slurm/acct_gather_energy_ibmaem.so +%{_libdir}/slurm/acct_gather_energy_rapl.so +%{_libdir}/slurm/acct_gather_interconnect_sysfs.so +%{_libdir}/slurm/acct_gather_filesystem_lustre.so +%{_libdir}/slurm/burst_buffer_lua.so +%{_libdir}/slurm/burst_buffer_datawarp.so +%{_libdir}/slurm/data_parser_v0_0_42.so +%{_libdir}/slurm/data_parser_v0_0_41.so +%{_libdir}/slurm/data_parser_v0_0_40.so +%{_libdir}/slurm/cgroup_v1.so +%if 0%{?suse_version} >= 1500 +%{_libdir}/slurm/cgroup_v2.so +%endif +%{_libdir}/slurm/cli_filter_lua.so +%{_libdir}/slurm/cli_filter_syslog.so +%{_libdir}/slurm/cli_filter_user_defaults.so +%{_libdir}/slurm/cred_none.so +%{_libdir}/slurm/gpu_generic.so +%{_libdir}/slurm/gpu_nrt.so +%{_libdir}/slurm/gres_gpu.so +%{_libdir}/slurm/gres_mps.so +%{_libdir}/slurm/gres_nic.so +%{_libdir}/slurm/gres_shard.so +%{_libdir}/slurm/hash_k12.so +%{_libdir}/slurm/hash_sha3.so +%{_libdir}/slurm/tls_none.so +%{_libdir}/slurm/jobacct_gather_cgroup.so +%{_libdir}/slurm/jobacct_gather_linux.so +%{_libdir}/slurm/jobcomp_filetxt.so +%{_libdir}/slurm/jobcomp_lua.so +%{_libdir}/slurm/jobcomp_script.so +%{_libdir}/slurm/job_container_tmpfs.so +%{_libdir}/slurm/job_submit_all_partitions.so +%{_libdir}/slurm/job_submit_defaults.so +%{_libdir}/slurm/job_submit_logging.so +%{_libdir}/slurm/job_submit_partition.so +%{_libdir}/slurm/job_submit_require_timelimit.so +%{_libdir}/slurm/job_submit_throttle.so +%{_libdir}/slurm/libslurm_pmi.so +%{_libdir}/slurm/mcs_account.so +%{_libdir}/slurm/mcs_group.so +%{_libdir}/slurm/mcs_user.so +%{_libdir}/slurm/mpi_pmi2.so +%if %{with pmix} +%{_libdir}/slurm/mpi_pmix.so +%{_libdir}/slurm/mpi_pmix_v3.so +%endif +%{_libdir}/slurm/node_features_helpers.so +%{_libdir}/slurm/preempt_partition_prio.so +%{_libdir}/slurm/preempt_qos.so +%{_libdir}/slurm/prep_script.so +%{_libdir}/slurm/priority_basic.so +%{_libdir}/slurm/priority_multifactor.so +%{_libdir}/slurm/proctrack_cgroup.so +%{_libdir}/slurm/proctrack_linuxproc.so +%{_libdir}/slurm/proctrack_pgid.so +%{_libdir}/slurm/sched_backfill.so +%{_libdir}/slurm/sched_builtin.so +%{_libdir}/slurm/select_cons_tres.so +%{_libdir}/slurm/select_linear.so +%{_libdir}/slurm/serializer_json.so +%{_libdir}/slurm/serializer_url_encoded.so +%{_libdir}/slurm/serializer_yaml.so +%{_libdir}/slurm/site_factor_example.so +%{_libdir}/slurm/switch_nvidia_imex.so +%{_libdir}/slurm/task_affinity.so +%{_libdir}/slurm/task_cgroup.so +%{_libdir}/slurm/topology_3d_torus.so +%{_libdir}/slurm/topology_block.so +%{_libdir}/slurm/topology_default.so +%{_libdir}/slurm/topology_tree.so +%if 0%{?suse_version} > 1310 +%{_libdir}/slurm/acct_gather_interconnect_ofed.so +%endif +%if 0%{?suse_version} > 1140 +%ifarch %{ix86} x86_64 +%{_libdir}/slurm/acct_gather_energy_ipmi.so +%{_libdir}/slurm/acct_gather_energy_xcc.so +%endif +%endif +%{_libdir}/slurm/node_features_knl_generic.so +%{_libdir}/slurm/acct_gather_profile_influxdb.so +%{_libdir}/slurm/jobcomp_elasticsearch.so +%{_libdir}/slurm/certmgr_script.so +%{_libdir}/slurm/gpu_nvidia.so +%{_libdir}/slurm/mcs_label.so + +%files lua +%{_libdir}/slurm/job_submit_lua.so + +%files torque +%{_bindir}/pbsnodes +%{_bindir}/qalter +%{_bindir}/qdel +%{_bindir}/qhold +%{_bindir}/qrls +%{_bindir}/qrerun +%{_bindir}/qstat +%{_bindir}/qsub +%{_bindir}/mpiexec.slurm +%{_bindir}/generate_pbs_nodefile +%{_libdir}/slurm/job_submit_pbs.so +%{_libdir}/slurm/spank_pbs.so + +%files sjstat +%{_bindir}/sjstat + +%files pam_slurm +%doc ../README.pam_slurm ../README.pam_slurm_adopt +%{_pam_moduledir}/pam_slurm.so +%{_pam_moduledir}/pam_slurm_adopt.so + +%if 0%{?build_slurmrestd} +%files rest +%{_sbindir}/slurmrestd +%{_sbindir}/rcslurmrestd +%{_unitdir}/slurmrestd.service +%{_mandir}/man8/slurmrestd.* +%{_libdir}/slurm/openapi_slurmctld.so +%{_libdir}/slurm/openapi_slurmdbd.so +%{_libdir}/slurm/rest_auth_local.so +%endif + +%files node +%{_sbindir}/slurmd +%{_sbindir}/slurmstepd +# bsc#1153095 +%{_bindir}/srun +%{_bindir}/scrun +%{_mandir}/man1/srun.1* +%{_mandir}/man1/scrun.1* +%{_mandir}/man8/slurmd.* +%{_mandir}/man8/slurmstepd* +%{_sbindir}/rcslurmd +%{_unitdir}/slurmd.service + +%files config +%dir %{_sysconfdir}/%{pname} +%config(noreplace) %{_sysconfdir}/%{pname}/slurm.conf +%config %{_sysconfdir}/%{pname}/slurm.conf.example +%config(noreplace) %{_sysconfdir}/%{pname}/cgroup.conf +%attr(0755, %slurm_u, %slurm_g) %_localstatedir/lib/slurm +%{_tmpfilesdir}/%{pname}.conf +%{?_rundir:%ghost %{_rundir}/slurm} +%dir %attr(0755, %slurm_u, %slurm_g)%{_localstatedir}/spool/slurm +%config(noreplace) %{_sysconfdir}/logrotate.d/slurm* +%if 0%{?have_firewalld} +%{_prefix}/lib/firewalld/services/slurmd.xml +%{_prefix}/lib/firewalld/services/slurmctld.xml +%{_prefix}/lib/firewalld/services/slurmdbd.xml +%endif +%{?have_sysuser:%{_sysusersdir}/system-user-%{pname}.conf} + +%files config-man +%{_mandir}/man5/acct_gather.conf.* +%{_mandir}/man5/burst_buffer.conf.* +%{_mandir}/man5/slurm.* +%{_mandir}/man5/cgroup.* +%{_mandir}/man5/gres.* +%{_mandir}/man5/helpers.* +%{_mandir}/man5/oci.conf.5.gz +%{_mandir}/man5/topology.* +%{_mandir}/man5/knl.conf.5.* +%{_mandir}/man5/job_container.conf.5.* +%{_mandir}/man5/mpi.conf.5.* + +%if 0%{?have_hdf5} +%files hdf5 +%{_bindir}/sh5util +%{_libdir}/slurm/acct_gather_profile_hdf5.so +%{_mandir}/man1/sh5util.1.gz +%endif + +%files cray +%{_libdir}/slurm/mpi_cray_shasta.so + +%if 0%{?slurm_testsuite} +%files testsuite +%defattr(-, %slurm_u, %slurm_u, -) +%dir %attr(-, %slurm_u, %slurm_u) /srv/slurm-testsuite +%attr(-, root, root) %{_datadir}/%{name} +%if 0%{?sle_version} == 120200 || 0%{?suse_version} >= 1550 +%dir %attr(-, root, root) %{_pam_secconfdir}/limits.d +%endif +%doc testsuite/expect/README +%doc %{basename: %{S:21}} +%config %attr( -, root, root) %{_sysconfdir}/systemd/system/slurmd.service +%config %attr(0440, root, root) %{_sysconfdir}/sudoers.d/slurm +%config %attr( -, root, root) %{_pam_secconfdir}/limits.d/slurm.conf +%{_libdir}/slurm/libslurm.so +%attr(0600, %slurm_u, %slurm_g) /srv/slurm-testsuite/config/slurmdbd.conf +/srv/slurm-testsuite/* +%dir %attr(-, %slurm_u, %slurm_g) %_localstatedir/lib/slurm/shared +%attr( -, root, root) /root/setup-testsuite.sh +%endif + +%changelog diff --git a/slurmctld.xml b/slurmctld.xml new file mode 100644 index 0000000..5abed0f --- /dev/null +++ b/slurmctld.xml @@ -0,0 +1,7 @@ + + + slurmctld + slurmctld is the management daemon for SLURM cluster management. + + + diff --git a/slurmd.xml b/slurmd.xml new file mode 100644 index 0000000..ef91f3c --- /dev/null +++ b/slurmd.xml @@ -0,0 +1,7 @@ + + + slurmd + slurmd is the daemon which starts jobs for the SLURM cluster management. + + + diff --git a/slurmdbd.xml b/slurmdbd.xml new file mode 100644 index 0000000..c2932b3 --- /dev/null +++ b/slurmdbd.xml @@ -0,0 +1,7 @@ + + + slurmd + slurmdbd is the database daemon for the SLURM cluster management. + + + diff --git a/test_setup.tar.gz b/test_setup.tar.gz new file mode 100644 index 0000000..0d53ecd --- /dev/null +++ b/test_setup.tar.gz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7a45706911924b06a2ec7d436d4e991d84dc459a505cbdfca244ac5fad2b9b60 +size 3165 diff --git a/upgrades b/upgrades new file mode 100644 index 0000000..1aa1c6b --- /dev/null +++ b/upgrades @@ -0,0 +1,13 @@ +24.05.4 +24.05.3 +23.11.1 +23.02.7 +23.02.6 +23.02.5 +23.02.3 +23.02.0 +22.05.11 +22.05.10 +22.05.5 +22.05.2 +22.05.0