From fb460ebe6a2d1239d688cb1b33db59b891b458d3d564a14a8dfc2514df25cbda Mon Sep 17 00:00:00 2001 From: Egbert Eich Date: Mon, 26 Feb 2024 21:40:59 +0000 Subject: [PATCH] Accepting request 1150524 from home:eeich:branches:network:cluster - Update to version 23.11.03 * slurmrestd - Reject single http query with multiple path requests. * Fix launching Singularity v4.x containers with `srun --container` by setting .process.terminal to true in generated `config.json` when step has pseudoterminal (`--pty`) requested. * Fix loading in `dyanmic/cloud` node jobs after `net_cred` expired. * Fix cgroup null path error on `slurmd/slurmstepd` tear down. * `data_parser/v0.0.40` - Prevent failure if accounting is disabled, instead issue a warning if needed data from the database can not be retrieved. * `openapi/slurmctld` - Prevent failure if accounting is disabled. * Prevent `slurmscriptd` processing delays from blocking other threads in `slurmctld` while trying to launch various scripts. This is additional work for a fix in 23.02.6. * Fix memory leak when receiving alias addrs from controller. * `scontrol` - Accept `scontrol token lifespan=infinite` to create tokens that effectively do not expire. * Avoid errors when Slurmdb accounting disabled when `--json` or `--yaml` is invoked with CLI commands and `slurmrestd`. Add warnings when query would have populated data from Slurmdb instead of errors. * Fix `slurmctld` memory leak when running job with `--tres-per-task=gres:shard:#` * Fix backfill trying to start jobs outside of backfill window. * Fix oversubscription on partitions with `PreemptMode=OFF`. * Preserve node reason on power up if the node is downed or drained. OBS-URL: https://build.opensuse.org/request/show/1150524 OBS-URL: https://build.opensuse.org/package/show/network:cluster/slurm?expand=0&rev=289 --- slurm-23.11.1.tar.bz2 | 3 -- slurm-23.11.3.tar.bz2 | 3 ++ slurm.changes | 120 ++++++++++++++++++++++++++++++++++++++++++ slurm.spec | 13 +++-- upgrades | 2 + 5 files changed, 131 insertions(+), 10 deletions(-) delete mode 100644 slurm-23.11.1.tar.bz2 create mode 100644 slurm-23.11.3.tar.bz2 diff --git a/slurm-23.11.1.tar.bz2 b/slurm-23.11.1.tar.bz2 deleted file mode 100644 index 60fe4c1..0000000 --- a/slurm-23.11.1.tar.bz2 +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:2f3f4ad4c92596c405d465f5a991bc50d85508b8b127fb2cc008a0980b7bdbd8 -size 7536436 diff --git a/slurm-23.11.3.tar.bz2 b/slurm-23.11.3.tar.bz2 new file mode 100644 index 0000000..3b963b1 --- /dev/null +++ b/slurm-23.11.3.tar.bz2 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5ad59832f3cf70832a14d08997867af6f0a4ab10340dc89d5a65a275373836ea +size 7359396 diff --git a/slurm.changes b/slurm.changes index 3d89146..cf38899 100644 --- a/slurm.changes +++ b/slurm.changes @@ -1,3 +1,123 @@ +------------------------------------------------------------------- +Mon Jan 29 13:47:55 UTC 2024 - Egbert Eich + +- Update to version 23.11.03 + * slurmrestd - Reject single http query with multiple path + requests. + * Fix launching Singularity v4.x containers with + `srun --container` by setting .process.terminal to true in + generated `config.json` when step has pseudoterminal (`--pty`) + requested. + * Fix loading in `dyanmic/cloud` node jobs after `net_cred` + expired. + * Fix cgroup null path error on `slurmd/slurmstepd` tear down. + * `data_parser/v0.0.40` - Prevent failure if accounting is + disabled, instead issue a warning if needed data from the + database can not be retrieved. + * `openapi/slurmctld` - Prevent failure if accounting is disabled. + * Prevent `slurmscriptd` processing delays from blocking other + threads in `slurmctld` while trying to launch various scripts. + This is additional work for a fix in 23.02.6. + * Fix memory leak when receiving alias addrs from controller. + * `scontrol` - Accept `scontrol token lifespan=infinite` to + create tokens that effectively do not expire. + * Avoid errors when Slurmdb accounting disabled when `--json` or + `--yaml` is invoked with CLI commands and `slurmrestd`. Add + warnings when query would have populated data from Slurmdb + instead of errors. + * Fix `slurmctld` memory leak when running job with + `--tres-per-task=gres:shard:#` + * Fix backfill trying to start jobs outside of backfill window. + * Fix oversubscription on partitions with `PreemptMode=OFF`. + * Preserve node reason on power up if the node is downed + or drained. + * `data_parser/v0.0.40` - Avoid aborting when invoking a not + implemented parser. + * `data_parser/v0.0.40` - Fix how nice values are parsed for job + submissions. + * `data_parser/v0.0.40` - Fix regression where parsing error did + not result in invalid request being rejected. + * Fix segfault in front-end node registration. + * Prevent jobs using none typed gpus from being killed by the + controller after a reconfig or restart. + * Fix deadlock situation in the dbd when adding associations. + * Update default values of text/blob columns when updating from + old mysql versions in more situations. This improves a + previous fix to handle an uncommon case when upgrading + mysql/mariadb. + * Fix rpmbuild in openSUSE/SLES due to incorrect mariadb + dependency. + * When upgrading the slurmdbd to 23.11, avoid generating a query + to update the association table that is larger than + `max_allowed_packet` which would result in an upgrade failure. + * Fix rare deadlock when a dynamic node registers at the same + time that a once per minute background task occurs. + * `data_parser/v0.0.40` - Fix enumerated strings in OpenAPI + specification not have type field specified. + * Improve `scontrol show job -d` information of used shared + gres (`shard/mps`) topology. + * accounting_storage/mysql - Fix usage query to use new lineage + column instead of lft/rgt. + * `slurmrestd` - Improve handling of missing parsers when + content plugins expect parsers not loaded. + * `slurmrestd` - Correct parsing of StepIds when querying jobs. + * `slurmrestd` - Improve error from parsing failures of lists. + * `slurmrestd` - Improve parsing of singular values for lists. + * `accounting_storage/mysql` - Fix `PrivateData=User` when + listing associations. + * Disable sorting of dynamic nodes to avoid issues when + restarting with heterogenous jobs that cause jobs to abort on + restart. + * Don't allow deletion of non-dynamic nodes. + * `accounting_storage/mysql` - Fix issue adding partition based + associations. + * Respect non-"slurm" settings for `I_MPI_HYDRA_BOOTSTRAP` and + `HYDRA_BOOTSTRAP` and avoid injecting the `--external-launcher` + option which will cause `mpirun/mpiexec` to fail with an + unexpected argument error. + * Fix bug where scontrol hold would change node count for jobs + with implicitly defined node counts. + * `data_parser/v0.0.40` - Fix regression of support for "hold" + in job description. + * Avoid sending KILL RPCs to unresolvable `POWERING_UP` and + `POWERED_DOWN` nodes. + * `data_parser/v0.0.38` - Fix several potential NULL + dereferences that could cause slurmrestd to crash. + * Add `--gres-flags=one-task-per-sharing`. Do not allow different + tasks in to be allocated shared gres from the same sharing gres. + * Add `SelectTypeParameters=ENFORCE_BINDING_GRES` and + `ONE_TASK_PER_SHARING_GRES`. + This gives default behavior for a job's `--gres-flags`. + * Alter the networking code to try connecting to the backup + controllers if the DNS lookup for the primary `SlurmctldHost` + fails. + * Alter the name resolution to only log at `verbose()` in client + commands on failures. This allows for HA setups where the DNS + entries are withdrawn for some `SlurmctldHost` entries without + flooding the user with errors. + * Prevent `slurmscriptd` PID leaks when running `slurmctld` in + foreground mode. + * Open all `slurmctld` listening ports at startup, and persist + throughout. + This also changes the backup `slurmctld` process to open the + `SlurmctldPort` range, instead of only the first. + * Fix backup `slurmctld` shutting down instead of resuming + standby duty if it took control. + * Fix race condition that delayed the primary `slurmctld` + resuming when taking control from a backup controller. + * `srun` - Ensure processed messages are meant for this job in + case of a rapidly-reused TCP port. + * `srun` - Prevent step launch failure while waiting for step + allocation if a stray message is received. + * Fix backup `slurmctld` to be able to respond to configless + config file requests correctly. + * Fix `slurmctld` crashing when recovering from a failed + reconfigure. + * Fix `slurmscriptd` operation after recovering from a failed + reconfigure. +- Make sure `-std=gnu99` is added to CFLAGS on SLE-12. +- Use %%autopatch. + ------------------------------------------------------------------- Fri Jan 12 11:08:01 UTC 2024 - Christian Goll diff --git a/slurm.spec b/slurm.spec index 0304d54..e43b731 100644 --- a/slurm.spec +++ b/slurm.spec @@ -19,7 +19,7 @@ # Check file META in sources: update so_version to (API_CURRENT - API_AGE) %define so_version 40 # Make sure to update `upgrades` as well! -%define ver 23.11.1 +%define ver 23.11.3 %define _ver _23_11 %define dl_ver %{ver} # so-version is 0 and seems to be stable @@ -120,7 +120,7 @@ Conflicts: %{*} } %endif %if 0%{?suse_version} >= 1500 -%undefine have_hdf5 +%define have_hdf5 1 %define have_boolean_deps 1 %define have_lz4 1 %define have_firewalld 1 @@ -623,11 +623,7 @@ Do not run test suite and file bug reports for each failed test! %prep %setup -q -n %{pname}-%{dl_ver} -%patch0 -p1 -%patch2 -p1 -%patch10 -p1 -%patch14 -p1 -%patch15 -p1 +%autopatch -p1 %if 0%{?python_ver} < 3 # Workaround for wrongly flagged python3 to keep SLE-11-SP4 building @@ -648,6 +644,9 @@ export SUSE_ZNOW=0 autoreconf [ -e $(pwd)/mybin ] && PATH=$(pwd)/mybin:$PATH +%if 0%{?suse_version} < 1500 +export CFLAGS="-std=gnu99 %optflags" +%endif %configure --enable-shared \ --disable-static \ --without-rpath \ diff --git a/upgrades b/upgrades index b03b0e1..135fbf5 100644 --- a/upgrades +++ b/upgrades @@ -1,3 +1,5 @@ +23.11.1 +23.02.7 23.02.6 23.02.5 23.02.3