From cda5ce024e45351ebc8f442d3344f12d7b32fcdbf7428fad3f434f55044fe121 Mon Sep 17 00:00:00 2001 From: Christian Goll Date: Tue, 26 Mar 2024 08:40:44 +0000 Subject: [PATCH] Accepting request 1161499 from home:mslacken:branches:network:cluster - removed Keep-logs-of-skipped-test-when-running-test-cases-sequentially.patch as incoperated upstream * Changes in Slurm 23.02.5 * Add the JobId to debug() messages indicating when cpus_per_task/mem_per_cpu or pn_min_cpus are being automatically adjusted. * Fix regression in 23.02.2 that caused slurmctld -R to crash on startup if a node features plugin is configured. * Fix and prevent reoccurring reservations from overlapping. * job_container/tmpfs - Avoid attempts to share BasePath between nodes. * Change the log message warning for rate limited users from verbose to info. * With CR_Cpu_Memory, fix node selection for jobs that request gres and *-mem-per-cpu. * Fix a regression from 22.05.7 in which some jobs were allocated too few nodes, thus overcommitting cpus to some tasks. * Fix a job being stuck in the completing state if the job ends while the primary controller is down or unresponsive and the backup controller has not yet taken over. * Fix slurmctld segfault when a node registers with a configured CpuSpecList while slurmctld configuration has the node without CpuSpecList. * Fix cloud nodes getting stuck in POWERED_DOWN+NO_RESPOND state after not registering by ResumeTimeout. * slurmstepd - Avoid cleanup of config.json-less containers spooldir getting skipped. * slurmstepd - Cleanup per task generated environment for containers in spooldir. * Fix scontrol segfault when 'completing' command requested repeatedly in interactive mode. * Properly handle a race condition between bind() and listen() calls in the network stack when running with SrunPortRange set. * Federation - Fix revoked jobs being returned regardless of the -a/--all OBS-URL: https://build.opensuse.org/request/show/1161499 OBS-URL: https://build.opensuse.org/package/show/network:cluster/slurm?expand=0&rev=292 --- ...when-running-test-cases-sequentially.patch | 27 --- slurm-23.11.3.tar.bz2 | 3 - slurm-23.11.5.tar.bz2 | 3 + slurm.changes | 180 ++++++++++++++++++ slurm.spec | 7 +- 5 files changed, 187 insertions(+), 33 deletions(-) delete mode 100644 Keep-logs-of-skipped-test-when-running-test-cases-sequentially.patch delete mode 100644 slurm-23.11.3.tar.bz2 create mode 100644 slurm-23.11.5.tar.bz2 diff --git a/Keep-logs-of-skipped-test-when-running-test-cases-sequentially.patch b/Keep-logs-of-skipped-test-when-running-test-cases-sequentially.patch deleted file mode 100644 index 42b9115..0000000 --- a/Keep-logs-of-skipped-test-when-running-test-cases-sequentially.patch +++ /dev/null @@ -1,27 +0,0 @@ -From: Egbert Eich -Date: Wed Jun 22 16:32:35 2022 +0200 -Subject: Keep logs of skipped test when running test cases sequentially. -Patch-mainline: Not yet -Git-repo: https://github.com/SchedMD/slurm -Git-commit: 457a53ca97b50530bb2fafda72d465507c434960 -References: - -Signed-off-by: Egbert Eich -Signed-off-by: Egbert Eich ---- - testsuite/expect/regression.py | 3 ++- - 1 file changed, 2 insertions(+), 1 deletion(-) -diff --git a/testsuite/expect/regression.py b/testsuite/expect/regression.py -index bcccaadbf5..b39af0c4e2 100755 ---- a/testsuite/expect/regression.py -+++ b/testsuite/expect/regression.py -@@ -199,7 +199,8 @@ def main(argv=None): - sys.stdout.write('SKIPPED\n') - if not options.keep_logs: - try: -- os.remove(testlog_name) -+# os.remove(testlog_name) -+ os.rename(testlog_name, testlog_name+'.skipped') - except IOError as e: - print('ERROR failed to close %s %s' % (testlog_name, e), - file=sys.stederr); diff --git a/slurm-23.11.3.tar.bz2 b/slurm-23.11.3.tar.bz2 deleted file mode 100644 index 3b963b1..0000000 --- a/slurm-23.11.3.tar.bz2 +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:5ad59832f3cf70832a14d08997867af6f0a4ab10340dc89d5a65a275373836ea -size 7359396 diff --git a/slurm-23.11.5.tar.bz2 b/slurm-23.11.5.tar.bz2 new file mode 100644 index 0000000..5cc1221 --- /dev/null +++ b/slurm-23.11.5.tar.bz2 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7a8f4b1b46d3a8ec9a95066b04635c97f9095877f6189a8ff7388e5e74daeef3 +size 7365175 diff --git a/slurm.changes b/slurm.changes index 8e04a66..8a6c800 100644 --- a/slurm.changes +++ b/slurm.changes @@ -1,3 +1,183 @@ +------------------------------------------------------------------- +Mon Mar 25 15:16:44 UTC 2024 - Christian Goll + +- removed Keep-logs-of-skipped-test-when-running-test-cases-sequentially.patch + as incoperated upstream +* Changes in Slurm 23.02.5 + * Add the JobId to debug() messages indicating when cpus_per_task/mem_per_cpu + or pn_min_cpus are being automatically adjusted. + * Fix regression in 23.02.2 that caused slurmctld -R to crash on startup if + a node features plugin is configured. + * Fix and prevent reoccurring reservations from overlapping. + * job_container/tmpfs - Avoid attempts to share BasePath between nodes. + * Change the log message warning for rate limited users from verbose to info. + * With CR_Cpu_Memory, fix node selection for jobs that request gres and + *-mem-per-cpu. + * Fix a regression from 22.05.7 in which some jobs were allocated too few + nodes, thus overcommitting cpus to some tasks. + * Fix a job being stuck in the completing state if the job ends while the + primary controller is down or unresponsive and the backup controller has + not yet taken over. + * Fix slurmctld segfault when a node registers with a configured CpuSpecList + while slurmctld configuration has the node without CpuSpecList. + * Fix cloud nodes getting stuck in POWERED_DOWN+NO_RESPOND state after not + registering by ResumeTimeout. + * slurmstepd - Avoid cleanup of config.json-less containers spooldir getting + skipped. + * slurmstepd - Cleanup per task generated environment for containers in + spooldir. + * Fix scontrol segfault when 'completing' command requested repeatedly in + interactive mode. + * Properly handle a race condition between bind() and listen() calls in the + network stack when running with SrunPortRange set. + * Federation - Fix revoked jobs being returned regardless of the -a/--all + option for privileged users. + * Federation - Fix canceling pending federated jobs from non-origin clusters + which could leave federated jobs orphaned from the origin cluster. + * Fix sinfo segfault when printing multiple clusters with --noheader option. + * Federation - fix clusters not syncing if clusters are added to a federation + before they have registered with the dbd. + * Change pmi2 plugin to honor the SrunPortRange option. This matches the new + behavior of the pmix plugin in 23.02.0. Note that neither of these plugins + makes use of the "MpiParams=ports=" option, and previously were only limited + by the systems ephemeral port range. + * node_features/helpers - Fix node selection for jobs requesting changeable + features with the '|' operator, which could prevent jobs from running on + some valid nodes. + * node_features/helpers - Fix inconsistent handling of '&' and '|', where an + AND'd feature was sometimes AND'd to all sets of features instead of just + the current set. E.g. "foo|bar&baz" was interpreted as {foo,baz} or + {bar,baz} instead of how it is documented: "{foo} or {bar,baz}". + * Fix job accounting so that when a job is requeued its allocated node count + is cleared. After the requeue, sacct will correctly show that the job has + 0 AllocNodes while it is pending or if it is canceled before restarting. + * sacct - AllocCPUS now correctly shows 0 if a job has not yet received an + allocation or if the job was canceled before getting one. + * Fix intel oneapi autodetect: detect the /dev/dri/renderD[0-9]+ gpus, and do + not detect /dev/dri/card[0*9]+. + * Format batch, extern, interactive, and pending step ids into strings that + are human readable. + * Fix node selection for jobs that request --gpus and a number of tasks fewer + than gpus, which resulted in incorrectly rejecting these jobs. + * Remove MYSQL_OPT_RECONNECT completely. + * Fix cloud nodes in POWERING_UP state disappearing (getting set to FUTURE) + when an `scontrol reconfigure` happens. + * openapi/dbv0.0.39 - Avoid assert / segfault on missing coordinators list. + * slurmrestd - Correct memory leak while parsing OpenAPI specification + templates with server overrides. + * slurmrestd - Reduce memory usage when printing out job CPU frequency. + * Fix overwriting user node reason with system message. + * Remove --uid / --gid options from salloc and srun commands. + * Prevent deadlock when rpc_queue is enabled. + * slurmrestd - Correct OpenAPI specification generation bug where fields with + overlapping parent paths would not get generated. + * Fix memory leak as a result of a partition info query. + * Fix memory leak as a result of a job info query. + * slurmrestd - For 'GET /slurm/v0.0.39/node[s]', change format of node's + energy field "current_watts" to a dictionary to account for unset value + instead of dumping 4294967294. + * slurmrestd - For 'GET /slurm/v0.0.39/qos', change format of QOS's + field "priority" to a dictionary to account for unset value instead of + dumping 4294967294. + * slurmrestd - For 'GET /slurm/v0.0.39/job[s]', the 'return code' code field + in v0.0.39_job_exit_code will be set to *127 instead of being left unset + where job does not have a relevant return code. + * data_parser/v0.0.39 - Add required/memory_per_cpu and + required/memory_per_node to `sacct *-json` and `sacct --yaml` and + 'GET /slurmdb/v0.0.39/jobs' from slurmrestd. + * For step allocations, fix --gres=none sometimes not ignoring gres from the + job. + * Fix --exclusive jobs incorrectly gang-scheduling where they shouldn't. + * Fix allocations with CR_SOCKET, gres not assigned to a specific socket, and + block core distribion potentially allocating more sockets than required. + * gpu/oneapi - Store cores correctly so CPU affinity is tracked. + * Revert a change in 23.02.3 where Slurm would kill a script's process group + as soon as the script ended instead of waiting as long as any process in + that process group held the stdout/stderr file descriptors open. That change + broke some scripts that relied on the previous behavior. Setting time limits + for scripts (such as PrologEpilogTimeout) is strongly encouraged to avoid + Slurm waiting indefinitely for scripts to finish. + * Allow slurmdbd -R to work if the root assoc id is not 1. + * Fix slurmdbd -R not returning an error under certain conditions. + * slurmdbd - Avoid potential NULL pointer dereference in the mysql plugin. + * Revert a change in 23.02 where SLURM_NTASKS was no longer set in the job's + environment when *-ntasks-per-node was requested. + * Limit periodic node registrations to 50 instead of the full TreeWidth. + Since unresolvable cloud/dynamic nodes must disable fanout by setting + TreeWidth to a large number, this would cause all nodes to register at + once. + * Fix regression in 23.02.3 which broken x11 forwarding for hosts when + MUNGE sends a localhost address in the encode host field. This is caused + when the node hostname is mapped to 127.0.0.1 (or similar) in /etc/hosts. + * openapi/[db]v0.0.39 - fix memory leak on parsing error. + * data_parser/v0.0.39 - fix updating qos for associations. + * openapi/dbv0.0.39 - fix updating values for associations with null users. + * Fix minor memory leak with --tres-per-task and licenses. + * Fix cyclic socket cpu distribution for tasks in a step where + --cpus-per-task < usable threads per core. +- Changes in Slurm 23.02.4 + * Fix sbatch return code when **wait is requested on a job array. + * switch/hpe_slingshot * avoid segfault when running with old libcxi. + * Avoid slurmctld segfault when specifying AccountingStorageExternalHost. + * Fix collected GPUUtilization values for acct_gather_profile plugins. + * Fix slurmrestd handling of job hold/release operations. + * Make spank S_JOB_ARGV item value hold the requested command argv instead of + the srun **bcast value when **bcast requested (only in local context). + * Fix step running indefinitely when slurmctld takes more than MessageTimeout + to respond. Now, slurmctld will cancel the step when detected, preventing + following steps from getting stuck waiting for resources to be released. + * Fix regression to make job_desc.min_cpus accurate again in job_submit when + requesting a job with **ntasks*per*node. + * scontrol * Permit changes to StdErr and StdIn for pending jobs. + * scontrol * Reset std{err,in,out} when set to empty string. + * slurmrestd * mark environment as a required field for job submission + descriptions. + * slurmrestd * avoid dumping null in OpenAPI schema required fields. + * data_parser/v0.0.39 * avoid rejecting valid memory_per_node formatted as + dictionary provided with a job description. + * data_parser/v0.0.39 * avoid rejecting valid memory_per_cpu formatted as + dictionary provided with a job description. + * slurmrestd * Return HTTP error code 404 when job query fails. + * slurmrestd * Add return schema to error response to job and license query. + * Fix handling of ArrayTaskThrottle in backfill. + * Fix regression in 23.02.2 when checking gres state on slurmctld startup or + reconfigure. Gres changes in the configuration were not updated on slurmctld + startup. On startup or reconfigure, these messages were present in the log: + "error: Attempt to change gres/gpu Count". + * Fix potential double count of gres when dealing with limits. + * switch/hpe_slingshot * support alternate traffic class names with "TC_" + prefix. + * scrontab * Fix cutting off the final character of quoted variables. + * Fix slurmstepd segfault when ContainerPath is not set in oci.conf + * Change the log message warning for rate limited users from debug to verbose. + * Fixed an issue where jobs requesting licenses were incorrectly rejected. + * smail * Fix issues where e*mails at job completion were not being sent. + * scontrol/slurmctld * fix comma parsing when updating a reservation's nodes. + * cgroup/v2 * Avoid capturing log output for ebpf when constraining devices, + as this can lead to inadvertent failure if the log buffer is too small. + * Fix **gpu*bind=single binding tasks to wrong gpus, leading to some gpus + having more tasks than they should and other gpus being unused. + * Fix main scheduler loop not starting after failover to backup controller. + * Added error message when attempting to use sattach on batch or extern steps. + * Fix regression in 23.02 that causes slurmstepd to crash when srun requests + more than TreeWidth nodes in a step and uses the pmi2 or pmix plugin. + * Reject job ArrayTaskThrottle update requests from unprivileged users. + * data_parser/v0.0.39 * populate description fields of property objects in + generated OpenAPI specifications where defined. + * slurmstepd * Avoid segfault caused by ContainerPath not being terminated by + '/' in oci.conf. + * data_parser/v0.0.39 * Change v0.0.39_job_info response to tag exit_code + field as being complex instead of only an unsigned integer. + * job_container/tmpfs * Fix %h and %n substitution in BasePath where %h was + substituted as the NodeName instead of the hostname, and %n was substituted + as an empty string. + * Fix regression where **cpu*bind=verbose would override TaskPluginParam. + * scancel * Fix **clusters/*M for federations. Only filtered jobs (e.g. *A, + *u, *p, etc.) from the specified clusters will be canceled, rather than all + jobs in the federation. Specific jobids will still be routed to the origin + cluster for cancellation. + + ------------------------------------------------------------------- Mon Jan 29 13:47:55 UTC 2024 - Egbert Eich diff --git a/slurm.spec b/slurm.spec index e43b731..4f1a131 100644 --- a/slurm.spec +++ b/slurm.spec @@ -19,7 +19,7 @@ # Check file META in sources: update so_version to (API_CURRENT - API_AGE) %define so_version 40 # Make sure to update `upgrades` as well! -%define ver 23.11.3 +%define ver 23.11.5 %define _ver _23_11 %define dl_ver %{ver} # so-version is 0 and seems to be stable @@ -171,7 +171,7 @@ Source21: README_Testsuite.md Patch0: Remove-rpath-from-build.patch Patch2: pam_slurm-Initialize-arrays-and-pass-sizes.patch Patch10: Fix-test-21.41.patch -Patch14: Keep-logs-of-skipped-test-when-running-test-cases-sequentially.patch +#Patch14: Keep-logs-of-skipped-test-when-running-test-cases-sequentially.patch Patch15: Fix-test7.2-to-find-libpmix-under-lib64-as-well.patch %{upgrade_dep %pname} @@ -1112,7 +1112,8 @@ rm -rf /srv/slurm-testsuite/src /srv/slurm-testsuite/testsuite \ %{_mandir}/man1/sjobexitmod.1.* %{_mandir}/man1/sjstat.1.* %{_mandir}/man8/slurmctld.* -%{_mandir}/man8/spank* +%{_mandir}/man8/spank.* +%{_mandir}/man8/sackd.* %files openlava %{_bindir}/bjobs