Accepting request 1238577 from network:cluster

* `slurmrestd` - Remove deprecated fields from the following
     `.result` from `POST /slurm/v0.0.42/job/submit`.  
     `.job_id`, `.step_id`, `.job_submit_user_msg` from `POST /slurm/v0.0.42/job/{job_id}`.  
     `.job.exclusive`, `.jobs[].exclusive` to `POST /slurm/v0.0.42/job/submit`.  
     `.jobs[].exclusive` from `GET /slurm/v0.0.42/job/{job_id}`.  
     `.jobs[].exclusive` from `GET /slurm/v0.0.42/jobs`.  
     `.job.oversubscribe`, `.jobs[].oversubscribe` to `POST /slurm/v0.0.42/job/submit`.  
     `.jobs[].oversubscribe` from `GET /slurm/v0.0.42/job/{job_id}`.  
     `.jobs[].oversubscribe` from `GET /slurm/v0.0.42/jobs`.  
     `DELETE /slurm/v0.0.40/jobs`  
     `DELETE /slurm/v0.0.41/jobs`  
     `DELETE /slurm/v0.0.42/jobs`  
    allocation is granted.
    `job|socket|task` or `cpus|mem` per GRES.
    node update whereas previously only single nodes could be
    updated through `/node/<nodename>` endpoint:
    `POST /slurm/v0.0.42/nodes`
    partition as this is a cluster-wide option.
    `REQUEST_NODE_INFO RPC`.
    the db server is not reachable.
    (`.jobs[].priority_by_partition`) to JSON and YAML output.
    connection` error if the error was the result of an
    authentication failure.
    errors with the `SLURM_PROTOCOL_AUTHENTICATION_ERROR` error
    code.
    of `Unspecified error` if querying the following endpoints
    fails:  
    `GET /slurm/v0.0.40/diag/`  
    `GET /slurm/v0.0.41/diag/`  
    `GET /slurm/v0.0.42/diag/` (forwarded request 1238576 from eeich)

OBS-URL: https://build.opensuse.org/request/show/1238577
OBS-URL: https://build.opensuse.org/package/show/openSUSE:Factory/slurm?expand=0&rev=111
This commit is contained in:
Dominique Leuenberger 2025-01-18 12:18:25 +00:00 committed by Git OBS Bridge
commit 8a2be70840
4 changed files with 575 additions and 187 deletions

369
regression.py.sle12 Normal file
View File

@ -0,0 +1,369 @@
#!/usr/bin/env python3
############################################################################
# Copyright (C) 2006 The Regents of the University of California.
# Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
# Written by Christopher J. Morrone <morrone2@llnl.gov>
# CODE-OCEC-09-009. All rights reserved.
#
# This file is part of Slurm, a resource management program.
# For details, see <https://slurm.schedmd.com/>.
# Please also read the supplied file: DISCLAIMER.
#
# Slurm is free software; you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free
# Software Foundation; either version 2 of the License, or (at your option)
# any later version.
#
# Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
# details.
#
# You should have received a copy of the GNU General Public License along
# with Slurm; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
############################################################################
"""This script makes it easier to run the Slurm expect test scripts."""
from __future__ import print_function
import json
import os
import re
import sys
import time
import signal
from optparse import OptionParser
from optparse import OptionValueError
from subprocess import Popen
def main(argv=None):
# "tests" is a list containing tuples of length 3 of the form
# (test major number, test minor number, test filename)
tests = []
failed_tests = []
passed_tests = []
skipped_tests = []
begin = (1, 1)
abort = False
# Handle command line parameters
if argv is None:
argv = sys.argv
parser = OptionParser()
parser.add_option(
"-t",
"--time-individual",
action="store_true",
dest="time_individual",
default=False,
)
parser.add_option(
"-e",
"--exclude",
type="string",
dest="exclude_tests",
action="callback",
callback=test_parser,
help="comma or space separated string of tests to skip",
)
parser.add_option(
"-i",
"--include",
type="string",
dest="include_tests",
action="callback",
callback=test_parser,
help="comma or space separated string of tests to include",
)
parser.add_option("-k", "--keep-logs", action="store_true", default=False)
parser.add_option("-s", "--stop-on-first-fail", action="store_true", default=False)
parser.add_option(
"-b",
"--begin-from-test",
type="string",
dest="begin_from_test",
action="callback",
callback=test_parser,
)
parser.add_option(
"-f",
"--results-file",
type="string",
help="write json result to specified file name",
)
(options, args) = parser.parse_args(args=argv)
# Sanity check
if not os.path.isfile("globals"):
print('ERROR: "globals" not here as needed', file=sys.stderr)
return -1
# Clear any environment variables that could break the tests.
# Cray sets some squeue format options that break tests
del os.environ["SQUEUE_ALL"]
del os.environ["SQUEUE_SORT"]
del os.environ["SQUEUE_FORMAT"]
del os.environ["SQUEUE_FORMAT2"]
# Read the current working directory and build a sorted list
# of the available tests.
test_re = re.compile(r"test(\d+)\.(\d+)$")
for filename in os.listdir("."):
match = test_re.match(filename)
if match:
major = int(match.group(1))
minor = int(match.group(2))
if not test_in_list(major, minor, options.exclude_tests) and (
not options.include_tests
or test_in_list(major, minor, options.include_tests)
):
tests.append((major, minor, filename))
if not tests:
print(
"ERROR: no test files found in current working directory", file=sys.stderr
)
return -1
# sory by major, minor
tests.sort(key=lambda t: (t[0], t[1]))
# Set begin value
if options.begin_from_test is not None:
begin = options.begin_from_test[0]
# Now run the tests
start_time = time.time()
test_env = os.environ.copy()
if options.stop_on_first_fail:
test_env["SLURM_TESTSUITE_CLEANUP_ON_FAILURE"] = "false"
else:
test_env["SLURM_TESTSUITE_CLEANUP_ON_FAILURE"] = "true"
print("Started:", time.asctime(time.localtime(start_time)), file=sys.stdout)
sys.stdout.flush()
results_list = []
for test in tests:
if begin[0] > test[0] or (begin[0] == test[0] and begin[1] > test[1]):
continue
test_id = "{0}.{1}".format(test[0],test[1])
sys.stdout.write("Running test %s " % test_id)
sys.stdout.flush()
test_dict = {}
test_dict["id"] = test_id
testlog_name = "test{test_id}.log"
try:
os.remove(testlog_name + ".failed")
except:
pass
testlog = open(testlog_name, "w+")
if options.time_individual:
t1 = time.time()
test_dict["start_time"] = float("%.03f" % t1)
try:
child = Popen(
("expect", test[2]),
shell=False,
env=test_env,
stdout=testlog,
stderr=testlog,
)
retcode = child.wait()
except KeyboardInterrupt:
child.send_signal(signal.SIGINT)
retcode = child.wait()
abort = True
if options.time_individual:
t2 = time.time()
minutes = int(int(t2 - t1) / 60)
seconds = (int(t2 - t1)) % 60
if minutes > 0:
sys.stdout.write("%d min " % (minutes))
sys.stdout.write("%.2f sec " % (seconds))
test_dict["duration"] = float("%.03f" % (t2 - t1))
if retcode == 0:
status = "pass"
elif retcode > 127:
status = "skip"
else:
status = "fail"
test_dict["status"] = status
# Determine the reason if requesting a json results file
if status != "pass" and options.results_file:
testlog.flush()
testlog.seek(0)
test_output = testlog.read()
sections = [s for s in test_output.split("=" * 78 + "\n")]
header = sections[1]
body = sections[2]
footer = "".join(sections[3:])
fatals = re.findall(
r"(?ms)\[[^\]]+\][ \[]+Fatal[ \]:]+(.*?) \(fail[^\)]+\)$", body
)
errors = re.findall(
r"(?ms)\[[^\]]+\][ \[]+Error[ \]:]+(.*?) \(subfail[^\)]+\)$", body
)
warnings = re.findall(
r"(?ms)\[[^\]]+\][ \[]+Warning[ \]:]+((?:(?!Warning).)*) \((?:sub)?skip[^\)]+\)$",
body,
)
if fatals:
test_dict["reason"] = fatals[0]
elif errors:
test_dict["reason"] = errors[0]
elif warnings:
test_dict["reason"] = warnings[0]
results_list.append(test_dict)
testlog.close()
if status == "pass":
passed_tests.append(test)
sys.stdout.write("\n")
if not options.keep_logs:
try:
os.remove(testlog_name)
except IOError as e:
print(
"ERROR failed to close %s %s" % (testlog_name, e),
file=sys.stederr,
)
elif status == "skip":
skipped_tests.append(test)
sys.stdout.write("SKIPPED\n")
if not options.keep_logs:
try:
os.remove(testlog_name)
except IOError as e:
print(
"ERROR failed to close %s %s" % (testlog_name, e),
file=sys.stederr,
)
else:
failed_tests.append(test)
os.rename(testlog_name, testlog_name + ".failed")
sys.stdout.write("FAILED!\n")
if options.stop_on_first_fail:
break
sys.stdout.flush()
if abort:
sys.stdout.write("\nRegression interrupted!\n")
break
end_time = time.time()
print("Ended:", time.asctime(time.localtime(end_time)), file=sys.stdout)
print(
"\nTestsuite ran for %d minutes %d seconds"
% ((end_time - start_time) / 60, (end_time - start_time) % 60),
file=sys.stdout,
)
if options.results_file:
with open(options.results_file, "w") as results_file:
json.dump(results_list, results_file)
print("Completions :", len(passed_tests), file=sys.stdout)
print("Failures :", len(failed_tests), file=sys.stdout)
print("Skipped :", len(skipped_tests), file=sys.stdout)
if len(failed_tests) > 0:
print("Failed tests : ", file=sys.stdout)
first = True
for test in failed_tests:
if first:
first = False
else:
sys.stdout.write(",")
sys.stdout.write("%d.%d" % (test[0], test[1]))
sys.stdout.write("\n")
sys.stdout.flush()
if abort:
print("INCOMPLETE", file=sys.stdout)
if len(failed_tests) > 0:
return 1
def test_in_list(major, minor, test_list):
"""Test for whether a test numbered major.minor is in test_list.
"major" and "minor" must be integers. "test_list" is a list of
tuples, each tuple representing one test. The tuples are of the
form:
(major, minor, filename)
Returns True if the test is in the list, and False otherwise.
"""
if not test_list:
return False
for test in test_list:
if (test[0] == "*" or test[0] == major) and (
test[1] == "*" or test[1] == minor
):
return True
return False
def test_parser(option, opt_str, value, parser):
"""Option callback function for the optparse.OptionParser class.
Will take a string representing one or more test names and append
a tuple representing the test into a list in the options's destination
variable.
A string representing test names must patch the regular expression
named "test_re" below. Some examples of exceptable options are:
'1.5'
'test9.8'
'2.6 test3.1 14.2'
'3.4,6.7,8.3'
'1.*'
'*.2'
'1.*,3.8,9.2'
Raises OptionValueError on error.
"""
# Initialize the option's destination array, if is does not already exist.
if not hasattr(parser.values, option.dest):
setattr(parser.values, option.dest, [])
if getattr(parser.values, option.dest) is None:
setattr(parser.values, option.dest, [])
# Get a pointer to the option's destination array.
l = getattr(parser.values, option.dest)
# Split the user's option string into a series of tuples that represent
# each test, and add each tuple to the destination array.
splitter = re.compile(r"[,\s]+")
val = splitter.split(value)
test_re = re.compile(r"(test)?((\d+)|\*)\.((\d+)|\*)$")
for v in val:
m = test_re.match(v)
if not m:
raise OptionValueError
major = m.group(2)
if major != "*":
major = int(major)
minor = m.group(4)
if minor != "*":
minor = int(minor)
l.append((major, minor))
if __name__ == "__main__":
sys.exit(main())

View File

@ -1,3 +1,8 @@
-------------------------------------------------------------------
Fri Jan 17 14:19:10 UTC 2025 - Egbert Eich <eich@suse.com>
- Make test suite package work on SLE-12.
------------------------------------------------------------------- -------------------------------------------------------------------
Thu Jan 9 08:35:38 UTC 2025 - Egbert Eich <eich@suse.com> Thu Jan 9 08:35:38 UTC 2025 - Egbert Eich <eich@suse.com>
@ -75,16 +80,16 @@ Mon Jan 6 12:40:31 UTC 2025 - Egbert Eich <eich@suse.com>
* Increase efficency of sending logs to syslog. * Increase efficency of sending logs to syslog.
* Switch to new official YAML mime type `application/yaml` in * Switch to new official YAML mime type `application/yaml` in
compliance with RFC9512 as primary mime type for YAML formatting. compliance with RFC9512 as primary mime type for YAML formatting.
* `slurmrestd` - Removed deprecated fields from the following * `slurmrestd` - Remove deprecated fields from the following
endpoints: endpoints:
`.result' from `POST /slurm/v0.0.42/job/submit`. `.result` from `POST /slurm/v0.0.42/job/submit`.
`.job_id`, `.step_id`, `.job_submit_user_msg` from `POST /slurm/v0.0.42/job/{job_id}`. `.job_id`, `.step_id`, `.job_submit_user_msg` from `POST /slurm/v0.0.42/job/{job_id}`.
`.job.exclusive`, `.jobs[].exclusive` to `POST /slurm/v0.0.42/job/submit`. `.job.exclusive`, `.jobs[].exclusive` to `POST /slurm/v0.0.42/job/submit`.
`.jobs[].exclusive` from `GET /slurm/v0.0.42/job/{job_id}`. `.jobs[].exclusive` from `GET /slurm/v0.0.42/job/{job_id}`.
`.jobs[].exclusive` from `GET /slurm/v0.0.42/jobs`. `.jobs[].exclusive` from `GET /slurm/v0.0.42/jobs`.
`.job.oversubscribe`, `.jobs[].oversubscribe` to `POST /slurm/v0.0.42/job/submit`. `.job.oversubscribe`, `.jobs[].oversubscribe` to `POST /slurm/v0.0.42/job/submit`.
`.jobs[].oversubscribe` from `GET /slurm/v0.0.42/job/{job_id}`. `.jobs[].oversubscribe` from `GET /slurm/v0.0.42/job/{job_id}`.
`.jobs[].oversubscribe` from `GET /slurm/v0.0.42/jobs`. `.jobs[].oversubscribe` from `GET /slurm/v0.0.42/jobs`.
* `scontrol` - Removed deprecated fields `.jobs[].exclusive` and * `scontrol` - Removed deprecated fields `.jobs[].exclusive` and
`.jobs[].oversubscribe` from `scontrol show jobs --{json|yaml}`. `.jobs[].oversubscribe` from `scontrol show jobs --{json|yaml}`.
* `squeue` - Removed deprecated fields `.jobs[].exclusive` and * `squeue` - Removed deprecated fields `.jobs[].exclusive` and
@ -100,297 +105,297 @@ Mon Jan 6 12:40:31 UTC 2025 - Egbert Eich <eich@suse.com>
to the drivers. to the drivers.
* Limit `SwitchName` to `HOST_NAME_MAX` chars length. * Limit `SwitchName` to `HOST_NAME_MAX` chars length.
* For `scancel --ctld` and the following rest api endpoints: * For `scancel --ctld` and the following rest api endpoints:
`DELETE /slurm/v0.0.40/jobs` `DELETE /slurm/v0.0.40/jobs`
`DELETE /slurm/v0.0.41/jobs` `DELETE /slurm/v0.0.41/jobs`
`DELETE /slurm/v0.0.42/jobs` `DELETE /slurm/v0.0.42/jobs`
Support array expressions in the responses to the client. Support array expressions in the responses to the client.
* `salloc` - Always output node names to the user when an * `salloc` - Always output node names to the user when an
allocation is granted. allocation is granted.
* `slurmrestd` - Removed all v0.0.39 endpoints. * `slurmrestd` - Removed all v0.0.39 endpoints.
* `select/linear` - Reject jobs asking for GRES per * `select/linear` - Reject jobs asking for GRES per
`job|socket|task` or `cpus|mem` per GRES. `job|socket|task` or `cpus|mem` per GRES.
* Add `/nodes` POST endpoint to REST API, supports multiple * Add `/nodes` POST endpoint to REST API, supports multiple
node update whereas previously only single nodes could be node update whereas previously only single nodes could be
updated through `/node/<nodename>` endpoint: updated through `/node/<nodename>` endpoint:
`POST /slurm/v0.0.42/nodes` `POST /slurm/v0.0.42/nodes`
* Do not allow changing or setting `PreemptMode=GANG` to a * Do not allow changing or setting `PreemptMode=GANG` to a
partition as this is a cluster-wide option. partition as this is a cluster-wide option.
* Add `%b` as a file name pattern for the array task id modulo 10. * Add `%b` as a file name pattern for the array task id modulo 10.
* Skip packing empty nodes when they are hidden during * Skip packing empty nodes when they are hidden during
`REQUEST_NODE_INFO RPC`. `REQUEST_NODE_INFO RPC`.
* `accounting_storage/mysql` - Avoid a fatal condition when * `accounting_storage/mysql` - Avoid a fatal condition when
the db server is not reachable. the db server is not reachable.
* Always lay out steps cyclically on nodes in an allocation. * Always lay out steps cyclically on nodes in an allocation.
* `squeue` - add priority by partition * `squeue` - add priority by partition
(`.jobs[].priority_by_partition`) to JSON and YAML output. (`.jobs[].priority_by_partition`) to JSON and YAML output.
* `slurmrestd` - Add clarification to `failed to open slurmdbd * `slurmrestd` - Add clarification to `failed to open slurmdbd
connection` error if the error was the result of an connection` error if the error was the result of an
authentication failure. authentication failure.
* Make it so `slurmctld` responds to RPCs that have authentication * Make it so `slurmctld` responds to RPCs that have authentication
errors with the `SLURM_PROTOCOL_AUTHENTICATION_ERROR` error errors with the `SLURM_PROTOCOL_AUTHENTICATION_ERROR` error
code. code.
* `openapi/slurmctld` - Display the correct error code instead * `openapi/slurmctld` - Display the correct error code instead
of `Unspecified error` if querying the following endpoints of `Unspecified error` if querying the following endpoints
fails: fails:
`GET /slurm/v0.0.40/diag/` `GET /slurm/v0.0.40/diag/`
`GET /slurm/v0.0.41/diag/` `GET /slurm/v0.0.41/diag/`
`GET /slurm/v0.0.42/diag/` `GET /slurm/v0.0.42/diag/`
`GET /slurm/v0.0.40/licenses/` `GET /slurm/v0.0.40/licenses/`
`GET /slurm/v0.0.41/licenses/` `GET /slurm/v0.0.41/licenses/`
`GET /slurm/v0.0.42/licenses/` `GET /slurm/v0.0.42/licenses/`
`GET /slurm/v0.0.40/reconfigure` `GET /slurm/v0.0.40/reconfigure`
`GET /slurm/v0.0.41/reconfigure` `GET /slurm/v0.0.41/reconfigure`
`GET /slurm/v0.0.42/reconfigure` `GET /slurm/v0.0.42/reconfigure`
* Fix how used CPUs are tracked in a job allocation to allow the * Fix how used CPUs are tracked in a job allocation to allow the
max number of concurrent steps to run at a time if threads per max number of concurrent steps to run at a time if threads per
core is greater than 1. core is greater than 1.
* In existing allocations SLURM_GPUS_PER_NODE environment * In existing allocations SLURM_GPUS_PER_NODE environment
variable will be ignored by srun if `--gpus` is specified. variable will be ignored by srun if `--gpus` is specified.
* When using `--get-user-env` explicitly or implicitly, check * When using `--get-user-env` explicitly or implicitly, check
if PID or mnt namespaces are disabled and fall back to old if PID or mnt namespaces are disabled and fall back to old
logic that does not rely on them when they are not available. logic that does not rely on them when they are not available.
* Removed non-functional option `SLURM_PROLOG_CPU_MASK` from * Removed non-functional option `SLURM_PROLOG_CPU_MASK` from
`TaskProlog` which was used to reset the affinity of a task `TaskProlog` which was used to reset the affinity of a task
based on the mask given. based on the mask given.
* `slurmrestd` - Support passing of `-d latest` to load latest * `slurmrestd` - Support passing of `-d latest` to load latest
version of `data_parser` plugin. version of `data_parser` plugin.
* `sacct`,`sacctmgr`,`scontrol`,`sdiag`,`sinfo`,`squeue`,`sshare` * `sacct`,`sacctmgr`,`scontrol`,`sdiag`,`sinfo`,`squeue`,`sshare`
- Change response to `--json=list` or `--yaml=list` to send - Change response to `--json=list` or `--yaml=list` to send
list of plugins to stdout and descriptive header to stderr to list of plugins to stdout and descriptive header to stderr to
allow for easier parsing. allow for easier parsing.
* `slurmrestd` - Change response to `-d list`, `-a list` or * `slurmrestd` - Change response to `-d list`, `-a list` or
`-s list` to send list of plugins to stdout and descriptive `-s list` to send list of plugins to stdout and descriptive
header to stderr to allow for easier parsing. header to stderr to allow for easier parsing.
* `sacct`,`sacctmgr`,`scontrol`,`sdiag`,`sinfo`,`squeue`, * `sacct`,`sacctmgr`,`scontrol`,`sdiag`,`sinfo`,`squeue`,
`sshare`,`slurmrestd` - Avoid crash when loading `data_parser` `sshare`,`slurmrestd` - Avoid crash when loading `data_parser`
plugins fail due to NULL dereference. plugins fail due to NULL dereference.
* Add autodetected GPUs to the output of `slurmd -C` * Add autodetected GPUs to the output of `slurmd -C`
* Remove `burst_buffer/lua` call `slurm.job_info_to_string()`. * Remove `burst_buffer/lua` call `slurm.job_info_to_string()`.
* Add `SchedulerParameters=bf_allow_magnetic_slot` option. It * Add `SchedulerParameters=bf_allow_magnetic_slot` option. It
allows jobs in magnetic reservations to be planned by backfill allows jobs in magnetic reservations to be planned by backfill
scheduler. scheduler.
* `slurmrestd` - Refuse to run as root, `SlurmUser`, and * `slurmrestd` - Refuse to run as root, `SlurmUser`, and
`nobody(99)`. `nobody(99)`.
* `openapi/slurmctld` - Revert regression that caused signaling * `openapi/slurmctld` - Revert regression that caused signaling
jobs to cancel entire job arrays instead of job array tasks: jobs to cancel entire job arrays instead of job array tasks:
`DELETE /slurm/v0.0.40/{job_id}` `DELETE /slurm/v0.0.40/{job_id}`
`DELETE /slurm/v0.0.41/{job_id}` `DELETE /slurm/v0.0.41/{job_id}`
`DELETE /slurm/v0.0.42/{job_id}` `DELETE /slurm/v0.0.42/{job_id}`
* `openapi/slurmctld` - Support more formats for `{job_id}` * `openapi/slurmctld` - Support more formats for `{job_id}`
including job steps: including job steps:
`DELETE /slurm/v0.0.40/{job_id}` `DELETE /slurm/v0.0.40/{job_id}`
`DELETE /slurm/v0.0.41/{job_id}` `DELETE /slurm/v0.0.41/{job_id}`
`DELETE /slurm/v0.0.42/{job_id}` `DELETE /slurm/v0.0.42/{job_id}`
* Alter scheduling of jobs at submission time to consider job * Alter scheduling of jobs at submission time to consider job
submission time and job id. This makes it so that that submission time and job id. This makes it so that that
interactive jobs aren't allocated resources before batch jobs interactive jobs aren't allocated resources before batch jobs
when they have the same priority at submit time. when they have the same priority at submit time.
* Fix multi-cluster submissions with differing Switch plugins. * Fix multi-cluster submissions with differing Switch plugins.
* `slurmrestd` - Change `+prefer_refs` flag to default in * `slurmrestd` - Change `+prefer_refs` flag to default in
`data_parser/v0.0.42` plugin. Add `+minimize_refs` flag to `data_parser/v0.0.42` plugin. Add `+minimize_refs` flag to
inline single referenced schemas in the OpenAPI schema. This inline single referenced schemas in the OpenAPI schema. This
sets the default OpenAPI schema generation behavior of sets the default OpenAPI schema generation behavior of
`data_parser/v0.0.42` to match v0.0.41 `+prefer_refs` and `data_parser/v0.0.42` to match v0.0.41 `+prefer_refs` and
v0.0.40 (without flags). v0.0.40 (without flags).
* Fix `LaunchParameters=batch_step_set_cpu_freq`. * Fix `LaunchParameters=batch_step_set_cpu_freq`.
* Clearer `seff` warning message for running jobs. * Clearer `seff` warning message for running jobs.
* `data_parser/v0.0.42` - Rename `JOB_INFO` field * `data_parser/v0.0.42` - Rename `JOB_INFO` field
`minimum_switches` to `required_switches` to reflect the `minimum_switches` to `required_switches` to reflect the
actual behavior. actual behavior.
* `data_parser/v0.0.42` - Rename `ACCOUNT_CONDITION` field * `data_parser/v0.0.42` - Rename `ACCOUNT_CONDITION` field
`assocation` to `association` to fix typo. `assocation` to `association` to fix typo.
* `cgroup/v2` - fix cgroup cleanup when running inside a * `cgroup/v2` - fix cgroup cleanup when running inside a
container without write permissions to `/sys/fs/cgroup`. container without write permissions to `/sys/fs/cgroup`.
* `cgroup/v2` - fix accounting of swap events detection. * `cgroup/v2` - fix accounting of swap events detection.
* Fix gathering MaxRSS for jobs that run shorter than two * Fix gathering MaxRSS for jobs that run shorter than two
`jobacctgather` intervals. Get the metrics from cgroups `jobacctgather` intervals. Get the metrics from cgroups
`memory.peak` or `memory.max_usage_in_bytes` where available. `memory.peak` or `memory.max_usage_in_bytes` where available.
* `openapi/slurmctld` - Set complex number support for the * `openapi/slurmctld` - Set complex number support for the
following fields: following fields:
`.shares[][].fairshare.factor` `.shares[][].fairshare.factor`
`.shares[][].fairshare.level` `.shares[][].fairshare.level`
for endpoints: for endpoints:
`GET /slurm/v0.0.42/shares` `GET /slurm/v0.0.42/shares`
and for commands: and for commands:
`sshare --json` `sshare --json`
`sshare --yaml` `sshare --yaml`
* `data_parser/v0.0.42` - Avoid dumping `Infinity` for `NO_VAL` * `data_parser/v0.0.42` - Avoid dumping `Infinity` for `NO_VAL`
tagged `number` fields. tagged `number` fields.
* Add `TopologyParam=TopoMaxSizeUnroll=#` to allow * Add `TopologyParam=TopoMaxSizeUnroll=#` to allow
`--nodes=<min>-<max>` for `topology/block`. `--nodes=<min>-<max>` for `topology/block`.
* `sacct` - Respect `--noheader` for `--batch-script` and * `sacct` - Respect `--noheader` for `--batch-script` and
`--env-vars`. `--env-vars`.
* `sacct` - Remove extra newline in output from `--batch-script` * `sacct` - Remove extra newline in output from `--batch-script`
and --env-vars. and --env-vars.
* Add `sacctmgr ping` command to query status of `slurmdbd`. * Add `sacctmgr ping` command to query status of `slurmdbd`.
* Generate an error message when a `NodeSet` name conflicts with * Generate an error message when a `NodeSet` name conflicts with
a `NodeName`, and prevent the controller from starting if such a `NodeName`, and prevent the controller from starting if such
a conflict exists. a conflict exists.
* `slurmd` - properly detect slurmd restarts in the energy * `slurmd` - properly detect slurmd restarts in the energy
gathering logic which caused bad numbers in accounting. gathering logic which caused bad numbers in accounting.
* `sackd` - retry fetching slurm configs indefinately in * `sackd` - retry fetching slurm configs indefinately in
configless mode. configless mode.
* `job_submit/lua` - Add `assoc_qos` attribute to `job_desc` * `job_submit/lua` - Add `assoc_qos` attribute to `job_desc`
to display all potential QOS's for a job's association. to display all potential QOS's for a job's association.
* `job_submit/lua` - Add `slurm.get_qos_priority()` function * `job_submit/lua` - Add `slurm.get_qos_priority()` function
to retrieve the given QOS's priority. to retrieve the given QOS's priority.
* `sbcast` - Add `--nodelist` option to specify where files are * `sbcast` - Add `--nodelist` option to specify where files are
transmitted to. transmitted to.
* `sbcast` - Add `--no-allocation` option to transmit files to * `sbcast` - Add `--no-allocation` option to transmit files to
nodes outside of a job allocation nodes outside of a job allocation
* Add `DataParserParameters` `slurm.conf` parameter to allow * Add `DataParserParameters` `slurm.conf` parameter to allow
setting default value for CLI `--json` and `--yaml` arguments. setting default value for CLI `--json` and `--yaml` arguments.
* `seff` - improve step's max memory consumption report by using * `seff` - improve step's max memory consumption report by using
`TresUsageInTot` and `TresUsageInAve` instead of overestimating `TresUsageInTot` and `TresUsageInAve` instead of overestimating
the values. the values.
* Enable RPC queueing for `REQUEST_KILL_JOBS`, which is used when * Enable RPC queueing for `REQUEST_KILL_JOBS`, which is used when
`scancel` is executed with `--ctld` flag. `scancel` is executed with `--ctld` flag.
* `slurmdbd` - Add `-u` option. This is used to determine if * `slurmdbd` - Add `-u` option. This is used to determine if
restarting the DBD will result in database conversion. restarting the DBD will result in database conversion.
* Fix `srun` inside an `salloc` in a federated cluster when using * Fix `srun` inside an `salloc` in a federated cluster when using
IPv6. IPv6.
* Calculate the forwarding timeouts according to tree depth * Calculate the forwarding timeouts according to tree depth
rather than node count / tree width for each level. Fixes race rather than node count / tree width for each level. Fixes race
conditions with same timeouts between two consecutive node conditions with same timeouts between two consecutive node
levels. levels.
* Add ability to submit jobs with multiple QOS. * Add ability to submit jobs with multiple QOS.
* Fix difference in behavior when swapping partition order in job * Fix difference in behavior when swapping partition order in job
submission. submission.
* Improve `PLANNED` state detection for mixed nodes and updating * Improve `PLANNED` state detection for mixed nodes and updating
state before yielding backfill locks. state before yielding backfill locks.
* Always consider partition priority tiers when deciding to try * Always consider partition priority tiers when deciding to try
scheduling jobs on submit. scheduling jobs on submit.
* Prevent starting jobs without reservations on submit when there * Prevent starting jobs without reservations on submit when there
are pending jobs with reservations that have flags `FLEX` or are pending jobs with reservations that have flags `FLEX` or
`ANY_NODES` that can be scheduled on overlapping nodes. `ANY_NODES` that can be scheduled on overlapping nodes.
* Prevent jobs that request both high and low priority tier * Prevent jobs that request both high and low priority tier
partitions from starting on submit in lower priority tier partitions from starting on submit in lower priority tier
partitions if it could delay pending jobs in higher priority partitions if it could delay pending jobs in higher priority
tier partitions. tier partitions.
* `scontrol` - Wait for `slurmctld` to start reconfigure in * `scontrol` - Wait for `slurmctld` to start reconfigure in
foreground mode before returning. foreground mode before returning.
* Improve reconfigure handling on Linux to only close open file * Improve reconfigure handling on Linux to only close open file
descriptors to avoid long delays on systems with large descriptors to avoid long delays on systems with large
`RLIMIT_NOFILE` settings. `RLIMIT_NOFILE` settings.
* `salloc` - Removed `--get-user-env` option. * `salloc` - Removed `--get-user-env` option.
* Removed the instant on feature from `switch/hpe_slingshot`. * Removed the instant on feature from `switch/hpe_slingshot`.
* Hardware collectives in `switch/hpe_slingshot` now requires * Hardware collectives in `switch/hpe_slingshot` now requires
`enable_stepmgr`. `enable_stepmgr`.
* Allow backfill to plan jobs on nodes currently being used by * Allow backfill to plan jobs on nodes currently being used by
exclusive user or mcs jobs. exclusive user or mcs jobs.
* Avoid miscaching IPv6 address to hostname lookups that could * Avoid miscaching IPv6 address to hostname lookups that could
have caused logs to have the incorrect hostname. have caused logs to have the incorrect hostname.
* `scontrol` - Add `--json`/`--yaml` support to `listpids` * `scontrol` - Add `--json`/`--yaml` support to `listpids`
* `scontrol` - Add `liststeps` * `scontrol` - Add `liststeps`
* `scontrol` - Add `listjobs` * `scontrol` - Add `listjobs`
* `slurmrestd` - Avoid connection to slurmdbd for the following * `slurmrestd` - Avoid connection to slurmdbd for the following
endpoints: endpoints:
`GET /slurm/v0.0.42/jobs` `GET /slurm/v0.0.42/jobs`
`GET /slurm/v0.0.42/job/{job_id}` `GET /slurm/v0.0.42/job/{job_id}`
* `slurmctld` - Changed incoming RPC handling to dedicated thread * `slurmctld` - Changed incoming RPC handling to dedicated thread
pool. pool.
* `job_container/tmpfs` - Add `EntireStepInNS` option that will * `job_container/tmpfs` - Add `EntireStepInNS` option that will
place the `slurmstepd` process within the constructed namespace place the `slurmstepd` process within the constructed namespace
directly. directly.
* `scontrol show topo` - Show aggregated block sizes when using * `scontrol show topo` - Show aggregated block sizes when using
`topology/block`. `topology/block`.
* `slurmrestd` - Add more descriptive HTTP status for * `slurmrestd` - Add more descriptive HTTP status for
authentication failure and connectivity errors with controller. authentication failure and connectivity errors with controller.
* `slurmrestd` - Improve reporting errors from `slurmctld` for * `slurmrestd` - Improve reporting errors from `slurmctld` for
job queries: job queries:
`GET /slurm/v0.0.41/{job_id}` `GET /slurm/v0.0.41/{job_id}`
`GET /slurm/v0.0.41/jobs/` `GET /slurm/v0.0.41/jobs/`
* Avoid rejecting a step request that needs fewer GRES than nodes * Avoid rejecting a step request that needs fewer GRES than nodes
in the job allocation. in the job allocation.
* `slurmrestd` - Tag the never populated `.jobs[].pid` field as * `slurmrestd` - Tag the never populated `.jobs[].pid` field as
deprecated for the following endpoints: deprecated for the following endpoints:
`GET /slurm/v0.0.42/{job_id}` `GET /slurm/v0.0.42/{job_id}`
`GET /slurm/v0.0.42/jobs/` `GET /slurm/v0.0.42/jobs/`
* `scontrol`,`squeue` - Tag the never populated `.jobs[].pid` field * `scontrol`,`squeue` - Tag the never populated `.jobs[].pid` field
as deprecated for the following: as deprecated for the following:
`scontrol show jobs --json` `scontrol show jobs --json`
`scontrol show jobs --yaml` `scontrol show jobs --yaml`
`scontrol show job ${JOB_ID} --json` `scontrol show job ${JOB_ID} --json`
`scontrol show job ${JOB_ID} --yaml` `scontrol show job ${JOB_ID} --yaml`
`squeue --json` `squeue --json`
`squeue --yaml` `squeue --yaml`
* `data_parser` v0.0.42 - fix timestamp parsing regression * `data_parser` v0.0.42 - fix timestamp parsing regression
introduced in in v0.0.40 (eaf3b6631f), parsing of non iso 8601 introduced in in v0.0.40 (eaf3b6631f), parsing of non iso 8601
style timestamps style timestamps
* `cgroup/v2` will detect some special container and namespaced * `cgroup/v2` will detect some special container and namespaced
setups and will work with it. setups and will work with it.
* Support IPv6 in configless mode. * Support IPv6 in configless mode.
* Add `SlurmctldParamters=ignore_constraint_validation` to ignore * Add `SlurmctldParamters=ignore_constraint_validation` to ignore
`constraint/feature` validation at submission. `constraint/feature` validation at submission.
* `slurmrestd` - Set `.pings[].mode` field as deprecated in the * `slurmrestd` - Set `.pings[].mode` field as deprecated in the
following endpoints: following endpoints:
`GET /slurm/v0.0.42/ping` `GET /slurm/v0.0.42/ping`
* `scontrol` - Set `.pings[].mode` field as deprecated in the * `scontrol` - Set `.pings[].mode` field as deprecated in the
following commands: following commands:
`scontrol ping --json` `scontrol ping --json`
`scontrol ping --yaml` `scontrol ping --yaml`
* `slurmrestd` - Set `.pings[].pinged` field as deprecated in * `slurmrestd` - Set `.pings[].pinged` field as deprecated in
the following endpoints: the following endpoints:
`GET /slurm/v0.0.42/ping` `GET /slurm/v0.0.42/ping`
* `scontrol` - Set `.pings[].pinged` field as deprecated in the * `scontrol` - Set `.pings[].pinged` field as deprecated in the
following commands: following commands:
`scontrol ping --json` `scontrol ping --json`
`scontrol ping --yaml` `scontrol ping --yaml`
* `slurmrestd` - Add `.pings[].primary` field to the following * `slurmrestd` - Add `.pings[].primary` field to the following
endpoints: endpoints:
`GET /slurm/v0.0.42/ping` `GET /slurm/v0.0.42/ping`
* `scontrol` - Add `.pings[].primary` field to the following * `scontrol` - Add `.pings[].primary` field to the following
commands: commands:
`scontrol ping --json` `scontrol ping --json`
`scontrol ping --yaml` `scontrol ping --yaml`
* `slurmrestd` - Add `.pings[].responding` field to the following * `slurmrestd` - Add `.pings[].responding` field to the following
endpoints: endpoints:
`GET /slurm/v0.0.42/ping` `GET /slurm/v0.0.42/ping`
* `scontrol` - Add `.pings[].responding` field to the following * `scontrol` - Add `.pings[].responding` field to the following
commands: commands:
`scontrol ping --json` `scontrol ping --json`
`scontrol ping --yaml` `scontrol ping --yaml`
* Prevent jobs without reservations from delaying jobs in * Prevent jobs without reservations from delaying jobs in
reservations with flags `FLEX` or `ANY_NODES` in the main reservations with flags `FLEX` or `ANY_NODES` in the main
scheduler. scheduler.
* Fix allowing to ask for multiple different types of TRES * Fix allowing to ask for multiple different types of TRES
when one of them has a value of 0. when one of them has a value of 0.
* `slurmctld` - Add a grace period to ensure the agent retry * `slurmctld` - Add a grace period to ensure the agent retry
queue is properly flushed during shutdown. queue is properly flushed during shutdown.
* Don't ship `src/slurmrestd/plugins/openapi/slurmdbd/openapi.json` * Don't ship `src/slurmrestd/plugins/openapi/slurmdbd/openapi.json`
`slurmrest` should always be used to enerate a new OpenAPI `slurmrest` should always be used to enerate a new OpenAPI
schema (aka openapi.json or openapi.yaml). schema (aka openapi.json or openapi.yaml).
* `mpi/pmix` - Fix potential deadlock and races with het jobs, * `mpi/pmix` - Fix potential deadlock and races with het jobs,
and fix potential memory and FDs leaks. and fix potential memory and FDs leaks.
* Fix jobs with `--gpus` being rejected in some edge cases for * Fix jobs with `--gpus` being rejected in some edge cases for
partitions where not all nodes have the same amount of GPUs partitions where not all nodes have the same amount of GPUs
and CPUs configured. and CPUs configured.
* In an extra constraints expression in a job request, do not * In an extra constraints expression in a job request, do not
allow an empty string for a key or value. allow an empty string for a key or value.
* In an extra constraints expression in a job request, fix * In an extra constraints expression in a job request, fix
validation that requests are separated by boolean operators. validation that requests are separated by boolean operators.
* Add `TaskPluginParam=OOMKillStep` to kill the step as a whole * Add `TaskPluginParam=OOMKillStep` to kill the step as a whole
when one task OOMs. when one task OOMs.
* Fix `scontrol` show conf not showing all `TaskPluginParam` * Fix `scontrol` show conf not showing all `TaskPluginParam`
elements. elements.
* `slurmrestd` - Add fields `.job.oom_kill_step` * `slurmrestd` - Add fields `.job.oom_kill_step`
`.jobs[].oom_kill_step` to `POST /slurm/v0.0.42/job/submit` `.jobs[].oom_kill_step` to `POST /slurm/v0.0.42/job/submit`
and `POST /slurm/v0.0.42/job/allocate`. and `POST /slurm/v0.0.42/job/allocate`.
* Improve performance for `_will_run_test()`. * Improve performance for `_will_run_test()`.
* Add `SchedulerParameters=bf_topopt_enable` option to enable * Add `SchedulerParameters=bf_topopt_enable` option to enable
experimental hook to control backfill. experimental hook to control backfill.
* If a step fails to launch under certain conditions, set the * If a step fails to launch under certain conditions, set the
step's state to `NODE_FAIL`. step's state to `NODE_FAIL`.
* `sched/backfill` - Fix certain situations where a job would * `sched/backfill` - Fix certain situations where a job would
not get a planned time, which could lead to it being delayed not get a planned time, which could lead to it being delayed
by lower priority jobs. by lower priority jobs.
* `slurmrestd` - Dump JSON `null` instead of `{}` (empty object) * `slurmrestd` - Dump JSON `null` instead of `{}` (empty object)
for non-required fields in objects to avoid client for non-required fields in objects to avoid client
compatiblity issues for v0.0.42 version tagged endpoints. compatiblity issues for v0.0.42 version tagged endpoints.
* `sacct`,`sacctmgr`,`scontrol`,`sdiag`,`sinfo`,`squeue`, * `sacct`,`sacctmgr`,`scontrol`,`sdiag`,`sinfo`,`squeue`,
`sshare` - Dump `null` instead `{}` (empty object) for `sshare` - Dump `null` instead `{}` (empty object) for
non-required fields in objects to avoid client compatiblity non-required fields in objects to avoid client compatiblity
issues when run with `--json` or `--yaml`. issues when run with `--json` or `--yaml`.
------------------------------------------------------------------- -------------------------------------------------------------------
Fri Nov 1 12:50:27 UTC 2024 - Egbert Eich <eich@suse.com> Fri Nov 1 12:50:27 UTC 2024 - Egbert Eich <eich@suse.com>

View File

@ -174,6 +174,7 @@ Source12: slurmdbd.xml
# create: tar --owner=nobody --group=nogroup --exclude=*~ -cvzf test_setup.tar.gz test_setup # create: tar --owner=nobody --group=nogroup --exclude=*~ -cvzf test_setup.tar.gz test_setup
Source20: test_setup.tar.gz Source20: test_setup.tar.gz
Source21: README_Testsuite.md Source21: README_Testsuite.md
Source22: regression.py.sle12
Patch0: Remove-rpath-from-build.patch Patch0: Remove-rpath-from-build.patch
Patch2: pam_slurm-Initialize-arrays-and-pass-sizes.patch Patch2: pam_slurm-Initialize-arrays-and-pass-sizes.patch
Patch15: Fix-test7.2-to-find-libpmix-under-lib64-as-well.patch Patch15: Fix-test7.2-to-find-libpmix-under-lib64-as-well.patch
@ -581,7 +582,9 @@ Requires: %{name}-lua = %version
Requires: %{name}-munge = %version Requires: %{name}-munge = %version
Requires: %{name}-node = %version Requires: %{name}-node = %version
Requires: %{name}-openlava = %version Requires: %{name}-openlava = %version
%if 0%{?build_slurmrestd}
Requires: %{name}-rest = %version Requires: %{name}-rest = %version
%endif
Requires: %{name}-seff = %version Requires: %{name}-seff = %version
Requires: %{name}-sjstat = %version Requires: %{name}-sjstat = %version
Requires: %{name}-slurmdbd = %version Requires: %{name}-slurmdbd = %version
@ -598,6 +601,7 @@ Requires: libnuma-devel
Requires: pam Requires: pam
Requires: pdsh Requires: pdsh
Requires: perl-%{name} = %version Requires: perl-%{name} = %version
Requires: readline-devel
Requires: sudo Requires: sudo
Requires: tar Requires: tar
BuildRequires: sudo BuildRequires: sudo
@ -890,6 +894,10 @@ find -type f -name "*.[ao]" -print | while read f; do
# drop non-deterministic lto bits from .o files # drop non-deterministic lto bits from .o files
strip -p --discard-locals -R .gnu.lto_* -R .gnu.debuglto_* -N __gnu_lto_v1 $f strip -p --discard-locals -R .gnu.lto_* -R .gnu.debuglto_* -N __gnu_lto_v1 $f
done done
# on versions < SLE15 replace regression.py with one compatible with py 3.4
%if 0%{?sle_version:1} && 0%{?sle_version} < 150000
install -m 755 %{S:22} %{buildroot}/srv/slurm-testsuite/testsuite/expect/regression.py
%endif
%if 0%{?suse_version} >= 1500 %if 0%{?suse_version} >= 1500
%define tar_sort --sort=name %define tar_sort --sort=name
%endif %endif
@ -922,6 +930,12 @@ fi
sed -i -e '/ExecStart/aExecStartPre=/bin/bash -c "for i in 0 1 2 3; do test -e /dev/nvidia$i || mknod /dev/nvidia$i c 10 $((i+2)); done"' $SLURMD_SERVICE sed -i -e '/ExecStart/aExecStartPre=/bin/bash -c "for i in 0 1 2 3; do test -e /dev/nvidia$i || mknod /dev/nvidia$i c 10 $((i+2)); done"' $SLURMD_SERVICE
tar -xzf %{S:20} tar -xzf %{S:20}
# on versions < SLE15 turn off AcctGatherProfileType and pmix
%if 0%{?sle_version:1} && 0%{?sle_version} < 150000
sed -i -e "/AcctGatherProfileType/s@^@#@" \
-e "/MpiDefault/s@pmix_v3@pmi2@" test_setup/slurm.conf
sed -i -e "/ProfileHDF5Dir/s@^@#@" test_setup/acct_gather.conf
%endif
mkdir -p %{buildroot}%{_pam_secconfdir}/limits.d mkdir -p %{buildroot}%{_pam_secconfdir}/limits.d
mv test_setup/slurm.conf.limits %{buildroot}%_pam_secconfdir/limits.d/slurm.conf mv test_setup/slurm.conf.limits %{buildroot}%_pam_secconfdir/limits.d/slurm.conf
%if 0%{?sle_version} < 150200 %if 0%{?sle_version} < 150200

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1 version https://git-lfs.github.com/spec/v1
oid sha256:7a45706911924b06a2ec7d436d4e991d84dc459a505cbdfca244ac5fad2b9b60 oid sha256:3c2249601135c2d6c2e6a8d7aa7318d50d354015ecf8a56fc467b43aa0059288
size 3165 size 3201