From 840527985f03a4327fc0fe78e45d889742601698 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michal=20Koutn=C3=BD?= Date: Fri, 11 Aug 2023 13:51:20 +0200 Subject: [PATCH 5006/5010] cgroup: Add EffectiveMemoryMax=, EffectiveMemoryHigh= and EffectiveTasksMax= properties Users become perplexed when they run their workload in a unit with no explicit limits configured (moreover, listing the limit property would even show it's infinity) but they experience unexpected resource limitation. The memory and pid limits come as the most visible, therefore add new unit read-only properties: - EffectiveMemoryMax=, - EffectiveMemoryHigh=, - EffectiveTasksMax=. These properties represent the most stringent limit systemd is aware of for the given unit -- and that is typically(*) the effective value. Implement the properties by simply traversing all parents in the leaf-slice tree and picking the minimum value. Note that effective limits are thus defined even for units that don't enable explicit accounting (because of the hierarchy). (*) The evasive case is when systemd runs in a cgroupns and cannot reason about outer setup. Complete solution would need kernel support. (cherry picked from commit 4fb0d2dc140c9a2c01c236d2a8dc09a44157e896) [mkoutny: fixes jsc#PED-5659] --- man/org.freedesktop.systemd1.xml | 126 ++++++++++++++++++++++++++++++ man/systemd.resource-control.xml | 11 ++- src/core/cgroup.c | 48 ++++++++++++ src/core/cgroup.h | 13 +++ src/core/dbus-unit.c | 25 ++++++ src/shared/bus-print-properties.c | 6 +- 6 files changed, 224 insertions(+), 5 deletions(-) diff --git a/man/org.freedesktop.systemd1.xml b/man/org.freedesktop.systemd1.xml index a1bcbba02f..59733c0039 100644 --- a/man/org.freedesktop.systemd1.xml +++ b/man/org.freedesktop.systemd1.xml @@ -2786,6 +2786,10 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice { @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly t MemoryAvailable = ...; @org.freedesktop.DBus.Property.EmitsChangedSignal("false") + readonly t EffectiveMemoryMax = ...; + @org.freedesktop.DBus.Property.EmitsChangedSignal("false") + readonly t EffectiveMemoryHigh = ...; + @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly t CPUUsageNSec = ...; @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly ay EffectiveCPUs = [...]; @@ -2794,6 +2798,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice { @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly t TasksCurrent = ...; @org.freedesktop.DBus.Property.EmitsChangedSignal("false") + readonly t EffectiveTasksMax = ...; + @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly t IPIngressBytes = ...; @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly t IPIngressPackets = ...; @@ -3419,6 +3425,10 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice { + + + + @@ -3427,6 +3437,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice { + + @@ -4061,6 +4073,10 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice { + + + + @@ -4069,6 +4085,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice { + + @@ -4865,6 +4883,10 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2esocket { @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly t MemoryAvailable = ...; @org.freedesktop.DBus.Property.EmitsChangedSignal("false") + readonly t EffectiveMemoryMax = ...; + @org.freedesktop.DBus.Property.EmitsChangedSignal("false") + readonly t EffectiveMemoryHigh = ...; + @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly t CPUUsageNSec = ...; @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly ay EffectiveCPUs = [...]; @@ -4873,6 +4895,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2esocket { @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly t TasksCurrent = ...; @org.freedesktop.DBus.Property.EmitsChangedSignal("false") + readonly t EffectiveTasksMax = ...; + @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly t IPIngressBytes = ...; @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly t IPIngressPackets = ...; @@ -5508,6 +5532,10 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2esocket { + + + + @@ -5516,6 +5544,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2esocket { + + @@ -6132,6 +6162,10 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2esocket { + + + + @@ -6140,6 +6174,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2esocket { + + @@ -6810,6 +6846,10 @@ node /org/freedesktop/systemd1/unit/home_2emount { @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly t MemoryAvailable = ...; @org.freedesktop.DBus.Property.EmitsChangedSignal("false") + readonly t EffectiveMemoryMax = ...; + @org.freedesktop.DBus.Property.EmitsChangedSignal("false") + readonly t EffectiveMemoryHigh = ...; + @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly t CPUUsageNSec = ...; @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly ay EffectiveCPUs = [...]; @@ -6818,6 +6858,8 @@ node /org/freedesktop/systemd1/unit/home_2emount { @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly t TasksCurrent = ...; @org.freedesktop.DBus.Property.EmitsChangedSignal("false") + readonly t EffectiveTasksMax = ...; + @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly t IPIngressBytes = ...; @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly t IPIngressPackets = ...; @@ -7381,6 +7423,10 @@ node /org/freedesktop/systemd1/unit/home_2emount { + + + + @@ -7389,6 +7435,8 @@ node /org/freedesktop/systemd1/unit/home_2emount { + + @@ -7919,6 +7967,10 @@ node /org/freedesktop/systemd1/unit/home_2emount { + + + + @@ -7927,6 +7979,8 @@ node /org/freedesktop/systemd1/unit/home_2emount { + + @@ -8720,6 +8774,10 @@ node /org/freedesktop/systemd1/unit/dev_2dsda3_2eswap { @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly t MemoryAvailable = ...; @org.freedesktop.DBus.Property.EmitsChangedSignal("false") + readonly t EffectiveMemoryMax = ...; + @org.freedesktop.DBus.Property.EmitsChangedSignal("false") + readonly t EffectiveMemoryHigh = ...; + @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly t CPUUsageNSec = ...; @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly ay EffectiveCPUs = [...]; @@ -8728,6 +8786,8 @@ node /org/freedesktop/systemd1/unit/dev_2dsda3_2eswap { @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly t TasksCurrent = ...; @org.freedesktop.DBus.Property.EmitsChangedSignal("false") + readonly t EffectiveTasksMax = ...; + @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly t IPIngressBytes = ...; @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly t IPIngressPackets = ...; @@ -9277,6 +9337,10 @@ node /org/freedesktop/systemd1/unit/dev_2dsda3_2eswap { + + + + @@ -9285,6 +9349,8 @@ node /org/freedesktop/systemd1/unit/dev_2dsda3_2eswap { + + @@ -9801,6 +9867,10 @@ node /org/freedesktop/systemd1/unit/dev_2dsda3_2eswap { + + + + @@ -9809,6 +9879,8 @@ node /org/freedesktop/systemd1/unit/dev_2dsda3_2eswap { + + @@ -10461,6 +10533,10 @@ node /org/freedesktop/systemd1/unit/system_2eslice { @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly t MemoryAvailable = ...; @org.freedesktop.DBus.Property.EmitsChangedSignal("false") + readonly t EffectiveMemoryMax = ...; + @org.freedesktop.DBus.Property.EmitsChangedSignal("false") + readonly t EffectiveMemoryHigh = ...; + @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly t CPUUsageNSec = ...; @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly ay EffectiveCPUs = [...]; @@ -10469,6 +10545,8 @@ node /org/freedesktop/systemd1/unit/system_2eslice { @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly t TasksCurrent = ...; @org.freedesktop.DBus.Property.EmitsChangedSignal("false") + readonly t EffectiveTasksMax = ...; + @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly t IPIngressBytes = ...; @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly t IPIngressPackets = ...; @@ -10644,6 +10722,10 @@ node /org/freedesktop/systemd1/unit/system_2eslice { + + + + @@ -10652,6 +10734,8 @@ node /org/freedesktop/systemd1/unit/system_2eslice { + + @@ -10832,6 +10916,10 @@ node /org/freedesktop/systemd1/unit/system_2eslice { + + + + @@ -10840,6 +10928,8 @@ node /org/freedesktop/systemd1/unit/system_2eslice { + + @@ -11046,6 +11136,10 @@ node /org/freedesktop/systemd1/unit/session_2d1_2escope { @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly t MemoryAvailable = ...; @org.freedesktop.DBus.Property.EmitsChangedSignal("false") + readonly t EffectiveMemoryMax = ...; + @org.freedesktop.DBus.Property.EmitsChangedSignal("false") + readonly t EffectiveMemoryHigh = ...; + @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly t CPUUsageNSec = ...; @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly ay EffectiveCPUs = [...]; @@ -11054,6 +11148,8 @@ node /org/freedesktop/systemd1/unit/session_2d1_2escope { @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly t TasksCurrent = ...; @org.freedesktop.DBus.Property.EmitsChangedSignal("false") + readonly t EffectiveTasksMax = ...; + @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly t IPIngressBytes = ...; @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly t IPIngressPackets = ...; @@ -11249,6 +11345,10 @@ node /org/freedesktop/systemd1/unit/session_2d1_2escope { + + + + @@ -11257,6 +11357,8 @@ node /org/freedesktop/systemd1/unit/session_2d1_2escope { + + @@ -11467,6 +11569,10 @@ node /org/freedesktop/systemd1/unit/session_2d1_2escope { + + + + @@ -11475,6 +11581,8 @@ node /org/freedesktop/systemd1/unit/session_2d1_2escope { + + @@ -11866,6 +11974,9 @@ $ gdbus introspect --system --dest org.freedesktop.systemd1 \ MemorySwapCurrent, MemorySwapPeak, and MemoryZSwapCurrent were added in version 255. + EffectiveMemoryHigh, + EffectiveMemoryMax, + EffectiveTasksMax were added in version 256. Socket Unit Objects @@ -11897,6 +12008,9 @@ $ gdbus introspect --system --dest org.freedesktop.systemd1 \ MemorySwapCurrent, MemorySwapPeak, and MemoryZSwapCurrent were added in version 255. + EffectiveMemoryHigh, + EffectiveMemoryMax, + EffectiveTasksMax were added in version 256. Mount Unit Objects @@ -11926,6 +12040,9 @@ $ gdbus introspect --system --dest org.freedesktop.systemd1 \ MemorySwapCurrent, MemorySwapPeak, and MemoryZSwapCurrent were added in version 255. + EffectiveMemoryHigh, + EffectiveMemoryMax, + EffectiveTasksMax were added in version 256. Swap Unit Objects @@ -11955,6 +12072,9 @@ $ gdbus introspect --system --dest org.freedesktop.systemd1 \ MemorySwapCurrent, MemorySwapPeak, and MemoryZSwapCurrent were added in version 255. + EffectiveMemoryHigh, + EffectiveMemoryMax, + EffectiveTasksMax were added in version 256. Slice Unit Objects @@ -11975,6 +12095,9 @@ $ gdbus introspect --system --dest org.freedesktop.systemd1 \ MemorySwapCurrent, MemorySwapPeak, and MemoryZSwapCurrent were added in version 255. + EffectiveMemoryHigh, + EffectiveMemoryMax, + EffectiveTasksMax were added in version 256. Scope Unit Objects @@ -11996,6 +12119,9 @@ $ gdbus introspect --system --dest org.freedesktop.systemd1 \ MemorySwapCurrent, MemorySwapPeak, and MemoryZSwapCurrent were added in version 255. + EffectiveMemoryHigh, + EffectiveMemoryMax, + EffectiveTasksMax were added in version 256. Job Objects diff --git a/man/systemd.resource-control.xml b/man/systemd.resource-control.xml index 42f265c950..bd8b6a5719 100644 --- a/man/systemd.resource-control.xml +++ b/man/systemd.resource-control.xml @@ -406,7 +406,9 @@ CPUWeight=20 DisableControllers=cpu / \ system. If assigned the special value infinity, no memory throttling is applied. This controls the memory.high control group attribute. For details about this control group attribute, see - Memory Interface Files. + Memory Interface Files. + The effective configuration is reported as EffectiveMemoryHigh= + (see also EffectiveMemoryMax=). While StartupMemoryHigh= applies to the startup and shutdown phases of the system, MemoryHigh= applies to normal runtime of the system, and if the former is not set also to @@ -434,7 +436,9 @@ CPUWeight=20 DisableControllers=cpu / \ percentage value may be specified, which is taken relative to the installed physical memory on the system. If assigned the special value infinity, no memory limit is applied. This controls the memory.max control group attribute. For details about this control group attribute, see - Memory Interface Files. + Memory Interface Files. + The effective configuration is reported as EffectiveMemoryMax= (the value is + the most stringent limit of the unit and parent slices). While StartupMemoryMax= applies to the startup and shutdown phases of the system, MemoryMax= applies to normal runtime of the system, and if the former is not set also to @@ -560,7 +564,8 @@ CPUWeight=20 DisableControllers=cpu / \ limit is applied. This controls the pids.max control group attribute. For details about this control group attribute, the pids controller - . + . + The effective configuration is reported as EffectiveTasksMax=. The system default for this setting may be controlled with DefaultTasksMax= in diff --git a/src/core/cgroup.c b/src/core/cgroup.c index 61ac4df1a6..78ca67216a 100644 --- a/src/core/cgroup.c +++ b/src/core/cgroup.c @@ -4243,6 +4243,46 @@ int unit_get_ip_accounting( return r; } +static uint64_t unit_get_effective_limit_one(Unit *u, CGroupLimitType type) { + CGroupContext *cc; + + assert(u); + assert(UNIT_HAS_CGROUP_CONTEXT(u)); + + cc = unit_get_cgroup_context(u); + switch (type) { + /* Note: on legacy/hybrid hierarchies memory_max stays CGROUP_LIMIT_MAX unless configured + * explicitly. Effective value of MemoryLimit= (cgroup v1) is not implemented. */ + case CGROUP_LIMIT_MEMORY_MAX: + return cc->memory_max; + case CGROUP_LIMIT_MEMORY_HIGH: + return cc->memory_high; + case CGROUP_LIMIT_TASKS_MAX: + return cgroup_tasks_max_resolve(&cc->tasks_max); + default: + assert_not_reached(); + } +} + +int unit_get_effective_limit(Unit *u, CGroupLimitType type, uint64_t *ret) { + uint64_t infimum; + + assert(u); + assert(ret); + assert(type >= 0); + assert(type < _CGROUP_LIMIT_TYPE_MAX); + + if (!UNIT_HAS_CGROUP_CONTEXT(u)) + return -EINVAL; + + infimum = unit_get_effective_limit_one(u, type); + for (Unit *slice = UNIT_GET_SLICE(u); slice; slice = UNIT_GET_SLICE(slice)) + infimum = MIN(infimum, unit_get_effective_limit_one(slice, type)); + + *ret = infimum; + return 0; +} + static int unit_get_io_accounting_raw(Unit *u, uint64_t ret[static _CGROUP_IO_ACCOUNTING_METRIC_MAX]) { static const char *const field_names[_CGROUP_IO_ACCOUNTING_METRIC_MAX] = { [CGROUP_IO_READ_BYTES] = "rbytes=", @@ -4663,3 +4703,11 @@ static const char* const cgroup_memory_accounting_metric_table[_CGROUP_MEMORY_AC }; DEFINE_STRING_TABLE_LOOKUP(cgroup_memory_accounting_metric, CGroupMemoryAccountingMetric); + +static const char *const cgroup_limit_type_table[_CGROUP_LIMIT_TYPE_MAX] = { + [CGROUP_LIMIT_MEMORY_MAX] = "EffectiveMemoryMax", + [CGROUP_LIMIT_MEMORY_HIGH] = "EffectiveMemoryHigh", + [CGROUP_LIMIT_TASKS_MAX] = "EffectiveTasksMax", +}; + +DEFINE_STRING_TABLE_LOOKUP(cgroup_limit_type, CGroupLimitType); diff --git a/src/core/cgroup.h b/src/core/cgroup.h index f1b674b4b7..54bce91ea1 100644 --- a/src/core/cgroup.h +++ b/src/core/cgroup.h @@ -276,6 +276,15 @@ typedef enum CGroupMemoryAccountingMetric { _CGROUP_MEMORY_ACCOUNTING_METRIC_INVALID = -EINVAL, } CGroupMemoryAccountingMetric; +/* Used for limits whose value sets have infimum */ +typedef enum CGroupLimitType { + CGROUP_LIMIT_MEMORY_MAX, + CGROUP_LIMIT_MEMORY_HIGH, + CGROUP_LIMIT_TASKS_MAX, + _CGROUP_LIMIT_TYPE_MAX, + _CGROUP_LIMIT_INVALID = -EINVAL, +} CGroupLimitType; + typedef struct Unit Unit; typedef struct Manager Manager; typedef enum ManagerState ManagerState; @@ -374,6 +383,7 @@ int unit_get_tasks_current(Unit *u, uint64_t *ret); int unit_get_cpu_usage(Unit *u, nsec_t *ret); int unit_get_io_accounting(Unit *u, CGroupIOAccountingMetric metric, bool allow_cache, uint64_t *ret); int unit_get_ip_accounting(Unit *u, CGroupIPAccountingMetric metric, uint64_t *ret); +int unit_get_effective_limit(Unit *u, CGroupLimitType type, uint64_t *ret); int unit_reset_cpu_accounting(Unit *u); void unit_reset_memory_accounting_last(Unit *u); @@ -425,5 +435,8 @@ CGroupIPAccountingMetric cgroup_ip_accounting_metric_from_string(const char *s) const char* cgroup_io_accounting_metric_to_string(CGroupIOAccountingMetric m) _const_; CGroupIOAccountingMetric cgroup_io_accounting_metric_from_string(const char *s) _pure_; +const char* cgroup_limit_type_to_string(CGroupLimitType m) _const_; +CGroupLimitType cgroup_limit_type_from_string(const char *s) _pure_; + const char* cgroup_memory_accounting_metric_to_string(CGroupMemoryAccountingMetric m) _const_; CGroupMemoryAccountingMetric cgroup_memory_accounting_metric_from_string(const char *s) _pure_; diff --git a/src/core/dbus-unit.c b/src/core/dbus-unit.c index 1a037b7035..ac6add4700 100644 --- a/src/core/dbus-unit.c +++ b/src/core/dbus-unit.c @@ -1441,6 +1441,28 @@ static int property_get_io_counter( return sd_bus_message_append(reply, "t", value); } +static int property_get_effective_limit( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + uint64_t value = CGROUP_LIMIT_MAX; + Unit *u = ASSERT_PTR(userdata); + ssize_t type; + + assert(bus); + assert(reply); + assert(property); + + assert_se((type = cgroup_limit_type_from_string(property)) >= 0); + (void) unit_get_effective_limit(u, type, &value); + return sd_bus_message_append(reply, "t", value); +} + int bus_unit_method_attach_processes(sd_bus_message *message, void *userdata, sd_bus_error *error) { _cleanup_(sd_bus_creds_unrefp) sd_bus_creds *creds = NULL; _cleanup_set_free_ Set *pids = NULL; @@ -1562,10 +1584,13 @@ const sd_bus_vtable bus_unit_cgroup_vtable[] = { SD_BUS_PROPERTY("MemorySwapPeak", "t", property_get_memory_accounting, 0, 0), SD_BUS_PROPERTY("MemoryZSwapCurrent", "t", property_get_memory_accounting, 0, 0), SD_BUS_PROPERTY("MemoryAvailable", "t", property_get_available_memory, 0, 0), + SD_BUS_PROPERTY("EffectiveMemoryMax", "t", property_get_effective_limit, 0, 0), + SD_BUS_PROPERTY("EffectiveMemoryHigh", "t", property_get_effective_limit, 0, 0), SD_BUS_PROPERTY("CPUUsageNSec", "t", property_get_cpu_usage, 0, 0), SD_BUS_PROPERTY("EffectiveCPUs", "ay", property_get_cpuset_cpus, 0, 0), SD_BUS_PROPERTY("EffectiveMemoryNodes", "ay", property_get_cpuset_mems, 0, 0), SD_BUS_PROPERTY("TasksCurrent", "t", property_get_current_tasks, 0, 0), + SD_BUS_PROPERTY("EffectiveTasksMax", "t", property_get_effective_limit, 0, 0), SD_BUS_PROPERTY("IPIngressBytes", "t", property_get_ip_counter, 0, 0), SD_BUS_PROPERTY("IPIngressPackets", "t", property_get_ip_counter, 0, 0), SD_BUS_PROPERTY("IPEgressBytes", "t", property_get_ip_counter, 0, 0), diff --git a/src/shared/bus-print-properties.c b/src/shared/bus-print-properties.c index 6704e1ef3d..99b1cc7c70 100644 --- a/src/shared/bus-print-properties.c +++ b/src/shared/bus-print-properties.c @@ -164,9 +164,11 @@ static int bus_print_property(const char *name, const char *expected_value, sd_b bus_print_property_value(name, expected_value, flags, "[not set]"); - else if ((ENDSWITH_SET(name, "MemoryLow", "MemoryMin", "MemoryHigh", "MemoryMax", "MemorySwapMax", "MemoryZSwapMax", "MemoryLimit") && + else if ((ENDSWITH_SET(name, "MemoryLow", "MemoryMin", + "MemoryHigh", "MemoryMax", + "MemorySwapMax", "MemoryZSwapMax", "MemoryLimit") && u == CGROUP_LIMIT_MAX) || - (STR_IN_SET(name, "TasksMax", "DefaultTasksMax") && u == UINT64_MAX) || + (endswith(name, "TasksMax") && u == UINT64_MAX) || (startswith(name, "Limit") && u == UINT64_MAX) || (startswith(name, "DefaultLimit") && u == UINT64_MAX)) -- 2.35.3