137 lines
4.7 KiB
Diff
137 lines
4.7 KiB
Diff
From 0f1f8e303cf1919c33952f4938e5637d8f77f907 Mon Sep 17 00:00:00 2001
|
|
From: Aleksa Sarai <cyphar@cyphar.com>
|
|
Date: Fri, 7 Jul 2023 22:45:44 +1000
|
|
Subject: [PATCH 4/4] bsc1214960: nsenter: cloned_binary: remove bindfd logic
|
|
entirely
|
|
|
|
(This is a cherry-pick of b999376fb237195265081a8b8ba3fd3bd6ef8c2c.)
|
|
|
|
While the ro-bind-mount trick did eliminate the memory overhead of
|
|
copying the runc binary for each "runc init" invocation, on machines
|
|
with very significant container churn, creating a temporary mount
|
|
namespace on every container invocation can trigger severe lock
|
|
contention on namespace_sem that makes containers fail to spawn.
|
|
|
|
The only reason we added bindfd in commit 16612d74de5f ("nsenter:
|
|
cloned_binary: try to ro-bind /proc/self/exe before copying") was due to
|
|
a Kubernetes e2e test failure where they had a ridiculously small memory
|
|
limit. It seems incredibly unlikely that real workloads are running
|
|
without 10MB to spare for the very short time that runc is interacting
|
|
with the container.
|
|
|
|
In addition, since the original cloned_binary implementation, cgroupv2
|
|
is now almost universally used on modern systems. Unlike cgroupv1, the
|
|
cgroupv2 memcg implementation does not migrate memory usage when
|
|
processes change cgroups (even cgroupv1 only did this if you had
|
|
memory.move_charge_at_immigrate enabled). In addition, because we do the
|
|
/proc/self/exe clone before synchronising the bootstrap data read, we
|
|
are guaranteed to do the clone before "runc init" is moved into the
|
|
container cgroup -- meaning that the memory used by the /proc/self/exe
|
|
clone is charged against the root cgroup, and thus container workloads
|
|
should not be affected at all with memfd cloning.
|
|
|
|
The long-term fix for this problem is to block the /proc/self/exe
|
|
re-opening attack entirely in-kernel, which is something I'm working
|
|
on[1]. Though it should also be noted that because the memfd is
|
|
completely separate to the host binary, even attacks like Dirty COW
|
|
against the runc binary can be defended against with the memfd approach.
|
|
Of course, once we have in-kernel protection against the /proc/self/exe
|
|
re-opening attack, we won't have that protection anymore...
|
|
|
|
[1]: https://lwn.net/Articles/934460/
|
|
|
|
SUSE-Bugs: https://bugzilla.suse.com/show_bug.cgi?id=1214960
|
|
Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
|
|
---
|
|
libcontainer/nsenter/cloned_binary.c | 67 ----------------------------
|
|
1 file changed, 67 deletions(-)
|
|
|
|
diff --git a/libcontainer/nsenter/cloned_binary.c b/libcontainer/nsenter/cloned_binary.c
|
|
index d1b2d4c546f1..565748b13a4e 100644
|
|
--- a/libcontainer/nsenter/cloned_binary.c
|
|
+++ b/libcontainer/nsenter/cloned_binary.c
|
|
@@ -396,61 +396,6 @@ static int seal_execfd(int *fd, int fdtype)
|
|
return -1;
|
|
}
|
|
|
|
-static int try_bindfd(void)
|
|
-{
|
|
- int fd, ret = -1;
|
|
- char template[PATH_MAX] = { 0 };
|
|
- char *prefix = getenv("_LIBCONTAINER_STATEDIR");
|
|
-
|
|
- if (!prefix || *prefix != '/')
|
|
- prefix = "/tmp";
|
|
- if (snprintf(template, sizeof(template), "%s/runc.XXXXXX", prefix) < 0)
|
|
- return ret;
|
|
-
|
|
- /*
|
|
- * We need somewhere to mount it, mounting anything over /proc/self is a
|
|
- * BAD idea on the host -- even if we do it temporarily.
|
|
- */
|
|
- fd = mkstemp(template);
|
|
- if (fd < 0)
|
|
- return ret;
|
|
- close(fd);
|
|
-
|
|
- /*
|
|
- * For obvious reasons this won't work in rootless mode because we haven't
|
|
- * created a userns+mntns -- but getting that to work will be a bit
|
|
- * complicated and it's only worth doing if someone actually needs it.
|
|
- */
|
|
- ret = -EPERM;
|
|
- if (mount("/proc/self/exe", template, "", MS_BIND, "") < 0)
|
|
- goto out;
|
|
- if (mount("", template, "", MS_REMOUNT | MS_BIND | MS_RDONLY, "") < 0)
|
|
- goto out_umount;
|
|
-
|
|
- /* Get read-only handle that we're sure can't be made read-write. */
|
|
- ret = open(template, O_PATH | O_CLOEXEC);
|
|
-
|
|
-out_umount:
|
|
- /*
|
|
- * Make sure the MNT_DETACH works, otherwise we could get remounted
|
|
- * read-write and that would be quite bad (the fd would be made read-write
|
|
- * too, invalidating the protection).
|
|
- */
|
|
- if (umount2(template, MNT_DETACH) < 0) {
|
|
- if (ret >= 0)
|
|
- close(ret);
|
|
- ret = -ENOTRECOVERABLE;
|
|
- }
|
|
-
|
|
-out:
|
|
- /*
|
|
- * We don't care about unlink errors, the worst that happens is that
|
|
- * there's an empty file left around in STATEDIR.
|
|
- */
|
|
- unlink(template);
|
|
- return ret;
|
|
-}
|
|
-
|
|
static ssize_t fd_to_fd(int outfd, int infd)
|
|
{
|
|
ssize_t total = 0;
|
|
@@ -485,18 +430,6 @@ static int clone_binary(void)
|
|
size_t sent = 0;
|
|
int fdtype = EFD_NONE;
|
|
|
|
- /*
|
|
- * Before we resort to copying, let's try creating an ro-binfd in one shot
|
|
- * by getting a handle for a read-only bind-mount of the execfd.
|
|
- */
|
|
- execfd = try_bindfd();
|
|
- if (execfd >= 0)
|
|
- return execfd;
|
|
-
|
|
- /*
|
|
- * Dammit, that didn't work -- time to copy the binary to a safe place we
|
|
- * can seal the contents.
|
|
- */
|
|
execfd = make_execfd(&fdtype);
|
|
if (execfd < 0 || fdtype == EFD_NONE)
|
|
return -ENOTRECOVERABLE;
|
|
--
|
|
2.46.0
|
|
|