From ff7b94c76f343931463b5916fb3fbd2610869a1a Mon Sep 17 00:00:00 2001 From: Aleksa Sarai Date: Sun, 15 Oct 2017 17:06:20 +1100 Subject: [PATCH] daemon: oci: obey CL_UNPRIVILEGED for user namespaced daemon When runc is bind-mounting a particular path "with options", it has to do so by first creating a bind-mount and the modifying the options of said bind-mount via remount. However, in a user namespace, there are restrictions on which flags you can change with a remount (due to CL_UNPRIVILEGED being set in this instance). Docker historically has ignored this, and as a result, internal Docker mounts (such as secrets) haven't worked with --userns-remap. Fix this by preserving CL_UNPRIVILEGED mount flags when Docker is spawning containers with user namespaces enabled. SUSE-Bug: https://bugzilla.suse.com/show_bug.cgi?id=1055676 Signed-off-by: Aleksa Sarai --- components/engine/daemon/oci_linux.go | 46 +++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) diff --git a/components/engine/daemon/oci_linux.go b/components/engine/daemon/oci_linux.go index 6917b4841429..936cb8f998ca 100644 --- a/components/engine/daemon/oci_linux.go +++ b/components/engine/daemon/oci_linux.go @@ -27,6 +27,7 @@ import ( "github.com/opencontainers/runc/libcontainer/user" specs "github.com/opencontainers/runtime-spec/specs-go" "github.com/sirupsen/logrus" + "golang.org/x/sys/unix" ) var ( @@ -469,6 +470,38 @@ func ensureSharedOrSlave(path string) error { return nil } +// Get the set of mount flags that are set on the mount that contains the given +// path and are locked by CL_UNPRIVILEGED. This is necessary to ensure that +// bind-mounting "with options" will not fail with user namespaces, due to +// kernel restrictions that require user namespace mounts to preserve +// CL_UNPRIVILEGED locked flags. +func getUnprivilegedMountFlags(path string) ([]string, error) { + var statfs unix.Statfs_t + if err := unix.Statfs(path, &statfs); err != nil { + return nil, err + } + + // The set of keys come from https://github.com/torvalds/linux/blob/v4.13/fs/namespace.c#L1034-L1048. + unprivilegedFlags := map[uint64]string{ + unix.MS_RDONLY: "ro", + unix.MS_NODEV: "nodev", + unix.MS_NOEXEC: "noexec", + unix.MS_NOSUID: "nosuid", + unix.MS_NOATIME: "noatime", + unix.MS_RELATIME: "relatime", + unix.MS_NODIRATIME: "nodiratime", + } + + var flags []string + for mask, flag := range unprivilegedFlags { + if uint64(statfs.Flags)&mask == mask { + flags = append(flags, flag) + } + } + + return flags, nil +} + var ( mountPropagationMap = map[string]int{ "private": mount.PRIVATE, @@ -586,6 +619,19 @@ func setMounts(daemon *Daemon, s *specs.Spec, c *container.Container, mounts []c opts = append(opts, mountPropagationReverseMap[pFlag]) } + // If we are using user namespaces, then we must make sure that we + // don't drop any of the CL_UNPRIVILEGED "locked" flags of the source + // "mount" when we bind-mount. The reason for this is that at the point + // when runc sets up the root filesystem, it is already inside a user + // namespace, and thus cannot change any flags that are locked. + if daemon.configStore.RemappedRoot != "" { + unprivOpts, err := getUnprivilegedMountFlags(m.Source) + if err != nil { + return err + } + opts = append(opts, unprivOpts...) + } + mt.Options = opts s.Mounts = append(s.Mounts, mt) } -- 2.16.1