From 9729fc3127c5e27d59a0126f52e91c61037173ae Mon Sep 17 00:00:00 2001 From: Aleksa Sarai Date: Tue, 29 Nov 2016 01:25:06 +1100 Subject: [PATCH] libcontainer: init: only pass stateDirFd when creating a container If we pass a file descriptor to the host filesystem while joining a container, there is a race condition where a process inside the container can ptrace(2) the joining process and stop it from closing its file descriptor to the stateDirFd. Then the process can access the *host* filesystem from that file descriptor. To fix this, don't open or pass the stateDirFd to the init process unless we're creating a new container. A proper fix for this would be to remove the need for even passing around directory file descriptors (which are quite dangerous in the context of mount namespaces). SUSE: This is an additional patch which was the original "fix" for the CVE. It was agreed that the patch was too complicated to be used as a fix for a security issue -- but it will be upstreamed once the embargo passes. Fixes: CVE-2016-9962 Signed-off-by: Aleksa Sarai --- libcontainer/container_linux.go | 30 ++++++++++++++++++------------ libcontainer/factory_linux.go | 37 ++++++++++++++++++++++--------------- libcontainer/init_linux.go | 3 +-- libcontainer/process_linux.go | 2 -- libcontainer/setns_init_linux.go | 7 +------ 5 files changed, 42 insertions(+), 37 deletions(-) diff --git a/libcontainer/container_linux.go b/libcontainer/container_linux.go index 82c6d8e..369b5d5 100644 --- a/libcontainer/container_linux.go +++ b/libcontainer/container_linux.go @@ -297,21 +297,29 @@ func (c *linuxContainer) newParentProcess(p *Process, doInit bool) (parentProces if err != nil { return nil, newSystemErrorWithCause(err, "creating new init pipe") } - rootDir, err := os.Open(c.root) - if err != nil { - return nil, err - } - cmd, err := c.commandTemplate(p, childPipe, rootDir) + cmd, err := c.commandTemplate(p, childPipe) if err != nil { return nil, newSystemErrorWithCause(err, "creating new command template") } if !doInit { - return c.newSetnsProcess(p, cmd, parentPipe, childPipe, rootDir) + return c.newSetnsProcess(p, cmd, parentPipe, childPipe) } + + // We only set up rootDir if we're not doing a `runc exec`. The reason for + // this is to avoid cases where a racing, unprivileged process inside the + // container can get access to the statedir file descriptor (which would + // allow for container rootfs escape). + rootDir, err := os.Open(c.root) + if err != nil { + return nil, err + } + cmd.ExtraFiles = append(cmd.ExtraFiles, rootDir) + cmd.Env = append(cmd.Env, + fmt.Sprintf("_LIBCONTAINER_STATEDIR=%d", stdioFdCount+len(cmd.ExtraFiles)-1)) return c.newInitProcess(p, cmd, parentPipe, childPipe, rootDir) } -func (c *linuxContainer) commandTemplate(p *Process, childPipe, rootDir *os.File) (*exec.Cmd, error) { +func (c *linuxContainer) commandTemplate(p *Process, childPipe *os.File) (*exec.Cmd, error) { cmd := exec.Command(c.initArgs[0], c.initArgs[1:]...) cmd.Stdin = p.Stdin cmd.Stdout = p.Stdout @@ -320,10 +328,9 @@ func (c *linuxContainer) commandTemplate(p *Process, childPipe, rootDir *os.File if cmd.SysProcAttr == nil { cmd.SysProcAttr = &syscall.SysProcAttr{} } - cmd.ExtraFiles = append(p.ExtraFiles, childPipe, rootDir) + cmd.ExtraFiles = append(p.ExtraFiles, childPipe) cmd.Env = append(cmd.Env, - fmt.Sprintf("_LIBCONTAINER_INITPIPE=%d", stdioFdCount+len(cmd.ExtraFiles)-2), - fmt.Sprintf("_LIBCONTAINER_STATEDIR=%d", stdioFdCount+len(cmd.ExtraFiles)-1)) + fmt.Sprintf("_LIBCONTAINER_INITPIPE=%d", stdioFdCount+len(cmd.ExtraFiles)-1)) // NOTE: when running a container with no PID namespace and the parent process spawning the container is // PID1 the pdeathsig is being delivered to the container's init process by the kernel for some reason // even with the parent still running. @@ -360,7 +367,7 @@ func (c *linuxContainer) newInitProcess(p *Process, cmd *exec.Cmd, parentPipe, c }, nil } -func (c *linuxContainer) newSetnsProcess(p *Process, cmd *exec.Cmd, parentPipe, childPipe, rootDir *os.File) (*setnsProcess, error) { +func (c *linuxContainer) newSetnsProcess(p *Process, cmd *exec.Cmd, parentPipe, childPipe *os.File) (*setnsProcess, error) { cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITTYPE="+string(initSetns)) state, err := c.currentState() if err != nil { @@ -381,7 +388,6 @@ func (c *linuxContainer) newSetnsProcess(p *Process, cmd *exec.Cmd, parentPipe, config: c.newInitConfig(p), process: p, bootstrapData: data, - rootDir: rootDir, }, nil } diff --git a/libcontainer/factory_linux.go b/libcontainer/factory_linux.go index cc336c147fb9..e3478d5b50e3 100644 --- a/libcontainer/factory_linux.go +++ b/libcontainer/factory_linux.go @@ -222,27 +222,34 @@ func (l *LinuxFactory) Type() string { // StartInitialization loads a container by opening the pipe fd from the parent to read the configuration and state // This is a low level implementation detail of the reexec and should not be consumed externally func (l *LinuxFactory) StartInitialization() (err error) { - var pipefd, rootfd int - for _, pair := range []struct { - k string - v *int - }{ - {"_LIBCONTAINER_INITPIPE", &pipefd}, - {"_LIBCONTAINER_STATEDIR", &rootfd}, - } { - - s := os.Getenv(pair.k) + var ( + pipefd, rootfd int + envInitPipe = os.Getenv("_LIBCONTAINER_INITPIPE") + envStateDir = os.Getenv("_LIBCONTAINER_STATEDIR") + ) - i, err := strconv.Atoi(s) - if err != nil { - return fmt.Errorf("unable to convert %s=%s to int", pair.k, s) - } - *pair.v = i + // Get the INITPIPE. + pipefd, err = strconv.Atoi(envInitPipe) + if err != nil { + return fmt.Errorf("unable to convert _LIBCONTAINER_INITPIPE=%s to int: %s", envInitPipe, err) } + var ( pipe = os.NewFile(uintptr(pipefd), "pipe") it = initType(os.Getenv("_LIBCONTAINER_INITTYPE")) ) + + // Only get the STATEDIR if we're an init process. It's a bit late to do + // anything about this if we've already brought in an fd (a racing process + // could've opened the fd by now because we're in the PID namespace). + rootfd = -1 + if it == initStandard { + rootfd, err = strconv.Atoi(envStateDir) + if err != nil { + return fmt.Errorf("unable to convert _LIBCONTAINER_STATEDIR=%s to int: %s", envStateDir, err) + } + } + // clear the current process's environment to clean any libcontainer // specific env vars. os.Clearenv() diff --git a/libcontainer/init_linux.go b/libcontainer/init_linux.go index 4043d51c0bd0..b1e6762ecdf3 100644 --- a/libcontainer/init_linux.go +++ b/libcontainer/init_linux.go @@ -77,8 +77,7 @@ func newContainerInit(t initType, pipe *os.File, stateDirFD int) (initer, error) switch t { case initSetns: return &linuxSetnsInit{ - config: config, - stateDirFD: stateDirFD, + config: config, }, nil case initStandard: return &linuxStandardInit{ diff --git a/libcontainer/process_linux.go b/libcontainer/process_linux.go index 5b81317fd711..7b90c6b8faae 100644 --- a/libcontainer/process_linux.go +++ b/libcontainer/process_linux.go @@ -51,7 +51,6 @@ type setnsProcess struct { fds []string process *Process bootstrapData io.Reader - rootDir *os.File } func (p *setnsProcess) startTime() (string, error) { @@ -70,7 +69,6 @@ func (p *setnsProcess) start() (err error) { defer p.parentPipe.Close() err = p.cmd.Start() p.childPipe.Close() - p.rootDir.Close() if err != nil { return newSystemErrorWithCause(err, "starting setns process") } diff --git a/libcontainer/setns_init_linux.go b/libcontainer/setns_init_linux.go index 7f5f182402cc..2a8f34528179 100644 --- a/libcontainer/setns_init_linux.go +++ b/libcontainer/setns_init_linux.go @@ -5,7 +5,6 @@ package libcontainer import ( "fmt" "os" - "syscall" "github.com/opencontainers/runc/libcontainer/apparmor" "github.com/opencontainers/runc/libcontainer/keys" @@ -17,8 +16,7 @@ import ( // linuxSetnsInit performs the container's initialization for running a new process // inside an existing container. type linuxSetnsInit struct { - config *initConfig - stateDirFD int + config *initConfig } func (l *linuxSetnsInit) getSessionRingName() string { @@ -51,8 +49,5 @@ func (l *linuxSetnsInit) Init() error { if err := label.SetProcessLabel(l.config.ProcessLabel); err != nil { return err } - // close the statedir fd before exec because the kernel resets dumpable in the wrong order - // https://github.com/torvalds/linux/blob/v4.9/fs/exec.c#L1290-L1318 - syscall.Close(l.stateDirFD) return system.Execv(l.config.Args[0], l.config.Args[0:], os.Environ()) } -- 2.11.0