From 9729fc3127c5e27d59a0126f52e91c61037173ae Mon Sep 17 00:00:00 2001
From: Aleksa Sarai <asarai@suse.de>
Date: Tue, 29 Nov 2016 01:25:06 +1100
Subject: [PATCH] libcontainer: init: only pass stateDirFd when creating a
 container

If we pass a file descriptor to the host filesystem while joining a
container, there is a race condition where a process inside the
container can ptrace(2) the joining process and stop it from closing its
file descriptor to the stateDirFd. Then the process can access the *host*
filesystem from that file descriptor.

To fix this, don't open or pass the stateDirFd to the init process
unless we're creating a new container. A proper fix for this would be to
remove the need for even passing around directory file descriptors
(which are quite dangerous in the context of mount namespaces).

SUSE: This is an additional patch which was the original "fix" for the
      CVE. It was agreed that the patch was too complicated to be used
      as a fix for a security issue -- but it will be upstreamed once
      the embargo passes.

Fixes: CVE-2016-9962
Signed-off-by: Aleksa Sarai <asarai@suse.de>
---
 libcontainer/container_linux.go  | 30 ++++++++++++++++++------------
 libcontainer/factory_linux.go    | 37 ++++++++++++++++++++++---------------
 libcontainer/init_linux.go       |  3 +--
 libcontainer/process_linux.go    |  2 --
 libcontainer/setns_init_linux.go |  7 +------
 5 files changed, 42 insertions(+), 37 deletions(-)

diff --git a/libcontainer/container_linux.go b/libcontainer/container_linux.go
index 82c6d8e..369b5d5 100644
--- a/libcontainer/container_linux.go
+++ b/libcontainer/container_linux.go
@@ -297,21 +297,29 @@ func (c *linuxContainer) newParentProcess(p *Process, doInit bool) (parentProces
 	if err != nil {
 		return nil, newSystemErrorWithCause(err, "creating new init pipe")
 	}
-	rootDir, err := os.Open(c.root)
-	if err != nil {
-		return nil, err
-	}
-	cmd, err := c.commandTemplate(p, childPipe, rootDir)
+	cmd, err := c.commandTemplate(p, childPipe)
 	if err != nil {
 		return nil, newSystemErrorWithCause(err, "creating new command template")
 	}
 	if !doInit {
-		return c.newSetnsProcess(p, cmd, parentPipe, childPipe, rootDir)
+		return c.newSetnsProcess(p, cmd, parentPipe, childPipe)
 	}
+
+	// We only set up rootDir if we're not doing a `runc exec`. The reason for
+	// this is to avoid cases where a racing, unprivileged process inside the
+	// container can get access to the statedir file descriptor (which would
+	// allow for container rootfs escape).
+	rootDir, err := os.Open(c.root)
+	if err != nil {
+		return nil, err
+	}
+	cmd.ExtraFiles = append(cmd.ExtraFiles, rootDir)
+	cmd.Env = append(cmd.Env,
+		fmt.Sprintf("_LIBCONTAINER_STATEDIR=%d", stdioFdCount+len(cmd.ExtraFiles)-1))
 	return c.newInitProcess(p, cmd, parentPipe, childPipe, rootDir)
 }
 
-func (c *linuxContainer) commandTemplate(p *Process, childPipe, rootDir *os.File) (*exec.Cmd, error) {
+func (c *linuxContainer) commandTemplate(p *Process, childPipe *os.File) (*exec.Cmd, error) {
 	cmd := exec.Command(c.initArgs[0], c.initArgs[1:]...)
 	cmd.Stdin = p.Stdin
 	cmd.Stdout = p.Stdout
@@ -320,10 +328,9 @@ func (c *linuxContainer) commandTemplate(p *Process, childPipe, rootDir *os.File
 	if cmd.SysProcAttr == nil {
 		cmd.SysProcAttr = &syscall.SysProcAttr{}
 	}
-	cmd.ExtraFiles = append(p.ExtraFiles, childPipe, rootDir)
+	cmd.ExtraFiles = append(p.ExtraFiles, childPipe)
 	cmd.Env = append(cmd.Env,
-		fmt.Sprintf("_LIBCONTAINER_INITPIPE=%d", stdioFdCount+len(cmd.ExtraFiles)-2),
-		fmt.Sprintf("_LIBCONTAINER_STATEDIR=%d", stdioFdCount+len(cmd.ExtraFiles)-1))
+		fmt.Sprintf("_LIBCONTAINER_INITPIPE=%d", stdioFdCount+len(cmd.ExtraFiles)-1))
 	// NOTE: when running a container with no PID namespace and the parent process spawning the container is
 	// PID1 the pdeathsig is being delivered to the container's init process by the kernel for some reason
 	// even with the parent still running.
@@ -360,7 +367,7 @@ func (c *linuxContainer) newInitProcess(p *Process, cmd *exec.Cmd, parentPipe, c
 	}, nil
 }
 
-func (c *linuxContainer) newSetnsProcess(p *Process, cmd *exec.Cmd, parentPipe, childPipe, rootDir *os.File) (*setnsProcess, error) {
+func (c *linuxContainer) newSetnsProcess(p *Process, cmd *exec.Cmd, parentPipe, childPipe *os.File) (*setnsProcess, error) {
 	cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITTYPE="+string(initSetns))
 	state, err := c.currentState()
 	if err != nil {
@@ -381,7 +388,6 @@ func (c *linuxContainer) newSetnsProcess(p *Process, cmd *exec.Cmd, parentPipe,
 		config:        c.newInitConfig(p),
 		process:       p,
 		bootstrapData: data,
-		rootDir:       rootDir,
 	}, nil
 }
 
diff --git a/libcontainer/factory_linux.go b/libcontainer/factory_linux.go
index cc336c147fb9..e3478d5b50e3 100644
--- a/libcontainer/factory_linux.go
+++ b/libcontainer/factory_linux.go
@@ -222,27 +222,34 @@ func (l *LinuxFactory) Type() string {
 // StartInitialization loads a container by opening the pipe fd from the parent to read the configuration and state
 // This is a low level implementation detail of the reexec and should not be consumed externally
 func (l *LinuxFactory) StartInitialization() (err error) {
-	var pipefd, rootfd int
-	for _, pair := range []struct {
-		k string
-		v *int
-	}{
-		{"_LIBCONTAINER_INITPIPE", &pipefd},
-		{"_LIBCONTAINER_STATEDIR", &rootfd},
-	} {
-
-		s := os.Getenv(pair.k)
+	var (
+		pipefd, rootfd int
+		envInitPipe    = os.Getenv("_LIBCONTAINER_INITPIPE")
+		envStateDir    = os.Getenv("_LIBCONTAINER_STATEDIR")
+	)
 
-		i, err := strconv.Atoi(s)
-		if err != nil {
-			return fmt.Errorf("unable to convert %s=%s to int", pair.k, s)
-		}
-		*pair.v = i
+	// Get the INITPIPE.
+	pipefd, err = strconv.Atoi(envInitPipe)
+	if err != nil {
+		return fmt.Errorf("unable to convert _LIBCONTAINER_INITPIPE=%s to int: %s", envInitPipe, err)
 	}
+
 	var (
 		pipe = os.NewFile(uintptr(pipefd), "pipe")
 		it   = initType(os.Getenv("_LIBCONTAINER_INITTYPE"))
 	)
+
+	// Only get the STATEDIR if we're an init process. It's a bit late to do
+	// anything about this if we've already brought in an fd (a racing process
+	// could've opened the fd by now because we're in the PID namespace).
+	rootfd = -1
+	if it == initStandard {
+		rootfd, err = strconv.Atoi(envStateDir)
+		if err != nil {
+			return fmt.Errorf("unable to convert _LIBCONTAINER_STATEDIR=%s to int: %s", envStateDir, err)
+		}
+	}
+
 	// clear the current process's environment to clean any libcontainer
 	// specific env vars.
 	os.Clearenv()
diff --git a/libcontainer/init_linux.go b/libcontainer/init_linux.go
index 4043d51c0bd0..b1e6762ecdf3 100644
--- a/libcontainer/init_linux.go
+++ b/libcontainer/init_linux.go
@@ -77,8 +77,7 @@ func newContainerInit(t initType, pipe *os.File, stateDirFD int) (initer, error)
 	switch t {
 	case initSetns:
 		return &linuxSetnsInit{
-			config:     config,
-			stateDirFD: stateDirFD,
+			config: config,
 		}, nil
 	case initStandard:
 		return &linuxStandardInit{
diff --git a/libcontainer/process_linux.go b/libcontainer/process_linux.go
index 5b81317fd711..7b90c6b8faae 100644
--- a/libcontainer/process_linux.go
+++ b/libcontainer/process_linux.go
@@ -51,7 +51,6 @@ type setnsProcess struct {
 	fds           []string
 	process       *Process
 	bootstrapData io.Reader
-	rootDir       *os.File
 }
 
 func (p *setnsProcess) startTime() (string, error) {
@@ -70,7 +69,6 @@ func (p *setnsProcess) start() (err error) {
 	defer p.parentPipe.Close()
 	err = p.cmd.Start()
 	p.childPipe.Close()
-	p.rootDir.Close()
 	if err != nil {
 		return newSystemErrorWithCause(err, "starting setns process")
 	}
diff --git a/libcontainer/setns_init_linux.go b/libcontainer/setns_init_linux.go
index 7f5f182402cc..2a8f34528179 100644
--- a/libcontainer/setns_init_linux.go
+++ b/libcontainer/setns_init_linux.go
@@ -5,7 +5,6 @@ package libcontainer
 import (
 	"fmt"
 	"os"
-	"syscall"
 
 	"github.com/opencontainers/runc/libcontainer/apparmor"
 	"github.com/opencontainers/runc/libcontainer/keys"
@@ -17,8 +16,7 @@ import (
 // linuxSetnsInit performs the container's initialization for running a new process
 // inside an existing container.
 type linuxSetnsInit struct {
-	config     *initConfig
-	stateDirFD int
+	config *initConfig
 }
 
 func (l *linuxSetnsInit) getSessionRingName() string {
@@ -51,8 +49,5 @@ func (l *linuxSetnsInit) Init() error {
 	if err := label.SetProcessLabel(l.config.ProcessLabel); err != nil {
 		return err
 	}
-	// close the statedir fd before exec because the kernel resets dumpable in the wrong order
-	// https://github.com/torvalds/linux/blob/v4.9/fs/exec.c#L1290-L1318
-	syscall.Close(l.stateDirFD)
 	return system.Execv(l.config.Args[0], l.config.Args[0:], os.Environ())
 }
-- 
2.11.0