Sync from SUSE:SLFO:Main runc revision 2133c931be9a58b516e6632c4b6eabdc

This commit is contained in:
Adrian Schröter 2024-09-06 15:31:00 +02:00
parent 863c656407
commit c92af6636d
10 changed files with 691 additions and 23 deletions

View File

@ -0,0 +1,44 @@
From 22eb87a32dc1c685425b685e96e8472b9ac1b5ca Mon Sep 17 00:00:00 2001
From: Kir Kolyshkin <kolyshkin@gmail.com>
Date: Fri, 14 Oct 2022 18:37:00 -0700
Subject: [PATCH 1/4] bsc1221050: libct/seccomp/patchbpf: rm duplicated code
(This is a cherry-pick of 2cd05e44b662fb79c46d5ebfd6c71e9ebc98d40c.)
In findLastSyscalls, we convert libseccomp.ArchNative to the real
libseccomp architecture, but archToNative already does that, so
this code is redundant.
Remove the redundant code, and move its comment to archToNative.
Fixes: 7a8d7162f
Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com>
Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
---
libcontainer/seccomp/patchbpf/enosys_linux.go | 10 ----------
1 file changed, 10 deletions(-)
diff --git a/libcontainer/seccomp/patchbpf/enosys_linux.go b/libcontainer/seccomp/patchbpf/enosys_linux.go
index efe6dca58b21..c9c1d4ccb685 100644
--- a/libcontainer/seccomp/patchbpf/enosys_linux.go
+++ b/libcontainer/seccomp/patchbpf/enosys_linux.go
@@ -233,16 +233,6 @@ func findLastSyscalls(config *configs.Seccomp) (lastSyscallMap, error) {
return nil, fmt.Errorf("unable to validate seccomp architecture: %w", err)
}
- // Map native architecture to a real architecture value to avoid
- // doubling-up the lastSyscall mapping.
- if arch == libseccomp.ArchNative {
- nativeArch, err := libseccomp.GetNativeArch()
- if err != nil {
- return nil, fmt.Errorf("unable to get native architecture: %w", err)
- }
- arch = nativeArch
- }
-
// Figure out native architecture representation of the architecture.
nativeArch, err := archToNative(arch)
if err != nil {
--
2.46.0

View File

@ -0,0 +1,289 @@
From 558c5ecf487a40001ba854cfcbd5c94223167501 Mon Sep 17 00:00:00 2001
From: Aleksa Sarai <cyphar@cyphar.com>
Date: Wed, 13 Mar 2024 13:40:16 +1100
Subject: [PATCH 2/4] bsc1221050: seccomp: patchbpf: rename nativeArch ->
linuxAuditArch
(This is a backport of 6167f5ffc3e3fd53e6a41a2effa592a4873ad046.)
Calling the Linux AUDIT_* architecture constants "native" leads to
confusing code when we are getting the actual native architecture of the
running system.
Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
---
libcontainer/seccomp/patchbpf/enosys_linux.go | 81 ++++++++++---------
.../seccomp/patchbpf/enosys_linux_test.go | 16 ++--
2 files changed, 49 insertions(+), 48 deletions(-)
diff --git a/libcontainer/seccomp/patchbpf/enosys_linux.go b/libcontainer/seccomp/patchbpf/enosys_linux.go
index c9c1d4ccb685..1b67fda85c64 100644
--- a/libcontainer/seccomp/patchbpf/enosys_linux.go
+++ b/libcontainer/seccomp/patchbpf/enosys_linux.go
@@ -164,11 +164,11 @@ func disassembleFilter(filter *libseccomp.ScmpFilter) ([]bpf.Instruction, error)
return program, nil
}
-type nativeArch uint32
+type linuxAuditArch uint32
-const invalidArch nativeArch = 0
+const invalidArch linuxAuditArch = 0
-func archToNative(arch libseccomp.ScmpArch) (nativeArch, error) {
+func scmpArchToAuditArch(arch libseccomp.ScmpArch) (linuxAuditArch, error) {
switch arch {
case libseccomp.ArchNative:
// Convert to actual native architecture.
@@ -176,48 +176,48 @@ func archToNative(arch libseccomp.ScmpArch) (nativeArch, error) {
if err != nil {
return invalidArch, fmt.Errorf("unable to get native arch: %w", err)
}
- return archToNative(arch)
+ return scmpArchToAuditArch(arch)
case libseccomp.ArchX86:
- return nativeArch(C.C_AUDIT_ARCH_I386), nil
+ return linuxAuditArch(C.C_AUDIT_ARCH_I386), nil
case libseccomp.ArchAMD64, libseccomp.ArchX32:
// NOTE: x32 is treated like x86_64 except all x32 syscalls have the
// 30th bit of the syscall number set to indicate that it's not a
// normal x86_64 syscall.
- return nativeArch(C.C_AUDIT_ARCH_X86_64), nil
+ return linuxAuditArch(C.C_AUDIT_ARCH_X86_64), nil
case libseccomp.ArchARM:
- return nativeArch(C.C_AUDIT_ARCH_ARM), nil
+ return linuxAuditArch(C.C_AUDIT_ARCH_ARM), nil
case libseccomp.ArchARM64:
- return nativeArch(C.C_AUDIT_ARCH_AARCH64), nil
+ return linuxAuditArch(C.C_AUDIT_ARCH_AARCH64), nil
case libseccomp.ArchMIPS:
- return nativeArch(C.C_AUDIT_ARCH_MIPS), nil
+ return linuxAuditArch(C.C_AUDIT_ARCH_MIPS), nil
case libseccomp.ArchMIPS64:
- return nativeArch(C.C_AUDIT_ARCH_MIPS64), nil
+ return linuxAuditArch(C.C_AUDIT_ARCH_MIPS64), nil
case libseccomp.ArchMIPS64N32:
- return nativeArch(C.C_AUDIT_ARCH_MIPS64N32), nil
+ return linuxAuditArch(C.C_AUDIT_ARCH_MIPS64N32), nil
case libseccomp.ArchMIPSEL:
- return nativeArch(C.C_AUDIT_ARCH_MIPSEL), nil
+ return linuxAuditArch(C.C_AUDIT_ARCH_MIPSEL), nil
case libseccomp.ArchMIPSEL64:
- return nativeArch(C.C_AUDIT_ARCH_MIPSEL64), nil
+ return linuxAuditArch(C.C_AUDIT_ARCH_MIPSEL64), nil
case libseccomp.ArchMIPSEL64N32:
- return nativeArch(C.C_AUDIT_ARCH_MIPSEL64N32), nil
+ return linuxAuditArch(C.C_AUDIT_ARCH_MIPSEL64N32), nil
case libseccomp.ArchPPC:
- return nativeArch(C.C_AUDIT_ARCH_PPC), nil
+ return linuxAuditArch(C.C_AUDIT_ARCH_PPC), nil
case libseccomp.ArchPPC64:
- return nativeArch(C.C_AUDIT_ARCH_PPC64), nil
+ return linuxAuditArch(C.C_AUDIT_ARCH_PPC64), nil
case libseccomp.ArchPPC64LE:
- return nativeArch(C.C_AUDIT_ARCH_PPC64LE), nil
+ return linuxAuditArch(C.C_AUDIT_ARCH_PPC64LE), nil
case libseccomp.ArchS390:
- return nativeArch(C.C_AUDIT_ARCH_S390), nil
+ return linuxAuditArch(C.C_AUDIT_ARCH_S390), nil
case libseccomp.ArchS390X:
- return nativeArch(C.C_AUDIT_ARCH_S390X), nil
+ return linuxAuditArch(C.C_AUDIT_ARCH_S390X), nil
case libseccomp.ArchRISCV64:
- return nativeArch(C.C_AUDIT_ARCH_RISCV64), nil
+ return linuxAuditArch(C.C_AUDIT_ARCH_RISCV64), nil
default:
return invalidArch, fmt.Errorf("unknown architecture: %v", arch)
}
}
-type lastSyscallMap map[nativeArch]map[libseccomp.ScmpArch]libseccomp.ScmpSyscall
+type lastSyscallMap map[linuxAuditArch]map[libseccomp.ScmpArch]libseccomp.ScmpSyscall
// Figure out largest syscall number referenced in the filter for each
// architecture. We will be generating code based on the native architecture
@@ -234,17 +234,17 @@ func findLastSyscalls(config *configs.Seccomp) (lastSyscallMap, error) {
}
// Figure out native architecture representation of the architecture.
- nativeArch, err := archToNative(arch)
+ auditArch, err := scmpArchToAuditArch(arch)
if err != nil {
return nil, fmt.Errorf("cannot map architecture %v to AUDIT_ARCH_ constant: %w", arch, err)
}
- if _, ok := lastSyscalls[nativeArch]; !ok {
- lastSyscalls[nativeArch] = map[libseccomp.ScmpArch]libseccomp.ScmpSyscall{}
+ if _, ok := lastSyscalls[auditArch]; !ok {
+ lastSyscalls[auditArch] = map[libseccomp.ScmpArch]libseccomp.ScmpSyscall{}
}
- if _, ok := lastSyscalls[nativeArch][arch]; ok {
+ if _, ok := lastSyscalls[auditArch][arch]; ok {
// Because of ArchNative we may hit the same entry multiple times.
- // Just skip it if we've seen this (nativeArch, ScmpArch)
+ // Just skip it if we've seen this (linuxAuditArch, ScmpArch)
// combination before.
continue
}
@@ -262,10 +262,11 @@ func findLastSyscalls(config *configs.Seccomp) (lastSyscallMap, error) {
}
}
if largestSyscall != 0 {
- lastSyscalls[nativeArch][arch] = largestSyscall
+ logrus.Debugf("seccomp: largest syscall number for arch %v is %v", arch, largestSyscall)
+ lastSyscalls[auditArch][arch] = largestSyscall
} else {
- logrus.Warnf("could not find any syscalls for arch %s", ociArch)
- delete(lastSyscalls[nativeArch], arch)
+ logrus.Warnf("could not find any syscalls for arch %v", arch)
+ delete(lastSyscalls[auditArch], arch)
}
}
return lastSyscalls, nil
@@ -283,10 +284,10 @@ func findLastSyscalls(config *configs.Seccomp) (lastSyscallMap, error) {
// close_range(2) which were added out-of-order in the syscall table between
// kernel releases.
func generateEnosysStub(lastSyscalls lastSyscallMap) ([]bpf.Instruction, error) {
- // A jump-table for each nativeArch used to generate the initial
+ // A jump-table for each linuxAuditArch used to generate the initial
// conditional jumps -- measured from the *END* of the program so they
// remain valid after prepending to the tail.
- archJumpTable := map[nativeArch]uint32{}
+ archJumpTable := map[linuxAuditArch]uint32{}
// Generate our own -ENOSYS rules for each architecture. They have to be
// generated in reverse (prepended to the tail of the program) because the
@@ -299,7 +300,7 @@ func generateEnosysStub(lastSyscalls lastSyscallMap) ([]bpf.Instruction, error)
}
// Generate the syscall -ENOSYS rules.
- for nativeArch, maxSyscalls := range lastSyscalls {
+ for auditArch, maxSyscalls := range lastSyscalls {
// The number of instructions from the tail of this section which need
// to be jumped in order to reach the -ENOSYS return. If the section
// does not jump, it will fall through to the actual filter.
@@ -380,7 +381,7 @@ func generateEnosysStub(lastSyscalls lastSyscallMap) ([]bpf.Instruction, error)
// If we're on x86 we need to add a check for x32 and if we're in
// the wrong mode we jump over the section.
- if uint32(nativeArch) == uint32(C.C_AUDIT_ARCH_X86_64) {
+ if uint32(auditArch) == uint32(C.C_AUDIT_ARCH_X86_64) {
// Generate a prefix to check the mode.
switch scmpArch {
case libseccomp.ArchAMD64:
@@ -409,8 +410,8 @@ func generateEnosysStub(lastSyscalls lastSyscallMap) ([]bpf.Instruction, error)
section = append(section, sectionTail...)
case 2:
// x32 and x86_64 are a unique case, we can't handle any others.
- if uint32(nativeArch) != uint32(C.C_AUDIT_ARCH_X86_64) {
- return nil, fmt.Errorf("unknown architecture overlap on native arch %#x", nativeArch)
+ if uint32(auditArch) != uint32(C.C_AUDIT_ARCH_X86_64) {
+ return nil, fmt.Errorf("unknown architecture overlap on native arch %#x", auditArch)
}
x32sysno, ok := maxSyscalls[libseccomp.ArchX32]
@@ -487,7 +488,7 @@ func generateEnosysStub(lastSyscalls lastSyscallMap) ([]bpf.Instruction, error)
programTail = append(section, programTail...)
// Update jump table.
- archJumpTable[nativeArch] = uint32(len(programTail))
+ archJumpTable[auditArch] = uint32(len(programTail))
}
// Add a dummy "jump to filter" for any architecture we might miss below.
@@ -507,9 +508,9 @@ func generateEnosysStub(lastSyscalls lastSyscallMap) ([]bpf.Instruction, error)
// architectures based on how large the jumps are going to be, or
// re-sort the candidate architectures each time to make sure that we
// pick the largest jump which is going to be smaller than 255.
- for nativeArch := range lastSyscalls {
+ for auditArch := range lastSyscalls {
// We jump forwards but the jump table is calculated from the *END*.
- jump := uint32(len(programTail)) - archJumpTable[nativeArch]
+ jump := uint32(len(programTail)) - archJumpTable[auditArch]
// Same routine as above -- this is a basic jeq check, complicated
// slightly if it turns out that we need to do a long jump.
@@ -518,7 +519,7 @@ func generateEnosysStub(lastSyscalls lastSyscallMap) ([]bpf.Instruction, error)
// jeq [arch],[jump]
bpf.JumpIf{
Cond: bpf.JumpEqual,
- Val: uint32(nativeArch),
+ Val: uint32(auditArch),
SkipTrue: uint8(jump),
},
}, programTail...)
@@ -527,7 +528,7 @@ func generateEnosysStub(lastSyscalls lastSyscallMap) ([]bpf.Instruction, error)
// jne [arch],1
bpf.JumpIf{
Cond: bpf.JumpNotEqual,
- Val: uint32(nativeArch),
+ Val: uint32(auditArch),
SkipTrue: 1,
},
// ja [jump]
diff --git a/libcontainer/seccomp/patchbpf/enosys_linux_test.go b/libcontainer/seccomp/patchbpf/enosys_linux_test.go
index e2d363a43bd3..bdfeff68adb3 100644
--- a/libcontainer/seccomp/patchbpf/enosys_linux_test.go
+++ b/libcontainer/seccomp/patchbpf/enosys_linux_test.go
@@ -23,7 +23,7 @@ type seccompData struct {
}
// mockSyscallPayload creates a fake seccomp_data struct with the given data.
-func mockSyscallPayload(t *testing.T, sysno libseccomp.ScmpSyscall, arch nativeArch, args ...uint64) []byte {
+func mockSyscallPayload(t *testing.T, sysno libseccomp.ScmpSyscall, arch linuxAuditArch, args ...uint64) []byte {
var buf bytes.Buffer
data := seccompData{
@@ -150,8 +150,8 @@ func testEnosysStub(t *testing.T, defaultAction configs.Action, arches []string)
for _, arch := range testArches {
type syscallTest struct {
- syscall string
sysno libseccomp.ScmpSyscall
+ syscall string
expected uint32
}
@@ -160,7 +160,7 @@ func testEnosysStub(t *testing.T, defaultAction configs.Action, arches []string)
t.Fatalf("unknown libseccomp architecture %q: %v", arch, err)
}
- nativeArch, err := archToNative(scmpArch)
+ auditArch, err := scmpArchToAuditArch(scmpArch)
if err != nil {
t.Fatalf("unknown audit architecture %q: %v", arch, err)
}
@@ -179,9 +179,9 @@ func testEnosysStub(t *testing.T, defaultAction configs.Action, arches []string)
t.Fatalf("unknown syscall %q on arch %q: %v", syscall, arch, err)
}
syscallTests = append(syscallTests, syscallTest{
- syscall,
- sysno,
- expected,
+ sysno: sysno,
+ syscall: syscall,
+ expected: expected,
})
}
@@ -233,7 +233,7 @@ func testEnosysStub(t *testing.T, defaultAction configs.Action, arches []string)
test.expected = retFallthrough
}
- payload := mockSyscallPayload(t, test.sysno, nativeArch, 0x1337, 0xF00BA5)
+ payload := mockSyscallPayload(t, test.sysno, auditArch, 0x1337, 0xF00BA5)
// NOTE: golang.org/x/net/bpf returns int here rather
// than uint32.
rawRet, err := filter.Run(payload)
@@ -247,7 +247,7 @@ func testEnosysStub(t *testing.T, defaultAction configs.Action, arches []string)
t.Logf(" [%4.1d] %s", idx, insn)
}
t.Logf("payload: %#v", payload)
- t.Errorf("filter %s(%d) %q(%d): got %#x, want %#x", arch, nativeArch, test.syscall, test.sysno, ret, test.expected)
+ t.Errorf("filter %s(%d) %q(%d): got %#x, want %#x", arch, auditArch, test.syscall, test.sysno, ret, test.expected)
}
}
}
--
2.46.0

View File

@ -0,0 +1,162 @@
From a1e9b2e4015a6b548a0d3e004bf27dd2e3f2cf35 Mon Sep 17 00:00:00 2001
From: Aleksa Sarai <cyphar@cyphar.com>
Date: Wed, 13 Mar 2024 16:12:51 +1100
Subject: [PATCH 3/4] bsc1221050: seccomp: patchbpf: always include native
architecture in stub
(This is a backport of 376417ba7646f05ddb1efa8fe30e2a3b53cf673b.)
It turns out that on ppc64le (at least), Docker doesn't include any
architectures in the list of allowed architectures. libseccomp
interprets this as "just include the default architecture" but patchbpf
would return a no-op ENOSYS stub, which would lead to the exact issues
that commit 7a8d7162f9d7 ("seccomp: prepend -ENOSYS stub to all
filters") fixed for other architectures.
So, just always include the running architecture in the list. There's
no real downside.
SUSE-Bugs: 1192051 1221050
Ref: https://bugzilla.suse.com/show_bug.cgi?id=1192051#c6
Reported-by: Fabian Vogt <fvogt@suse.com>
Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
---
libcontainer/seccomp/patchbpf/enosys_linux.go | 22 +++++++--
.../seccomp/patchbpf/enosys_linux_test.go | 47 +++++++++++++++++--
2 files changed, 61 insertions(+), 8 deletions(-)
diff --git a/libcontainer/seccomp/patchbpf/enosys_linux.go b/libcontainer/seccomp/patchbpf/enosys_linux.go
index 1b67fda85c64..d459ba8792ca 100644
--- a/libcontainer/seccomp/patchbpf/enosys_linux.go
+++ b/libcontainer/seccomp/patchbpf/enosys_linux.go
@@ -224,16 +224,30 @@ type lastSyscallMap map[linuxAuditArch]map[libseccomp.ScmpArch]libseccomp.ScmpSy
// representation, but SCMP_ARCH_X32 means we have to track cases where the
// same architecture has different largest syscalls based on the mode.
func findLastSyscalls(config *configs.Seccomp) (lastSyscallMap, error) {
- lastSyscalls := make(lastSyscallMap)
- // Only loop over architectures which are present in the filter. Any other
- // architectures will get the libseccomp bad architecture action anyway.
+ scmpArchs := make(map[libseccomp.ScmpArch]struct{})
for _, ociArch := range config.Architectures {
arch, err := libseccomp.GetArchFromString(ociArch)
if err != nil {
return nil, fmt.Errorf("unable to validate seccomp architecture: %w", err)
}
+ scmpArchs[arch] = struct{}{}
+ }
+ // On architectures like ppc64le, Docker inexplicably doesn't include the
+ // native architecture in the architecture list which results in no
+ // architectures being present in the list at all (rendering the ENOSYS
+ // stub a no-op). So, always include the native architecture.
+ if nativeScmpArch, err := libseccomp.GetNativeArch(); err != nil {
+ return nil, fmt.Errorf("unable to get native arch: %w", err)
+ } else if _, ok := scmpArchs[nativeScmpArch]; !ok {
+ logrus.Debugf("seccomp: adding implied native architecture %v to config set", nativeScmpArch)
+ scmpArchs[nativeScmpArch] = struct{}{}
+ }
+ logrus.Debugf("seccomp: configured architecture set: %s", scmpArchs)
- // Figure out native architecture representation of the architecture.
+ // Only loop over architectures which are present in the filter. Any other
+ // architectures will get the libseccomp bad architecture action anyway.
+ lastSyscalls := make(lastSyscallMap)
+ for arch := range scmpArchs {
auditArch, err := scmpArchToAuditArch(arch)
if err != nil {
return nil, fmt.Errorf("cannot map architecture %v to AUDIT_ARCH_ constant: %w", arch, err)
diff --git a/libcontainer/seccomp/patchbpf/enosys_linux_test.go b/libcontainer/seccomp/patchbpf/enosys_linux_test.go
index bdfeff68adb3..3d442e1daa66 100644
--- a/libcontainer/seccomp/patchbpf/enosys_linux_test.go
+++ b/libcontainer/seccomp/patchbpf/enosys_linux_test.go
@@ -12,6 +12,7 @@ import (
"github.com/opencontainers/runc/libcontainer/configs"
libseccomp "github.com/seccomp/libseccomp-golang"
+ "github.com/sirupsen/logrus"
"golang.org/x/net/bpf"
)
@@ -105,6 +106,18 @@ var testArches = []string{
"ppc64le",
"s390",
"s390x",
+ // Dummy value to indicate a configuration with no architecture specified.
+ "native",
+}
+
+var nativeArch string
+
+func init() {
+ scmpNativeArch, err := libseccomp.GetNativeArch()
+ if err != nil {
+ logrus.Panicf("get native arch: %v", err)
+ }
+ nativeArch = scmpNativeArch.String()
}
func testEnosysStub(t *testing.T, defaultAction configs.Action, arches []string) {
@@ -155,6 +168,9 @@ func testEnosysStub(t *testing.T, defaultAction configs.Action, arches []string)
expected uint32
}
+ if arch == "native" {
+ arch = nativeArch
+ }
scmpArch, err := libseccomp.GetArchFromString(arch)
if err != nil {
t.Fatalf("unknown libseccomp architecture %q: %v", arch, err)
@@ -228,8 +244,15 @@ func testEnosysStub(t *testing.T, defaultAction configs.Action, arches []string)
// Test syscalls in the explicit list.
for _, test := range syscallTests {
- // Override the expected value in the two special cases.
- if !archSet[arch] || isAllowAction(defaultAction) {
+ // Override the expected value in the two special cases:
+ // 1. If the default action is allow, the filter won't have
+ // the stub prepended so we expect a fallthrough.
+ // 2. If the executing architecture is not in the architecture
+ // set, then the architecture is not handled by the stub --
+ // *except* in the case of the native architecture (which
+ // is always included in the stub).
+ if isAllowAction(defaultAction) ||
+ (!archSet[arch] && arch != nativeArch) {
test.expected = retFallthrough
}
@@ -263,7 +286,14 @@ var testActions = map[string]configs.Action{
func TestEnosysStub_SingleArch(t *testing.T) {
for _, arch := range testArches {
- arches := []string{arch}
+ var arches []string
+ // "native" indicates a blank architecture field for seccomp, to test
+ // the case where the running architecture was not included in the
+ // architecture. Docker doesn't always set the architecture for some
+ // reason (namely for ppc64le).
+ if arch != "native" {
+ arches = append(arches, arch)
+ }
t.Run("arch="+arch, func(t *testing.T) {
for name, action := range testActions {
t.Run("action="+name, func(t *testing.T) {
@@ -277,7 +307,16 @@ func TestEnosysStub_SingleArch(t *testing.T) {
func TestEnosysStub_MultiArch(t *testing.T) {
for end := 0; end < len(testArches); end++ {
for start := 0; start < end; start++ {
- arches := testArches[start:end]
+ var arches []string
+ for _, arch := range testArches[start:end] {
+ // "native" indicates a blank architecture field for seccomp, to test
+ // the case where the running architecture was not included in the
+ // architecture. Docker doesn't always set the architecture for some
+ // reason (namely for ppc64le).
+ if arch != "native" {
+ arches = append(arches, arch)
+ }
+ }
if len(arches) <= 1 {
continue
}
--
2.46.0

View File

@ -0,0 +1,136 @@
From 0f1f8e303cf1919c33952f4938e5637d8f77f907 Mon Sep 17 00:00:00 2001
From: Aleksa Sarai <cyphar@cyphar.com>
Date: Fri, 7 Jul 2023 22:45:44 +1000
Subject: [PATCH 4/4] bsc1214960: nsenter: cloned_binary: remove bindfd logic
entirely
(This is a cherry-pick of b999376fb237195265081a8b8ba3fd3bd6ef8c2c.)
While the ro-bind-mount trick did eliminate the memory overhead of
copying the runc binary for each "runc init" invocation, on machines
with very significant container churn, creating a temporary mount
namespace on every container invocation can trigger severe lock
contention on namespace_sem that makes containers fail to spawn.
The only reason we added bindfd in commit 16612d74de5f ("nsenter:
cloned_binary: try to ro-bind /proc/self/exe before copying") was due to
a Kubernetes e2e test failure where they had a ridiculously small memory
limit. It seems incredibly unlikely that real workloads are running
without 10MB to spare for the very short time that runc is interacting
with the container.
In addition, since the original cloned_binary implementation, cgroupv2
is now almost universally used on modern systems. Unlike cgroupv1, the
cgroupv2 memcg implementation does not migrate memory usage when
processes change cgroups (even cgroupv1 only did this if you had
memory.move_charge_at_immigrate enabled). In addition, because we do the
/proc/self/exe clone before synchronising the bootstrap data read, we
are guaranteed to do the clone before "runc init" is moved into the
container cgroup -- meaning that the memory used by the /proc/self/exe
clone is charged against the root cgroup, and thus container workloads
should not be affected at all with memfd cloning.
The long-term fix for this problem is to block the /proc/self/exe
re-opening attack entirely in-kernel, which is something I'm working
on[1]. Though it should also be noted that because the memfd is
completely separate to the host binary, even attacks like Dirty COW
against the runc binary can be defended against with the memfd approach.
Of course, once we have in-kernel protection against the /proc/self/exe
re-opening attack, we won't have that protection anymore...
[1]: https://lwn.net/Articles/934460/
SUSE-Bugs: https://bugzilla.suse.com/show_bug.cgi?id=1214960
Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
---
libcontainer/nsenter/cloned_binary.c | 67 ----------------------------
1 file changed, 67 deletions(-)
diff --git a/libcontainer/nsenter/cloned_binary.c b/libcontainer/nsenter/cloned_binary.c
index d1b2d4c546f1..565748b13a4e 100644
--- a/libcontainer/nsenter/cloned_binary.c
+++ b/libcontainer/nsenter/cloned_binary.c
@@ -396,61 +396,6 @@ static int seal_execfd(int *fd, int fdtype)
return -1;
}
-static int try_bindfd(void)
-{
- int fd, ret = -1;
- char template[PATH_MAX] = { 0 };
- char *prefix = getenv("_LIBCONTAINER_STATEDIR");
-
- if (!prefix || *prefix != '/')
- prefix = "/tmp";
- if (snprintf(template, sizeof(template), "%s/runc.XXXXXX", prefix) < 0)
- return ret;
-
- /*
- * We need somewhere to mount it, mounting anything over /proc/self is a
- * BAD idea on the host -- even if we do it temporarily.
- */
- fd = mkstemp(template);
- if (fd < 0)
- return ret;
- close(fd);
-
- /*
- * For obvious reasons this won't work in rootless mode because we haven't
- * created a userns+mntns -- but getting that to work will be a bit
- * complicated and it's only worth doing if someone actually needs it.
- */
- ret = -EPERM;
- if (mount("/proc/self/exe", template, "", MS_BIND, "") < 0)
- goto out;
- if (mount("", template, "", MS_REMOUNT | MS_BIND | MS_RDONLY, "") < 0)
- goto out_umount;
-
- /* Get read-only handle that we're sure can't be made read-write. */
- ret = open(template, O_PATH | O_CLOEXEC);
-
-out_umount:
- /*
- * Make sure the MNT_DETACH works, otherwise we could get remounted
- * read-write and that would be quite bad (the fd would be made read-write
- * too, invalidating the protection).
- */
- if (umount2(template, MNT_DETACH) < 0) {
- if (ret >= 0)
- close(ret);
- ret = -ENOTRECOVERABLE;
- }
-
-out:
- /*
- * We don't care about unlink errors, the worst that happens is that
- * there's an empty file left around in STATEDIR.
- */
- unlink(template);
- return ret;
-}
-
static ssize_t fd_to_fd(int outfd, int infd)
{
ssize_t total = 0;
@@ -485,18 +430,6 @@ static int clone_binary(void)
size_t sent = 0;
int fdtype = EFD_NONE;
- /*
- * Before we resort to copying, let's try creating an ro-binfd in one shot
- * by getting a handle for a read-only bind-mount of the execfd.
- */
- execfd = try_bindfd();
- if (execfd >= 0)
- return execfd;
-
- /*
- * Dammit, that didn't work -- time to copy the binary to a safe place we
- * can seal the contents.
- */
execfd = make_execfd(&fdtype);
if (execfd < 0 || fdtype == EFD_NONE)
return -ENOTRECOVERABLE;
--
2.46.0

BIN
runc-1.1.12.tar.xz (Stored with Git LFS)

Binary file not shown.

View File

@ -1,17 +0,0 @@
-----BEGIN PGP SIGNATURE-----
iQJEBAABCAAuFiEEXzbGxhtUYBJKdfWmnhiqJn3bjbQFAmWvvCcQHGFzYXJhaUBz
dXNlLmNvbQAKCRCeGKomfduNtG2oD/9yLwYdfbx4GU31kCuvTS3odH8XyplL4QLl
TszoLO/50z/Y9r0QBNuLsDDvAWtsJAYTsRIwEwDgUuziHnbkbHCnE2C+6P7OWUKp
7VS1mqWzWeVibt0hYBWcooJb8inA/ctwfppZlH8EnTdoyqp0bAuQKtj2muA+LTvN
n/19qZ0/zAvErya5ugZCfnpJngOM0W//F5OSE/DKI3ct6o3AilxlzlhZuwkiYQud
nwS5j4CvQp7GkJeuwDluUHGmsT8AW6P3McptS/BcT4wUKWhxcntJG1cdiZOFTW84
3CLdwMPGQR0SVK5yPMbKogRtglODEW82Ytp4S8BB9sG5PS5rBsvnApSQxFluRMQT
oaQsEKwPS+VSUwf44QR42iF3fB8dxmmmcautr5yaUiSx4DdFGj9jjrbMa9YCk2da
J/5ExwJv5nP5R+uwOiH3ziZuFuuH1afbGLrT2ouv61/SMGiYiLEAyiegF94Zg2nu
5RvMUz33LpEckLrlNN5u9q+/jbfJmZAUtdVafKQQTBRFKPCyHjOroKM11PzoHX6l
3dsyEPbEfowZ+uM2z9wCfub529fNF8t9k9sUAIQsma5p7+l7xJMbOua2kd1kGiQU
ec19+KD6ka4NHyDRwxe0iM6/AuFlKKUUTVGZjg2bD+ap0qgDjZ3R5lTmI1pJ8Win
wfoEKZCm+A==
=Sl8m
-----END PGP SIGNATURE-----

BIN
runc-1.1.14.tar.xz (Stored with Git LFS) Normal file

Binary file not shown.

7
runc-1.1.14.tar.xz.asc Normal file
View File

@ -0,0 +1,7 @@
-----BEGIN PGP SIGNATURE-----
iHUEABYKAB0WIQS2TklVsp+j1GPyqQYol/rSt+lEbwUCZtZk+AAKCRAol/rSt+lE
b0TGAQC6tc59nCVnmViX22aKK6fuV++saYQgQKKhIkqiyBs97wD/a49dqcnjgHIf
OKO+WjeCGwFIwmHIsAeD3bdCb+XTqQI=
=E21y
-----END PGP SIGNATURE-----

View File

@ -1,3 +1,43 @@
-------------------------------------------------------------------
Tue Sep 3 01:57:20 UTC 2024 - Aleksa Sarai <asarai@suse.com>
[ This was only ever released for SLES and Leap. ]
- Update to runc v1.1.14. Upstream changelog is available from
<https://github.com/opencontainers/runc/releases/tag/v1.1.14>.
Includes the patch for CVE-2024-45310. bsc#1230092
- Rebase patches:
* 0001-bsc1221050-libct-seccomp-patchbpf-rm-duplicated-code.patch
* 0002-bsc1221050-seccomp-patchbpf-rename-nativeArch-linuxA.patch
* 0003-bsc1221050-seccomp-patchbpf-always-include-native-ar.patch
* 0004-bsc1214960-nsenter-cloned_binary-remove-bindfd-logic.patch
-------------------------------------------------------------------
Mon Jul 22 13:08:06 UTC 2024 - Aleksa Sarai <asarai@suse.com>
[ This was only ever released for SLES and Leap. ]
- Update to runc v1.1.13. Upstream changelog is available from
<https://github.com/opencontainers/runc/releases/tag/v1.1.13>.
- Rebase patches:
* 0001-bsc1221050-libct-seccomp-patchbpf-rm-duplicated-code.patch
* 0002-bsc1221050-seccomp-patchbpf-rename-nativeArch-linuxA.patch
* 0003-bsc1221050-seccomp-patchbpf-always-include-native-ar.patch
- Backport <https://github.com/opencontainers/runc/pull/3931> to fix a
performance issue when running lots of containers, caused by systemd getting
too many mount notifications. bsc#1214960
+ 0004-bsc1214960-nsenter-cloned_binary-remove-bindfd-logic.patch
-------------------------------------------------------------------
Thu Mar 21 03:46:48 UTC 2024 - Aleksa Sarai <asarai@suse.com>
- Add upstream patch <https://github.com/opencontainers/runc/pull/4219> to
properly fix -ENOSYS stub on ppc64le. bsc#1192051 bsc#1221050
+ 0001-bsc1221050-libct-seccomp-patchbpf-rm-duplicated-code.patch
+ 0002-bsc1221050-seccomp-patchbpf-rename-nativeArch-linuxA.patch
+ 0003-bsc1221050-seccomp-patchbpf-always-include-native-ar.patch
-------------------------------------------------------------------
Wed Jan 31 00:00:33 UTC 2024 - Aleksa Sarai <asarai@suse.com>

View File

@ -18,13 +18,13 @@
# MANUAL: Make sure you update this each time you update runc.
%define git_version 51d5e94601ceffbbd85688df1c928ecccbfa4685
%define git_short 51d5e94601ce
%define git_version 2c9f5602f0ba3d9da1c2596322dfc4e156844890
%define git_short 2c9f5602f0ba
%define project github.com/opencontainers/runc
Name: runc
Version: 1.1.12
Version: 1.1.14
Release: 0
Summary: Tool for spawning and running OCI containers
License: Apache-2.0
@ -33,6 +33,12 @@ URL: https://github.com/opencontainers/runc
Source0: https://github.com/opencontainers/runc/releases/download/v%{version}/runc.tar.xz#/runc-%{version}.tar.xz
Source1: https://github.com/opencontainers/runc/releases/download/v%{version}/runc.tar.xz.asc#/runc-%{version}.tar.xz.asc
Source2: runc.keyring
# SUSE-FIX-UPSTREAM: Backport of <https://github.com/opencontainers/runc/pull/4219>. bsc#1221050
Patch10: 0001-bsc1221050-libct-seccomp-patchbpf-rm-duplicated-code.patch
Patch11: 0002-bsc1221050-seccomp-patchbpf-rename-nativeArch-linuxA.patch
Patch12: 0003-bsc1221050-seccomp-patchbpf-always-include-native-ar.patch
# SUSE-FIX-UPSTREAM: Partial backport of <https://github.com/opencontainers/runc/pull/3931>. bsc#1214960
Patch20: 0004-bsc1214960-nsenter-cloned_binary-remove-bindfd-logic.patch
BuildRequires: diffutils
BuildRequires: fdupes
BuildRequires: go
@ -64,6 +70,7 @@ and has grown to become a separate project entirely.
%prep
%setup -q -n %{name}-%{version}
%autopatch -p1
%build
# build runc