Compare commits

4 Commits
main ... 1.1

11 changed files with 177 additions and 672 deletions

View File

@@ -1,44 +0,0 @@
From 22eb87a32dc1c685425b685e96e8472b9ac1b5ca Mon Sep 17 00:00:00 2001
From: Kir Kolyshkin <kolyshkin@gmail.com>
Date: Fri, 14 Oct 2022 18:37:00 -0700
Subject: [PATCH 1/4] bsc1221050: libct/seccomp/patchbpf: rm duplicated code
(This is a cherry-pick of 2cd05e44b662fb79c46d5ebfd6c71e9ebc98d40c.)
In findLastSyscalls, we convert libseccomp.ArchNative to the real
libseccomp architecture, but archToNative already does that, so
this code is redundant.
Remove the redundant code, and move its comment to archToNative.
Fixes: 7a8d7162f
Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com>
Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
---
libcontainer/seccomp/patchbpf/enosys_linux.go | 10 ----------
1 file changed, 10 deletions(-)
diff --git a/libcontainer/seccomp/patchbpf/enosys_linux.go b/libcontainer/seccomp/patchbpf/enosys_linux.go
index efe6dca58b21..c9c1d4ccb685 100644
--- a/libcontainer/seccomp/patchbpf/enosys_linux.go
+++ b/libcontainer/seccomp/patchbpf/enosys_linux.go
@@ -233,16 +233,6 @@ func findLastSyscalls(config *configs.Seccomp) (lastSyscallMap, error) {
return nil, fmt.Errorf("unable to validate seccomp architecture: %w", err)
}
- // Map native architecture to a real architecture value to avoid
- // doubling-up the lastSyscall mapping.
- if arch == libseccomp.ArchNative {
- nativeArch, err := libseccomp.GetNativeArch()
- if err != nil {
- return nil, fmt.Errorf("unable to get native architecture: %w", err)
- }
- arch = nativeArch
- }
-
// Figure out native architecture representation of the architecture.
nativeArch, err := archToNative(arch)
if err != nil {
--
2.46.0

View File

@@ -1,289 +0,0 @@
From 558c5ecf487a40001ba854cfcbd5c94223167501 Mon Sep 17 00:00:00 2001
From: Aleksa Sarai <cyphar@cyphar.com>
Date: Wed, 13 Mar 2024 13:40:16 +1100
Subject: [PATCH 2/4] bsc1221050: seccomp: patchbpf: rename nativeArch ->
linuxAuditArch
(This is a backport of 6167f5ffc3e3fd53e6a41a2effa592a4873ad046.)
Calling the Linux AUDIT_* architecture constants "native" leads to
confusing code when we are getting the actual native architecture of the
running system.
Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
---
libcontainer/seccomp/patchbpf/enosys_linux.go | 81 ++++++++++---------
.../seccomp/patchbpf/enosys_linux_test.go | 16 ++--
2 files changed, 49 insertions(+), 48 deletions(-)
diff --git a/libcontainer/seccomp/patchbpf/enosys_linux.go b/libcontainer/seccomp/patchbpf/enosys_linux.go
index c9c1d4ccb685..1b67fda85c64 100644
--- a/libcontainer/seccomp/patchbpf/enosys_linux.go
+++ b/libcontainer/seccomp/patchbpf/enosys_linux.go
@@ -164,11 +164,11 @@ func disassembleFilter(filter *libseccomp.ScmpFilter) ([]bpf.Instruction, error)
return program, nil
}
-type nativeArch uint32
+type linuxAuditArch uint32
-const invalidArch nativeArch = 0
+const invalidArch linuxAuditArch = 0
-func archToNative(arch libseccomp.ScmpArch) (nativeArch, error) {
+func scmpArchToAuditArch(arch libseccomp.ScmpArch) (linuxAuditArch, error) {
switch arch {
case libseccomp.ArchNative:
// Convert to actual native architecture.
@@ -176,48 +176,48 @@ func archToNative(arch libseccomp.ScmpArch) (nativeArch, error) {
if err != nil {
return invalidArch, fmt.Errorf("unable to get native arch: %w", err)
}
- return archToNative(arch)
+ return scmpArchToAuditArch(arch)
case libseccomp.ArchX86:
- return nativeArch(C.C_AUDIT_ARCH_I386), nil
+ return linuxAuditArch(C.C_AUDIT_ARCH_I386), nil
case libseccomp.ArchAMD64, libseccomp.ArchX32:
// NOTE: x32 is treated like x86_64 except all x32 syscalls have the
// 30th bit of the syscall number set to indicate that it's not a
// normal x86_64 syscall.
- return nativeArch(C.C_AUDIT_ARCH_X86_64), nil
+ return linuxAuditArch(C.C_AUDIT_ARCH_X86_64), nil
case libseccomp.ArchARM:
- return nativeArch(C.C_AUDIT_ARCH_ARM), nil
+ return linuxAuditArch(C.C_AUDIT_ARCH_ARM), nil
case libseccomp.ArchARM64:
- return nativeArch(C.C_AUDIT_ARCH_AARCH64), nil
+ return linuxAuditArch(C.C_AUDIT_ARCH_AARCH64), nil
case libseccomp.ArchMIPS:
- return nativeArch(C.C_AUDIT_ARCH_MIPS), nil
+ return linuxAuditArch(C.C_AUDIT_ARCH_MIPS), nil
case libseccomp.ArchMIPS64:
- return nativeArch(C.C_AUDIT_ARCH_MIPS64), nil
+ return linuxAuditArch(C.C_AUDIT_ARCH_MIPS64), nil
case libseccomp.ArchMIPS64N32:
- return nativeArch(C.C_AUDIT_ARCH_MIPS64N32), nil
+ return linuxAuditArch(C.C_AUDIT_ARCH_MIPS64N32), nil
case libseccomp.ArchMIPSEL:
- return nativeArch(C.C_AUDIT_ARCH_MIPSEL), nil
+ return linuxAuditArch(C.C_AUDIT_ARCH_MIPSEL), nil
case libseccomp.ArchMIPSEL64:
- return nativeArch(C.C_AUDIT_ARCH_MIPSEL64), nil
+ return linuxAuditArch(C.C_AUDIT_ARCH_MIPSEL64), nil
case libseccomp.ArchMIPSEL64N32:
- return nativeArch(C.C_AUDIT_ARCH_MIPSEL64N32), nil
+ return linuxAuditArch(C.C_AUDIT_ARCH_MIPSEL64N32), nil
case libseccomp.ArchPPC:
- return nativeArch(C.C_AUDIT_ARCH_PPC), nil
+ return linuxAuditArch(C.C_AUDIT_ARCH_PPC), nil
case libseccomp.ArchPPC64:
- return nativeArch(C.C_AUDIT_ARCH_PPC64), nil
+ return linuxAuditArch(C.C_AUDIT_ARCH_PPC64), nil
case libseccomp.ArchPPC64LE:
- return nativeArch(C.C_AUDIT_ARCH_PPC64LE), nil
+ return linuxAuditArch(C.C_AUDIT_ARCH_PPC64LE), nil
case libseccomp.ArchS390:
- return nativeArch(C.C_AUDIT_ARCH_S390), nil
+ return linuxAuditArch(C.C_AUDIT_ARCH_S390), nil
case libseccomp.ArchS390X:
- return nativeArch(C.C_AUDIT_ARCH_S390X), nil
+ return linuxAuditArch(C.C_AUDIT_ARCH_S390X), nil
case libseccomp.ArchRISCV64:
- return nativeArch(C.C_AUDIT_ARCH_RISCV64), nil
+ return linuxAuditArch(C.C_AUDIT_ARCH_RISCV64), nil
default:
return invalidArch, fmt.Errorf("unknown architecture: %v", arch)
}
}
-type lastSyscallMap map[nativeArch]map[libseccomp.ScmpArch]libseccomp.ScmpSyscall
+type lastSyscallMap map[linuxAuditArch]map[libseccomp.ScmpArch]libseccomp.ScmpSyscall
// Figure out largest syscall number referenced in the filter for each
// architecture. We will be generating code based on the native architecture
@@ -234,17 +234,17 @@ func findLastSyscalls(config *configs.Seccomp) (lastSyscallMap, error) {
}
// Figure out native architecture representation of the architecture.
- nativeArch, err := archToNative(arch)
+ auditArch, err := scmpArchToAuditArch(arch)
if err != nil {
return nil, fmt.Errorf("cannot map architecture %v to AUDIT_ARCH_ constant: %w", arch, err)
}
- if _, ok := lastSyscalls[nativeArch]; !ok {
- lastSyscalls[nativeArch] = map[libseccomp.ScmpArch]libseccomp.ScmpSyscall{}
+ if _, ok := lastSyscalls[auditArch]; !ok {
+ lastSyscalls[auditArch] = map[libseccomp.ScmpArch]libseccomp.ScmpSyscall{}
}
- if _, ok := lastSyscalls[nativeArch][arch]; ok {
+ if _, ok := lastSyscalls[auditArch][arch]; ok {
// Because of ArchNative we may hit the same entry multiple times.
- // Just skip it if we've seen this (nativeArch, ScmpArch)
+ // Just skip it if we've seen this (linuxAuditArch, ScmpArch)
// combination before.
continue
}
@@ -262,10 +262,11 @@ func findLastSyscalls(config *configs.Seccomp) (lastSyscallMap, error) {
}
}
if largestSyscall != 0 {
- lastSyscalls[nativeArch][arch] = largestSyscall
+ logrus.Debugf("seccomp: largest syscall number for arch %v is %v", arch, largestSyscall)
+ lastSyscalls[auditArch][arch] = largestSyscall
} else {
- logrus.Warnf("could not find any syscalls for arch %s", ociArch)
- delete(lastSyscalls[nativeArch], arch)
+ logrus.Warnf("could not find any syscalls for arch %v", arch)
+ delete(lastSyscalls[auditArch], arch)
}
}
return lastSyscalls, nil
@@ -283,10 +284,10 @@ func findLastSyscalls(config *configs.Seccomp) (lastSyscallMap, error) {
// close_range(2) which were added out-of-order in the syscall table between
// kernel releases.
func generateEnosysStub(lastSyscalls lastSyscallMap) ([]bpf.Instruction, error) {
- // A jump-table for each nativeArch used to generate the initial
+ // A jump-table for each linuxAuditArch used to generate the initial
// conditional jumps -- measured from the *END* of the program so they
// remain valid after prepending to the tail.
- archJumpTable := map[nativeArch]uint32{}
+ archJumpTable := map[linuxAuditArch]uint32{}
// Generate our own -ENOSYS rules for each architecture. They have to be
// generated in reverse (prepended to the tail of the program) because the
@@ -299,7 +300,7 @@ func generateEnosysStub(lastSyscalls lastSyscallMap) ([]bpf.Instruction, error)
}
// Generate the syscall -ENOSYS rules.
- for nativeArch, maxSyscalls := range lastSyscalls {
+ for auditArch, maxSyscalls := range lastSyscalls {
// The number of instructions from the tail of this section which need
// to be jumped in order to reach the -ENOSYS return. If the section
// does not jump, it will fall through to the actual filter.
@@ -380,7 +381,7 @@ func generateEnosysStub(lastSyscalls lastSyscallMap) ([]bpf.Instruction, error)
// If we're on x86 we need to add a check for x32 and if we're in
// the wrong mode we jump over the section.
- if uint32(nativeArch) == uint32(C.C_AUDIT_ARCH_X86_64) {
+ if uint32(auditArch) == uint32(C.C_AUDIT_ARCH_X86_64) {
// Generate a prefix to check the mode.
switch scmpArch {
case libseccomp.ArchAMD64:
@@ -409,8 +410,8 @@ func generateEnosysStub(lastSyscalls lastSyscallMap) ([]bpf.Instruction, error)
section = append(section, sectionTail...)
case 2:
// x32 and x86_64 are a unique case, we can't handle any others.
- if uint32(nativeArch) != uint32(C.C_AUDIT_ARCH_X86_64) {
- return nil, fmt.Errorf("unknown architecture overlap on native arch %#x", nativeArch)
+ if uint32(auditArch) != uint32(C.C_AUDIT_ARCH_X86_64) {
+ return nil, fmt.Errorf("unknown architecture overlap on native arch %#x", auditArch)
}
x32sysno, ok := maxSyscalls[libseccomp.ArchX32]
@@ -487,7 +488,7 @@ func generateEnosysStub(lastSyscalls lastSyscallMap) ([]bpf.Instruction, error)
programTail = append(section, programTail...)
// Update jump table.
- archJumpTable[nativeArch] = uint32(len(programTail))
+ archJumpTable[auditArch] = uint32(len(programTail))
}
// Add a dummy "jump to filter" for any architecture we might miss below.
@@ -507,9 +508,9 @@ func generateEnosysStub(lastSyscalls lastSyscallMap) ([]bpf.Instruction, error)
// architectures based on how large the jumps are going to be, or
// re-sort the candidate architectures each time to make sure that we
// pick the largest jump which is going to be smaller than 255.
- for nativeArch := range lastSyscalls {
+ for auditArch := range lastSyscalls {
// We jump forwards but the jump table is calculated from the *END*.
- jump := uint32(len(programTail)) - archJumpTable[nativeArch]
+ jump := uint32(len(programTail)) - archJumpTable[auditArch]
// Same routine as above -- this is a basic jeq check, complicated
// slightly if it turns out that we need to do a long jump.
@@ -518,7 +519,7 @@ func generateEnosysStub(lastSyscalls lastSyscallMap) ([]bpf.Instruction, error)
// jeq [arch],[jump]
bpf.JumpIf{
Cond: bpf.JumpEqual,
- Val: uint32(nativeArch),
+ Val: uint32(auditArch),
SkipTrue: uint8(jump),
},
}, programTail...)
@@ -527,7 +528,7 @@ func generateEnosysStub(lastSyscalls lastSyscallMap) ([]bpf.Instruction, error)
// jne [arch],1
bpf.JumpIf{
Cond: bpf.JumpNotEqual,
- Val: uint32(nativeArch),
+ Val: uint32(auditArch),
SkipTrue: 1,
},
// ja [jump]
diff --git a/libcontainer/seccomp/patchbpf/enosys_linux_test.go b/libcontainer/seccomp/patchbpf/enosys_linux_test.go
index e2d363a43bd3..bdfeff68adb3 100644
--- a/libcontainer/seccomp/patchbpf/enosys_linux_test.go
+++ b/libcontainer/seccomp/patchbpf/enosys_linux_test.go
@@ -23,7 +23,7 @@ type seccompData struct {
}
// mockSyscallPayload creates a fake seccomp_data struct with the given data.
-func mockSyscallPayload(t *testing.T, sysno libseccomp.ScmpSyscall, arch nativeArch, args ...uint64) []byte {
+func mockSyscallPayload(t *testing.T, sysno libseccomp.ScmpSyscall, arch linuxAuditArch, args ...uint64) []byte {
var buf bytes.Buffer
data := seccompData{
@@ -150,8 +150,8 @@ func testEnosysStub(t *testing.T, defaultAction configs.Action, arches []string)
for _, arch := range testArches {
type syscallTest struct {
- syscall string
sysno libseccomp.ScmpSyscall
+ syscall string
expected uint32
}
@@ -160,7 +160,7 @@ func testEnosysStub(t *testing.T, defaultAction configs.Action, arches []string)
t.Fatalf("unknown libseccomp architecture %q: %v", arch, err)
}
- nativeArch, err := archToNative(scmpArch)
+ auditArch, err := scmpArchToAuditArch(scmpArch)
if err != nil {
t.Fatalf("unknown audit architecture %q: %v", arch, err)
}
@@ -179,9 +179,9 @@ func testEnosysStub(t *testing.T, defaultAction configs.Action, arches []string)
t.Fatalf("unknown syscall %q on arch %q: %v", syscall, arch, err)
}
syscallTests = append(syscallTests, syscallTest{
- syscall,
- sysno,
- expected,
+ sysno: sysno,
+ syscall: syscall,
+ expected: expected,
})
}
@@ -233,7 +233,7 @@ func testEnosysStub(t *testing.T, defaultAction configs.Action, arches []string)
test.expected = retFallthrough
}
- payload := mockSyscallPayload(t, test.sysno, nativeArch, 0x1337, 0xF00BA5)
+ payload := mockSyscallPayload(t, test.sysno, auditArch, 0x1337, 0xF00BA5)
// NOTE: golang.org/x/net/bpf returns int here rather
// than uint32.
rawRet, err := filter.Run(payload)
@@ -247,7 +247,7 @@ func testEnosysStub(t *testing.T, defaultAction configs.Action, arches []string)
t.Logf(" [%4.1d] %s", idx, insn)
}
t.Logf("payload: %#v", payload)
- t.Errorf("filter %s(%d) %q(%d): got %#x, want %#x", arch, nativeArch, test.syscall, test.sysno, ret, test.expected)
+ t.Errorf("filter %s(%d) %q(%d): got %#x, want %#x", arch, auditArch, test.syscall, test.sysno, ret, test.expected)
}
}
}
--
2.46.0

View File

@@ -1,162 +0,0 @@
From a1e9b2e4015a6b548a0d3e004bf27dd2e3f2cf35 Mon Sep 17 00:00:00 2001
From: Aleksa Sarai <cyphar@cyphar.com>
Date: Wed, 13 Mar 2024 16:12:51 +1100
Subject: [PATCH 3/4] bsc1221050: seccomp: patchbpf: always include native
architecture in stub
(This is a backport of 376417ba7646f05ddb1efa8fe30e2a3b53cf673b.)
It turns out that on ppc64le (at least), Docker doesn't include any
architectures in the list of allowed architectures. libseccomp
interprets this as "just include the default architecture" but patchbpf
would return a no-op ENOSYS stub, which would lead to the exact issues
that commit 7a8d7162f9d7 ("seccomp: prepend -ENOSYS stub to all
filters") fixed for other architectures.
So, just always include the running architecture in the list. There's
no real downside.
SUSE-Bugs: 1192051 1221050
Ref: https://bugzilla.suse.com/show_bug.cgi?id=1192051#c6
Reported-by: Fabian Vogt <fvogt@suse.com>
Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
---
libcontainer/seccomp/patchbpf/enosys_linux.go | 22 +++++++--
.../seccomp/patchbpf/enosys_linux_test.go | 47 +++++++++++++++++--
2 files changed, 61 insertions(+), 8 deletions(-)
diff --git a/libcontainer/seccomp/patchbpf/enosys_linux.go b/libcontainer/seccomp/patchbpf/enosys_linux.go
index 1b67fda85c64..d459ba8792ca 100644
--- a/libcontainer/seccomp/patchbpf/enosys_linux.go
+++ b/libcontainer/seccomp/patchbpf/enosys_linux.go
@@ -224,16 +224,30 @@ type lastSyscallMap map[linuxAuditArch]map[libseccomp.ScmpArch]libseccomp.ScmpSy
// representation, but SCMP_ARCH_X32 means we have to track cases where the
// same architecture has different largest syscalls based on the mode.
func findLastSyscalls(config *configs.Seccomp) (lastSyscallMap, error) {
- lastSyscalls := make(lastSyscallMap)
- // Only loop over architectures which are present in the filter. Any other
- // architectures will get the libseccomp bad architecture action anyway.
+ scmpArchs := make(map[libseccomp.ScmpArch]struct{})
for _, ociArch := range config.Architectures {
arch, err := libseccomp.GetArchFromString(ociArch)
if err != nil {
return nil, fmt.Errorf("unable to validate seccomp architecture: %w", err)
}
+ scmpArchs[arch] = struct{}{}
+ }
+ // On architectures like ppc64le, Docker inexplicably doesn't include the
+ // native architecture in the architecture list which results in no
+ // architectures being present in the list at all (rendering the ENOSYS
+ // stub a no-op). So, always include the native architecture.
+ if nativeScmpArch, err := libseccomp.GetNativeArch(); err != nil {
+ return nil, fmt.Errorf("unable to get native arch: %w", err)
+ } else if _, ok := scmpArchs[nativeScmpArch]; !ok {
+ logrus.Debugf("seccomp: adding implied native architecture %v to config set", nativeScmpArch)
+ scmpArchs[nativeScmpArch] = struct{}{}
+ }
+ logrus.Debugf("seccomp: configured architecture set: %s", scmpArchs)
- // Figure out native architecture representation of the architecture.
+ // Only loop over architectures which are present in the filter. Any other
+ // architectures will get the libseccomp bad architecture action anyway.
+ lastSyscalls := make(lastSyscallMap)
+ for arch := range scmpArchs {
auditArch, err := scmpArchToAuditArch(arch)
if err != nil {
return nil, fmt.Errorf("cannot map architecture %v to AUDIT_ARCH_ constant: %w", arch, err)
diff --git a/libcontainer/seccomp/patchbpf/enosys_linux_test.go b/libcontainer/seccomp/patchbpf/enosys_linux_test.go
index bdfeff68adb3..3d442e1daa66 100644
--- a/libcontainer/seccomp/patchbpf/enosys_linux_test.go
+++ b/libcontainer/seccomp/patchbpf/enosys_linux_test.go
@@ -12,6 +12,7 @@ import (
"github.com/opencontainers/runc/libcontainer/configs"
libseccomp "github.com/seccomp/libseccomp-golang"
+ "github.com/sirupsen/logrus"
"golang.org/x/net/bpf"
)
@@ -105,6 +106,18 @@ var testArches = []string{
"ppc64le",
"s390",
"s390x",
+ // Dummy value to indicate a configuration with no architecture specified.
+ "native",
+}
+
+var nativeArch string
+
+func init() {
+ scmpNativeArch, err := libseccomp.GetNativeArch()
+ if err != nil {
+ logrus.Panicf("get native arch: %v", err)
+ }
+ nativeArch = scmpNativeArch.String()
}
func testEnosysStub(t *testing.T, defaultAction configs.Action, arches []string) {
@@ -155,6 +168,9 @@ func testEnosysStub(t *testing.T, defaultAction configs.Action, arches []string)
expected uint32
}
+ if arch == "native" {
+ arch = nativeArch
+ }
scmpArch, err := libseccomp.GetArchFromString(arch)
if err != nil {
t.Fatalf("unknown libseccomp architecture %q: %v", arch, err)
@@ -228,8 +244,15 @@ func testEnosysStub(t *testing.T, defaultAction configs.Action, arches []string)
// Test syscalls in the explicit list.
for _, test := range syscallTests {
- // Override the expected value in the two special cases.
- if !archSet[arch] || isAllowAction(defaultAction) {
+ // Override the expected value in the two special cases:
+ // 1. If the default action is allow, the filter won't have
+ // the stub prepended so we expect a fallthrough.
+ // 2. If the executing architecture is not in the architecture
+ // set, then the architecture is not handled by the stub --
+ // *except* in the case of the native architecture (which
+ // is always included in the stub).
+ if isAllowAction(defaultAction) ||
+ (!archSet[arch] && arch != nativeArch) {
test.expected = retFallthrough
}
@@ -263,7 +286,14 @@ var testActions = map[string]configs.Action{
func TestEnosysStub_SingleArch(t *testing.T) {
for _, arch := range testArches {
- arches := []string{arch}
+ var arches []string
+ // "native" indicates a blank architecture field for seccomp, to test
+ // the case where the running architecture was not included in the
+ // architecture. Docker doesn't always set the architecture for some
+ // reason (namely for ppc64le).
+ if arch != "native" {
+ arches = append(arches, arch)
+ }
t.Run("arch="+arch, func(t *testing.T) {
for name, action := range testActions {
t.Run("action="+name, func(t *testing.T) {
@@ -277,7 +307,16 @@ func TestEnosysStub_SingleArch(t *testing.T) {
func TestEnosysStub_MultiArch(t *testing.T) {
for end := 0; end < len(testArches); end++ {
for start := 0; start < end; start++ {
- arches := testArches[start:end]
+ var arches []string
+ for _, arch := range testArches[start:end] {
+ // "native" indicates a blank architecture field for seccomp, to test
+ // the case where the running architecture was not included in the
+ // architecture. Docker doesn't always set the architecture for some
+ // reason (namely for ppc64le).
+ if arch != "native" {
+ arches = append(arches, arch)
+ }
+ }
if len(arches) <= 1 {
continue
}
--
2.46.0

View File

@@ -1,136 +0,0 @@
From 0f1f8e303cf1919c33952f4938e5637d8f77f907 Mon Sep 17 00:00:00 2001
From: Aleksa Sarai <cyphar@cyphar.com>
Date: Fri, 7 Jul 2023 22:45:44 +1000
Subject: [PATCH 4/4] bsc1214960: nsenter: cloned_binary: remove bindfd logic
entirely
(This is a cherry-pick of b999376fb237195265081a8b8ba3fd3bd6ef8c2c.)
While the ro-bind-mount trick did eliminate the memory overhead of
copying the runc binary for each "runc init" invocation, on machines
with very significant container churn, creating a temporary mount
namespace on every container invocation can trigger severe lock
contention on namespace_sem that makes containers fail to spawn.
The only reason we added bindfd in commit 16612d74de5f ("nsenter:
cloned_binary: try to ro-bind /proc/self/exe before copying") was due to
a Kubernetes e2e test failure where they had a ridiculously small memory
limit. It seems incredibly unlikely that real workloads are running
without 10MB to spare for the very short time that runc is interacting
with the container.
In addition, since the original cloned_binary implementation, cgroupv2
is now almost universally used on modern systems. Unlike cgroupv1, the
cgroupv2 memcg implementation does not migrate memory usage when
processes change cgroups (even cgroupv1 only did this if you had
memory.move_charge_at_immigrate enabled). In addition, because we do the
/proc/self/exe clone before synchronising the bootstrap data read, we
are guaranteed to do the clone before "runc init" is moved into the
container cgroup -- meaning that the memory used by the /proc/self/exe
clone is charged against the root cgroup, and thus container workloads
should not be affected at all with memfd cloning.
The long-term fix for this problem is to block the /proc/self/exe
re-opening attack entirely in-kernel, which is something I'm working
on[1]. Though it should also be noted that because the memfd is
completely separate to the host binary, even attacks like Dirty COW
against the runc binary can be defended against with the memfd approach.
Of course, once we have in-kernel protection against the /proc/self/exe
re-opening attack, we won't have that protection anymore...
[1]: https://lwn.net/Articles/934460/
SUSE-Bugs: https://bugzilla.suse.com/show_bug.cgi?id=1214960
Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
---
libcontainer/nsenter/cloned_binary.c | 67 ----------------------------
1 file changed, 67 deletions(-)
diff --git a/libcontainer/nsenter/cloned_binary.c b/libcontainer/nsenter/cloned_binary.c
index d1b2d4c546f1..565748b13a4e 100644
--- a/libcontainer/nsenter/cloned_binary.c
+++ b/libcontainer/nsenter/cloned_binary.c
@@ -396,61 +396,6 @@ static int seal_execfd(int *fd, int fdtype)
return -1;
}
-static int try_bindfd(void)
-{
- int fd, ret = -1;
- char template[PATH_MAX] = { 0 };
- char *prefix = getenv("_LIBCONTAINER_STATEDIR");
-
- if (!prefix || *prefix != '/')
- prefix = "/tmp";
- if (snprintf(template, sizeof(template), "%s/runc.XXXXXX", prefix) < 0)
- return ret;
-
- /*
- * We need somewhere to mount it, mounting anything over /proc/self is a
- * BAD idea on the host -- even if we do it temporarily.
- */
- fd = mkstemp(template);
- if (fd < 0)
- return ret;
- close(fd);
-
- /*
- * For obvious reasons this won't work in rootless mode because we haven't
- * created a userns+mntns -- but getting that to work will be a bit
- * complicated and it's only worth doing if someone actually needs it.
- */
- ret = -EPERM;
- if (mount("/proc/self/exe", template, "", MS_BIND, "") < 0)
- goto out;
- if (mount("", template, "", MS_REMOUNT | MS_BIND | MS_RDONLY, "") < 0)
- goto out_umount;
-
- /* Get read-only handle that we're sure can't be made read-write. */
- ret = open(template, O_PATH | O_CLOEXEC);
-
-out_umount:
- /*
- * Make sure the MNT_DETACH works, otherwise we could get remounted
- * read-write and that would be quite bad (the fd would be made read-write
- * too, invalidating the protection).
- */
- if (umount2(template, MNT_DETACH) < 0) {
- if (ret >= 0)
- close(ret);
- ret = -ENOTRECOVERABLE;
- }
-
-out:
- /*
- * We don't care about unlink errors, the worst that happens is that
- * there's an empty file left around in STATEDIR.
- */
- unlink(template);
- return ret;
-}
-
static ssize_t fd_to_fd(int outfd, int infd)
{
ssize_t total = 0;
@@ -485,18 +430,6 @@ static int clone_binary(void)
size_t sent = 0;
int fdtype = EFD_NONE;
- /*
- * Before we resort to copying, let's try creating an ro-binfd in one shot
- * by getting a handle for a read-only bind-mount of the execfd.
- */
- execfd = try_bindfd();
- if (execfd >= 0)
- return execfd;
-
- /*
- * Dammit, that didn't work -- time to copy the binary to a safe place we
- * can seal the contents.
- */
execfd = make_execfd(&fdtype);
if (execfd < 0 || fdtype == EFD_NONE)
return -ENOTRECOVERABLE;
--
2.46.0

Binary file not shown.

View File

@@ -1,7 +0,0 @@
-----BEGIN PGP SIGNATURE-----
iHUEABYKAB0WIQS2TklVsp+j1GPyqQYol/rSt+lEbwUCZtZk+AAKCRAol/rSt+lE
b0TGAQC6tc59nCVnmViX22aKK6fuV++saYQgQKKhIkqiyBs97wD/a49dqcnjgHIf
OKO+WjeCGwFIwmHIsAeD3bdCb+XTqQI=
=E21y
-----END PGP SIGNATURE-----

BIN
runc-1.3.4.tar.xz LFS Normal file

Binary file not shown.

8
runc-1.3.4.tar.xz.asc Normal file
View File

@@ -0,0 +1,8 @@
-----BEGIN PGP SIGNATURE-----
iJEEABYKADkWIQS2TklVsp+j1GPyqQYol/rSt+lEbwUCaSjevxsUgAAAAAAEAA5t
YW51MiwyLjUrMS4xMSwyLDIACgkQKJf60rfpRG8DqgEAgQBUL0dOg31PIjBq03oW
5dLKfrM4KQS4tDfj36Ol7y0A/jmlAoMzn32VfL2UnEh1DUBHFDxhiXvNEA3lNf0O
G3gC
=Q/Xl
-----END PGP SIGNATURE-----

View File

@@ -1,3 +1,116 @@
-------------------------------------------------------------------
Fri Nov 28 00:20:13 UTC 2025 - Aleksa Sarai <asarai@suse.com>
- Update to runc v1.3.4. Upstream changelog is available from
<https://github.com/opencontainers/runc/releases/tag/v1.3.4>. bsc#1254362
-------------------------------------------------------------------
Wed Nov 5 10:05:32 UTC 2025 - Aleksa Sarai <asarai@suse.com>
- Update to runc v1.3.3. Upstream changelog is available from
<https://github.com/opencontainers/runc/releases/tag/v1.3.3>. bsc#1252232
* CVE-2025-31133
* CVE-2025-52565
* CVE-2025-52881
- Remove upstreamed patches for bsc#1252232:
- 2025-11-05-CVEs.patch
-------------------------------------------------------------------
Thu Oct 16 02:16:12 UTC 2025 - Aleksa Sarai <asarai@suse.com>
[ This update was only released for SLE 12 and 15. ]
- Backport patches for three CVEs. All three vulnerabilities ultimately allow
(through different methods) for full container breakouts by bypassing runc's
restrictions for writing to arbitrary /proc files. bsc#1252232
* CVE-2025-31133
* CVE-2025-52565
* CVE-2025-52881
+ 2025-11-05-CVEs.patch
-------------------------------------------------------------------
Fri Oct 10 14:10:23 UTC 2025 - Aleksa Sarai <asarai@suse.com>
[ This update was only released for SLE 12 and 15. ]
- Update to runc v1.2.7. Upstream changelog is available from
<https://github.com/opencontainers/runc/releases/tag/v1.2.7>.
-------------------------------------------------------------------
Sat Oct 4 05:01:50 UTC 2025 - Aleksa Sarai <asarai@suse.com>
- Update to runc v1.3.2. Upstream changelog is available from
<https://github.com/opencontainers/runc/releases/tag/v1.3.2> bsc#1252110
- Includes an important fix for the CPUSet translation for cgroupv2.
-------------------------------------------------------------------
Thu Sep 4 15:29:15 UTC 2025 - Aleksa Sarai <asarai@suse.com>
- Update to runc v1.3.1. Upstream changelog is available from
<https://github.com/opencontainers/runc/releases/tag/v1.3.1>
- Fix runc 1.3.x builds on SLE-12 by enabling --std=gnu11.
-------------------------------------------------------------------
Tue Apr 29 15:23:32 UTC 2025 - Aleksa Sarai <asarai@suse.com>
- Update to runc v1.3.0. Upstream changelog is available from
<https://github.com/opencontainers/runc/releases/tag/v1.3.0>
-------------------------------------------------------------------
Thu Apr 10 03:52:03 UTC 2025 - Aleksa Sarai <asarai@suse.com>
- Update to runc v1.2.6. Upstream changelog is available from
<https://github.com/opencontainers/runc/releases/tag/v1.2.6>.
-------------------------------------------------------------------
Fri Feb 14 01:31:56 UTC 2025 - Aleksa Sarai <asarai@suse.com>
- Update to runc v1.2.5. Upstream changelog is available from
<https://github.com/opencontainers/runc/releases/tag/v1.2.5>.
-------------------------------------------------------------------
Tue Jan 7 06:31:57 UTC 2025 - Aleksa Sarai <asarai@suse.com>
- Update to runc v1.2.4. Upstream changelog is available from
<https://github.com/opencontainers/runc/releases/tag/v1.2.4>.
- Update runc.keyring to match upstream.
-------------------------------------------------------------------
Wed Dec 11 02:01:52 UTC 2024 - Aleksa Sarai <asarai@suse.com>
- Update to runc v1.2.3. Upstream changelog is available from
<https://github.com/opencontainers/runc/releases/tag/v1.2.3>.
-------------------------------------------------------------------
Sat Nov 16 01:55:06 UTC 2024 - Aleksa Sarai <asarai@suse.com>
- Update to runc v1.2.2. Upstream changelog is available from
<https://github.com/opencontainers/runc/releases/tag/v1.2.2>.
-------------------------------------------------------------------
Fri Nov 1 22:26:11 UTC 2024 - Aleksa Sarai <asarai@suse.com>
- Update to runc v1.2.1. Upstream changelog is available from
<https://github.com/opencontainers/runc/releases/tag/v1.2.1>.
-------------------------------------------------------------------
Mon Oct 21 22:42:50 UTC 2024 - Aleksa Sarai <asarai@suse.com>
- Update to runc v1.2.0. Upstream changelog is available from
<https://github.com/opencontainers/runc/releases/tag/v1.2.0>.
- Remove upstreamed patches.
- 0001-bsc1221050-libct-seccomp-patchbpf-rm-duplicated-code.patch
- 0002-bsc1221050-seccomp-patchbpf-rename-nativeArch-linuxA.patch
- 0003-bsc1221050-seccomp-patchbpf-always-include-native-ar.patch
- 0004-bsc1214960-nsenter-cloned_binary-remove-bindfd-logic.patch
-------------------------------------------------------------------
Tue Sep 3 02:01:16 UTC 2024 - Aleksa Sarai <asarai@suse.com>
- Update to runc v1.2.0~rc3. Upstream changelog is available from
<https://github.com/opencontainers/runc/releases/tag/v1.2.0-rc.3>.
Includes the patch for CVE-2024-45310. bsc#1230092
-------------------------------------------------------------------
Tue Sep 3 01:57:20 UTC 2024 - Aleksa Sarai <asarai@suse.com>
@@ -29,6 +142,29 @@ Mon Jul 22 13:08:06 UTC 2024 - Aleksa Sarai <asarai@suse.com>
too many mount notifications. bsc#1214960
+ 0004-bsc1214960-nsenter-cloned_binary-remove-bindfd-logic.patch
-------------------------------------------------------------------
Fri Jul 12 08:33:22 UTC 2024 - Aleksa Sarai <asarai@suse.com>
- Update to runc v1.2.0~rc2. Upstream changelog is available from
<https://github.com/opencontainers/runc/releases/tag/v1.2.0-rc.2>.
- Re-allow Go 1.22 builds for >= 1.22.4.
-------------------------------------------------------------------
Thu Apr 25 08:23:43 UTC 2024 - Aleksa Sarai <asarai@suse.com>
- Build with Go 1.21 until the upstream Go 1.22 compatibility issue gets fixed.
<https://github.com/opencontainers/runc/issues/4233>
-------------------------------------------------------------------
Fri Apr 4 05:04:27 UTC 2024 - Aleksa Sarai <asarai@suse.com>
- Update to runc v1.2.0~rc1. Upstream changelog is available from
<https://github.com/opencontainers/runc/releases/tag/v1.2.0-rc.1>.
- Remove upstreamed patches.
- 0001-bsc1221050-libct-seccomp-patchbpf-rm-duplicated-code.patch
- 0002-bsc1221050-seccomp-patchbpf-rename-nativeArch-linuxA.patch
- 0003-bsc1221050-seccomp-patchbpf-always-include-native-ar.patch
-------------------------------------------------------------------
Thu Mar 21 03:46:48 UTC 2024 - Aleksa Sarai <asarai@suse.com>

View File

@@ -122,10 +122,10 @@ lxxclgJYU604APsFzpoLD0oUlfMn5Fh75ftkKPrwiHpTj4rRU6oIQu1/Bg==
=Ab7w
-----END PGP PUBLIC KEY BLOCK-----
pub rsa2048 2020-04-28 [SC] [expires: 2025-04-18]
pub rsa2048 2020-04-28 [SC] [expires: 2028-04-18]
C2428CD75720FACDCF76B6EA17DE5ECB75A1100E
uid [ultimate] Kir Kolyshkin <kolyshkin@gmail.com>
sub rsa2048 2020-04-28 [E] [expires: 2025-04-18]
sub rsa2048 2020-04-28 [E] [expires: 2028-04-18]
-----BEGIN PGP PUBLIC KEY BLOCK-----
Comment: github=kolyshkin
@@ -137,26 +137,26 @@ ppTSiCl8/x/gKoXiJ+7MyvOZozUavkVHdim1NKCzwD014VOB8RXz+heUjS+HDXY9
SbTL4jCsN/x0bq+ZNp4lunihVY5WqX+BGLcx7xPnJ0Rp9Ju1mAhKrbKUmOG3rkWu
DIJuVP8HQfCoffsBLUKQ0V4fh18kfq1bo3JvABEBAAG0I0tpciBLb2x5c2hraW4g
PGtvbHlzaGtpbkBnbWFpbC5jb20+iQFUBBMBCAA+AhsDBQsJCAcCBhUKCQgLAgQW
AgMBAh4BAheAFiEEwkKM11cg+s3PdrbqF95ey3WhEA4FAmRAbOgFCQlaGGoACgkQ
F95ey3WhEA6dRQf+P+OHI3QiZu3TnrNBTsf+V8HhFBWKqafrjKbIE1A5HOHzcK2F
t2afYG+MZQILwSuCQOObgr3o7hGlqkwMwGtHt5nqG6/Z0bmkowG4JJmYIg9FhvQW
JEm/7lSBtxvFkw05H90UlzCM7AigD+PrLs96Zb0+FqdzEDWTMJeU7yYUFRNbXEu3
wqpOZpHlYCJGKzFJBbGxYphlmljexRlWdZPwACKg7lBsVkM8JDPGxmmEe7/5tXPt
Oa1yS13SleLv4muHH3KO3cgJGqBfY/XIExZUQUF0GdL0yppBDbn0oZ/wvRuibCR0
1P7rW88csSjAjhNjja4v/zWleSIpyWVi8IvYLLkBDQReqLt+AQgAtKUDLyUFxQ9k
AgMBAh4BAheAFiEEwkKM11cg+s3PdrbqF95ey3WhEA4FAmdcs+gFCQ7+0bIACgkQ
F95ey3WhEA6rRwf8CxnbLB/uqPZfmmiTzTk7luWaIo6YxtnNz3bn2rTByEo+rBgO
gbgtKaV4REYeKhtbdstkMTX3zr+zlqwuqaPaag/Cz20HLkD04bI+JCPoRH/dPadd
3nOdbdRfdWZeDDSFKjVunVpXlLxwvZ1WaaYKCfF06U3F7/z7MTAuKHrHTG9SrNPJ
UPJTy63dNnuiPpVNNtOyftLGEGgD1JH2tcosVEwEpAlXpIpJy4Lad9ajaRVoYNtT
qZr26sRFYNOQqWgl25QM8LyLFyYry9HfEXkbilW0OpkAkUvv0yAe97UPZ0beP8D+
d5rMbZps6Ph1TtosdE/Gx8xWs7ALNDmXyCI/F7kBDQReqLt+AQgAtKUDLyUFxQ9k
p8OwI/MsPTLLoYfjilJaXnmtzQjGYFrEuU3lt7omRUBldNChkjGghEukGTq0RD7Z
s6Qv5PM5dtOypPJM0lmz2j7seun3AfDV44h/bjOFwTUjab3Nr9fQ52qESmRS03ik
6+5YNwq2D/+2kHVJ2vkUoo6KvioA1vPU311oW/Yfky8dLS5NguikE3to6YElWW38
oqFUVdMScCbf9a6CPXSQEz/rH4TgAhwyTo6oegv+8L/szGFy5ToNGiA0D45HcFDc
yXs1d+b3bYRuGfC1l/z+WZWwbeHt1fKEQ8pCLDLRre5y0hPRHeN2CG4U7iyI5B5h
8LITPcZ66wARAQABiQE8BBgBCAAmAhsMFiEEwkKM11cg+s3PdrbqF95ey3WhEA4F
AmRAbRQFCQlaGJYACgkQF95ey3WhEA7vywf9FFTeRgNji8ZIPMM2vIlns+CMkP5R
uXakU6Q0O6Wmbb/ULOkobTqJ/Jcze8OuembuU3V6MiOQKgUIDrN7itjnJPQBneKT
iqJdPK8KOiGIzqa0aRekvOu2nCz9n87Bf48pviH922yfs8gXYRCUnSV/i7/p+N8r
5Fy7dJen5SXksN2/rUCEgU9FD17l2uMAoQbRqZg74/GwSDLnhrZ9eMrbPnguSQF4
S1NPMeS7+G/gPN9Ze9qFmOF2p57cmEa+8mriZCYY3BcUBOiMOV5HSBKJwqA2M8au
2dAKmFWb/G+K/dgBdkAulQ/BfCpwgFmmgJ5dAeaS3y8Xd86aBE0/eLCrhQ==
=GkpD
AmdctAIFCQ7+0bIACgkQF95ey3WhEA7PDggAlZxK7mCYThh7Z75mWftIaT3ms5jR
cuQcCQYy2Z7qCaNxJtRklhsaAwpO0NQdNdQEfVXlNYLXRuFDq+hemhZKMu4lzQbZ
3atm5swWcB8+9q+aCMP5nppwUXxCxHdhp4VxIYEv+wNjTF/6Fxu66fYPQPDKVacS
H9NLjHsVoDFSi9rvtAy/Bs2aVn0hZkwpxzHJNVPnNcMAEnYXfM+kXu3761J61FAr
o8zT9XXXnUYRuxHRAsrpa3atQj7jDHvFlcc3VfPmUFPs0aLRy19/44xRE1FZOSur
f7jJ1HOKSJA9zx0xWaURRTRkMTIVuMnQKZofxC96GavBDVTtZlgLzeWVnQ==
=eHgH
-----END PGP PUBLIC KEY BLOCK-----
pub rsa3072 2019-07-25 [SC] [expires: 2025-07-27]

View File

@@ -1,7 +1,7 @@
#
# spec file for package runc
#
# Copyright (c) 2024 SUSE LLC
# Copyright (c) 2025 SUSE LLC and contributors
#
# All modifications and additions to the file contributed by third parties
# remain the property of their copyright owners, unless otherwise agreed
@@ -18,30 +18,25 @@
# MANUAL: Make sure you update this each time you update runc.
%define git_version 2c9f5602f0ba3d9da1c2596322dfc4e156844890
%define git_short 2c9f5602f0ba
%define git_version d6d73eb8c60246978da649ffe75ce5c8bca8f856
%define git_short d6d73eb8c602
%define project github.com/opencontainers/runc
Name: runc
Version: 1.1.14
Version: 1.3.4
%define upstream_version %{version}
Release: 0
Summary: Tool for spawning and running OCI containers
License: Apache-2.0
Group: System/Management
URL: https://github.com/opencontainers/runc
Source0: https://github.com/opencontainers/runc/releases/download/v%{version}/runc.tar.xz#/runc-%{version}.tar.xz
Source1: https://github.com/opencontainers/runc/releases/download/v%{version}/runc.tar.xz.asc#/runc-%{version}.tar.xz.asc
Source0: https://github.com/opencontainers/runc/releases/download/v%{upstream_version}/runc.tar.xz#/runc-%{upstream_version}.tar.xz
Source1: https://github.com/opencontainers/runc/releases/download/v%{upstream_version}/runc.tar.xz.asc#/runc-%{upstream_version}.tar.xz.asc
Source2: runc.keyring
# SUSE-FIX-UPSTREAM: Backport of <https://github.com/opencontainers/runc/pull/4219>. bsc#1221050
Patch10: 0001-bsc1221050-libct-seccomp-patchbpf-rm-duplicated-code.patch
Patch11: 0002-bsc1221050-seccomp-patchbpf-rename-nativeArch-linuxA.patch
Patch12: 0003-bsc1221050-seccomp-patchbpf-always-include-native-ar.patch
# SUSE-FIX-UPSTREAM: Partial backport of <https://github.com/opencontainers/runc/pull/3931>. bsc#1214960
Patch20: 0004-bsc1214960-nsenter-cloned_binary-remove-bindfd-logic.patch
BuildRequires: diffutils
BuildRequires: fdupes
BuildRequires: go
BuildRequires: go >= 1.23
BuildRequires: go-go-md2man
BuildRequires: libseccomp-devel
BuildRequires: libselinux-devel
@@ -60,7 +55,7 @@ Obsoletes: docker-runc_50a19c6
ExcludeArch: s390
# Construct "git describe --dirty --long --always".
%define git_describe v%{version}-0-g%{git_short}
%define git_describe v%{upstream_version}-0-g%{git_short}
%description
runc is a CLI tool for spawning and running containers according to the OCI
@@ -69,10 +64,14 @@ of Docker. It was originally designed to be a replacement for LXC within Docker,
and has grown to become a separate project entirely.
%prep
%setup -q -n %{name}-%{version}
%setup -q -n %{name}-%{upstream_version}
%autopatch -p1
%build
%if 0%{?sle_version} == 120000
# Fix nsenter builds on SLE12.
export CGO_CFLAGS="--std=gnu11"
%endif
# build runc
make BUILDTAGS="seccomp" COMMIT="%{git_describe}" runc
# build man pages