From c92af6636dc23f5b465e6eefeca535db865248aea483d87dbef3337a8827c4fd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20Schr=C3=B6ter?= Date: Fri, 6 Sep 2024 15:31:00 +0200 Subject: [PATCH] Sync from SUSE:SLFO:Main runc revision 2133c931be9a58b516e6632c4b6eabdc --- ...-seccomp-patchbpf-rm-duplicated-code.patch | 44 +++ ...mp-patchbpf-rename-nativeArch-linuxA.patch | 289 ++++++++++++++++++ ...mp-patchbpf-always-include-native-ar.patch | 162 ++++++++++ ...er-cloned_binary-remove-bindfd-logic.patch | 136 +++++++++ runc-1.1.12.tar.xz | 3 - runc-1.1.12.tar.xz.asc | 17 -- runc-1.1.14.tar.xz | 3 + runc-1.1.14.tar.xz.asc | 7 + runc.changes | 40 +++ runc.spec | 13 +- 10 files changed, 691 insertions(+), 23 deletions(-) create mode 100644 0001-bsc1221050-libct-seccomp-patchbpf-rm-duplicated-code.patch create mode 100644 0002-bsc1221050-seccomp-patchbpf-rename-nativeArch-linuxA.patch create mode 100644 0003-bsc1221050-seccomp-patchbpf-always-include-native-ar.patch create mode 100644 0004-bsc1214960-nsenter-cloned_binary-remove-bindfd-logic.patch delete mode 100644 runc-1.1.12.tar.xz delete mode 100644 runc-1.1.12.tar.xz.asc create mode 100644 runc-1.1.14.tar.xz create mode 100644 runc-1.1.14.tar.xz.asc diff --git a/0001-bsc1221050-libct-seccomp-patchbpf-rm-duplicated-code.patch b/0001-bsc1221050-libct-seccomp-patchbpf-rm-duplicated-code.patch new file mode 100644 index 0000000..9d462b4 --- /dev/null +++ b/0001-bsc1221050-libct-seccomp-patchbpf-rm-duplicated-code.patch @@ -0,0 +1,44 @@ +From 22eb87a32dc1c685425b685e96e8472b9ac1b5ca Mon Sep 17 00:00:00 2001 +From: Kir Kolyshkin +Date: Fri, 14 Oct 2022 18:37:00 -0700 +Subject: [PATCH 1/4] bsc1221050: libct/seccomp/patchbpf: rm duplicated code + +(This is a cherry-pick of 2cd05e44b662fb79c46d5ebfd6c71e9ebc98d40c.) + +In findLastSyscalls, we convert libseccomp.ArchNative to the real +libseccomp architecture, but archToNative already does that, so +this code is redundant. + +Remove the redundant code, and move its comment to archToNative. + +Fixes: 7a8d7162f +Signed-off-by: Kir Kolyshkin +Signed-off-by: Aleksa Sarai +--- + libcontainer/seccomp/patchbpf/enosys_linux.go | 10 ---------- + 1 file changed, 10 deletions(-) + +diff --git a/libcontainer/seccomp/patchbpf/enosys_linux.go b/libcontainer/seccomp/patchbpf/enosys_linux.go +index efe6dca58b21..c9c1d4ccb685 100644 +--- a/libcontainer/seccomp/patchbpf/enosys_linux.go ++++ b/libcontainer/seccomp/patchbpf/enosys_linux.go +@@ -233,16 +233,6 @@ func findLastSyscalls(config *configs.Seccomp) (lastSyscallMap, error) { + return nil, fmt.Errorf("unable to validate seccomp architecture: %w", err) + } + +- // Map native architecture to a real architecture value to avoid +- // doubling-up the lastSyscall mapping. +- if arch == libseccomp.ArchNative { +- nativeArch, err := libseccomp.GetNativeArch() +- if err != nil { +- return nil, fmt.Errorf("unable to get native architecture: %w", err) +- } +- arch = nativeArch +- } +- + // Figure out native architecture representation of the architecture. + nativeArch, err := archToNative(arch) + if err != nil { +-- +2.46.0 + diff --git a/0002-bsc1221050-seccomp-patchbpf-rename-nativeArch-linuxA.patch b/0002-bsc1221050-seccomp-patchbpf-rename-nativeArch-linuxA.patch new file mode 100644 index 0000000..1b7e411 --- /dev/null +++ b/0002-bsc1221050-seccomp-patchbpf-rename-nativeArch-linuxA.patch @@ -0,0 +1,289 @@ +From 558c5ecf487a40001ba854cfcbd5c94223167501 Mon Sep 17 00:00:00 2001 +From: Aleksa Sarai +Date: Wed, 13 Mar 2024 13:40:16 +1100 +Subject: [PATCH 2/4] bsc1221050: seccomp: patchbpf: rename nativeArch -> + linuxAuditArch + +(This is a backport of 6167f5ffc3e3fd53e6a41a2effa592a4873ad046.) + +Calling the Linux AUDIT_* architecture constants "native" leads to +confusing code when we are getting the actual native architecture of the +running system. + +Signed-off-by: Aleksa Sarai +--- + libcontainer/seccomp/patchbpf/enosys_linux.go | 81 ++++++++++--------- + .../seccomp/patchbpf/enosys_linux_test.go | 16 ++-- + 2 files changed, 49 insertions(+), 48 deletions(-) + +diff --git a/libcontainer/seccomp/patchbpf/enosys_linux.go b/libcontainer/seccomp/patchbpf/enosys_linux.go +index c9c1d4ccb685..1b67fda85c64 100644 +--- a/libcontainer/seccomp/patchbpf/enosys_linux.go ++++ b/libcontainer/seccomp/patchbpf/enosys_linux.go +@@ -164,11 +164,11 @@ func disassembleFilter(filter *libseccomp.ScmpFilter) ([]bpf.Instruction, error) + return program, nil + } + +-type nativeArch uint32 ++type linuxAuditArch uint32 + +-const invalidArch nativeArch = 0 ++const invalidArch linuxAuditArch = 0 + +-func archToNative(arch libseccomp.ScmpArch) (nativeArch, error) { ++func scmpArchToAuditArch(arch libseccomp.ScmpArch) (linuxAuditArch, error) { + switch arch { + case libseccomp.ArchNative: + // Convert to actual native architecture. +@@ -176,48 +176,48 @@ func archToNative(arch libseccomp.ScmpArch) (nativeArch, error) { + if err != nil { + return invalidArch, fmt.Errorf("unable to get native arch: %w", err) + } +- return archToNative(arch) ++ return scmpArchToAuditArch(arch) + case libseccomp.ArchX86: +- return nativeArch(C.C_AUDIT_ARCH_I386), nil ++ return linuxAuditArch(C.C_AUDIT_ARCH_I386), nil + case libseccomp.ArchAMD64, libseccomp.ArchX32: + // NOTE: x32 is treated like x86_64 except all x32 syscalls have the + // 30th bit of the syscall number set to indicate that it's not a + // normal x86_64 syscall. +- return nativeArch(C.C_AUDIT_ARCH_X86_64), nil ++ return linuxAuditArch(C.C_AUDIT_ARCH_X86_64), nil + case libseccomp.ArchARM: +- return nativeArch(C.C_AUDIT_ARCH_ARM), nil ++ return linuxAuditArch(C.C_AUDIT_ARCH_ARM), nil + case libseccomp.ArchARM64: +- return nativeArch(C.C_AUDIT_ARCH_AARCH64), nil ++ return linuxAuditArch(C.C_AUDIT_ARCH_AARCH64), nil + case libseccomp.ArchMIPS: +- return nativeArch(C.C_AUDIT_ARCH_MIPS), nil ++ return linuxAuditArch(C.C_AUDIT_ARCH_MIPS), nil + case libseccomp.ArchMIPS64: +- return nativeArch(C.C_AUDIT_ARCH_MIPS64), nil ++ return linuxAuditArch(C.C_AUDIT_ARCH_MIPS64), nil + case libseccomp.ArchMIPS64N32: +- return nativeArch(C.C_AUDIT_ARCH_MIPS64N32), nil ++ return linuxAuditArch(C.C_AUDIT_ARCH_MIPS64N32), nil + case libseccomp.ArchMIPSEL: +- return nativeArch(C.C_AUDIT_ARCH_MIPSEL), nil ++ return linuxAuditArch(C.C_AUDIT_ARCH_MIPSEL), nil + case libseccomp.ArchMIPSEL64: +- return nativeArch(C.C_AUDIT_ARCH_MIPSEL64), nil ++ return linuxAuditArch(C.C_AUDIT_ARCH_MIPSEL64), nil + case libseccomp.ArchMIPSEL64N32: +- return nativeArch(C.C_AUDIT_ARCH_MIPSEL64N32), nil ++ return linuxAuditArch(C.C_AUDIT_ARCH_MIPSEL64N32), nil + case libseccomp.ArchPPC: +- return nativeArch(C.C_AUDIT_ARCH_PPC), nil ++ return linuxAuditArch(C.C_AUDIT_ARCH_PPC), nil + case libseccomp.ArchPPC64: +- return nativeArch(C.C_AUDIT_ARCH_PPC64), nil ++ return linuxAuditArch(C.C_AUDIT_ARCH_PPC64), nil + case libseccomp.ArchPPC64LE: +- return nativeArch(C.C_AUDIT_ARCH_PPC64LE), nil ++ return linuxAuditArch(C.C_AUDIT_ARCH_PPC64LE), nil + case libseccomp.ArchS390: +- return nativeArch(C.C_AUDIT_ARCH_S390), nil ++ return linuxAuditArch(C.C_AUDIT_ARCH_S390), nil + case libseccomp.ArchS390X: +- return nativeArch(C.C_AUDIT_ARCH_S390X), nil ++ return linuxAuditArch(C.C_AUDIT_ARCH_S390X), nil + case libseccomp.ArchRISCV64: +- return nativeArch(C.C_AUDIT_ARCH_RISCV64), nil ++ return linuxAuditArch(C.C_AUDIT_ARCH_RISCV64), nil + default: + return invalidArch, fmt.Errorf("unknown architecture: %v", arch) + } + } + +-type lastSyscallMap map[nativeArch]map[libseccomp.ScmpArch]libseccomp.ScmpSyscall ++type lastSyscallMap map[linuxAuditArch]map[libseccomp.ScmpArch]libseccomp.ScmpSyscall + + // Figure out largest syscall number referenced in the filter for each + // architecture. We will be generating code based on the native architecture +@@ -234,17 +234,17 @@ func findLastSyscalls(config *configs.Seccomp) (lastSyscallMap, error) { + } + + // Figure out native architecture representation of the architecture. +- nativeArch, err := archToNative(arch) ++ auditArch, err := scmpArchToAuditArch(arch) + if err != nil { + return nil, fmt.Errorf("cannot map architecture %v to AUDIT_ARCH_ constant: %w", arch, err) + } + +- if _, ok := lastSyscalls[nativeArch]; !ok { +- lastSyscalls[nativeArch] = map[libseccomp.ScmpArch]libseccomp.ScmpSyscall{} ++ if _, ok := lastSyscalls[auditArch]; !ok { ++ lastSyscalls[auditArch] = map[libseccomp.ScmpArch]libseccomp.ScmpSyscall{} + } +- if _, ok := lastSyscalls[nativeArch][arch]; ok { ++ if _, ok := lastSyscalls[auditArch][arch]; ok { + // Because of ArchNative we may hit the same entry multiple times. +- // Just skip it if we've seen this (nativeArch, ScmpArch) ++ // Just skip it if we've seen this (linuxAuditArch, ScmpArch) + // combination before. + continue + } +@@ -262,10 +262,11 @@ func findLastSyscalls(config *configs.Seccomp) (lastSyscallMap, error) { + } + } + if largestSyscall != 0 { +- lastSyscalls[nativeArch][arch] = largestSyscall ++ logrus.Debugf("seccomp: largest syscall number for arch %v is %v", arch, largestSyscall) ++ lastSyscalls[auditArch][arch] = largestSyscall + } else { +- logrus.Warnf("could not find any syscalls for arch %s", ociArch) +- delete(lastSyscalls[nativeArch], arch) ++ logrus.Warnf("could not find any syscalls for arch %v", arch) ++ delete(lastSyscalls[auditArch], arch) + } + } + return lastSyscalls, nil +@@ -283,10 +284,10 @@ func findLastSyscalls(config *configs.Seccomp) (lastSyscallMap, error) { + // close_range(2) which were added out-of-order in the syscall table between + // kernel releases. + func generateEnosysStub(lastSyscalls lastSyscallMap) ([]bpf.Instruction, error) { +- // A jump-table for each nativeArch used to generate the initial ++ // A jump-table for each linuxAuditArch used to generate the initial + // conditional jumps -- measured from the *END* of the program so they + // remain valid after prepending to the tail. +- archJumpTable := map[nativeArch]uint32{} ++ archJumpTable := map[linuxAuditArch]uint32{} + + // Generate our own -ENOSYS rules for each architecture. They have to be + // generated in reverse (prepended to the tail of the program) because the +@@ -299,7 +300,7 @@ func generateEnosysStub(lastSyscalls lastSyscallMap) ([]bpf.Instruction, error) + } + + // Generate the syscall -ENOSYS rules. +- for nativeArch, maxSyscalls := range lastSyscalls { ++ for auditArch, maxSyscalls := range lastSyscalls { + // The number of instructions from the tail of this section which need + // to be jumped in order to reach the -ENOSYS return. If the section + // does not jump, it will fall through to the actual filter. +@@ -380,7 +381,7 @@ func generateEnosysStub(lastSyscalls lastSyscallMap) ([]bpf.Instruction, error) + + // If we're on x86 we need to add a check for x32 and if we're in + // the wrong mode we jump over the section. +- if uint32(nativeArch) == uint32(C.C_AUDIT_ARCH_X86_64) { ++ if uint32(auditArch) == uint32(C.C_AUDIT_ARCH_X86_64) { + // Generate a prefix to check the mode. + switch scmpArch { + case libseccomp.ArchAMD64: +@@ -409,8 +410,8 @@ func generateEnosysStub(lastSyscalls lastSyscallMap) ([]bpf.Instruction, error) + section = append(section, sectionTail...) + case 2: + // x32 and x86_64 are a unique case, we can't handle any others. +- if uint32(nativeArch) != uint32(C.C_AUDIT_ARCH_X86_64) { +- return nil, fmt.Errorf("unknown architecture overlap on native arch %#x", nativeArch) ++ if uint32(auditArch) != uint32(C.C_AUDIT_ARCH_X86_64) { ++ return nil, fmt.Errorf("unknown architecture overlap on native arch %#x", auditArch) + } + + x32sysno, ok := maxSyscalls[libseccomp.ArchX32] +@@ -487,7 +488,7 @@ func generateEnosysStub(lastSyscalls lastSyscallMap) ([]bpf.Instruction, error) + programTail = append(section, programTail...) + + // Update jump table. +- archJumpTable[nativeArch] = uint32(len(programTail)) ++ archJumpTable[auditArch] = uint32(len(programTail)) + } + + // Add a dummy "jump to filter" for any architecture we might miss below. +@@ -507,9 +508,9 @@ func generateEnosysStub(lastSyscalls lastSyscallMap) ([]bpf.Instruction, error) + // architectures based on how large the jumps are going to be, or + // re-sort the candidate architectures each time to make sure that we + // pick the largest jump which is going to be smaller than 255. +- for nativeArch := range lastSyscalls { ++ for auditArch := range lastSyscalls { + // We jump forwards but the jump table is calculated from the *END*. +- jump := uint32(len(programTail)) - archJumpTable[nativeArch] ++ jump := uint32(len(programTail)) - archJumpTable[auditArch] + + // Same routine as above -- this is a basic jeq check, complicated + // slightly if it turns out that we need to do a long jump. +@@ -518,7 +519,7 @@ func generateEnosysStub(lastSyscalls lastSyscallMap) ([]bpf.Instruction, error) + // jeq [arch],[jump] + bpf.JumpIf{ + Cond: bpf.JumpEqual, +- Val: uint32(nativeArch), ++ Val: uint32(auditArch), + SkipTrue: uint8(jump), + }, + }, programTail...) +@@ -527,7 +528,7 @@ func generateEnosysStub(lastSyscalls lastSyscallMap) ([]bpf.Instruction, error) + // jne [arch],1 + bpf.JumpIf{ + Cond: bpf.JumpNotEqual, +- Val: uint32(nativeArch), ++ Val: uint32(auditArch), + SkipTrue: 1, + }, + // ja [jump] +diff --git a/libcontainer/seccomp/patchbpf/enosys_linux_test.go b/libcontainer/seccomp/patchbpf/enosys_linux_test.go +index e2d363a43bd3..bdfeff68adb3 100644 +--- a/libcontainer/seccomp/patchbpf/enosys_linux_test.go ++++ b/libcontainer/seccomp/patchbpf/enosys_linux_test.go +@@ -23,7 +23,7 @@ type seccompData struct { + } + + // mockSyscallPayload creates a fake seccomp_data struct with the given data. +-func mockSyscallPayload(t *testing.T, sysno libseccomp.ScmpSyscall, arch nativeArch, args ...uint64) []byte { ++func mockSyscallPayload(t *testing.T, sysno libseccomp.ScmpSyscall, arch linuxAuditArch, args ...uint64) []byte { + var buf bytes.Buffer + + data := seccompData{ +@@ -150,8 +150,8 @@ func testEnosysStub(t *testing.T, defaultAction configs.Action, arches []string) + + for _, arch := range testArches { + type syscallTest struct { +- syscall string + sysno libseccomp.ScmpSyscall ++ syscall string + expected uint32 + } + +@@ -160,7 +160,7 @@ func testEnosysStub(t *testing.T, defaultAction configs.Action, arches []string) + t.Fatalf("unknown libseccomp architecture %q: %v", arch, err) + } + +- nativeArch, err := archToNative(scmpArch) ++ auditArch, err := scmpArchToAuditArch(scmpArch) + if err != nil { + t.Fatalf("unknown audit architecture %q: %v", arch, err) + } +@@ -179,9 +179,9 @@ func testEnosysStub(t *testing.T, defaultAction configs.Action, arches []string) + t.Fatalf("unknown syscall %q on arch %q: %v", syscall, arch, err) + } + syscallTests = append(syscallTests, syscallTest{ +- syscall, +- sysno, +- expected, ++ sysno: sysno, ++ syscall: syscall, ++ expected: expected, + }) + } + +@@ -233,7 +233,7 @@ func testEnosysStub(t *testing.T, defaultAction configs.Action, arches []string) + test.expected = retFallthrough + } + +- payload := mockSyscallPayload(t, test.sysno, nativeArch, 0x1337, 0xF00BA5) ++ payload := mockSyscallPayload(t, test.sysno, auditArch, 0x1337, 0xF00BA5) + // NOTE: golang.org/x/net/bpf returns int here rather + // than uint32. + rawRet, err := filter.Run(payload) +@@ -247,7 +247,7 @@ func testEnosysStub(t *testing.T, defaultAction configs.Action, arches []string) + t.Logf(" [%4.1d] %s", idx, insn) + } + t.Logf("payload: %#v", payload) +- t.Errorf("filter %s(%d) %q(%d): got %#x, want %#x", arch, nativeArch, test.syscall, test.sysno, ret, test.expected) ++ t.Errorf("filter %s(%d) %q(%d): got %#x, want %#x", arch, auditArch, test.syscall, test.sysno, ret, test.expected) + } + } + } +-- +2.46.0 + diff --git a/0003-bsc1221050-seccomp-patchbpf-always-include-native-ar.patch b/0003-bsc1221050-seccomp-patchbpf-always-include-native-ar.patch new file mode 100644 index 0000000..31c6d0c --- /dev/null +++ b/0003-bsc1221050-seccomp-patchbpf-always-include-native-ar.patch @@ -0,0 +1,162 @@ +From a1e9b2e4015a6b548a0d3e004bf27dd2e3f2cf35 Mon Sep 17 00:00:00 2001 +From: Aleksa Sarai +Date: Wed, 13 Mar 2024 16:12:51 +1100 +Subject: [PATCH 3/4] bsc1221050: seccomp: patchbpf: always include native + architecture in stub + +(This is a backport of 376417ba7646f05ddb1efa8fe30e2a3b53cf673b.) + +It turns out that on ppc64le (at least), Docker doesn't include any +architectures in the list of allowed architectures. libseccomp +interprets this as "just include the default architecture" but patchbpf +would return a no-op ENOSYS stub, which would lead to the exact issues +that commit 7a8d7162f9d7 ("seccomp: prepend -ENOSYS stub to all +filters") fixed for other architectures. + +So, just always include the running architecture in the list. There's +no real downside. + +SUSE-Bugs: 1192051 1221050 +Ref: https://bugzilla.suse.com/show_bug.cgi?id=1192051#c6 +Reported-by: Fabian Vogt +Signed-off-by: Aleksa Sarai +--- + libcontainer/seccomp/patchbpf/enosys_linux.go | 22 +++++++-- + .../seccomp/patchbpf/enosys_linux_test.go | 47 +++++++++++++++++-- + 2 files changed, 61 insertions(+), 8 deletions(-) + +diff --git a/libcontainer/seccomp/patchbpf/enosys_linux.go b/libcontainer/seccomp/patchbpf/enosys_linux.go +index 1b67fda85c64..d459ba8792ca 100644 +--- a/libcontainer/seccomp/patchbpf/enosys_linux.go ++++ b/libcontainer/seccomp/patchbpf/enosys_linux.go +@@ -224,16 +224,30 @@ type lastSyscallMap map[linuxAuditArch]map[libseccomp.ScmpArch]libseccomp.ScmpSy + // representation, but SCMP_ARCH_X32 means we have to track cases where the + // same architecture has different largest syscalls based on the mode. + func findLastSyscalls(config *configs.Seccomp) (lastSyscallMap, error) { +- lastSyscalls := make(lastSyscallMap) +- // Only loop over architectures which are present in the filter. Any other +- // architectures will get the libseccomp bad architecture action anyway. ++ scmpArchs := make(map[libseccomp.ScmpArch]struct{}) + for _, ociArch := range config.Architectures { + arch, err := libseccomp.GetArchFromString(ociArch) + if err != nil { + return nil, fmt.Errorf("unable to validate seccomp architecture: %w", err) + } ++ scmpArchs[arch] = struct{}{} ++ } ++ // On architectures like ppc64le, Docker inexplicably doesn't include the ++ // native architecture in the architecture list which results in no ++ // architectures being present in the list at all (rendering the ENOSYS ++ // stub a no-op). So, always include the native architecture. ++ if nativeScmpArch, err := libseccomp.GetNativeArch(); err != nil { ++ return nil, fmt.Errorf("unable to get native arch: %w", err) ++ } else if _, ok := scmpArchs[nativeScmpArch]; !ok { ++ logrus.Debugf("seccomp: adding implied native architecture %v to config set", nativeScmpArch) ++ scmpArchs[nativeScmpArch] = struct{}{} ++ } ++ logrus.Debugf("seccomp: configured architecture set: %s", scmpArchs) + +- // Figure out native architecture representation of the architecture. ++ // Only loop over architectures which are present in the filter. Any other ++ // architectures will get the libseccomp bad architecture action anyway. ++ lastSyscalls := make(lastSyscallMap) ++ for arch := range scmpArchs { + auditArch, err := scmpArchToAuditArch(arch) + if err != nil { + return nil, fmt.Errorf("cannot map architecture %v to AUDIT_ARCH_ constant: %w", arch, err) +diff --git a/libcontainer/seccomp/patchbpf/enosys_linux_test.go b/libcontainer/seccomp/patchbpf/enosys_linux_test.go +index bdfeff68adb3..3d442e1daa66 100644 +--- a/libcontainer/seccomp/patchbpf/enosys_linux_test.go ++++ b/libcontainer/seccomp/patchbpf/enosys_linux_test.go +@@ -12,6 +12,7 @@ import ( + "github.com/opencontainers/runc/libcontainer/configs" + + libseccomp "github.com/seccomp/libseccomp-golang" ++ "github.com/sirupsen/logrus" + "golang.org/x/net/bpf" + ) + +@@ -105,6 +106,18 @@ var testArches = []string{ + "ppc64le", + "s390", + "s390x", ++ // Dummy value to indicate a configuration with no architecture specified. ++ "native", ++} ++ ++var nativeArch string ++ ++func init() { ++ scmpNativeArch, err := libseccomp.GetNativeArch() ++ if err != nil { ++ logrus.Panicf("get native arch: %v", err) ++ } ++ nativeArch = scmpNativeArch.String() + } + + func testEnosysStub(t *testing.T, defaultAction configs.Action, arches []string) { +@@ -155,6 +168,9 @@ func testEnosysStub(t *testing.T, defaultAction configs.Action, arches []string) + expected uint32 + } + ++ if arch == "native" { ++ arch = nativeArch ++ } + scmpArch, err := libseccomp.GetArchFromString(arch) + if err != nil { + t.Fatalf("unknown libseccomp architecture %q: %v", arch, err) +@@ -228,8 +244,15 @@ func testEnosysStub(t *testing.T, defaultAction configs.Action, arches []string) + + // Test syscalls in the explicit list. + for _, test := range syscallTests { +- // Override the expected value in the two special cases. +- if !archSet[arch] || isAllowAction(defaultAction) { ++ // Override the expected value in the two special cases: ++ // 1. If the default action is allow, the filter won't have ++ // the stub prepended so we expect a fallthrough. ++ // 2. If the executing architecture is not in the architecture ++ // set, then the architecture is not handled by the stub -- ++ // *except* in the case of the native architecture (which ++ // is always included in the stub). ++ if isAllowAction(defaultAction) || ++ (!archSet[arch] && arch != nativeArch) { + test.expected = retFallthrough + } + +@@ -263,7 +286,14 @@ var testActions = map[string]configs.Action{ + + func TestEnosysStub_SingleArch(t *testing.T) { + for _, arch := range testArches { +- arches := []string{arch} ++ var arches []string ++ // "native" indicates a blank architecture field for seccomp, to test ++ // the case where the running architecture was not included in the ++ // architecture. Docker doesn't always set the architecture for some ++ // reason (namely for ppc64le). ++ if arch != "native" { ++ arches = append(arches, arch) ++ } + t.Run("arch="+arch, func(t *testing.T) { + for name, action := range testActions { + t.Run("action="+name, func(t *testing.T) { +@@ -277,7 +307,16 @@ func TestEnosysStub_SingleArch(t *testing.T) { + func TestEnosysStub_MultiArch(t *testing.T) { + for end := 0; end < len(testArches); end++ { + for start := 0; start < end; start++ { +- arches := testArches[start:end] ++ var arches []string ++ for _, arch := range testArches[start:end] { ++ // "native" indicates a blank architecture field for seccomp, to test ++ // the case where the running architecture was not included in the ++ // architecture. Docker doesn't always set the architecture for some ++ // reason (namely for ppc64le). ++ if arch != "native" { ++ arches = append(arches, arch) ++ } ++ } + if len(arches) <= 1 { + continue + } +-- +2.46.0 + diff --git a/0004-bsc1214960-nsenter-cloned_binary-remove-bindfd-logic.patch b/0004-bsc1214960-nsenter-cloned_binary-remove-bindfd-logic.patch new file mode 100644 index 0000000..c017ced --- /dev/null +++ b/0004-bsc1214960-nsenter-cloned_binary-remove-bindfd-logic.patch @@ -0,0 +1,136 @@ +From 0f1f8e303cf1919c33952f4938e5637d8f77f907 Mon Sep 17 00:00:00 2001 +From: Aleksa Sarai +Date: Fri, 7 Jul 2023 22:45:44 +1000 +Subject: [PATCH 4/4] bsc1214960: nsenter: cloned_binary: remove bindfd logic + entirely + +(This is a cherry-pick of b999376fb237195265081a8b8ba3fd3bd6ef8c2c.) + +While the ro-bind-mount trick did eliminate the memory overhead of +copying the runc binary for each "runc init" invocation, on machines +with very significant container churn, creating a temporary mount +namespace on every container invocation can trigger severe lock +contention on namespace_sem that makes containers fail to spawn. + +The only reason we added bindfd in commit 16612d74de5f ("nsenter: +cloned_binary: try to ro-bind /proc/self/exe before copying") was due to +a Kubernetes e2e test failure where they had a ridiculously small memory +limit. It seems incredibly unlikely that real workloads are running +without 10MB to spare for the very short time that runc is interacting +with the container. + +In addition, since the original cloned_binary implementation, cgroupv2 +is now almost universally used on modern systems. Unlike cgroupv1, the +cgroupv2 memcg implementation does not migrate memory usage when +processes change cgroups (even cgroupv1 only did this if you had +memory.move_charge_at_immigrate enabled). In addition, because we do the +/proc/self/exe clone before synchronising the bootstrap data read, we +are guaranteed to do the clone before "runc init" is moved into the +container cgroup -- meaning that the memory used by the /proc/self/exe +clone is charged against the root cgroup, and thus container workloads +should not be affected at all with memfd cloning. + +The long-term fix for this problem is to block the /proc/self/exe +re-opening attack entirely in-kernel, which is something I'm working +on[1]. Though it should also be noted that because the memfd is +completely separate to the host binary, even attacks like Dirty COW +against the runc binary can be defended against with the memfd approach. +Of course, once we have in-kernel protection against the /proc/self/exe +re-opening attack, we won't have that protection anymore... + +[1]: https://lwn.net/Articles/934460/ + +SUSE-Bugs: https://bugzilla.suse.com/show_bug.cgi?id=1214960 +Signed-off-by: Aleksa Sarai +--- + libcontainer/nsenter/cloned_binary.c | 67 ---------------------------- + 1 file changed, 67 deletions(-) + +diff --git a/libcontainer/nsenter/cloned_binary.c b/libcontainer/nsenter/cloned_binary.c +index d1b2d4c546f1..565748b13a4e 100644 +--- a/libcontainer/nsenter/cloned_binary.c ++++ b/libcontainer/nsenter/cloned_binary.c +@@ -396,61 +396,6 @@ static int seal_execfd(int *fd, int fdtype) + return -1; + } + +-static int try_bindfd(void) +-{ +- int fd, ret = -1; +- char template[PATH_MAX] = { 0 }; +- char *prefix = getenv("_LIBCONTAINER_STATEDIR"); +- +- if (!prefix || *prefix != '/') +- prefix = "/tmp"; +- if (snprintf(template, sizeof(template), "%s/runc.XXXXXX", prefix) < 0) +- return ret; +- +- /* +- * We need somewhere to mount it, mounting anything over /proc/self is a +- * BAD idea on the host -- even if we do it temporarily. +- */ +- fd = mkstemp(template); +- if (fd < 0) +- return ret; +- close(fd); +- +- /* +- * For obvious reasons this won't work in rootless mode because we haven't +- * created a userns+mntns -- but getting that to work will be a bit +- * complicated and it's only worth doing if someone actually needs it. +- */ +- ret = -EPERM; +- if (mount("/proc/self/exe", template, "", MS_BIND, "") < 0) +- goto out; +- if (mount("", template, "", MS_REMOUNT | MS_BIND | MS_RDONLY, "") < 0) +- goto out_umount; +- +- /* Get read-only handle that we're sure can't be made read-write. */ +- ret = open(template, O_PATH | O_CLOEXEC); +- +-out_umount: +- /* +- * Make sure the MNT_DETACH works, otherwise we could get remounted +- * read-write and that would be quite bad (the fd would be made read-write +- * too, invalidating the protection). +- */ +- if (umount2(template, MNT_DETACH) < 0) { +- if (ret >= 0) +- close(ret); +- ret = -ENOTRECOVERABLE; +- } +- +-out: +- /* +- * We don't care about unlink errors, the worst that happens is that +- * there's an empty file left around in STATEDIR. +- */ +- unlink(template); +- return ret; +-} +- + static ssize_t fd_to_fd(int outfd, int infd) + { + ssize_t total = 0; +@@ -485,18 +430,6 @@ static int clone_binary(void) + size_t sent = 0; + int fdtype = EFD_NONE; + +- /* +- * Before we resort to copying, let's try creating an ro-binfd in one shot +- * by getting a handle for a read-only bind-mount of the execfd. +- */ +- execfd = try_bindfd(); +- if (execfd >= 0) +- return execfd; +- +- /* +- * Dammit, that didn't work -- time to copy the binary to a safe place we +- * can seal the contents. +- */ + execfd = make_execfd(&fdtype); + if (execfd < 0 || fdtype == EFD_NONE) + return -ENOTRECOVERABLE; +-- +2.46.0 + diff --git a/runc-1.1.12.tar.xz b/runc-1.1.12.tar.xz deleted file mode 100644 index ded755c..0000000 --- a/runc-1.1.12.tar.xz +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:47d9e34500e478d860512b3b646724ee4b9e638692122ddaa82af417668ca4d7 -size 1473936 diff --git a/runc-1.1.12.tar.xz.asc b/runc-1.1.12.tar.xz.asc deleted file mode 100644 index 4cdcf53..0000000 --- a/runc-1.1.12.tar.xz.asc +++ /dev/null @@ -1,17 +0,0 @@ ------BEGIN PGP SIGNATURE----- - -iQJEBAABCAAuFiEEXzbGxhtUYBJKdfWmnhiqJn3bjbQFAmWvvCcQHGFzYXJhaUBz -dXNlLmNvbQAKCRCeGKomfduNtG2oD/9yLwYdfbx4GU31kCuvTS3odH8XyplL4QLl -TszoLO/50z/Y9r0QBNuLsDDvAWtsJAYTsRIwEwDgUuziHnbkbHCnE2C+6P7OWUKp -7VS1mqWzWeVibt0hYBWcooJb8inA/ctwfppZlH8EnTdoyqp0bAuQKtj2muA+LTvN -n/19qZ0/zAvErya5ugZCfnpJngOM0W//F5OSE/DKI3ct6o3AilxlzlhZuwkiYQud -nwS5j4CvQp7GkJeuwDluUHGmsT8AW6P3McptS/BcT4wUKWhxcntJG1cdiZOFTW84 -3CLdwMPGQR0SVK5yPMbKogRtglODEW82Ytp4S8BB9sG5PS5rBsvnApSQxFluRMQT -oaQsEKwPS+VSUwf44QR42iF3fB8dxmmmcautr5yaUiSx4DdFGj9jjrbMa9YCk2da -J/5ExwJv5nP5R+uwOiH3ziZuFuuH1afbGLrT2ouv61/SMGiYiLEAyiegF94Zg2nu -5RvMUz33LpEckLrlNN5u9q+/jbfJmZAUtdVafKQQTBRFKPCyHjOroKM11PzoHX6l -3dsyEPbEfowZ+uM2z9wCfub529fNF8t9k9sUAIQsma5p7+l7xJMbOua2kd1kGiQU -ec19+KD6ka4NHyDRwxe0iM6/AuFlKKUUTVGZjg2bD+ap0qgDjZ3R5lTmI1pJ8Win -wfoEKZCm+A== -=Sl8m ------END PGP SIGNATURE----- diff --git a/runc-1.1.14.tar.xz b/runc-1.1.14.tar.xz new file mode 100644 index 0000000..1b74b65 --- /dev/null +++ b/runc-1.1.14.tar.xz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4ea6f31203ab3bd667d354c54f480c66a40765cf6f84d79c65e3bab1fc67d7db +size 1485648 diff --git a/runc-1.1.14.tar.xz.asc b/runc-1.1.14.tar.xz.asc new file mode 100644 index 0000000..d49f265 --- /dev/null +++ b/runc-1.1.14.tar.xz.asc @@ -0,0 +1,7 @@ +-----BEGIN PGP SIGNATURE----- + +iHUEABYKAB0WIQS2TklVsp+j1GPyqQYol/rSt+lEbwUCZtZk+AAKCRAol/rSt+lE +b0TGAQC6tc59nCVnmViX22aKK6fuV++saYQgQKKhIkqiyBs97wD/a49dqcnjgHIf +OKO+WjeCGwFIwmHIsAeD3bdCb+XTqQI= +=E21y +-----END PGP SIGNATURE----- diff --git a/runc.changes b/runc.changes index 11da0d9..cafa9de 100644 --- a/runc.changes +++ b/runc.changes @@ -1,3 +1,43 @@ +------------------------------------------------------------------- +Tue Sep 3 01:57:20 UTC 2024 - Aleksa Sarai + +[ This was only ever released for SLES and Leap. ] + +- Update to runc v1.1.14. Upstream changelog is available from + . + Includes the patch for CVE-2024-45310. bsc#1230092 + +- Rebase patches: + * 0001-bsc1221050-libct-seccomp-patchbpf-rm-duplicated-code.patch + * 0002-bsc1221050-seccomp-patchbpf-rename-nativeArch-linuxA.patch + * 0003-bsc1221050-seccomp-patchbpf-always-include-native-ar.patch + * 0004-bsc1214960-nsenter-cloned_binary-remove-bindfd-logic.patch + +------------------------------------------------------------------- +Mon Jul 22 13:08:06 UTC 2024 - Aleksa Sarai + +[ This was only ever released for SLES and Leap. ] + +- Update to runc v1.1.13. Upstream changelog is available from + . +- Rebase patches: + * 0001-bsc1221050-libct-seccomp-patchbpf-rm-duplicated-code.patch + * 0002-bsc1221050-seccomp-patchbpf-rename-nativeArch-linuxA.patch + * 0003-bsc1221050-seccomp-patchbpf-always-include-native-ar.patch +- Backport to fix a + performance issue when running lots of containers, caused by systemd getting + too many mount notifications. bsc#1214960 + + 0004-bsc1214960-nsenter-cloned_binary-remove-bindfd-logic.patch + +------------------------------------------------------------------- +Thu Mar 21 03:46:48 UTC 2024 - Aleksa Sarai + +- Add upstream patch to + properly fix -ENOSYS stub on ppc64le. bsc#1192051 bsc#1221050 + + 0001-bsc1221050-libct-seccomp-patchbpf-rm-duplicated-code.patch + + 0002-bsc1221050-seccomp-patchbpf-rename-nativeArch-linuxA.patch + + 0003-bsc1221050-seccomp-patchbpf-always-include-native-ar.patch + ------------------------------------------------------------------- Wed Jan 31 00:00:33 UTC 2024 - Aleksa Sarai diff --git a/runc.spec b/runc.spec index b4a7484..f886876 100644 --- a/runc.spec +++ b/runc.spec @@ -18,13 +18,13 @@ # MANUAL: Make sure you update this each time you update runc. -%define git_version 51d5e94601ceffbbd85688df1c928ecccbfa4685 -%define git_short 51d5e94601ce +%define git_version 2c9f5602f0ba3d9da1c2596322dfc4e156844890 +%define git_short 2c9f5602f0ba %define project github.com/opencontainers/runc Name: runc -Version: 1.1.12 +Version: 1.1.14 Release: 0 Summary: Tool for spawning and running OCI containers License: Apache-2.0 @@ -33,6 +33,12 @@ URL: https://github.com/opencontainers/runc Source0: https://github.com/opencontainers/runc/releases/download/v%{version}/runc.tar.xz#/runc-%{version}.tar.xz Source1: https://github.com/opencontainers/runc/releases/download/v%{version}/runc.tar.xz.asc#/runc-%{version}.tar.xz.asc Source2: runc.keyring +# SUSE-FIX-UPSTREAM: Backport of . bsc#1221050 +Patch10: 0001-bsc1221050-libct-seccomp-patchbpf-rm-duplicated-code.patch +Patch11: 0002-bsc1221050-seccomp-patchbpf-rename-nativeArch-linuxA.patch +Patch12: 0003-bsc1221050-seccomp-patchbpf-always-include-native-ar.patch +# SUSE-FIX-UPSTREAM: Partial backport of . bsc#1214960 +Patch20: 0004-bsc1214960-nsenter-cloned_binary-remove-bindfd-logic.patch BuildRequires: diffutils BuildRequires: fdupes BuildRequires: go @@ -64,6 +70,7 @@ and has grown to become a separate project entirely. %prep %setup -q -n %{name}-%{version} +%autopatch -p1 %build # build runc