165 lines
6.4 KiB
Diff
165 lines
6.4 KiB
Diff
|
From dcc3dc305307f530f8faf394c84d1dede29443ab Mon Sep 17 00:00:00 2001
|
||
|
From: Aleksa Sarai <cyphar@cyphar.com>
|
||
|
Date: Fri, 20 May 2022 10:39:41 +1000
|
||
|
Subject: [PATCH] seccomp: enosys: always return -ENOSYS for setup(2) on
|
||
|
s390(x)
|
||
|
|
||
|
On s390x, syscalls above 255 are multiplexed using the (now otherwise
|
||
|
unused) setup(2) syscall (syscall number 0). If the kernel supports the
|
||
|
syscall then it will correctly translate the syscall number such that
|
||
|
seccomp will correctly detect it -- however, for unknown syscalls the
|
||
|
syscall number remains unchanged. This can be verified by running the
|
||
|
following program under strace:
|
||
|
|
||
|
int main(void)
|
||
|
{
|
||
|
scmp_filter_ctx ctx = seccomp_init(SCMP_ACT_TRAP);
|
||
|
seccomp_load(ctx);
|
||
|
|
||
|
return syscall(439, AT_FDCWD, "asdf", X_OK, 0);
|
||
|
}
|
||
|
|
||
|
Which will then die with the following signal (on pre-5.8 kernels):
|
||
|
|
||
|
--- SIGSYS {si_signo=SIGSYS, si_code=SYS_SECCOMP,
|
||
|
si_call_addr=0x3ffb3006c22, si_syscall=__NR_setup,
|
||
|
si_arch=AUDIT_ARCH_S390X} ---
|
||
|
|
||
|
(Note that the si_syscall is __NR_setup, not __NR_faccessat2.)
|
||
|
|
||
|
As a result, the -ENOSYS handling we had previously did not work
|
||
|
completely correctly on s390x because any syscall not supported by the
|
||
|
kernel would be treated as syscall number 0 rather than the actual
|
||
|
syscall number.
|
||
|
|
||
|
Always returning -ENOSYS will not cause any issues because in all of the
|
||
|
cases where this multiplexing occurs, seccomp will see the remapped
|
||
|
syscall number -- and no userspace program will call setup(2)
|
||
|
intentionally (the syscall has not existed in Linux for decades and was
|
||
|
originally a hack used early in Linux init prior to spawning pid1 -- so
|
||
|
you will get -ENOSYS from the kernel anyway).
|
||
|
|
||
|
SUSE-Bugs: bsc#1192051 bsc#1199565
|
||
|
Backport: <https://github.com/opencontainers/runc/pull/3474>
|
||
|
Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
|
||
|
---
|
||
|
libcontainer/seccomp/patchbpf/enosys_linux.go | 48 ++++++++++++++-----
|
||
|
.../seccomp/patchbpf/enosys_linux_test.go | 13 +++++
|
||
|
2 files changed, 50 insertions(+), 11 deletions(-)
|
||
|
|
||
|
diff --git a/libcontainer/seccomp/patchbpf/enosys_linux.go b/libcontainer/seccomp/patchbpf/enosys_linux.go
|
||
|
index 095fba7fd91f..6376512b086f 100644
|
||
|
--- a/libcontainer/seccomp/patchbpf/enosys_linux.go
|
||
|
+++ b/libcontainer/seccomp/patchbpf/enosys_linux.go
|
||
|
@@ -80,6 +80,11 @@ import "C"
|
||
|
|
||
|
var retErrnoEnosys = uint32(C.C_ACT_ERRNO_ENOSYS)
|
||
|
|
||
|
+// This syscall is used for multiplexing "large" syscalls on s390(x). Unknown
|
||
|
+// syscalls will end up with this syscall number, so we need to explcitly
|
||
|
+// return -ENOSYS for this syscall on those architectures.
|
||
|
+const s390xMultiplexSyscall libseccomp.ScmpSyscall = 0
|
||
|
+
|
||
|
func isAllowAction(action configs.Action) bool {
|
||
|
switch action {
|
||
|
// Trace is considered an "allow" action because a good tracer should
|
||
|
@@ -315,7 +320,7 @@ func generateEnosysStub(lastSyscalls lastSyscallMap) ([]bpf.Instruction, error)
|
||
|
// directly from the arch code so we need to do it here. Sadly we can't
|
||
|
// share this code between architecture branches.
|
||
|
section := []bpf.Instruction{
|
||
|
- // load [0]
|
||
|
+ // load [0] (syscall number)
|
||
|
bpf.LoadAbsolute{Off: 0, Size: 4}, // NOTE: We assume sizeof(int) == 4.
|
||
|
}
|
||
|
|
||
|
@@ -324,10 +329,37 @@ func generateEnosysStub(lastSyscalls lastSyscallMap) ([]bpf.Instruction, error)
|
||
|
// No syscalls found for this arch -- skip it and move on.
|
||
|
continue
|
||
|
case 1:
|
||
|
- // Get the only syscall in the map.
|
||
|
- var sysno libseccomp.ScmpSyscall
|
||
|
- for _, no := range maxSyscalls {
|
||
|
+ // Get the only syscall and scmpArch in the map.
|
||
|
+ var (
|
||
|
+ scmpArch libseccomp.ScmpArch
|
||
|
+ sysno libseccomp.ScmpSyscall
|
||
|
+ )
|
||
|
+ for arch, no := range maxSyscalls {
|
||
|
sysno = no
|
||
|
+ scmpArch = arch
|
||
|
+ }
|
||
|
+
|
||
|
+ switch scmpArch {
|
||
|
+ // Return -ENOSYS for setup(2) on s390(x). This syscall is used for
|
||
|
+ // multiplexing "large syscall number" syscalls, but if the syscall
|
||
|
+ // number is not known to the kernel then the syscall number is
|
||
|
+ // left unchanged (and because it is sysno=0, you'll end up with
|
||
|
+ // EPERM for syscalls the kernel doesn't know about).
|
||
|
+ //
|
||
|
+ // The actual setup(2) syscall is never used by userspace anymore
|
||
|
+ // (and hasn't existed for decades) outside of this multiplexing
|
||
|
+ // scheme so returning -ENOSYS is fine.
|
||
|
+ case libseccomp.ArchS390, libseccomp.ArchS390X:
|
||
|
+ section = append(section, []bpf.Instruction{
|
||
|
+ // jne [setup=0],1
|
||
|
+ bpf.JumpIf{
|
||
|
+ Cond: bpf.JumpNotEqual,
|
||
|
+ Val: uint32(s390xMultiplexSyscall),
|
||
|
+ SkipTrue: 1,
|
||
|
+ },
|
||
|
+ // ret [ENOSYS]
|
||
|
+ bpf.RetConstant{Val: retErrnoEnosys},
|
||
|
+ }...)
|
||
|
}
|
||
|
|
||
|
// The simplest case just boils down to a single jgt instruction,
|
||
|
@@ -359,12 +391,6 @@ func generateEnosysStub(lastSyscalls lastSyscallMap) ([]bpf.Instruction, error)
|
||
|
// If we're on x86 we need to add a check for x32 and if we're in
|
||
|
// the wrong mode we jump over the section.
|
||
|
if uint32(nativeArch) == uint32(C.C_AUDIT_ARCH_X86_64) {
|
||
|
- // Grab the only architecture in the map.
|
||
|
- var scmpArch libseccomp.ScmpArch
|
||
|
- for arch := range maxSyscalls {
|
||
|
- scmpArch = arch
|
||
|
- }
|
||
|
-
|
||
|
// Generate a prefix to check the mode.
|
||
|
switch scmpArch {
|
||
|
case libseccomp.ArchAMD64:
|
||
|
@@ -522,7 +548,7 @@ func generateEnosysStub(lastSyscalls lastSyscallMap) ([]bpf.Instruction, error)
|
||
|
|
||
|
// Prepend the load instruction for the architecture.
|
||
|
programTail = append([]bpf.Instruction{
|
||
|
- // load [4]
|
||
|
+ // load [4] (architecture)
|
||
|
bpf.LoadAbsolute{Off: 4, Size: 4}, // NOTE: We assume sizeof(int) == 4.
|
||
|
}, programTail...)
|
||
|
|
||
|
diff --git a/libcontainer/seccomp/patchbpf/enosys_linux_test.go b/libcontainer/seccomp/patchbpf/enosys_linux_test.go
|
||
|
index 727800aa50cd..e2d363a43bd3 100644
|
||
|
--- a/libcontainer/seccomp/patchbpf/enosys_linux_test.go
|
||
|
+++ b/libcontainer/seccomp/patchbpf/enosys_linux_test.go
|
||
|
@@ -213,6 +213,19 @@ func testEnosysStub(t *testing.T, defaultAction configs.Action, arches []string)
|
||
|
})
|
||
|
}
|
||
|
|
||
|
+ // If we're on s390(x) make sure you get -ENOSYS for the "setup"
|
||
|
+ // syscall (this is done to work around an issue with s390x's
|
||
|
+ // syscall multiplexing which results in unknown syscalls being a
|
||
|
+ // setup(2) invocation).
|
||
|
+ switch scmpArch {
|
||
|
+ case libseccomp.ArchS390, libseccomp.ArchS390X:
|
||
|
+ syscallTests = append(syscallTests, syscallTest{
|
||
|
+ sysno: s390xMultiplexSyscall,
|
||
|
+ syscall: "setup",
|
||
|
+ expected: retErrnoEnosys,
|
||
|
+ })
|
||
|
+ }
|
||
|
+
|
||
|
// Test syscalls in the explicit list.
|
||
|
for _, test := range syscallTests {
|
||
|
// Override the expected value in the two special cases.
|
||
|
--
|
||
|
2.36.1
|
||
|
|