summaryrefslogtreecommitdiff
path: root/arch/um/kernel
diff options
context:
space:
mode:
authorBenjamin Berg <benjamin.berg@intel.com>2025-06-02 15:00:52 +0200
committerJohannes Berg <johannes.berg@intel.com>2025-06-02 16:20:10 +0200
commite92e2552858142b60238b9828d802f128e4acccd (patch)
tree57e6a51acfc7397215131df99d1a67fc58d18349 /arch/um/kernel
parentbeddc9fb1cb161e1bf779b180750b648ff9690c7 (diff)
um: pass FD for memory operations when needed
Instead of always sharing the FDs with the userspace process, only hand over the FDs needed for mmap when required. The idea is that userspace might be able to force the stub into executing an mmap syscall, however, it will not be able to manipulate the control flow sufficiently to have access to an FD that would allow mapping arbitrary memory. Security wise, we need to be sure that only the expected syscalls are executed after the kernel sends FDs through the socket. This is currently not the case, as userspace can trivially jump to the rt_sigreturn syscall instruction to execute any syscall that the stub is permitted to do. With this, it can trick the kernel to send the FD, which in turn allows userspace to freely map any physical memory. As such, this is currently *not* secure. However, in principle the approach should be fine with a more strict SECCOMP filter and a careful review of the stub control flow (as userspace can prepare a stack). With some care, it is likely possible to extend the security model to SMP if desired. Signed-off-by: Benjamin Berg <benjamin.berg@intel.com> Link: https://patch.msgid.link/20250602130052.545733-8-benjamin@sipsolutions.net Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Diffstat (limited to 'arch/um/kernel')
-rw-r--r--arch/um/kernel/skas/mmu.c3
-rw-r--r--arch/um/kernel/skas/stub.c87
-rw-r--r--arch/um/kernel/skas/stub_exe.c40
3 files changed, 111 insertions, 19 deletions
diff --git a/arch/um/kernel/skas/mmu.c b/arch/um/kernel/skas/mmu.c
index 87a18ae4da19..849fafa4b54f 100644
--- a/arch/um/kernel/skas/mmu.c
+++ b/arch/um/kernel/skas/mmu.c
@@ -78,6 +78,9 @@ void destroy_context(struct mm_struct *mm)
mmu->id.pid = -1;
}
+ if (using_seccomp && mmu->id.sock)
+ os_close_file(mmu->id.sock);
+
free_pages(mmu->id.stack, ilog2(STUB_DATA_PAGES));
guard(spinlock_irqsave)(&mm_list_lock);
diff --git a/arch/um/kernel/skas/stub.c b/arch/um/kernel/skas/stub.c
index 9041f6b6e28b..67cab46a602c 100644
--- a/arch/um/kernel/skas/stub.c
+++ b/arch/um/kernel/skas/stub.c
@@ -6,23 +6,53 @@
#include <sysdep/stub.h>
#include <linux/futex.h>
+#include <sys/socket.h>
#include <errno.h>
-static __always_inline int syscall_handler(struct stub_data *d)
+/*
+ * Known security issues
+ *
+ * Userspace can jump to this address to execute *any* syscall that is
+ * permitted by the stub. As we will return afterwards, it can do
+ * whatever it likes, including:
+ * - Tricking the kernel into handing out the memory FD
+ * - Using this memory FD to read/write all physical memory
+ * - Running in parallel to the kernel processing a syscall
+ * (possibly creating data races?)
+ * - Blocking e.g. SIGALRM to avoid time based scheduling
+ *
+ * To avoid this, the permitted location for each syscall needs to be
+ * checked for in the SECCOMP filter (which is reasonably simple). Also,
+ * more care will need to go into considerations how the code might be
+ * tricked by using a prepared stack (or even modifying the stack from
+ * another thread in case SMP support is added).
+ *
+ * As for the SIGALRM, the best counter measure will be to check in the
+ * kernel that the process is reporting back the SIGALRM in a timely
+ * fashion.
+ */
+static __always_inline int syscall_handler(int fd_map[STUB_MAX_FDS])
{
+ struct stub_data *d = get_stub_data();
int i;
unsigned long res;
+ int fd;
for (i = 0; i < d->syscall_data_len; i++) {
struct stub_syscall *sc = &d->syscall_data[i];
switch (sc->syscall) {
case STUB_SYSCALL_MMAP:
+ if (fd_map)
+ fd = fd_map[sc->mem.fd];
+ else
+ fd = sc->mem.fd;
+
res = stub_syscall6(STUB_MMAP_NR,
sc->mem.addr, sc->mem.length,
sc->mem.prot,
MAP_SHARED | MAP_FIXED,
- sc->mem.fd, sc->mem.offset);
+ fd, sc->mem.offset);
if (res != sc->mem.addr) {
d->err = res;
d->syscall_data_len = i;
@@ -54,9 +84,7 @@ static __always_inline int syscall_handler(struct stub_data *d)
void __section(".__syscall_stub")
stub_syscall_handler(void)
{
- struct stub_data *d = get_stub_data();
-
- syscall_handler(d);
+ syscall_handler(NULL);
trap_myself();
}
@@ -65,7 +93,25 @@ void __section(".__syscall_stub")
stub_signal_interrupt(int sig, siginfo_t *info, void *p)
{
struct stub_data *d = get_stub_data();
+ char rcv_data;
+ union {
+ char data[CMSG_SPACE(sizeof(int) * STUB_MAX_FDS)];
+ struct cmsghdr align;
+ } ctrl = {};
+ struct iovec iov = {
+ .iov_base = &rcv_data,
+ .iov_len = 1,
+ };
+ struct msghdr msghdr = {
+ .msg_iov = &iov,
+ .msg_iovlen = 1,
+ .msg_control = &ctrl,
+ .msg_controllen = sizeof(ctrl),
+ };
ucontext_t *uc = p;
+ struct cmsghdr *fd_msg;
+ int *fd_map;
+ int num_fds;
long res;
d->signal = sig;
@@ -78,6 +124,7 @@ restart_wait:
res = stub_syscall3(__NR_futex, (unsigned long)&d->futex,
FUTEX_WAKE, 1);
} while (res == -EINTR);
+
do {
res = stub_syscall4(__NR_futex, (unsigned long)&d->futex,
FUTEX_WAIT, FUTEX_IN_KERN, 0);
@@ -86,11 +133,37 @@ restart_wait:
if (res < 0 && res != -EAGAIN)
stub_syscall1(__NR_exit_group, 1);
- /* Try running queued syscalls. */
- if (syscall_handler(d) < 0 || d->restart_wait) {
+ if (d->syscall_data_len) {
+ /* Read passed FDs (if any) */
+ do {
+ res = stub_syscall3(__NR_recvmsg, 0, (unsigned long)&msghdr, 0);
+ } while (res == -EINTR);
+
+ /* We should never have a receive error (other than -EAGAIN) */
+ if (res < 0 && res != -EAGAIN)
+ stub_syscall1(__NR_exit_group, 1);
+
+ /* Receive the FDs */
+ num_fds = 0;
+ fd_msg = msghdr.msg_control;
+ fd_map = (void *)&CMSG_DATA(fd_msg);
+ if (res == iov.iov_len && msghdr.msg_controllen > sizeof(struct cmsghdr))
+ num_fds = (fd_msg->cmsg_len - CMSG_LEN(0)) / sizeof(int);
+
+ /* Try running queued syscalls. */
+ res = syscall_handler(fd_map);
+
+ while (num_fds)
+ stub_syscall2(__NR_close, fd_map[--num_fds], 0);
+ } else {
+ res = 0;
+ }
+
+ if (res < 0 || d->restart_wait) {
/* Report SIGSYS if we restart. */
d->signal = SIGSYS;
d->restart_wait = 0;
+
goto restart_wait;
}
diff --git a/arch/um/kernel/skas/stub_exe.c b/arch/um/kernel/skas/stub_exe.c
index f40f2332b676..cbafaa684e66 100644
--- a/arch/um/kernel/skas/stub_exe.c
+++ b/arch/um/kernel/skas/stub_exe.c
@@ -1,5 +1,6 @@
#include <sys/ptrace.h>
#include <sys/prctl.h>
+#include <sys/fcntl.h>
#include <asm/unistd.h>
#include <sysdep/stub.h>
#include <stub-data.h>
@@ -45,7 +46,11 @@ noinline static void real_init(void)
if (res != sizeof(init_data))
stub_syscall1(__NR_exit, 10);
- stub_syscall1(__NR_close, 0);
+ /* In SECCOMP mode, FD 0 is a socket and is later used for FD passing */
+ if (!init_data.seccomp)
+ stub_syscall1(__NR_close, 0);
+ else
+ stub_syscall3(__NR_fcntl, 0, F_SETFL, O_NONBLOCK);
/* map stub code + data */
res = stub_syscall6(STUB_MMAP_NR,
@@ -63,6 +68,13 @@ noinline static void real_init(void)
if (res != init_data.stub_start + UM_KERN_PAGE_SIZE)
stub_syscall1(__NR_exit, 12);
+ /* In SECCOMP mode, we only need the signalling FD from now on */
+ if (init_data.seccomp) {
+ res = stub_syscall3(__NR_close_range, 1, ~0U, 0);
+ if (res != 0)
+ stub_syscall1(__NR_exit, 13);
+ }
+
/* setup signal stack inside stub data */
stack.ss_sp = (void *)init_data.stub_start + UM_KERN_PAGE_SIZE;
stub_syscall2(__NR_sigaltstack, (unsigned long)&stack, 0);
@@ -77,7 +89,7 @@ noinline static void real_init(void)
res = stub_syscall4(__NR_rt_sigaction, SIGSEGV,
(unsigned long)&sa, 0, sizeof(sa.sa_mask));
if (res != 0)
- stub_syscall1(__NR_exit, 13);
+ stub_syscall1(__NR_exit, 14);
} else {
/* SECCOMP mode uses rt_sigreturn, need to mask all signals */
sa.sa_mask = ~0ULL;
@@ -85,32 +97,32 @@ noinline static void real_init(void)
res = stub_syscall4(__NR_rt_sigaction, SIGSEGV,
(unsigned long)&sa, 0, sizeof(sa.sa_mask));
if (res != 0)
- stub_syscall1(__NR_exit, 14);
+ stub_syscall1(__NR_exit, 15);
res = stub_syscall4(__NR_rt_sigaction, SIGSYS,
(unsigned long)&sa, 0, sizeof(sa.sa_mask));
if (res != 0)
- stub_syscall1(__NR_exit, 15);
+ stub_syscall1(__NR_exit, 16);
res = stub_syscall4(__NR_rt_sigaction, SIGALRM,
(unsigned long)&sa, 0, sizeof(sa.sa_mask));
if (res != 0)
- stub_syscall1(__NR_exit, 16);
+ stub_syscall1(__NR_exit, 17);
res = stub_syscall4(__NR_rt_sigaction, SIGTRAP,
(unsigned long)&sa, 0, sizeof(sa.sa_mask));
if (res != 0)
- stub_syscall1(__NR_exit, 17);
+ stub_syscall1(__NR_exit, 18);
res = stub_syscall4(__NR_rt_sigaction, SIGILL,
(unsigned long)&sa, 0, sizeof(sa.sa_mask));
if (res != 0)
- stub_syscall1(__NR_exit, 18);
+ stub_syscall1(__NR_exit, 19);
res = stub_syscall4(__NR_rt_sigaction, SIGFPE,
(unsigned long)&sa, 0, sizeof(sa.sa_mask));
if (res != 0)
- stub_syscall1(__NR_exit, 19);
+ stub_syscall1(__NR_exit, 20);
}
/*
@@ -153,8 +165,12 @@ noinline static void real_init(void)
BPF_STMT(BPF_LD | BPF_W | BPF_ABS,
offsetof(struct seccomp_data, nr)),
- /* [10-14] Check against permitted syscalls */
+ /* [10-16] Check against permitted syscalls */
BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_futex,
+ 7, 0),
+ BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K,__NR_recvmsg,
+ 6, 0),
+ BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K,__NR_close,
5, 0),
BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, STUB_MMAP_NR,
4, 0),
@@ -170,10 +186,10 @@ noinline static void real_init(void)
BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_rt_sigreturn,
1, 0),
- /* [15] Not one of the permitted syscalls */
+ /* [17] Not one of the permitted syscalls */
BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_KILL_PROCESS),
- /* [16] Permitted call for the stub */
+ /* [18] Permitted call for the stub */
BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW),
};
struct sock_fprog prog = {
@@ -184,7 +200,7 @@ noinline static void real_init(void)
if (stub_syscall3(__NR_seccomp, SECCOMP_SET_MODE_FILTER,
SECCOMP_FILTER_FLAG_TSYNC,
(unsigned long)&prog) != 0)
- stub_syscall1(__NR_exit, 20);
+ stub_syscall1(__NR_exit, 21);
/* Fall through, the exit syscall will cause SIGSYS */
} else {