diff options
| author | Thomas Gleixner <tglx@linutronix.de> | 2025-10-27 09:44:28 +0100 |
|---|---|---|
| committer | Ingo Molnar <mingo@kernel.org> | 2025-11-04 08:30:23 +0100 |
| commit | 83409986f49f17b14a675f9c598ad50d4c60191b (patch) | |
| tree | 59efe6d5d76c099ea7914d015a7f7e2e1185350e | |
| parent | d923739e2e356424cc566143a3323c62cd6ed067 (diff) | |
rseq, virt: Retrigger RSEQ after vcpu_run()
Hypervisors invoke resume_user_mode_work() before entering the guest, which
clears TIF_NOTIFY_RESUME. The @regs argument is NULL as there is no user
space context available to them, so the rseq notify handler skips
inspecting the critical section, but updates the CPU/MM CID values
unconditionally so that the eventual pending rseq event is not lost on the
way to user space.
This is a pointless exercise as the task might be rescheduled before
actually returning to user space and it creates unnecessary work in the
vcpu_run() loops.
It's way more efficient to ignore that invocation based on @regs == NULL
and let the hypervisors re-raise TIF_NOTIFY_RESUME after returning from the
vcpu_run() loop before returning from the ioctl().
This ensures that a pending RSEQ update is not lost and the IDs are updated
before returning to user space.
Once the RSEQ handling is decoupled from TIF_NOTIFY_RESUME, this turns into
a NOOP.
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Acked-by: Sean Christopherson <seanjc@google.com>
Link: https://patch.msgid.link/20251027084306.399495855@linutronix.de
| -rw-r--r-- | drivers/hv/mshv_root_main.c | 3 | ||||
| -rw-r--r-- | include/linux/rseq.h | 17 | ||||
| -rw-r--r-- | kernel/rseq.c | 78 | ||||
| -rw-r--r-- | virt/kvm/kvm_main.c | 7 |
4 files changed, 68 insertions, 37 deletions
diff --git a/drivers/hv/mshv_root_main.c b/drivers/hv/mshv_root_main.c index e3b2bd417c46..a21a0eb0f5be 100644 --- a/drivers/hv/mshv_root_main.c +++ b/drivers/hv/mshv_root_main.c @@ -29,6 +29,7 @@ #include <linux/crash_dump.h> #include <linux/panic_notifier.h> #include <linux/vmalloc.h> +#include <linux/rseq.h> #include "mshv_eventfd.h" #include "mshv.h" @@ -560,6 +561,8 @@ static long mshv_run_vp_with_root_scheduler(struct mshv_vp *vp) } } while (!vp->run.flags.intercept_suspend); + rseq_virt_userspace_exit(); + return ret; } diff --git a/include/linux/rseq.h b/include/linux/rseq.h index 241067bf20db..c6267f70c746 100644 --- a/include/linux/rseq.h +++ b/include/linux/rseq.h @@ -38,6 +38,22 @@ static __always_inline void rseq_exit_to_user_mode(void) } /* + * KVM/HYPERV invoke resume_user_mode_work() before entering guest mode, + * which clears TIF_NOTIFY_RESUME. To avoid updating user space RSEQ in + * that case just to do it eventually again before returning to user space, + * the entry resume_user_mode_work() invocation is ignored as the register + * argument is NULL. + * + * After returning from guest mode, they have to invoke this function to + * re-raise TIF_NOTIFY_RESUME if necessary. + */ +static inline void rseq_virt_userspace_exit(void) +{ + if (current->rseq_event_pending) + set_tsk_thread_flag(current, TIF_NOTIFY_RESUME); +} + +/* * If parent process has a registered restartable sequences area, the * child inherits. Unregister rseq for a clone with CLONE_VM set. */ @@ -68,6 +84,7 @@ static inline void rseq_execve(struct task_struct *t) static inline void rseq_handle_notify_resume(struct pt_regs *regs) { } static inline void rseq_signal_deliver(struct ksignal *ksig, struct pt_regs *regs) { } static inline void rseq_sched_switch_event(struct task_struct *t) { } +static inline void rseq_virt_userspace_exit(void) { } static inline void rseq_fork(struct task_struct *t, u64 clone_flags) { } static inline void rseq_execve(struct task_struct *t) { } static inline void rseq_exit_to_user_mode(void) { } diff --git a/kernel/rseq.c b/kernel/rseq.c index 59adc1a7183b..01e711383e05 100644 --- a/kernel/rseq.c +++ b/kernel/rseq.c @@ -422,50 +422,54 @@ void __rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs) { struct task_struct *t = current; int ret, sig; + bool event; + + /* + * If invoked from hypervisors before entering the guest via + * resume_user_mode_work(), then @regs is a NULL pointer. + * + * resume_user_mode_work() clears TIF_NOTIFY_RESUME and re-raises + * it before returning from the ioctl() to user space when + * rseq_event.sched_switch is set. + * + * So it's safe to ignore here instead of pointlessly updating it + * in the vcpu_run() loop. + */ + if (!regs) + return; if (unlikely(t->flags & PF_EXITING)) return; /* - * If invoked from hypervisors or IO-URING, then @regs is a NULL - * pointer, so fixup cannot be done. If the syscall which led to - * this invocation was invoked inside a critical section, then it - * will either end up in this code again or a possible violation of - * a syscall inside a critical region can only be detected by the - * debug code in rseq_syscall() in a debug enabled kernel. + * Read and clear the event pending bit first. If the task + * was not preempted or migrated or a signal is on the way, + * there is no point in doing any of the heavy lifting here + * on production kernels. In that case TIF_NOTIFY_RESUME + * was raised by some other functionality. + * + * This is correct because the read/clear operation is + * guarded against scheduler preemption, which makes it CPU + * local atomic. If the task is preempted right after + * re-enabling preemption then TIF_NOTIFY_RESUME is set + * again and this function is invoked another time _before_ + * the task is able to return to user mode. + * + * On a debug kernel, invoke the fixup code unconditionally + * with the result handed in to allow the detection of + * inconsistencies. */ - if (regs) { - /* - * Read and clear the event pending bit first. If the task - * was not preempted or migrated or a signal is on the way, - * there is no point in doing any of the heavy lifting here - * on production kernels. In that case TIF_NOTIFY_RESUME - * was raised by some other functionality. - * - * This is correct because the read/clear operation is - * guarded against scheduler preemption, which makes it CPU - * local atomic. If the task is preempted right after - * re-enabling preemption then TIF_NOTIFY_RESUME is set - * again and this function is invoked another time _before_ - * the task is able to return to user mode. - * - * On a debug kernel, invoke the fixup code unconditionally - * with the result handed in to allow the detection of - * inconsistencies. - */ - bool event; - - scoped_guard(RSEQ_EVENT_GUARD) { - event = t->rseq_event_pending; - t->rseq_event_pending = false; - } - - if (IS_ENABLED(CONFIG_DEBUG_RSEQ) || event) { - ret = rseq_ip_fixup(regs, event); - if (unlikely(ret < 0)) - goto error; - } + scoped_guard(RSEQ_EVENT_GUARD) { + event = t->rseq_event_pending; + t->rseq_event_pending = false; } + + if (IS_ENABLED(CONFIG_DEBUG_RSEQ) || event) { + ret = rseq_ip_fixup(regs, event); + if (unlikely(ret < 0)) + goto error; + } + if (unlikely(rseq_update_cpu_node_id(t))) goto error; return; diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index b7a0ae2a7b20..4255fcf9c6e5 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -49,6 +49,7 @@ #include <linux/lockdep.h> #include <linux/kthread.h> #include <linux/suspend.h> +#include <linux/rseq.h> #include <asm/processor.h> #include <asm/ioctl.h> @@ -4476,6 +4477,12 @@ static long kvm_vcpu_ioctl(struct file *filp, r = kvm_arch_vcpu_ioctl_run(vcpu); vcpu->wants_to_run = false; + /* + * FIXME: Remove this hack once all KVM architectures + * support the generic TIF bits, i.e. a dedicated TIF_RSEQ. + */ + rseq_virt_userspace_exit(); + trace_kvm_userspace_exit(vcpu->run->exit_reason, r); break; } |