From 3ca55ca225d7627e76fcfe2894740c4f615ff2e0 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Wed, 2 Apr 2025 11:09:25 -0700 Subject: exit: move and extend sched_process_exit() tracepoint It is useful to be able to access current->mm at task exit to, say, record a bunch of VMA information right before the task exits (e.g., for stack symbolization reasons when dealing with short-lived processes that exit in the middle of profiling session). Currently, trace_sched_process_exit() is triggered after exit_mm() which resets current->mm to NULL making this tracepoint unsuitable for inspecting and recording task's mm_struct-related data when tracing process lifetimes. There is a particularly suitable place, though, right after taskstats_exit() is called, but before we do exit_mm() and other exit_*() resource teardowns. taskstats performs a similar kind of accounting that some applications do with BPF, and so co-locating them seems like a good fit. So that's where trace_sched_process_exit() is moved with this patch. Also, existing trace_sched_process_exit() tracepoint is notoriously missing `group_dead` flag that is certainly useful in practice and some of our production applications have to work around this. So plumb `group_dead` through while at it, to have a richer and more complete tracepoint. Note that we can't use sched_process_template anymore, and so we use TRACE_EVENT()-based tracepoint definition. But all the field names and order, as well as assign and output logic remain intact. We just add one extra field at the end in backwards-compatible way. [andrii@kernel.org: document sched_process_exit and sched_process_template relation] Link: https://lkml.kernel.org/r/20250403174120.4087794-1-andrii@kernel.org Link: https://lkml.kernel.org/r/20250402180925.90914-1-andrii@kernel.org Signed-off-by: Andrii Nakryiko Acked-by: Steven Rostedt (Google) Acked-by: Oleg Nesterov Suggested-by: Ingo Molnar Cc: Alexander Potapenko Cc: Christian Brauner Cc: "Masami Hiramatsu (Google)" Cc: Mathieu Desnoyers Cc: Michal Hocko Signed-off-by: Andrew Morton --- kernel/exit.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/exit.c') diff --git a/kernel/exit.c b/kernel/exit.c index 1b51dc099f1e..f1db86dcbeb1 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -936,12 +936,12 @@ void __noreturn do_exit(long code) tsk->exit_code = code; taskstats_exit(tsk, group_dead); + trace_sched_process_exit(tsk, group_dead); exit_mm(); if (group_dead) acct_process(); - trace_sched_process_exit(tsk); exit_sem(tsk); exit_shm(tsk); -- cgit v1.2.3 From fe6f600c43e0931f4fb5e1debb2311b7a6ebabd8 Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Wed, 19 Mar 2025 20:54:36 +0100 Subject: exit: combine work under lock in synchronize_group_exit() and coredump_task_exit() This reduces single-threaded overhead as it avoids one lock+irq trip on exit. It also improves scalability of spawning and killing threads within one process (just shy of 5% when doing it on 24 cores on my test jig). Both routines are moved below kcov and kmsan exit, which should be harmless. Link: https://lkml.kernel.org/r/20250319195436.1864415-1-mjguzik@gmail.com Signed-off-by: Mateusz Guzik Reviewed-by: Oleg Nesterov Signed-off-by: Andrew Morton --- kernel/exit.c | 68 ++++++++++++++++++++++++++++------------------------------- 1 file changed, 32 insertions(+), 36 deletions(-) (limited to 'kernel/exit.c') diff --git a/kernel/exit.c b/kernel/exit.c index f1db86dcbeb1..921f28249b9b 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -415,44 +415,30 @@ kill_orphaned_pgrp(struct task_struct *tsk, struct task_struct *parent) } } -static void coredump_task_exit(struct task_struct *tsk) +static void coredump_task_exit(struct task_struct *tsk, + struct core_state *core_state) { - struct core_state *core_state; + struct core_thread self; + self.task = tsk; + if (self.task->flags & PF_SIGNALED) + self.next = xchg(&core_state->dumper.next, &self); + else + self.task = NULL; /* - * Serialize with any possible pending coredump. - * We must hold siglock around checking core_state - * and setting PF_POSTCOREDUMP. The core-inducing thread - * will increment ->nr_threads for each thread in the - * group without PF_POSTCOREDUMP set. + * Implies mb(), the result of xchg() must be visible + * to core_state->dumper. */ - spin_lock_irq(&tsk->sighand->siglock); - tsk->flags |= PF_POSTCOREDUMP; - core_state = tsk->signal->core_state; - spin_unlock_irq(&tsk->sighand->siglock); - if (core_state) { - struct core_thread self; - - self.task = current; - if (self.task->flags & PF_SIGNALED) - self.next = xchg(&core_state->dumper.next, &self); - else - self.task = NULL; - /* - * Implies mb(), the result of xchg() must be visible - * to core_state->dumper. - */ - if (atomic_dec_and_test(&core_state->nr_threads)) - complete(&core_state->startup); + if (atomic_dec_and_test(&core_state->nr_threads)) + complete(&core_state->startup); - for (;;) { - set_current_state(TASK_IDLE|TASK_FREEZABLE); - if (!self.task) /* see coredump_finish() */ - break; - schedule(); - } - __set_current_state(TASK_RUNNING); + for (;;) { + set_current_state(TASK_IDLE|TASK_FREEZABLE); + if (!self.task) /* see coredump_finish() */ + break; + schedule(); } + __set_current_state(TASK_RUNNING); } #ifdef CONFIG_MEMCG @@ -876,6 +862,7 @@ static void synchronize_group_exit(struct task_struct *tsk, long code) { struct sighand_struct *sighand = tsk->sighand; struct signal_struct *signal = tsk->signal; + struct core_state *core_state; spin_lock_irq(&sighand->siglock); signal->quick_threads--; @@ -885,7 +872,19 @@ static void synchronize_group_exit(struct task_struct *tsk, long code) signal->group_exit_code = code; signal->group_stop_count = 0; } + /* + * Serialize with any possible pending coredump. + * We must hold siglock around checking core_state + * and setting PF_POSTCOREDUMP. The core-inducing thread + * will increment ->nr_threads for each thread in the + * group without PF_POSTCOREDUMP set. + */ + tsk->flags |= PF_POSTCOREDUMP; + core_state = signal->core_state; spin_unlock_irq(&sighand->siglock); + + if (unlikely(core_state)) + coredump_task_exit(tsk, core_state); } void __noreturn do_exit(long code) @@ -894,15 +893,12 @@ void __noreturn do_exit(long code) int group_dead; WARN_ON(irqs_disabled()); - - synchronize_group_exit(tsk, code); - WARN_ON(tsk->plug); kcov_task_exit(tsk); kmsan_task_exit(tsk); - coredump_task_exit(tsk); + synchronize_group_exit(tsk, code); ptrace_event(PTRACE_EVENT_EXIT, code); user_events_exit(tsk); -- cgit v1.2.3