diff options
| author | Linus Torvalds <torvalds@linux-foundation.org> | 2025-12-01 20:42:01 -0800 |
|---|---|---|
| committer | Linus Torvalds <torvalds@linux-foundation.org> | 2025-12-01 20:42:01 -0800 |
| commit | 6c26fbe8c9d3e932dce6afe2505b19b4b261cae9 (patch) | |
| tree | 81cc40ecd2cde95b1b37937cf270cc0fa3832c43 /kernel | |
| parent | 63e6995005be8ceb8a1d56a18df1a1a40c28356d (diff) | |
| parent | 9929dffce5ed7e2988e0274f4db98035508b16d9 (diff) | |
Merge tag 'perf-core-2025-12-01' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull performance events updates from Ingo Molnar:
"Callchain support:
- Add support for deferred user-space stack unwinding for perf,
enabled on x86. (Peter Zijlstra, Steven Rostedt)
- unwind_user/x86: Enable frame pointer unwinding on x86 (Josh
Poimboeuf)
x86 PMU support and infrastructure:
- x86/insn: Simplify for_each_insn_prefix() (Peter Zijlstra)
- x86/insn,uprobes,alternative: Unify insn_is_nop() (Peter Zijlstra)
Intel PMU driver:
- Large series to prepare for and implement architectural PEBS
support for Intel platforms such as Clearwater Forest (CWF) and
Panther Lake (PTL). (Dapeng Mi, Kan Liang)
- Check dynamic constraints (Kan Liang)
- Optimize PEBS extended config (Peter Zijlstra)
- cstates:
- Remove PC3 support from LunarLake (Zhang Rui)
- Add Pantherlake support (Zhang Rui)
- Clearwater Forest support (Zide Chen)
AMD PMU driver:
- x86/amd: Check event before enable to avoid GPF (George Kennedy)
Fixes and cleanups:
- task_work: Fix NMI race condition (Peter Zijlstra)
- perf/x86: Fix NULL event access and potential PEBS record loss
(Dapeng Mi)
- Misc other fixes and cleanups (Dapeng Mi, Ingo Molnar, Peter
Zijlstra)"
* tag 'perf-core-2025-12-01' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (38 commits)
perf/x86/intel: Fix and clean up intel_pmu_drain_arch_pebs() type use
perf/x86/intel: Optimize PEBS extended config
perf/x86/intel: Check PEBS dyn_constraints
perf/x86/intel: Add a check for dynamic constraints
perf/x86/intel: Add counter group support for arch-PEBS
perf/x86/intel: Setup PEBS data configuration and enable legacy groups
perf/x86/intel: Update dyn_constraint base on PEBS event precise level
perf/x86/intel: Allocate arch-PEBS buffer and initialize PEBS_BASE MSR
perf/x86/intel: Process arch-PEBS records or record fragments
perf/x86/intel/ds: Factor out PEBS group processing code to functions
perf/x86/intel/ds: Factor out PEBS record processing code to functions
perf/x86/intel: Initialize architectural PEBS
perf/x86/intel: Correct large PEBS flag check
perf/x86/intel: Replace x86_pmu.drain_pebs calling with static call
perf/x86: Fix NULL event access and potential PEBS record loss
perf/x86: Remove redundant is_x86_event() prototype
entry,unwind/deferred: Fix unwind_reset_info() placement
unwind_user/x86: Fix arch=um build
perf: Support deferred user unwind
unwind_user/x86: Teach FP unwind about start of function
...
Diffstat (limited to 'kernel')
| -rw-r--r-- | kernel/bpf/stackmap.c | 4 | ||||
| -rw-r--r-- | kernel/events/callchain.c | 14 | ||||
| -rw-r--r-- | kernel/events/core.c | 78 | ||||
| -rw-r--r-- | kernel/exit.c | 7 | ||||
| -rw-r--r-- | kernel/task_work.c | 8 | ||||
| -rw-r--r-- | kernel/unwind/deferred.c | 44 | ||||
| -rw-r--r-- | kernel/unwind/user.c | 59 |
7 files changed, 176 insertions, 38 deletions
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 4d53cdd1374c..8f1dacaf01fe 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -315,7 +315,7 @@ BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map, max_depth = sysctl_perf_event_max_stack; trace = get_perf_callchain(regs, kernel, user, max_depth, - false, false); + false, false, 0); if (unlikely(!trace)) /* couldn't fetch the stack trace */ @@ -452,7 +452,7 @@ static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task, trace = get_callchain_entry_for_task(task, max_depth); else trace = get_perf_callchain(regs, kernel, user, max_depth, - crosstask, false); + crosstask, false, 0); if (unlikely(!trace) || trace->nr < skip) { if (may_fault) diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c index 808c0d7a31fa..b9c7e00725d6 100644 --- a/kernel/events/callchain.c +++ b/kernel/events/callchain.c @@ -218,7 +218,7 @@ static void fixup_uretprobe_trampoline_entries(struct perf_callchain_entry *entr struct perf_callchain_entry * get_perf_callchain(struct pt_regs *regs, bool kernel, bool user, - u32 max_stack, bool crosstask, bool add_mark) + u32 max_stack, bool crosstask, bool add_mark, u64 defer_cookie) { struct perf_callchain_entry *entry; struct perf_callchain_entry_ctx ctx; @@ -251,6 +251,18 @@ get_perf_callchain(struct pt_regs *regs, bool kernel, bool user, regs = task_pt_regs(current); } + if (defer_cookie) { + /* + * Foretell the coming of PERF_RECORD_CALLCHAIN_DEFERRED + * which can be stitched to this one, and add + * the cookie after it (it will be cut off when the + * user stack is copied to the callchain). + */ + perf_callchain_store_context(&ctx, PERF_CONTEXT_USER_DEFERRED); + perf_callchain_store_context(&ctx, defer_cookie); + goto exit_put; + } + if (add_mark) perf_callchain_store_context(&ctx, PERF_CONTEXT_USER); diff --git a/kernel/events/core.c b/kernel/events/core.c index 2c35acc2722b..ece716879cbc 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -56,6 +56,7 @@ #include <linux/buildid.h> #include <linux/task_work.h> #include <linux/percpu-rwsem.h> +#include <linux/unwind_deferred.h> #include "internal.h" @@ -8200,6 +8201,8 @@ static u64 perf_get_page_size(unsigned long addr) static struct perf_callchain_entry __empty_callchain = { .nr = 0, }; +static struct unwind_work perf_unwind_work; + struct perf_callchain_entry * perf_callchain(struct perf_event *event, struct pt_regs *regs) { @@ -8208,8 +8211,11 @@ perf_callchain(struct perf_event *event, struct pt_regs *regs) !(current->flags & (PF_KTHREAD | PF_USER_WORKER)); /* Disallow cross-task user callchains. */ bool crosstask = event->ctx->task && event->ctx->task != current; + bool defer_user = IS_ENABLED(CONFIG_UNWIND_USER) && user && + event->attr.defer_callchain; const u32 max_stack = event->attr.sample_max_stack; struct perf_callchain_entry *callchain; + u64 defer_cookie; if (!current->mm) user = false; @@ -8217,8 +8223,13 @@ perf_callchain(struct perf_event *event, struct pt_regs *regs) if (!kernel && !user) return &__empty_callchain; - callchain = get_perf_callchain(regs, kernel, user, - max_stack, crosstask, true); + if (!(user && defer_user && !crosstask && + unwind_deferred_request(&perf_unwind_work, &defer_cookie) >= 0)) + defer_cookie = 0; + + callchain = get_perf_callchain(regs, kernel, user, max_stack, + crosstask, true, defer_cookie); + return callchain ?: &__empty_callchain; } @@ -10003,6 +10014,66 @@ void perf_event_bpf_event(struct bpf_prog *prog, perf_iterate_sb(perf_event_bpf_output, &bpf_event, NULL); } +struct perf_callchain_deferred_event { + struct unwind_stacktrace *trace; + struct { + struct perf_event_header header; + u64 cookie; + u64 nr; + u64 ips[]; + } event; +}; + +static void perf_callchain_deferred_output(struct perf_event *event, void *data) +{ + struct perf_callchain_deferred_event *deferred_event = data; + struct perf_output_handle handle; + struct perf_sample_data sample; + int ret, size = deferred_event->event.header.size; + + if (!event->attr.defer_output) + return; + + /* XXX do we really need sample_id_all for this ??? */ + perf_event_header__init_id(&deferred_event->event.header, &sample, event); + + ret = perf_output_begin(&handle, &sample, event, + deferred_event->event.header.size); + if (ret) + goto out; + + perf_output_put(&handle, deferred_event->event); + for (int i = 0; i < deferred_event->trace->nr; i++) { + u64 entry = deferred_event->trace->entries[i]; + perf_output_put(&handle, entry); + } + perf_event__output_id_sample(event, &handle, &sample); + + perf_output_end(&handle); +out: + deferred_event->event.header.size = size; +} + +static void perf_unwind_deferred_callback(struct unwind_work *work, + struct unwind_stacktrace *trace, u64 cookie) +{ + struct perf_callchain_deferred_event deferred_event = { + .trace = trace, + .event = { + .header = { + .type = PERF_RECORD_CALLCHAIN_DEFERRED, + .misc = PERF_RECORD_MISC_USER, + .size = sizeof(deferred_event.event) + + (trace->nr * sizeof(u64)), + }, + .cookie = cookie, + .nr = trace->nr, + }, + }; + + perf_iterate_sb(perf_callchain_deferred_output, &deferred_event, NULL); +} + struct perf_text_poke_event { const void *old_bytes; const void *new_bytes; @@ -14809,6 +14880,9 @@ void __init perf_event_init(void) idr_init(&pmu_idr); + unwind_deferred_init(&perf_unwind_work, + perf_unwind_deferred_callback); + perf_event_init_all_cpus(); init_srcu_struct(&pmus_srcu); perf_pmu_register(&perf_swevent, "software", PERF_TYPE_SOFTWARE); diff --git a/kernel/exit.c b/kernel/exit.c index 988e16efd66b..fdfd05d1826c 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -940,7 +940,6 @@ void __noreturn do_exit(long code) tsk->exit_code = code; taskstats_exit(tsk, group_dead); - unwind_deferred_task_exit(tsk); trace_sched_process_exit(tsk, group_dead); /* @@ -951,6 +950,12 @@ void __noreturn do_exit(long code) * gets woken up by child-exit notifications. */ perf_event_exit_task(tsk); + /* + * PF_EXITING (above) ensures unwind_deferred_request() will no + * longer add new unwinds. While exit_mm() (below) will destroy the + * abaility to do unwinds. So flush any pending unwinds here. + */ + unwind_deferred_task_exit(tsk); exit_mm(); diff --git a/kernel/task_work.c b/kernel/task_work.c index d1efec571a4a..0f7519f8e7c9 100644 --- a/kernel/task_work.c +++ b/kernel/task_work.c @@ -9,7 +9,12 @@ static struct callback_head work_exited; /* all we need is ->next == NULL */ #ifdef CONFIG_IRQ_WORK static void task_work_set_notify_irq(struct irq_work *entry) { - test_and_set_tsk_thread_flag(current, TIF_NOTIFY_RESUME); + /* + * no-op IPI + * + * TWA_NMI_CURRENT will already have set the TIF flag, all + * this interrupt does it tickle the return-to-user path. + */ } static DEFINE_PER_CPU(struct irq_work, irq_work_NMI_resume) = IRQ_WORK_INIT_HARD(task_work_set_notify_irq); @@ -86,6 +91,7 @@ int task_work_add(struct task_struct *task, struct callback_head *work, break; #ifdef CONFIG_IRQ_WORK case TWA_NMI_CURRENT: + set_tsk_thread_flag(current, TIF_NOTIFY_RESUME); irq_work_queue(this_cpu_ptr(&irq_work_NMI_resume)); break; #endif diff --git a/kernel/unwind/deferred.c b/kernel/unwind/deferred.c index dc6040aae3ee..a88fb481c4a3 100644 --- a/kernel/unwind/deferred.c +++ b/kernel/unwind/deferred.c @@ -53,7 +53,7 @@ DEFINE_STATIC_SRCU(unwind_srcu); static inline bool unwind_pending(struct unwind_task_info *info) { - return test_bit(UNWIND_PENDING_BIT, &info->unwind_mask); + return atomic_long_read(&info->unwind_mask) & UNWIND_PENDING; } /* @@ -79,6 +79,8 @@ static u64 get_cookie(struct unwind_task_info *info) { u32 cnt = 1; + lockdep_assert_irqs_disabled(); + if (info->id.cpu) return info->id.id; @@ -126,23 +128,20 @@ int unwind_user_faultable(struct unwind_stacktrace *trace) cache = info->cache; trace->entries = cache->entries; - - if (cache->nr_entries) { - /* - * The user stack has already been previously unwound in this - * entry context. Skip the unwind and use the cache. - */ - trace->nr = cache->nr_entries; + trace->nr = cache->nr_entries; + /* + * The user stack has already been previously unwound in this + * entry context. Skip the unwind and use the cache. + */ + if (trace->nr) return 0; - } - trace->nr = 0; unwind_user(trace, UNWIND_MAX_ENTRIES); cache->nr_entries = trace->nr; /* Clear nr_entries on way back to user space */ - set_bit(UNWIND_USED_BIT, &info->unwind_mask); + atomic_long_or(UNWIND_USED, &info->unwind_mask); return 0; } @@ -160,7 +159,7 @@ static void process_unwind_deferred(struct task_struct *task) /* Clear pending bit but make sure to have the current bits */ bits = atomic_long_fetch_andnot(UNWIND_PENDING, - (atomic_long_t *)&info->unwind_mask); + &info->unwind_mask); /* * From here on out, the callback must always be called, even if it's * just an empty trace. @@ -231,6 +230,7 @@ void unwind_deferred_task_exit(struct task_struct *task) int unwind_deferred_request(struct unwind_work *work, u64 *cookie) { struct unwind_task_info *info = ¤t->unwind_info; + int twa_mode = TWA_RESUME; unsigned long old, bits; unsigned long bit; int ret; @@ -246,8 +246,11 @@ int unwind_deferred_request(struct unwind_work *work, u64 *cookie) * Trigger a warning to make it obvious that an architecture * is using this in NMI when it should not be. */ - if (WARN_ON_ONCE(!CAN_USE_IN_NMI && in_nmi())) - return -EINVAL; + if (in_nmi()) { + if (WARN_ON_ONCE(!CAN_USE_IN_NMI)) + return -EINVAL; + twa_mode = TWA_NMI_CURRENT; + } /* Do not allow cancelled works to request again */ bit = READ_ONCE(work->bit); @@ -261,7 +264,7 @@ int unwind_deferred_request(struct unwind_work *work, u64 *cookie) *cookie = get_cookie(info); - old = READ_ONCE(info->unwind_mask); + old = atomic_long_read(&info->unwind_mask); /* Is this already queued or executed */ if (old & bit) @@ -274,7 +277,7 @@ int unwind_deferred_request(struct unwind_work *work, u64 *cookie) * to have a callback. */ bits = UNWIND_PENDING | bit; - old = atomic_long_fetch_or(bits, (atomic_long_t *)&info->unwind_mask); + old = atomic_long_fetch_or(bits, &info->unwind_mask); if (old & bits) { /* * If the work's bit was set, whatever set it had better @@ -285,10 +288,10 @@ int unwind_deferred_request(struct unwind_work *work, u64 *cookie) } /* The work has been claimed, now schedule it. */ - ret = task_work_add(current, &info->work, TWA_RESUME); + ret = task_work_add(current, &info->work, twa_mode); if (WARN_ON_ONCE(ret)) - WRITE_ONCE(info->unwind_mask, 0); + atomic_long_set(&info->unwind_mask, 0); return ret; } @@ -320,7 +323,8 @@ void unwind_deferred_cancel(struct unwind_work *work) guard(rcu)(); /* Clear this bit from all threads */ for_each_process_thread(g, t) { - clear_bit(bit, &t->unwind_info.unwind_mask); + atomic_long_andnot(BIT(bit), + &t->unwind_info.unwind_mask); if (t->unwind_info.cache) clear_bit(bit, &t->unwind_info.cache->unwind_completed); } @@ -350,7 +354,7 @@ void unwind_task_init(struct task_struct *task) memset(info, 0, sizeof(*info)); init_task_work(&info->work, unwind_deferred_task_work); - info->unwind_mask = 0; + atomic_long_set(&info->unwind_mask, 0); } void unwind_task_free(struct task_struct *task) diff --git a/kernel/unwind/user.c b/kernel/unwind/user.c index 97a8415e3216..39e270789444 100644 --- a/kernel/unwind/user.c +++ b/kernel/unwind/user.c @@ -8,18 +8,28 @@ #include <linux/unwind_user.h> #include <linux/uaccess.h> -static const struct unwind_user_frame fp_frame = { - ARCH_INIT_USER_FP_FRAME -}; - #define for_each_user_frame(state) \ for (unwind_user_start(state); !(state)->done; unwind_user_next(state)) -static int unwind_user_next_fp(struct unwind_user_state *state) +static inline int +get_user_word(unsigned long *word, unsigned long base, int off, unsigned int ws) +{ + unsigned long __user *addr = (void __user *)base + off; +#ifdef CONFIG_COMPAT + if (ws == sizeof(int)) { + unsigned int data; + int ret = get_user(data, (unsigned int __user *)addr); + *word = data; + return ret; + } +#endif + return get_user(*word, addr); +} + +static int unwind_user_next_common(struct unwind_user_state *state, + const struct unwind_user_frame *frame) { - const struct unwind_user_frame *frame = &fp_frame; unsigned long cfa, fp, ra; - unsigned int shift; if (frame->use_fp) { if (state->fp < state->sp) @@ -37,24 +47,45 @@ static int unwind_user_next_fp(struct unwind_user_state *state) return -EINVAL; /* Make sure that the address is word aligned */ - shift = sizeof(long) == 4 ? 2 : 3; - if (cfa & ((1 << shift) - 1)) + if (cfa & (state->ws - 1)) return -EINVAL; /* Find the Return Address (RA) */ - if (get_user(ra, (unsigned long *)(cfa + frame->ra_off))) + if (get_user_word(&ra, cfa, frame->ra_off, state->ws)) return -EINVAL; - if (frame->fp_off && get_user(fp, (unsigned long __user *)(cfa + frame->fp_off))) + if (frame->fp_off && get_user_word(&fp, cfa, frame->fp_off, state->ws)) return -EINVAL; state->ip = ra; state->sp = cfa; if (frame->fp_off) state->fp = fp; + state->topmost = false; return 0; } +static int unwind_user_next_fp(struct unwind_user_state *state) +{ +#ifdef CONFIG_HAVE_UNWIND_USER_FP + struct pt_regs *regs = task_pt_regs(current); + + if (state->topmost && unwind_user_at_function_start(regs)) { + const struct unwind_user_frame fp_entry_frame = { + ARCH_INIT_USER_FP_ENTRY_FRAME(state->ws) + }; + return unwind_user_next_common(state, &fp_entry_frame); + } + + const struct unwind_user_frame fp_frame = { + ARCH_INIT_USER_FP_FRAME(state->ws) + }; + return unwind_user_next_common(state, &fp_frame); +#else + return -EINVAL; +#endif +} + static int unwind_user_next(struct unwind_user_state *state) { unsigned long iter_mask = state->available_types; @@ -102,6 +133,12 @@ static int unwind_user_start(struct unwind_user_state *state) state->ip = instruction_pointer(regs); state->sp = user_stack_pointer(regs); state->fp = frame_pointer(regs); + state->ws = unwind_user_word_size(regs); + if (!state->ws) { + state->done = true; + return -EINVAL; + } + state->topmost = true; return 0; } |