summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2025-12-01 20:42:01 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2025-12-01 20:42:01 -0800
commit6c26fbe8c9d3e932dce6afe2505b19b4b261cae9 (patch)
tree81cc40ecd2cde95b1b37937cf270cc0fa3832c43 /kernel
parent63e6995005be8ceb8a1d56a18df1a1a40c28356d (diff)
parent9929dffce5ed7e2988e0274f4db98035508b16d9 (diff)
Merge tag 'perf-core-2025-12-01' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull performance events updates from Ingo Molnar: "Callchain support: - Add support for deferred user-space stack unwinding for perf, enabled on x86. (Peter Zijlstra, Steven Rostedt) - unwind_user/x86: Enable frame pointer unwinding on x86 (Josh Poimboeuf) x86 PMU support and infrastructure: - x86/insn: Simplify for_each_insn_prefix() (Peter Zijlstra) - x86/insn,uprobes,alternative: Unify insn_is_nop() (Peter Zijlstra) Intel PMU driver: - Large series to prepare for and implement architectural PEBS support for Intel platforms such as Clearwater Forest (CWF) and Panther Lake (PTL). (Dapeng Mi, Kan Liang) - Check dynamic constraints (Kan Liang) - Optimize PEBS extended config (Peter Zijlstra) - cstates: - Remove PC3 support from LunarLake (Zhang Rui) - Add Pantherlake support (Zhang Rui) - Clearwater Forest support (Zide Chen) AMD PMU driver: - x86/amd: Check event before enable to avoid GPF (George Kennedy) Fixes and cleanups: - task_work: Fix NMI race condition (Peter Zijlstra) - perf/x86: Fix NULL event access and potential PEBS record loss (Dapeng Mi) - Misc other fixes and cleanups (Dapeng Mi, Ingo Molnar, Peter Zijlstra)" * tag 'perf-core-2025-12-01' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (38 commits) perf/x86/intel: Fix and clean up intel_pmu_drain_arch_pebs() type use perf/x86/intel: Optimize PEBS extended config perf/x86/intel: Check PEBS dyn_constraints perf/x86/intel: Add a check for dynamic constraints perf/x86/intel: Add counter group support for arch-PEBS perf/x86/intel: Setup PEBS data configuration and enable legacy groups perf/x86/intel: Update dyn_constraint base on PEBS event precise level perf/x86/intel: Allocate arch-PEBS buffer and initialize PEBS_BASE MSR perf/x86/intel: Process arch-PEBS records or record fragments perf/x86/intel/ds: Factor out PEBS group processing code to functions perf/x86/intel/ds: Factor out PEBS record processing code to functions perf/x86/intel: Initialize architectural PEBS perf/x86/intel: Correct large PEBS flag check perf/x86/intel: Replace x86_pmu.drain_pebs calling with static call perf/x86: Fix NULL event access and potential PEBS record loss perf/x86: Remove redundant is_x86_event() prototype entry,unwind/deferred: Fix unwind_reset_info() placement unwind_user/x86: Fix arch=um build perf: Support deferred user unwind unwind_user/x86: Teach FP unwind about start of function ...
Diffstat (limited to 'kernel')
-rw-r--r--kernel/bpf/stackmap.c4
-rw-r--r--kernel/events/callchain.c14
-rw-r--r--kernel/events/core.c78
-rw-r--r--kernel/exit.c7
-rw-r--r--kernel/task_work.c8
-rw-r--r--kernel/unwind/deferred.c44
-rw-r--r--kernel/unwind/user.c59
7 files changed, 176 insertions, 38 deletions
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 4d53cdd1374c..8f1dacaf01fe 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -315,7 +315,7 @@ BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map,
max_depth = sysctl_perf_event_max_stack;
trace = get_perf_callchain(regs, kernel, user, max_depth,
- false, false);
+ false, false, 0);
if (unlikely(!trace))
/* couldn't fetch the stack trace */
@@ -452,7 +452,7 @@ static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task,
trace = get_callchain_entry_for_task(task, max_depth);
else
trace = get_perf_callchain(regs, kernel, user, max_depth,
- crosstask, false);
+ crosstask, false, 0);
if (unlikely(!trace) || trace->nr < skip) {
if (may_fault)
diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
index 808c0d7a31fa..b9c7e00725d6 100644
--- a/kernel/events/callchain.c
+++ b/kernel/events/callchain.c
@@ -218,7 +218,7 @@ static void fixup_uretprobe_trampoline_entries(struct perf_callchain_entry *entr
struct perf_callchain_entry *
get_perf_callchain(struct pt_regs *regs, bool kernel, bool user,
- u32 max_stack, bool crosstask, bool add_mark)
+ u32 max_stack, bool crosstask, bool add_mark, u64 defer_cookie)
{
struct perf_callchain_entry *entry;
struct perf_callchain_entry_ctx ctx;
@@ -251,6 +251,18 @@ get_perf_callchain(struct pt_regs *regs, bool kernel, bool user,
regs = task_pt_regs(current);
}
+ if (defer_cookie) {
+ /*
+ * Foretell the coming of PERF_RECORD_CALLCHAIN_DEFERRED
+ * which can be stitched to this one, and add
+ * the cookie after it (it will be cut off when the
+ * user stack is copied to the callchain).
+ */
+ perf_callchain_store_context(&ctx, PERF_CONTEXT_USER_DEFERRED);
+ perf_callchain_store_context(&ctx, defer_cookie);
+ goto exit_put;
+ }
+
if (add_mark)
perf_callchain_store_context(&ctx, PERF_CONTEXT_USER);
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 2c35acc2722b..ece716879cbc 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -56,6 +56,7 @@
#include <linux/buildid.h>
#include <linux/task_work.h>
#include <linux/percpu-rwsem.h>
+#include <linux/unwind_deferred.h>
#include "internal.h"
@@ -8200,6 +8201,8 @@ static u64 perf_get_page_size(unsigned long addr)
static struct perf_callchain_entry __empty_callchain = { .nr = 0, };
+static struct unwind_work perf_unwind_work;
+
struct perf_callchain_entry *
perf_callchain(struct perf_event *event, struct pt_regs *regs)
{
@@ -8208,8 +8211,11 @@ perf_callchain(struct perf_event *event, struct pt_regs *regs)
!(current->flags & (PF_KTHREAD | PF_USER_WORKER));
/* Disallow cross-task user callchains. */
bool crosstask = event->ctx->task && event->ctx->task != current;
+ bool defer_user = IS_ENABLED(CONFIG_UNWIND_USER) && user &&
+ event->attr.defer_callchain;
const u32 max_stack = event->attr.sample_max_stack;
struct perf_callchain_entry *callchain;
+ u64 defer_cookie;
if (!current->mm)
user = false;
@@ -8217,8 +8223,13 @@ perf_callchain(struct perf_event *event, struct pt_regs *regs)
if (!kernel && !user)
return &__empty_callchain;
- callchain = get_perf_callchain(regs, kernel, user,
- max_stack, crosstask, true);
+ if (!(user && defer_user && !crosstask &&
+ unwind_deferred_request(&perf_unwind_work, &defer_cookie) >= 0))
+ defer_cookie = 0;
+
+ callchain = get_perf_callchain(regs, kernel, user, max_stack,
+ crosstask, true, defer_cookie);
+
return callchain ?: &__empty_callchain;
}
@@ -10003,6 +10014,66 @@ void perf_event_bpf_event(struct bpf_prog *prog,
perf_iterate_sb(perf_event_bpf_output, &bpf_event, NULL);
}
+struct perf_callchain_deferred_event {
+ struct unwind_stacktrace *trace;
+ struct {
+ struct perf_event_header header;
+ u64 cookie;
+ u64 nr;
+ u64 ips[];
+ } event;
+};
+
+static void perf_callchain_deferred_output(struct perf_event *event, void *data)
+{
+ struct perf_callchain_deferred_event *deferred_event = data;
+ struct perf_output_handle handle;
+ struct perf_sample_data sample;
+ int ret, size = deferred_event->event.header.size;
+
+ if (!event->attr.defer_output)
+ return;
+
+ /* XXX do we really need sample_id_all for this ??? */
+ perf_event_header__init_id(&deferred_event->event.header, &sample, event);
+
+ ret = perf_output_begin(&handle, &sample, event,
+ deferred_event->event.header.size);
+ if (ret)
+ goto out;
+
+ perf_output_put(&handle, deferred_event->event);
+ for (int i = 0; i < deferred_event->trace->nr; i++) {
+ u64 entry = deferred_event->trace->entries[i];
+ perf_output_put(&handle, entry);
+ }
+ perf_event__output_id_sample(event, &handle, &sample);
+
+ perf_output_end(&handle);
+out:
+ deferred_event->event.header.size = size;
+}
+
+static void perf_unwind_deferred_callback(struct unwind_work *work,
+ struct unwind_stacktrace *trace, u64 cookie)
+{
+ struct perf_callchain_deferred_event deferred_event = {
+ .trace = trace,
+ .event = {
+ .header = {
+ .type = PERF_RECORD_CALLCHAIN_DEFERRED,
+ .misc = PERF_RECORD_MISC_USER,
+ .size = sizeof(deferred_event.event) +
+ (trace->nr * sizeof(u64)),
+ },
+ .cookie = cookie,
+ .nr = trace->nr,
+ },
+ };
+
+ perf_iterate_sb(perf_callchain_deferred_output, &deferred_event, NULL);
+}
+
struct perf_text_poke_event {
const void *old_bytes;
const void *new_bytes;
@@ -14809,6 +14880,9 @@ void __init perf_event_init(void)
idr_init(&pmu_idr);
+ unwind_deferred_init(&perf_unwind_work,
+ perf_unwind_deferred_callback);
+
perf_event_init_all_cpus();
init_srcu_struct(&pmus_srcu);
perf_pmu_register(&perf_swevent, "software", PERF_TYPE_SOFTWARE);
diff --git a/kernel/exit.c b/kernel/exit.c
index 988e16efd66b..fdfd05d1826c 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -940,7 +940,6 @@ void __noreturn do_exit(long code)
tsk->exit_code = code;
taskstats_exit(tsk, group_dead);
- unwind_deferred_task_exit(tsk);
trace_sched_process_exit(tsk, group_dead);
/*
@@ -951,6 +950,12 @@ void __noreturn do_exit(long code)
* gets woken up by child-exit notifications.
*/
perf_event_exit_task(tsk);
+ /*
+ * PF_EXITING (above) ensures unwind_deferred_request() will no
+ * longer add new unwinds. While exit_mm() (below) will destroy the
+ * abaility to do unwinds. So flush any pending unwinds here.
+ */
+ unwind_deferred_task_exit(tsk);
exit_mm();
diff --git a/kernel/task_work.c b/kernel/task_work.c
index d1efec571a4a..0f7519f8e7c9 100644
--- a/kernel/task_work.c
+++ b/kernel/task_work.c
@@ -9,7 +9,12 @@ static struct callback_head work_exited; /* all we need is ->next == NULL */
#ifdef CONFIG_IRQ_WORK
static void task_work_set_notify_irq(struct irq_work *entry)
{
- test_and_set_tsk_thread_flag(current, TIF_NOTIFY_RESUME);
+ /*
+ * no-op IPI
+ *
+ * TWA_NMI_CURRENT will already have set the TIF flag, all
+ * this interrupt does it tickle the return-to-user path.
+ */
}
static DEFINE_PER_CPU(struct irq_work, irq_work_NMI_resume) =
IRQ_WORK_INIT_HARD(task_work_set_notify_irq);
@@ -86,6 +91,7 @@ int task_work_add(struct task_struct *task, struct callback_head *work,
break;
#ifdef CONFIG_IRQ_WORK
case TWA_NMI_CURRENT:
+ set_tsk_thread_flag(current, TIF_NOTIFY_RESUME);
irq_work_queue(this_cpu_ptr(&irq_work_NMI_resume));
break;
#endif
diff --git a/kernel/unwind/deferred.c b/kernel/unwind/deferred.c
index dc6040aae3ee..a88fb481c4a3 100644
--- a/kernel/unwind/deferred.c
+++ b/kernel/unwind/deferred.c
@@ -53,7 +53,7 @@ DEFINE_STATIC_SRCU(unwind_srcu);
static inline bool unwind_pending(struct unwind_task_info *info)
{
- return test_bit(UNWIND_PENDING_BIT, &info->unwind_mask);
+ return atomic_long_read(&info->unwind_mask) & UNWIND_PENDING;
}
/*
@@ -79,6 +79,8 @@ static u64 get_cookie(struct unwind_task_info *info)
{
u32 cnt = 1;
+ lockdep_assert_irqs_disabled();
+
if (info->id.cpu)
return info->id.id;
@@ -126,23 +128,20 @@ int unwind_user_faultable(struct unwind_stacktrace *trace)
cache = info->cache;
trace->entries = cache->entries;
-
- if (cache->nr_entries) {
- /*
- * The user stack has already been previously unwound in this
- * entry context. Skip the unwind and use the cache.
- */
- trace->nr = cache->nr_entries;
+ trace->nr = cache->nr_entries;
+ /*
+ * The user stack has already been previously unwound in this
+ * entry context. Skip the unwind and use the cache.
+ */
+ if (trace->nr)
return 0;
- }
- trace->nr = 0;
unwind_user(trace, UNWIND_MAX_ENTRIES);
cache->nr_entries = trace->nr;
/* Clear nr_entries on way back to user space */
- set_bit(UNWIND_USED_BIT, &info->unwind_mask);
+ atomic_long_or(UNWIND_USED, &info->unwind_mask);
return 0;
}
@@ -160,7 +159,7 @@ static void process_unwind_deferred(struct task_struct *task)
/* Clear pending bit but make sure to have the current bits */
bits = atomic_long_fetch_andnot(UNWIND_PENDING,
- (atomic_long_t *)&info->unwind_mask);
+ &info->unwind_mask);
/*
* From here on out, the callback must always be called, even if it's
* just an empty trace.
@@ -231,6 +230,7 @@ void unwind_deferred_task_exit(struct task_struct *task)
int unwind_deferred_request(struct unwind_work *work, u64 *cookie)
{
struct unwind_task_info *info = &current->unwind_info;
+ int twa_mode = TWA_RESUME;
unsigned long old, bits;
unsigned long bit;
int ret;
@@ -246,8 +246,11 @@ int unwind_deferred_request(struct unwind_work *work, u64 *cookie)
* Trigger a warning to make it obvious that an architecture
* is using this in NMI when it should not be.
*/
- if (WARN_ON_ONCE(!CAN_USE_IN_NMI && in_nmi()))
- return -EINVAL;
+ if (in_nmi()) {
+ if (WARN_ON_ONCE(!CAN_USE_IN_NMI))
+ return -EINVAL;
+ twa_mode = TWA_NMI_CURRENT;
+ }
/* Do not allow cancelled works to request again */
bit = READ_ONCE(work->bit);
@@ -261,7 +264,7 @@ int unwind_deferred_request(struct unwind_work *work, u64 *cookie)
*cookie = get_cookie(info);
- old = READ_ONCE(info->unwind_mask);
+ old = atomic_long_read(&info->unwind_mask);
/* Is this already queued or executed */
if (old & bit)
@@ -274,7 +277,7 @@ int unwind_deferred_request(struct unwind_work *work, u64 *cookie)
* to have a callback.
*/
bits = UNWIND_PENDING | bit;
- old = atomic_long_fetch_or(bits, (atomic_long_t *)&info->unwind_mask);
+ old = atomic_long_fetch_or(bits, &info->unwind_mask);
if (old & bits) {
/*
* If the work's bit was set, whatever set it had better
@@ -285,10 +288,10 @@ int unwind_deferred_request(struct unwind_work *work, u64 *cookie)
}
/* The work has been claimed, now schedule it. */
- ret = task_work_add(current, &info->work, TWA_RESUME);
+ ret = task_work_add(current, &info->work, twa_mode);
if (WARN_ON_ONCE(ret))
- WRITE_ONCE(info->unwind_mask, 0);
+ atomic_long_set(&info->unwind_mask, 0);
return ret;
}
@@ -320,7 +323,8 @@ void unwind_deferred_cancel(struct unwind_work *work)
guard(rcu)();
/* Clear this bit from all threads */
for_each_process_thread(g, t) {
- clear_bit(bit, &t->unwind_info.unwind_mask);
+ atomic_long_andnot(BIT(bit),
+ &t->unwind_info.unwind_mask);
if (t->unwind_info.cache)
clear_bit(bit, &t->unwind_info.cache->unwind_completed);
}
@@ -350,7 +354,7 @@ void unwind_task_init(struct task_struct *task)
memset(info, 0, sizeof(*info));
init_task_work(&info->work, unwind_deferred_task_work);
- info->unwind_mask = 0;
+ atomic_long_set(&info->unwind_mask, 0);
}
void unwind_task_free(struct task_struct *task)
diff --git a/kernel/unwind/user.c b/kernel/unwind/user.c
index 97a8415e3216..39e270789444 100644
--- a/kernel/unwind/user.c
+++ b/kernel/unwind/user.c
@@ -8,18 +8,28 @@
#include <linux/unwind_user.h>
#include <linux/uaccess.h>
-static const struct unwind_user_frame fp_frame = {
- ARCH_INIT_USER_FP_FRAME
-};
-
#define for_each_user_frame(state) \
for (unwind_user_start(state); !(state)->done; unwind_user_next(state))
-static int unwind_user_next_fp(struct unwind_user_state *state)
+static inline int
+get_user_word(unsigned long *word, unsigned long base, int off, unsigned int ws)
+{
+ unsigned long __user *addr = (void __user *)base + off;
+#ifdef CONFIG_COMPAT
+ if (ws == sizeof(int)) {
+ unsigned int data;
+ int ret = get_user(data, (unsigned int __user *)addr);
+ *word = data;
+ return ret;
+ }
+#endif
+ return get_user(*word, addr);
+}
+
+static int unwind_user_next_common(struct unwind_user_state *state,
+ const struct unwind_user_frame *frame)
{
- const struct unwind_user_frame *frame = &fp_frame;
unsigned long cfa, fp, ra;
- unsigned int shift;
if (frame->use_fp) {
if (state->fp < state->sp)
@@ -37,24 +47,45 @@ static int unwind_user_next_fp(struct unwind_user_state *state)
return -EINVAL;
/* Make sure that the address is word aligned */
- shift = sizeof(long) == 4 ? 2 : 3;
- if (cfa & ((1 << shift) - 1))
+ if (cfa & (state->ws - 1))
return -EINVAL;
/* Find the Return Address (RA) */
- if (get_user(ra, (unsigned long *)(cfa + frame->ra_off)))
+ if (get_user_word(&ra, cfa, frame->ra_off, state->ws))
return -EINVAL;
- if (frame->fp_off && get_user(fp, (unsigned long __user *)(cfa + frame->fp_off)))
+ if (frame->fp_off && get_user_word(&fp, cfa, frame->fp_off, state->ws))
return -EINVAL;
state->ip = ra;
state->sp = cfa;
if (frame->fp_off)
state->fp = fp;
+ state->topmost = false;
return 0;
}
+static int unwind_user_next_fp(struct unwind_user_state *state)
+{
+#ifdef CONFIG_HAVE_UNWIND_USER_FP
+ struct pt_regs *regs = task_pt_regs(current);
+
+ if (state->topmost && unwind_user_at_function_start(regs)) {
+ const struct unwind_user_frame fp_entry_frame = {
+ ARCH_INIT_USER_FP_ENTRY_FRAME(state->ws)
+ };
+ return unwind_user_next_common(state, &fp_entry_frame);
+ }
+
+ const struct unwind_user_frame fp_frame = {
+ ARCH_INIT_USER_FP_FRAME(state->ws)
+ };
+ return unwind_user_next_common(state, &fp_frame);
+#else
+ return -EINVAL;
+#endif
+}
+
static int unwind_user_next(struct unwind_user_state *state)
{
unsigned long iter_mask = state->available_types;
@@ -102,6 +133,12 @@ static int unwind_user_start(struct unwind_user_state *state)
state->ip = instruction_pointer(regs);
state->sp = user_stack_pointer(regs);
state->fp = frame_pointer(regs);
+ state->ws = unwind_user_word_size(regs);
+ if (!state->ws) {
+ state->done = true;
+ return -EINVAL;
+ }
+ state->topmost = true;
return 0;
}