Merge tag 'perf-core-2025-12-01' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull performance events updates from Ingo Molnar: "Callchain support: - Add support for deferred user-space stack unwinding for perf, enabled on x86. (Peter Zijlstra, Steven Rostedt) - unwind_user/x86: Enable frame pointer unwinding on x86 (Josh Poimboeuf) x86 PMU support and infrastructure: - x86/insn: Simplify for_each_insn_prefix() (Peter Zijlstra) - x86/insn,uprobes,alternative: Unify insn_is_nop() (Peter Zijlstra) Intel PMU driver: - Large series to prepare for and implement architectural PEBS support for Intel platforms such as Clearwater Forest (CWF) and Panther Lake (PTL). (Dapeng Mi, Kan Liang) - Check dynamic constraints (Kan Liang) - Optimize PEBS extended config (Peter Zijlstra) - cstates: - Remove PC3 support from LunarLake (Zhang Rui) - Add Pantherlake support (Zhang Rui) - Clearwater Forest support (Zide Chen) AMD PMU driver: - x86/amd: Check event before enable to avoid GPF (George Kennedy) Fixes and cleanups: - task_work: Fix NMI race condition (Peter Zijlstra) - perf/x86: Fix NULL event access and potential PEBS record loss (Dapeng Mi) - Misc other fixes and cleanups (Dapeng Mi, Ingo Molnar, Peter Zijlstra)" * tag 'perf-core-2025-12-01' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (38 commits) perf/x86/intel: Fix and clean up intel_pmu_drain_arch_pebs() type use perf/x86/intel: Optimize PEBS extended config perf/x86/intel: Check PEBS dyn_constraints perf/x86/intel: Add a check for dynamic constraints perf/x86/intel: Add counter group support for arch-PEBS perf/x86/intel: Setup PEBS data configuration and enable legacy groups perf/x86/intel: Update dyn_constraint base on PEBS event precise level perf/x86/intel: Allocate arch-PEBS buffer and initialize PEBS_BASE MSR perf/x86/intel: Process arch-PEBS records or record fragments perf/x86/intel/ds: Factor out PEBS group processing code to functions perf/x86/intel/ds: Factor out PEBS record processing code to functions perf/x86/intel: Initialize architectural PEBS perf/x86/intel: Correct large PEBS flag check perf/x86/intel: Replace x86_pmu.drain_pebs calling with static call perf/x86: Fix NULL event access and potential PEBS record loss perf/x86: Remove redundant is_x86_event() prototype entry,unwind/deferred: Fix unwind_reset_info() placement unwind_user/x86: Fix arch=um build perf: Support deferred user unwind unwind_user/x86: Teach FP unwind about start of function ...
author: Linus Torvalds <torvalds@linux-foundation.org> 2025-12-01 20:42:01 -0800
committer: Linus Torvalds <torvalds@linux-foundation.org> 2025-12-01 20:42:01 -0800
commit: 6c26fbe8c9d3e932dce6afe2505b19b4b261cae9 (patch)
tree: 81cc40ecd2cde95b1b37937cf270cc0fa3832c43 /kernel
parent: 63e6995005be8ceb8a1d56a18df1a1a40c28356d (diff)
parent: 9929dffce5ed7e2988e0274f4db98035508b16d9 (diff)
7 files changed, 176 insertions, 38 deletions
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 4d53cdd1374c..8f1dacaf01fe 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -315,7 +315,7 @@ BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map,
 		max_depth = sysctl_perf_event_max_stack;
 
 	trace = get_perf_callchain(regs, kernel, user, max_depth,
-				   false, false);
+				   false, false, 0);
 
 	if (unlikely(!trace))
 		/* couldn't fetch the stack trace */
@@ -452,7 +452,7 @@ static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task,
 		trace = get_callchain_entry_for_task(task, max_depth);
 	else
 		trace = get_perf_callchain(regs, kernel, user, max_depth,
-					   crosstask, false);
+					   crosstask, false, 0);
 
 	if (unlikely(!trace) || trace->nr < skip) {
 		if (may_fault)
diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
index 808c0d7a31fa..b9c7e00725d6 100644
--- a/kernel/events/callchain.c
+++ b/kernel/events/callchain.c
@@ -218,7 +218,7 @@ static void fixup_uretprobe_trampoline_entries(struct perf_callchain_entry *entr
 
 struct perf_callchain_entry *
 get_perf_callchain(struct pt_regs *regs, bool kernel, bool user,
-		   u32 max_stack, bool crosstask, bool add_mark)
+		   u32 max_stack, bool crosstask, bool add_mark, u64 defer_cookie)
 {
 	struct perf_callchain_entry *entry;
 	struct perf_callchain_entry_ctx ctx;
@@ -251,6 +251,18 @@ get_perf_callchain(struct pt_regs *regs, bool kernel, bool user,
 			regs = task_pt_regs(current);
 		}
 
+		if (defer_cookie) {
+			/*
+			 * Foretell the coming of PERF_RECORD_CALLCHAIN_DEFERRED
+			 * which can be stitched to this one, and add
+			 * the cookie after it (it will be cut off when the
+			 * user stack is copied to the callchain).
+			 */
+			perf_callchain_store_context(&ctx, PERF_CONTEXT_USER_DEFERRED);
+			perf_callchain_store_context(&ctx, defer_cookie);
+			goto exit_put;
+		}
+
 		if (add_mark)
 			perf_callchain_store_context(&ctx, PERF_CONTEXT_USER);
 
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 2c35acc2722b..ece716879cbc 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -56,6 +56,7 @@
 #include <linux/buildid.h>
 #include <linux/task_work.h>
 #include <linux/percpu-rwsem.h>
+#include <linux/unwind_deferred.h>
 
 #include "internal.h"
 
@@ -8200,6 +8201,8 @@ static u64 perf_get_page_size(unsigned long addr)
 
 static struct perf_callchain_entry __empty_callchain = { .nr = 0, };
 
+static struct unwind_work perf_unwind_work;
+
 struct perf_callchain_entry *
 perf_callchain(struct perf_event *event, struct pt_regs *regs)
 {
@@ -8208,8 +8211,11 @@ perf_callchain(struct perf_event *event, struct pt_regs *regs)
 		!(current->flags & (PF_KTHREAD | PF_USER_WORKER));
 	/* Disallow cross-task user callchains. */
 	bool crosstask = event->ctx->task && event->ctx->task != current;
+	bool defer_user = IS_ENABLED(CONFIG_UNWIND_USER) && user &&
+			  event->attr.defer_callchain;
 	const u32 max_stack = event->attr.sample_max_stack;
 	struct perf_callchain_entry *callchain;
+	u64 defer_cookie;
 
 	if (!current->mm)
 		user = false;
@@ -8217,8 +8223,13 @@ perf_callchain(struct perf_event *event, struct pt_regs *regs)
 	if (!kernel && !user)
 		return &__empty_callchain;
 
-	callchain = get_perf_callchain(regs, kernel, user,
-				       max_stack, crosstask, true);
+	if (!(user && defer_user && !crosstask &&
+	      unwind_deferred_request(&perf_unwind_work, &defer_cookie) >= 0))
+		defer_cookie = 0;
+
+	callchain = get_perf_callchain(regs, kernel, user, max_stack,
+				       crosstask, true, defer_cookie);
+
 	return callchain ?: &__empty_callchain;
 }
 
@@ -10003,6 +10014,66 @@ void perf_event_bpf_event(struct bpf_prog *prog,
 	perf_iterate_sb(perf_event_bpf_output, &bpf_event, NULL);
 }
 
+struct perf_callchain_deferred_event {
+	struct unwind_stacktrace *trace;
+	struct {
+		struct perf_event_header	header;
+		u64				cookie;
+		u64				nr;
+		u64				ips[];
+	} event;
+};
+
+static void perf_callchain_deferred_output(struct perf_event *event, void *data)
+{
+	struct perf_callchain_deferred_event *deferred_event = data;
+	struct perf_output_handle handle;
+	struct perf_sample_data sample;
+	int ret, size = deferred_event->event.header.size;
+
+	if (!event->attr.defer_output)
+		return;
+
+	/* XXX do we really need sample_id_all for this ??? */
+	perf_event_header__init_id(&deferred_event->event.header, &sample, event);
+
+	ret = perf_output_begin(&handle, &sample, event,
+				deferred_event->event.header.size);
+	if (ret)
+		goto out;
+
+	perf_output_put(&handle, deferred_event->event);
+	for (int i = 0; i < deferred_event->trace->nr; i++) {
+		u64 entry = deferred_event->trace->entries[i];
+		perf_output_put(&handle, entry);
+	}
+	perf_event__output_id_sample(event, &handle, &sample);
+
+	perf_output_end(&handle);
+out:
+	deferred_event->event.header.size = size;
+}
+
+static void perf_unwind_deferred_callback(struct unwind_work *work,
+					 struct unwind_stacktrace *trace, u64 cookie)
+{
+	struct perf_callchain_deferred_event deferred_event = {
+		.trace = trace,
+		.event = {
+			.header = {
+				.type = PERF_RECORD_CALLCHAIN_DEFERRED,
+				.misc = PERF_RECORD_MISC_USER,
+				.size = sizeof(deferred_event.event) +
+					(trace->nr * sizeof(u64)),
+			},
+			.cookie = cookie,
+			.nr = trace->nr,
+		},
+	};
+
+	perf_iterate_sb(perf_callchain_deferred_output, &deferred_event, NULL);
+}
+
 struct perf_text_poke_event {
 	const void		*old_bytes;
 	const void		*new_bytes;
@@ -14809,6 +14880,9 @@ void __init perf_event_init(void)
 
 	idr_init(&pmu_idr);
 
+	unwind_deferred_init(&perf_unwind_work,
+			     perf_unwind_deferred_callback);
+
 	perf_event_init_all_cpus();
 	init_srcu_struct(&pmus_srcu);
 	perf_pmu_register(&perf_swevent, "software", PERF_TYPE_SOFTWARE);
diff --git a/kernel/exit.c b/kernel/exit.c
index 988e16efd66b..fdfd05d1826c 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -940,7 +940,6 @@ void __noreturn do_exit(long code)
 
 	tsk->exit_code = code;
 	taskstats_exit(tsk, group_dead);
-	unwind_deferred_task_exit(tsk);
 	trace_sched_process_exit(tsk, group_dead);
 
 	/*
@@ -951,6 +950,12 @@ void __noreturn do_exit(long code)
 	 * gets woken up by child-exit notifications.
 	 */
 	perf_event_exit_task(tsk);
+	/*
+	 * PF_EXITING (above) ensures unwind_deferred_request() will no
+	 * longer add new unwinds. While exit_mm() (below) will destroy the
+	 * abaility to do unwinds. So flush any pending unwinds here.
+	 */
+	unwind_deferred_task_exit(tsk);
 
 	exit_mm();
 
diff --git a/kernel/task_work.c b/kernel/task_work.c
index d1efec571a4a..0f7519f8e7c9 100644
--- a/kernel/task_work.c
+++ b/kernel/task_work.c
@@ -9,7 +9,12 @@ static struct callback_head work_exited; /* all we need is ->next == NULL */
 #ifdef CONFIG_IRQ_WORK
 static void task_work_set_notify_irq(struct irq_work *entry)
 {
-	test_and_set_tsk_thread_flag(current, TIF_NOTIFY_RESUME);
+	/*
+	 * no-op IPI
+	 *
+	 * TWA_NMI_CURRENT will already have set the TIF flag, all
+	 * this interrupt does it tickle the return-to-user path.
+	 */
 }
 static DEFINE_PER_CPU(struct irq_work, irq_work_NMI_resume) =
 	IRQ_WORK_INIT_HARD(task_work_set_notify_irq);
@@ -86,6 +91,7 @@ int task_work_add(struct task_struct *task, struct callback_head *work,
 		break;
 #ifdef CONFIG_IRQ_WORK
 	case TWA_NMI_CURRENT:
+		set_tsk_thread_flag(current, TIF_NOTIFY_RESUME);
 		irq_work_queue(this_cpu_ptr(&irq_work_NMI_resume));
 		break;
 #endif
diff --git a/kernel/unwind/deferred.c b/kernel/unwind/deferred.c
index dc6040aae3ee..a88fb481c4a3 100644
--- a/kernel/unwind/deferred.c
+++ b/kernel/unwind/deferred.c
@@ -53,7 +53,7 @@ DEFINE_STATIC_SRCU(unwind_srcu);
 
 static inline bool unwind_pending(struct unwind_task_info *info)
 {
-	return test_bit(UNWIND_PENDING_BIT, &info->unwind_mask);
+	return atomic_long_read(&info->unwind_mask) & UNWIND_PENDING;
 }
 
 /*
@@ -79,6 +79,8 @@ static u64 get_cookie(struct unwind_task_info *info)
 {
 	u32 cnt = 1;
 
+	lockdep_assert_irqs_disabled();
+
 	if (info->id.cpu)
 		return info->id.id;
 
@@ -126,23 +128,20 @@ int unwind_user_faultable(struct unwind_stacktrace *trace)
 
 	cache = info->cache;
 	trace->entries = cache->entries;
-
-	if (cache->nr_entries) {
-		/*
-		 * The user stack has already been previously unwound in this
-		 * entry context.  Skip the unwind and use the cache.
-		 */
-		trace->nr = cache->nr_entries;
+	trace->nr = cache->nr_entries;
+	/*
+	 * The user stack has already been previously unwound in this
+	 * entry context.  Skip the unwind and use the cache.
+	 */
+	if (trace->nr)
 		return 0;
-	}
 
-	trace->nr = 0;
 	unwind_user(trace, UNWIND_MAX_ENTRIES);
 
 	cache->nr_entries = trace->nr;
 
 	/* Clear nr_entries on way back to user space */
-	set_bit(UNWIND_USED_BIT, &info->unwind_mask);
+	atomic_long_or(UNWIND_USED, &info->unwind_mask);
 
 	return 0;
 }
@@ -160,7 +159,7 @@ static void process_unwind_deferred(struct task_struct *task)
 
 	/* Clear pending bit but make sure to have the current bits */
 	bits = atomic_long_fetch_andnot(UNWIND_PENDING,
-				  (atomic_long_t *)&info->unwind_mask);
+					&info->unwind_mask);
 	/*
 	 * From here on out, the callback must always be called, even if it's
 	 * just an empty trace.
@@ -231,6 +230,7 @@ void unwind_deferred_task_exit(struct task_struct *task)
 int unwind_deferred_request(struct unwind_work *work, u64 *cookie)
 {
 	struct unwind_task_info *info = &current->unwind_info;
+	int twa_mode = TWA_RESUME;
 	unsigned long old, bits;
 	unsigned long bit;
 	int ret;
@@ -246,8 +246,11 @@ int unwind_deferred_request(struct unwind_work *work, u64 *cookie)
 	 * Trigger a warning to make it obvious that an architecture
 	 * is using this in NMI when it should not be.
 	 */
-	if (WARN_ON_ONCE(!CAN_USE_IN_NMI && in_nmi()))
-		return -EINVAL;
+	if (in_nmi()) {
+		if (WARN_ON_ONCE(!CAN_USE_IN_NMI))
+			return -EINVAL;
+		twa_mode = TWA_NMI_CURRENT;
+	}
 
 	/* Do not allow cancelled works to request again */
 	bit = READ_ONCE(work->bit);
@@ -261,7 +264,7 @@ int unwind_deferred_request(struct unwind_work *work, u64 *cookie)
 
 	*cookie = get_cookie(info);
 
-	old = READ_ONCE(info->unwind_mask);
+	old = atomic_long_read(&info->unwind_mask);
 
 	/* Is this already queued or executed */
 	if (old & bit)
@@ -274,7 +277,7 @@ int unwind_deferred_request(struct unwind_work *work, u64 *cookie)
 	 * to have a callback.
 	 */
 	bits = UNWIND_PENDING | bit;
-	old = atomic_long_fetch_or(bits, (atomic_long_t *)&info->unwind_mask);
+	old = atomic_long_fetch_or(bits, &info->unwind_mask);
 	if (old & bits) {
 		/*
 		 * If the work's bit was set, whatever set it had better
@@ -285,10 +288,10 @@ int unwind_deferred_request(struct unwind_work *work, u64 *cookie)
 	}
 
 	/* The work has been claimed, now schedule it. */
-	ret = task_work_add(current, &info->work, TWA_RESUME);
+	ret = task_work_add(current, &info->work, twa_mode);
 
 	if (WARN_ON_ONCE(ret))
-		WRITE_ONCE(info->unwind_mask, 0);
+		atomic_long_set(&info->unwind_mask, 0);
 
 	return ret;
 }
@@ -320,7 +323,8 @@ void unwind_deferred_cancel(struct unwind_work *work)
 	guard(rcu)();
 	/* Clear this bit from all threads */
 	for_each_process_thread(g, t) {
-		clear_bit(bit, &t->unwind_info.unwind_mask);
+		atomic_long_andnot(BIT(bit),
+				   &t->unwind_info.unwind_mask);
 		if (t->unwind_info.cache)
 			clear_bit(bit, &t->unwind_info.cache->unwind_completed);
 	}
@@ -350,7 +354,7 @@ void unwind_task_init(struct task_struct *task)
 
 	memset(info, 0, sizeof(*info));
 	init_task_work(&info->work, unwind_deferred_task_work);
-	info->unwind_mask = 0;
+	atomic_long_set(&info->unwind_mask, 0);
 }
 
 void unwind_task_free(struct task_struct *task)
diff --git a/kernel/unwind/user.c b/kernel/unwind/user.c
index 97a8415e3216..39e270789444 100644
--- a/kernel/unwind/user.c
+++ b/kernel/unwind/user.c
@@ -8,18 +8,28 @@
 #include <linux/unwind_user.h>
 #include <linux/uaccess.h>
 
-static const struct unwind_user_frame fp_frame = {
-	ARCH_INIT_USER_FP_FRAME
-};
-
 #define for_each_user_frame(state) \
 	for (unwind_user_start(state); !(state)->done; unwind_user_next(state))
 
-static int unwind_user_next_fp(struct unwind_user_state *state)
+static inline int
+get_user_word(unsigned long *word, unsigned long base, int off, unsigned int ws)
+{
+	unsigned long __user *addr = (void __user *)base + off;
+#ifdef CONFIG_COMPAT
+	if (ws == sizeof(int)) {
+		unsigned int data;
+		int ret = get_user(data, (unsigned int __user *)addr);
+		*word = data;
+		return ret;
+	}
+#endif
+	return get_user(*word, addr);
+}
+
+static int unwind_user_next_common(struct unwind_user_state *state,
+				   const struct unwind_user_frame *frame)
 {
-	const struct unwind_user_frame *frame = &fp_frame;
 	unsigned long cfa, fp, ra;
-	unsigned int shift;
 
 	if (frame->use_fp) {
 		if (state->fp < state->sp)
@@ -37,24 +47,45 @@ static int unwind_user_next_fp(struct unwind_user_state *state)
 		return -EINVAL;
 
 	/* Make sure that the address is word aligned */
-	shift = sizeof(long) == 4 ? 2 : 3;
-	if (cfa & ((1 << shift) - 1))
+	if (cfa & (state->ws - 1))
 		return -EINVAL;
 
 	/* Find the Return Address (RA) */
-	if (get_user(ra, (unsigned long *)(cfa + frame->ra_off)))
+	if (get_user_word(&ra, cfa, frame->ra_off, state->ws))
 		return -EINVAL;
 
-	if (frame->fp_off && get_user(fp, (unsigned long __user *)(cfa + frame->fp_off)))
+	if (frame->fp_off && get_user_word(&fp, cfa, frame->fp_off, state->ws))
 		return -EINVAL;
 
 	state->ip = ra;
 	state->sp = cfa;
 	if (frame->fp_off)
 		state->fp = fp;
+	state->topmost = false;
 	return 0;
 }
 
+static int unwind_user_next_fp(struct unwind_user_state *state)
+{
+#ifdef CONFIG_HAVE_UNWIND_USER_FP
+	struct pt_regs *regs = task_pt_regs(current);
+
+	if (state->topmost && unwind_user_at_function_start(regs)) {
+		const struct unwind_user_frame fp_entry_frame = {
+			ARCH_INIT_USER_FP_ENTRY_FRAME(state->ws)
+		};
+		return unwind_user_next_common(state, &fp_entry_frame);
+	}
+
+	const struct unwind_user_frame fp_frame = {
+		ARCH_INIT_USER_FP_FRAME(state->ws)
+	};
+	return unwind_user_next_common(state, &fp_frame);
+#else
+	return -EINVAL;
+#endif
+}
+
 static int unwind_user_next(struct unwind_user_state *state)
 {
 	unsigned long iter_mask = state->available_types;
@@ -102,6 +133,12 @@ static int unwind_user_start(struct unwind_user_state *state)
 	state->ip = instruction_pointer(regs);
 	state->sp = user_stack_pointer(regs);
 	state->fp = frame_pointer(regs);
+	state->ws = unwind_user_word_size(regs);
+	if (!state->ws) {
+		state->done = true;
+		return -EINVAL;
+	}
+	state->topmost = true;
 
 	return 0;
 }
author	Linus Torvalds <torvalds@linux-foundation.org>	2025-12-01 20:42:01 -0800
committer	Linus Torvalds <torvalds@linux-foundation.org>	2025-12-01 20:42:01 -0800
commit	6c26fbe8c9d3e932dce6afe2505b19b4b261cae9 (patch)
tree	81cc40ecd2cde95b1b37937cf270cc0fa3832c43 /kernel
parent	63e6995005be8ceb8a1d56a18df1a1a40c28356d (diff)
parent	9929dffce5ed7e2988e0274f4db98035508b16d9 (diff)