diff options
| author | Linus Torvalds <torvalds@linux-foundation.org> | 2025-12-01 20:42:01 -0800 |
|---|---|---|
| committer | Linus Torvalds <torvalds@linux-foundation.org> | 2025-12-01 20:42:01 -0800 |
| commit | 6c26fbe8c9d3e932dce6afe2505b19b4b261cae9 (patch) | |
| tree | 81cc40ecd2cde95b1b37937cf270cc0fa3832c43 /arch | |
| parent | 63e6995005be8ceb8a1d56a18df1a1a40c28356d (diff) | |
| parent | 9929dffce5ed7e2988e0274f4db98035508b16d9 (diff) | |
Merge tag 'perf-core-2025-12-01' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull performance events updates from Ingo Molnar:
"Callchain support:
- Add support for deferred user-space stack unwinding for perf,
enabled on x86. (Peter Zijlstra, Steven Rostedt)
- unwind_user/x86: Enable frame pointer unwinding on x86 (Josh
Poimboeuf)
x86 PMU support and infrastructure:
- x86/insn: Simplify for_each_insn_prefix() (Peter Zijlstra)
- x86/insn,uprobes,alternative: Unify insn_is_nop() (Peter Zijlstra)
Intel PMU driver:
- Large series to prepare for and implement architectural PEBS
support for Intel platforms such as Clearwater Forest (CWF) and
Panther Lake (PTL). (Dapeng Mi, Kan Liang)
- Check dynamic constraints (Kan Liang)
- Optimize PEBS extended config (Peter Zijlstra)
- cstates:
- Remove PC3 support from LunarLake (Zhang Rui)
- Add Pantherlake support (Zhang Rui)
- Clearwater Forest support (Zide Chen)
AMD PMU driver:
- x86/amd: Check event before enable to avoid GPF (George Kennedy)
Fixes and cleanups:
- task_work: Fix NMI race condition (Peter Zijlstra)
- perf/x86: Fix NULL event access and potential PEBS record loss
(Dapeng Mi)
- Misc other fixes and cleanups (Dapeng Mi, Ingo Molnar, Peter
Zijlstra)"
* tag 'perf-core-2025-12-01' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (38 commits)
perf/x86/intel: Fix and clean up intel_pmu_drain_arch_pebs() type use
perf/x86/intel: Optimize PEBS extended config
perf/x86/intel: Check PEBS dyn_constraints
perf/x86/intel: Add a check for dynamic constraints
perf/x86/intel: Add counter group support for arch-PEBS
perf/x86/intel: Setup PEBS data configuration and enable legacy groups
perf/x86/intel: Update dyn_constraint base on PEBS event precise level
perf/x86/intel: Allocate arch-PEBS buffer and initialize PEBS_BASE MSR
perf/x86/intel: Process arch-PEBS records or record fragments
perf/x86/intel/ds: Factor out PEBS group processing code to functions
perf/x86/intel/ds: Factor out PEBS record processing code to functions
perf/x86/intel: Initialize architectural PEBS
perf/x86/intel: Correct large PEBS flag check
perf/x86/intel: Replace x86_pmu.drain_pebs calling with static call
perf/x86: Fix NULL event access and potential PEBS record loss
perf/x86: Remove redundant is_x86_event() prototype
entry,unwind/deferred: Fix unwind_reset_info() placement
unwind_user/x86: Fix arch=um build
perf: Support deferred user unwind
unwind_user/x86: Teach FP unwind about start of function
...
Diffstat (limited to 'arch')
| -rw-r--r-- | arch/x86/Kconfig | 1 | ||||
| -rw-r--r-- | arch/x86/boot/compressed/sev-handle-vc.c | 3 | ||||
| -rw-r--r-- | arch/x86/events/amd/core.c | 7 | ||||
| -rw-r--r-- | arch/x86/events/core.c | 66 | ||||
| -rw-r--r-- | arch/x86/events/intel/core.c | 444 | ||||
| -rw-r--r-- | arch/x86/events/intel/cstate.c | 18 | ||||
| -rw-r--r-- | arch/x86/events/intel/ds.c | 601 | ||||
| -rw-r--r-- | arch/x86/events/perf_event.h | 41 | ||||
| -rw-r--r-- | arch/x86/include/asm/insn-eval.h | 2 | ||||
| -rw-r--r-- | arch/x86/include/asm/insn.h | 5 | ||||
| -rw-r--r-- | arch/x86/include/asm/intel_ds.h | 10 | ||||
| -rw-r--r-- | arch/x86/include/asm/msr-index.h | 20 | ||||
| -rw-r--r-- | arch/x86/include/asm/perf_event.h | 116 | ||||
| -rw-r--r-- | arch/x86/include/asm/unwind_user.h | 41 | ||||
| -rw-r--r-- | arch/x86/include/asm/uprobes.h | 9 | ||||
| -rw-r--r-- | arch/x86/kernel/alternative.c | 20 | ||||
| -rw-r--r-- | arch/x86/kernel/kprobes/core.c | 3 | ||||
| -rw-r--r-- | arch/x86/kernel/uprobes.c | 70 | ||||
| -rw-r--r-- | arch/x86/lib/insn-eval.c | 151 |
19 files changed, 1363 insertions, 265 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index ac9692093215..c4c21e8d0772 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -298,6 +298,7 @@ config X86 select HAVE_SYSCALL_TRACEPOINTS select HAVE_UACCESS_VALIDATION if HAVE_OBJTOOL select HAVE_UNSTABLE_SCHED_CLOCK + select HAVE_UNWIND_USER_FP if X86_64 select HAVE_USER_RETURN_NOTIFIER select HAVE_GENERIC_VDSO select VDSO_GETRANDOM if X86_64 diff --git a/arch/x86/boot/compressed/sev-handle-vc.c b/arch/x86/boot/compressed/sev-handle-vc.c index 7530ad8b768b..030001b46554 100644 --- a/arch/x86/boot/compressed/sev-handle-vc.c +++ b/arch/x86/boot/compressed/sev-handle-vc.c @@ -29,11 +29,10 @@ bool insn_has_rep_prefix(struct insn *insn) { insn_byte_t p; - int i; insn_get_prefixes(insn); - for_each_insn_prefix(insn, i, p) { + for_each_insn_prefix(insn, p) { if (p == 0xf2 || p == 0xf3) return true; } diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c index b20661b8621d..8868f5f5379b 100644 --- a/arch/x86/events/amd/core.c +++ b/arch/x86/events/amd/core.c @@ -763,7 +763,12 @@ static void amd_pmu_enable_all(int added) if (!test_bit(idx, cpuc->active_mask)) continue; - amd_pmu_enable_event(cpuc->events[idx]); + /* + * FIXME: cpuc->events[idx] can become NULL in a subtle race + * condition with NMI->throttle->x86_pmu_stop(). + */ + if (cpuc->events[idx]) + amd_pmu_enable_event(cpuc->events[idx]); } } diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index fa6c47b50989..c4091482e6c8 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -554,14 +554,22 @@ static inline int precise_br_compat(struct perf_event *event) return m == b; } -int x86_pmu_max_precise(void) +int x86_pmu_max_precise(struct pmu *pmu) { int precise = 0; - /* Support for constant skid */ if (x86_pmu.pebs_active && !x86_pmu.pebs_broken) { - precise++; + /* arch PEBS */ + if (x86_pmu.arch_pebs) { + precise = 2; + if (hybrid(pmu, arch_pebs_cap).pdists) + precise++; + + return precise; + } + /* legacy PEBS - support for constant skid */ + precise++; /* Support for IP fixup */ if (x86_pmu.lbr_nr || x86_pmu.intel_cap.pebs_format >= 2) precise++; @@ -569,13 +577,14 @@ int x86_pmu_max_precise(void) if (x86_pmu.pebs_prec_dist) precise++; } + return precise; } int x86_pmu_hw_config(struct perf_event *event) { if (event->attr.precise_ip) { - int precise = x86_pmu_max_precise(); + int precise = x86_pmu_max_precise(event->pmu); if (event->attr.precise_ip > precise) return -EOPNOTSUPP; @@ -1344,6 +1353,7 @@ static void x86_pmu_enable(struct pmu *pmu) hwc->state |= PERF_HES_ARCH; x86_pmu_stop(event, PERF_EF_UPDATE); + cpuc->events[hwc->idx] = NULL; } /* @@ -1365,6 +1375,7 @@ static void x86_pmu_enable(struct pmu *pmu) * if cpuc->enabled = 0, then no wrmsr as * per x86_pmu_enable_event() */ + cpuc->events[hwc->idx] = event; x86_pmu_start(event, PERF_EF_RELOAD); } cpuc->n_added = 0; @@ -1531,7 +1542,6 @@ static void x86_pmu_start(struct perf_event *event, int flags) event->hw.state = 0; - cpuc->events[idx] = event; __set_bit(idx, cpuc->active_mask); static_call(x86_pmu_enable)(event); perf_event_update_userpage(event); @@ -1610,7 +1620,6 @@ void x86_pmu_stop(struct perf_event *event, int flags) if (test_bit(hwc->idx, cpuc->active_mask)) { static_call(x86_pmu_disable)(event); __clear_bit(hwc->idx, cpuc->active_mask); - cpuc->events[hwc->idx] = NULL; WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED); hwc->state |= PERF_HES_STOPPED; } @@ -1648,6 +1657,7 @@ static void x86_pmu_del(struct perf_event *event, int flags) * Not a TXN, therefore cleanup properly. */ x86_pmu_stop(event, PERF_EF_UPDATE); + cpuc->events[event->hw.idx] = NULL; for (i = 0; i < cpuc->n_events; i++) { if (event == cpuc->event_list[i]) @@ -2629,7 +2639,9 @@ static ssize_t max_precise_show(struct device *cdev, struct device_attribute *attr, char *buf) { - return snprintf(buf, PAGE_SIZE, "%d\n", x86_pmu_max_precise()); + struct pmu *pmu = dev_get_drvdata(cdev); + + return snprintf(buf, PAGE_SIZE, "%d\n", x86_pmu_max_precise(pmu)); } static DEVICE_ATTR_RO(max_precise); @@ -2845,46 +2857,6 @@ static unsigned long get_segment_base(unsigned int segment) return get_desc_base(desc); } -#ifdef CONFIG_UPROBES -/* - * Heuristic-based check if uprobe is installed at the function entry. - * - * Under assumption of user code being compiled with frame pointers, - * `push %rbp/%ebp` is a good indicator that we indeed are. - * - * Similarly, `endbr64` (assuming 64-bit mode) is also a common pattern. - * If we get this wrong, captured stack trace might have one extra bogus - * entry, but the rest of stack trace will still be meaningful. - */ -static bool is_uprobe_at_func_entry(struct pt_regs *regs) -{ - struct arch_uprobe *auprobe; - - if (!current->utask) - return false; - - auprobe = current->utask->auprobe; - if (!auprobe) - return false; - - /* push %rbp/%ebp */ - if (auprobe->insn[0] == 0x55) - return true; - - /* endbr64 (64-bit only) */ - if (user_64bit_mode(regs) && is_endbr((u32 *)auprobe->insn)) - return true; - - return false; -} - -#else -static bool is_uprobe_at_func_entry(struct pt_regs *regs) -{ - return false; -} -#endif /* CONFIG_UPROBES */ - #ifdef CONFIG_IA32_EMULATION #include <linux/compat.h> diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index fe65be0b9d9c..853fe073bab3 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -2563,6 +2563,44 @@ static void intel_pmu_disable_fixed(struct perf_event *event) cpuc->fixed_ctrl_val &= ~mask; } +static inline void __intel_pmu_update_event_ext(int idx, u64 ext) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + u32 msr; + + if (idx < INTEL_PMC_IDX_FIXED) { + msr = MSR_IA32_PMC_V6_GP0_CFG_C + + x86_pmu.addr_offset(idx, false); + } else { + msr = MSR_IA32_PMC_V6_FX0_CFG_C + + x86_pmu.addr_offset(idx - INTEL_PMC_IDX_FIXED, false); + } + + cpuc->cfg_c_val[idx] = ext; + wrmsrq(msr, ext); +} + +static void intel_pmu_disable_event_ext(struct perf_event *event) +{ + /* + * Only clear CFG_C MSR for PEBS counter group events, + * it avoids the HW counter's value to be added into + * other PEBS records incorrectly after PEBS counter + * group events are disabled. + * + * For other events, it's unnecessary to clear CFG_C MSRs + * since CFG_C doesn't take effect if counter is in + * disabled state. That helps to reduce the WRMSR overhead + * in context switches. + */ + if (!is_pebs_counter_event_group(event)) + return; + + __intel_pmu_update_event_ext(event->hw.idx, 0); +} + +DEFINE_STATIC_CALL_NULL(intel_pmu_disable_event_ext, intel_pmu_disable_event_ext); + static void intel_pmu_disable_event(struct perf_event *event) { struct hw_perf_event *hwc = &event->hw; @@ -2571,9 +2609,12 @@ static void intel_pmu_disable_event(struct perf_event *event) switch (idx) { case 0 ... INTEL_PMC_IDX_FIXED - 1: intel_clear_masks(event, idx); + static_call_cond(intel_pmu_disable_event_ext)(event); x86_pmu_disable_event(event); break; case INTEL_PMC_IDX_FIXED ... INTEL_PMC_IDX_FIXED_BTS - 1: + static_call_cond(intel_pmu_disable_event_ext)(event); + fallthrough; case INTEL_PMC_IDX_METRIC_BASE ... INTEL_PMC_IDX_METRIC_END: intel_pmu_disable_fixed(event); break; @@ -2940,6 +2981,79 @@ static void intel_pmu_enable_acr(struct perf_event *event) DEFINE_STATIC_CALL_NULL(intel_pmu_enable_acr_event, intel_pmu_enable_acr); +static void intel_pmu_enable_event_ext(struct perf_event *event) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + struct hw_perf_event *hwc = &event->hw; + union arch_pebs_index old, new; + struct arch_pebs_cap cap; + u64 ext = 0; + + cap = hybrid(cpuc->pmu, arch_pebs_cap); + + if (event->attr.precise_ip) { + u64 pebs_data_cfg = intel_get_arch_pebs_data_config(event); + + ext |= ARCH_PEBS_EN; + if (hwc->flags & PERF_X86_EVENT_AUTO_RELOAD) + ext |= (-hwc->sample_period) & ARCH_PEBS_RELOAD; + + if (pebs_data_cfg && cap.caps) { + if (pebs_data_cfg & PEBS_DATACFG_MEMINFO) + ext |= ARCH_PEBS_AUX & cap.caps; + + if (pebs_data_cfg & PEBS_DATACFG_GP) + ext |= ARCH_PEBS_GPR & cap.caps; + + if (pebs_data_cfg & PEBS_DATACFG_XMMS) + ext |= ARCH_PEBS_VECR_XMM & cap.caps; + + if (pebs_data_cfg & PEBS_DATACFG_LBRS) + ext |= ARCH_PEBS_LBR & cap.caps; + + if (pebs_data_cfg & + (PEBS_DATACFG_CNTR_MASK << PEBS_DATACFG_CNTR_SHIFT)) + ext |= ARCH_PEBS_CNTR_GP & cap.caps; + + if (pebs_data_cfg & + (PEBS_DATACFG_FIX_MASK << PEBS_DATACFG_FIX_SHIFT)) + ext |= ARCH_PEBS_CNTR_FIXED & cap.caps; + + if (pebs_data_cfg & PEBS_DATACFG_METRICS) + ext |= ARCH_PEBS_CNTR_METRICS & cap.caps; + } + + if (cpuc->n_pebs == cpuc->n_large_pebs) + new.thresh = ARCH_PEBS_THRESH_MULTI; + else + new.thresh = ARCH_PEBS_THRESH_SINGLE; + + rdmsrq(MSR_IA32_PEBS_INDEX, old.whole); + if (new.thresh != old.thresh || !old.en) { + if (old.thresh == ARCH_PEBS_THRESH_MULTI && old.wr > 0) { + /* + * Large PEBS was enabled. + * Drain PEBS buffer before applying the single PEBS. + */ + intel_pmu_drain_pebs_buffer(); + } else { + new.wr = 0; + new.full = 0; + new.en = 1; + wrmsrq(MSR_IA32_PEBS_INDEX, new.whole); + } + } + } + + if (is_pebs_counter_event_group(event)) + ext |= ARCH_PEBS_CNTR_ALLOW; + + if (cpuc->cfg_c_val[hwc->idx] != ext) + __intel_pmu_update_event_ext(hwc->idx, ext); +} + +DEFINE_STATIC_CALL_NULL(intel_pmu_enable_event_ext, intel_pmu_enable_event_ext); + static void intel_pmu_enable_event(struct perf_event *event) { u64 enable_mask = ARCH_PERFMON_EVENTSEL_ENABLE; @@ -2955,10 +3069,12 @@ static void intel_pmu_enable_event(struct perf_event *event) enable_mask |= ARCH_PERFMON_EVENTSEL_BR_CNTR; intel_set_masks(event, idx); static_call_cond(intel_pmu_enable_acr_event)(event); + static_call_cond(intel_pmu_enable_event_ext)(event); __x86_pmu_enable_event(hwc, enable_mask); break; case INTEL_PMC_IDX_FIXED ... INTEL_PMC_IDX_FIXED_BTS - 1: static_call_cond(intel_pmu_enable_acr_event)(event); + static_call_cond(intel_pmu_enable_event_ext)(event); fallthrough; case INTEL_PMC_IDX_METRIC_BASE ... INTEL_PMC_IDX_METRIC_END: intel_pmu_enable_fixed(event); @@ -3216,6 +3332,19 @@ static int handle_pmi_common(struct pt_regs *regs, u64 status) } /* + * Arch PEBS sets bit 54 in the global status register + */ + if (__test_and_clear_bit(GLOBAL_STATUS_ARCH_PEBS_THRESHOLD_BIT, + (unsigned long *)&status)) { + handled++; + static_call(x86_pmu_drain_pebs)(regs, &data); + + if (cpuc->events[INTEL_PMC_IDX_FIXED_SLOTS] && + is_pebs_counter_event_group(cpuc->events[INTEL_PMC_IDX_FIXED_SLOTS])) + status &= ~GLOBAL_STATUS_PERF_METRICS_OVF_BIT; + } + + /* * Intel PT */ if (__test_and_clear_bit(GLOBAL_STATUS_TRACE_TOPAPMI_BIT, (unsigned long *)&status)) { @@ -3269,7 +3398,7 @@ static int handle_pmi_common(struct pt_regs *regs, u64 status) * The PEBS buffer has to be drained before handling the A-PMI */ if (is_pebs_counter_event_group(event)) - x86_pmu.drain_pebs(regs, &data); + static_call(x86_pmu_drain_pebs)(regs, &data); last_period = event->hw.last_period; @@ -4029,7 +4158,9 @@ static unsigned long intel_pmu_large_pebs_flags(struct perf_event *event) if (!event->attr.exclude_kernel) flags &= ~PERF_SAMPLE_REGS_USER; if (event->attr.sample_regs_user & ~PEBS_GP_REGS) - flags &= ~(PERF_SAMPLE_REGS_USER | PERF_SAMPLE_REGS_INTR); + flags &= ~PERF_SAMPLE_REGS_USER; + if (event->attr.sample_regs_intr & ~PEBS_GP_REGS) + flags &= ~PERF_SAMPLE_REGS_INTR; return flags; } @@ -4204,6 +4335,20 @@ static bool intel_pmu_is_acr_group(struct perf_event *event) return false; } +static inline bool intel_pmu_has_pebs_counter_group(struct pmu *pmu) +{ + u64 caps; + + if (x86_pmu.intel_cap.pebs_format >= 6 && x86_pmu.intel_cap.pebs_baseline) + return true; + + caps = hybrid(pmu, arch_pebs_cap).caps; + if (x86_pmu.arch_pebs && (caps & ARCH_PEBS_CNTR_MASK)) + return true; + + return false; +} + static inline void intel_pmu_set_acr_cntr_constr(struct perf_event *event, u64 *cause_mask, int *num) { @@ -4237,6 +4382,8 @@ static int intel_pmu_hw_config(struct perf_event *event) } if (event->attr.precise_ip) { + struct arch_pebs_cap pebs_cap = hybrid(event->pmu, arch_pebs_cap); + if ((event->attr.config & INTEL_ARCH_EVENT_MASK) == INTEL_FIXED_VLBR_EVENT) return -EINVAL; @@ -4250,6 +4397,15 @@ static int intel_pmu_hw_config(struct perf_event *event) } if (x86_pmu.pebs_aliases) x86_pmu.pebs_aliases(event); + + if (x86_pmu.arch_pebs) { + u64 cntr_mask = hybrid(event->pmu, intel_ctrl) & + ~GLOBAL_CTRL_EN_PERF_METRICS; + u64 pebs_mask = event->attr.precise_ip >= 3 ? + pebs_cap.pdists : pebs_cap.counters; + if (cntr_mask != pebs_mask) + event->hw.dyn_constraint &= pebs_mask; + } } if (needs_branch_stack(event)) { @@ -4341,8 +4497,7 @@ static int intel_pmu_hw_config(struct perf_event *event) } if ((event->attr.sample_type & PERF_SAMPLE_READ) && - (x86_pmu.intel_cap.pebs_format >= 6) && - x86_pmu.intel_cap.pebs_baseline && + intel_pmu_has_pebs_counter_group(event->pmu) && is_sampling_event(event) && event->attr.precise_ip) event->group_leader->hw.flags |= PERF_X86_EVENT_PEBS_CNTR; @@ -5212,7 +5367,13 @@ err: static int intel_pmu_cpu_prepare(int cpu) { - return intel_cpuc_prepare(&per_cpu(cpu_hw_events, cpu), cpu); + int ret; + + ret = intel_cpuc_prepare(&per_cpu(cpu_hw_events, cpu), cpu); + if (ret) + return ret; + + return alloc_arch_pebs_buf_on_cpu(cpu); } static void flip_smm_bit(void *data) @@ -5257,6 +5418,163 @@ static void intel_pmu_check_event_constraints(struct event_constraint *event_con u64 fixed_cntr_mask, u64 intel_ctrl); +enum dyn_constr_type { + DYN_CONSTR_NONE, + DYN_CONSTR_BR_CNTR, + DYN_CONSTR_ACR_CNTR, + DYN_CONSTR_ACR_CAUSE, + DYN_CONSTR_PEBS, + DYN_CONSTR_PDIST, + + DYN_CONSTR_MAX, +}; + +static const char * const dyn_constr_type_name[] = { + [DYN_CONSTR_NONE] = "a normal event", + [DYN_CONSTR_BR_CNTR] = "a branch counter logging event", + [DYN_CONSTR_ACR_CNTR] = "an auto-counter reload event", + [DYN_CONSTR_ACR_CAUSE] = "an auto-counter reload cause event", + [DYN_CONSTR_PEBS] = "a PEBS event", + [DYN_CONSTR_PDIST] = "a PEBS PDIST event", +}; + +static void __intel_pmu_check_dyn_constr(struct event_constraint *constr, + enum dyn_constr_type type, u64 mask) +{ + struct event_constraint *c1, *c2; + int new_weight, check_weight; + u64 new_mask, check_mask; + + for_each_event_constraint(c1, constr) { + new_mask = c1->idxmsk64 & mask; + new_weight = hweight64(new_mask); + + /* ignore topdown perf metrics event */ + if (c1->idxmsk64 & INTEL_PMC_MSK_TOPDOWN) + continue; + + if (!new_weight && fls64(c1->idxmsk64) < INTEL_PMC_IDX_FIXED) { + pr_info("The event 0x%llx is not supported as %s.\n", + c1->code, dyn_constr_type_name[type]); + } + + if (new_weight <= 1) + continue; + + for_each_event_constraint(c2, c1 + 1) { + bool check_fail = false; + + check_mask = c2->idxmsk64 & mask; + check_weight = hweight64(check_mask); + + if (c2->idxmsk64 & INTEL_PMC_MSK_TOPDOWN || + !check_weight) + continue; + + /* The same constraints or no overlap */ + if (new_mask == check_mask || + (new_mask ^ check_mask) == (new_mask | check_mask)) + continue; + + /* + * A scheduler issue may be triggered in the following cases. + * - Two overlap constraints have the same weight. + * E.g., A constraints: 0x3, B constraints: 0x6 + * event counter failure case + * B PMC[2:1] 1 + * A PMC[1:0] 0 + * A PMC[1:0] FAIL + * - Two overlap constraints have different weight. + * The constraint has a low weight, but has high last bit. + * E.g., A constraints: 0x7, B constraints: 0xC + * event counter failure case + * B PMC[3:2] 2 + * A PMC[2:0] 0 + * A PMC[2:0] 1 + * A PMC[2:0] FAIL + */ + if (new_weight == check_weight) { + check_fail = true; + } else if (new_weight < check_weight) { + if ((new_mask | check_mask) != check_mask && + fls64(new_mask) > fls64(check_mask)) + check_fail = true; + } else { + if ((new_mask | check_mask) != new_mask && + fls64(new_mask) < fls64(check_mask)) + check_fail = true; + } + + if (check_fail) { + pr_info("The two events 0x%llx and 0x%llx may not be " + "fully scheduled under some circumstances as " + "%s.\n", + c1->code, c2->code, dyn_constr_type_name[type]); + } + } + } +} + +static void intel_pmu_check_dyn_constr(struct pmu *pmu, + struct event_constraint *constr, + u64 cntr_mask) +{ + enum dyn_constr_type i; + u64 mask; + + for (i = DYN_CONSTR_NONE; i < DYN_CONSTR_MAX; i++) { + mask = 0; + switch (i) { + case DYN_CONSTR_NONE: + mask = cntr_mask; + break; + case DYN_CONSTR_BR_CNTR: + if (x86_pmu.flags & PMU_FL_BR_CNTR) + mask = x86_pmu.lbr_counters; + break; + case DYN_CONSTR_ACR_CNTR: + mask = hybrid(pmu, acr_cntr_mask64) & GENMASK_ULL(INTEL_PMC_MAX_GENERIC - 1, 0); + break; + case DYN_CONSTR_ACR_CAUSE: + if (hybrid(pmu, acr_cntr_mask64) == hybrid(pmu, acr_cause_mask64)) + continue; + mask = hybrid(pmu, acr_cause_mask64) & GENMASK_ULL(INTEL_PMC_MAX_GENERIC - 1, 0); + break; + case DYN_CONSTR_PEBS: + if (x86_pmu.arch_pebs) + mask = hybrid(pmu, arch_pebs_cap).counters; + break; + case DYN_CONSTR_PDIST: + if (x86_pmu.arch_pebs) + mask = hybrid(pmu, arch_pebs_cap).pdists; + break; + default: + pr_warn("Unsupported dynamic constraint type %d\n", i); + } + + if (mask) + __intel_pmu_check_dyn_constr(constr, i, mask); + } +} + +static void intel_pmu_check_event_constraints_all(struct pmu *pmu) +{ + struct event_constraint *event_constraints = hybrid(pmu, event_constraints); + struct event_constraint *pebs_constraints = hybrid(pmu, pebs_constraints); + u64 cntr_mask = hybrid(pmu, cntr_mask64); + u64 fixed_cntr_mask = hybrid(pmu, fixed_cntr_mask64); + u64 intel_ctrl = hybrid(pmu, intel_ctrl); + + intel_pmu_check_event_constraints(event_constraints, cntr_mask, + fixed_cntr_mask, intel_ctrl); + + if (event_constraints) + intel_pmu_check_dyn_constr(pmu, event_constraints, cntr_mask); + + if (pebs_constraints) + intel_pmu_check_dyn_constr(pmu, pebs_constraints, cntr_mask); +} + static void intel_pmu_check_extra_regs(struct extra_reg *extra_regs); static inline bool intel_pmu_broken_perf_cap(void) @@ -5269,34 +5587,89 @@ static inline bool intel_pmu_broken_perf_cap(void) return false; } +static inline void __intel_update_pmu_caps(struct pmu *pmu) +{ + struct pmu *dest_pmu = pmu ? pmu : x86_get_pmu(smp_processor_id()); + + if (hybrid(pmu, arch_pebs_cap).caps & ARCH_PEBS_VECR_XMM) + dest_pmu->capabilities |= PERF_PMU_CAP_EXTENDED_REGS; +} + +static inline void __intel_update_large_pebs_flags(struct pmu *pmu) +{ + u64 caps = hybrid(pmu, arch_pebs_cap).caps; + + x86_pmu.large_pebs_flags |= PERF_SAMPLE_TIME; + if (caps & ARCH_PEBS_LBR) + x86_pmu.large_pebs_flags |= PERF_SAMPLE_BRANCH_STACK; + if (caps & ARCH_PEBS_CNTR_MASK) + x86_pmu.large_pebs_flags |= PERF_SAMPLE_READ; + + if (!(caps & ARCH_PEBS_AUX)) + x86_pmu.large_pebs_flags &= ~PERF_SAMPLE_DATA_SRC; + if (!(caps & ARCH_PEBS_GPR)) { + x86_pmu.large_pebs_flags &= + ~(PERF_SAMPLE_REGS_INTR | PERF_SAMPLE_REGS_USER); + } +} + +#define counter_mask(_gp, _fixed) ((_gp) | ((u64)(_fixed) << INTEL_PMC_IDX_FIXED)) + static void update_pmu_cap(struct pmu *pmu) { - unsigned int cntr, fixed_cntr, ecx, edx; - union cpuid35_eax eax; - union cpuid35_ebx ebx; + unsigned int eax, ebx, ecx, edx; + union cpuid35_eax eax_0; + union cpuid35_ebx ebx_0; + u64 cntrs_mask = 0; + u64 pebs_mask = 0; + u64 pdists_mask = 0; - cpuid(ARCH_PERFMON_EXT_LEAF, &eax.full, &ebx.full, &ecx, &edx); + cpuid(ARCH_PERFMON_EXT_LEAF, &eax_0.full, &ebx_0.full, &ecx, &edx); - if (ebx.split.umask2) + if (ebx_0.split.umask2) hybrid(pmu, config_mask) |= ARCH_PERFMON_EVENTSEL_UMASK2; - if (ebx.split.eq) + if (ebx_0.split.eq) hybrid(pmu, config_mask) |= ARCH_PERFMON_EVENTSEL_EQ; - if (eax.split.cntr_subleaf) { + if (eax_0.split.cntr_subleaf) { cpuid_count(ARCH_PERFMON_EXT_LEAF, ARCH_PERFMON_NUM_COUNTER_LEAF, - &cntr, &fixed_cntr, &ecx, &edx); - hybrid(pmu, cntr_mask64) = cntr; - hybrid(pmu, fixed_cntr_mask64) = fixed_cntr; + &eax, &ebx, &ecx, &edx); + hybrid(pmu, cntr_mask64) = eax; + hybrid(pmu, fixed_cntr_mask64) = ebx; + cntrs_mask = counter_mask(eax, ebx); } - if (eax.split.acr_subleaf) { + if (eax_0.split.acr_subleaf) { cpuid_count(ARCH_PERFMON_EXT_LEAF, ARCH_PERFMON_ACR_LEAF, - &cntr, &fixed_cntr, &ecx, &edx); + &eax, &ebx, &ecx, &edx); /* The mask of the counters which can be reloaded */ - hybrid(pmu, acr_cntr_mask64) = cntr | ((u64)fixed_cntr << INTEL_PMC_IDX_FIXED); - + hybrid(pmu, acr_cntr_mask64) = counter_mask(eax, ebx); /* The mask of the counters which can cause a reload of reloadable counters */ - hybrid(pmu, acr_cause_mask64) = ecx | ((u64)edx << INTEL_PMC_IDX_FIXED); + hybrid(pmu, acr_cause_mask64) = counter_mask(ecx, edx); + } + + /* Bits[5:4] should be set simultaneously if arch-PEBS is supported */ + if (eax_0.split.pebs_caps_subleaf && eax_0.split.pebs_cnts_subleaf) { + cpuid_count(ARCH_PERFMON_EXT_LEAF, ARCH_PERFMON_PEBS_CAP_LEAF, + &eax, &ebx, &ecx, &edx); + hybrid(pmu, arch_pebs_cap).caps = (u64)ebx << 32; + + cpuid_count(ARCH_PERFMON_EXT_LEAF, ARCH_PERFMON_PEBS_COUNTER_LEAF, + &eax, &ebx, &ecx, &edx); + pebs_mask = counter_mask(eax, ecx); + pdists_mask = counter_mask(ebx, edx); + hybrid(pmu, arch_pebs_cap).counters = pebs_mask; + hybrid(pmu, arch_pebs_cap).pdists = pdists_mask; + + if (WARN_ON((pebs_mask | pdists_mask) & ~cntrs_mask)) { + x86_pmu.arch_pebs = 0; + } else { + __intel_update_pmu_caps(pmu); + __intel_update_large_pebs_flags(pmu); + } + } else { + WARN_ON(x86_pmu.arch_pebs == 1); + x86_pmu.arch_pebs = 0; } if (!intel_pmu_broken_perf_cap()) { @@ -5319,10 +5692,7 @@ static void intel_pmu_check_hybrid_pmus(struct x86_hybrid_pmu *pmu) else pmu->intel_ctrl &= ~GLOBAL_CTRL_EN_PERF_METRICS; - intel_pmu_check_event_constraints(pmu->event_constraints, - pmu->cntr_mask64, - pmu->fixed_cntr_mask64, - pmu->intel_ctrl); + intel_pmu_check_event_constraints_all(&pmu->pmu); intel_pmu_check_extra_regs(pmu->extra_regs); } @@ -5418,6 +5788,7 @@ static void intel_pmu_cpu_starting(int cpu) return; init_debug_store_on_cpu(cpu); + init_arch_pebs_on_cpu(cpu); /* * Deal with CPUs that don't clear their LBRs on power-up, and that may * even boot with LBRs enabled. @@ -5456,6 +5827,8 @@ static void intel_pmu_cpu_starting(int cpu) } } + __intel_update_pmu_caps(cpuc->pmu); + if (!cpuc->shared_regs) return; @@ -5515,6 +5888,7 @@ static void free_excl_cntrs(struct cpu_hw_events *cpuc) static void intel_pmu_cpu_dying(int cpu) { fini_debug_store_on_cpu(cpu); + fini_arch_pebs_on_cpu(cpu); } void intel_cpuc_finish(struct cpu_hw_events *cpuc) @@ -5535,6 +5909,7 @@ static void intel_pmu_cpu_dead(int cpu) { struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu); + release_arch_pebs_buf_on_cpu(cpu); intel_cpuc_finish(cpuc); if (is_hybrid() && cpuc->pmu) @@ -6250,7 +6625,7 @@ tsx_is_visible(struct kobject *kobj, struct attribute *attr, int i) static umode_t pebs_is_visible(struct kobject *kobj, struct attribute *attr, int i) { - return x86_pmu.ds_pebs ? attr->mode : 0; + return intel_pmu_has_pebs() ? attr->mode : 0; } static umode_t @@ -6940,8 +7315,11 @@ __init int intel_pmu_init(void) * Many features on and after V6 require dynamic constraint, * e.g., Arch PEBS, ACR. */ - if (version >= 6) + if (version >= 6) { x86_pmu.flags |= PMU_FL_DYN_CONSTRAINT; + x86_pmu.late_setup = intel_pmu_late_setup; + } + /* * Install the hw-cache-events table: */ @@ -7727,6 +8105,14 @@ __init int intel_pmu_init(void) if (!is_hybrid() && boot_cpu_has(X86_FEATURE_ARCH_PERFMON_EXT)) update_pmu_cap(NULL); + if (x86_pmu.arch_pebs) { + static_call_update(intel_pmu_disable_event_ext, + intel_pmu_disable_event_ext); + static_call_update(intel_pmu_enable_event_ext, + intel_pmu_enable_event_ext); + pr_cont("Architectural PEBS, "); + } + intel_pmu_check_counters_mask(&x86_pmu.cntr_mask64, &x86_pmu.fixed_cntr_mask64, &x86_pmu.intel_ctrl); @@ -7735,10 +8121,8 @@ __init int intel_pmu_init(void) if (x86_pmu.intel_cap.anythread_deprecated) x86_pmu.format_attrs = intel_arch_formats_attr; - intel_pmu_check_event_constraints(x86_pmu.event_constraints, - x86_pmu.cntr_mask64, - x86_pmu.fixed_cntr_mask64, - x86_pmu.intel_ctrl); + intel_pmu_check_event_constraints_all(NULL); + /* * Access LBR MSR may cause #GP under certain circumstances. * Check all LBR MSR here. diff --git a/arch/x86/events/intel/cstate.c b/arch/x86/events/intel/cstate.c index ec753e39b007..fa67fda6e45b 100644 --- a/arch/x86/events/intel/cstate.c +++ b/arch/x86/events/intel/cstate.c @@ -41,7 +41,7 @@ * MSR_CORE_C1_RES: CORE C1 Residency Counter * perf code: 0x00 * Available model: SLM,AMT,GLM,CNL,ICX,TNT,ADL,RPL - * MTL,SRF,GRR,ARL,LNL + * MTL,SRF,GRR,ARL,LNL,PTL * Scope: Core (each processor core has a MSR) * MSR_CORE_C3_RESIDENCY: CORE C3 Residency Counter * perf code: 0x01 @@ -53,31 +53,32 @@ * Available model: SLM,AMT,NHM,WSM,SNB,IVB,HSW,BDW, * SKL,KNL,GLM,CNL,KBL,CML,ICL,ICX, * TGL,TNT,RKL,ADL,RPL,SPR,MTL,SRF, - * GRR,ARL,LNL + * GRR,ARL,LNL,PTL * Scope: Core * MSR_CORE_C7_RESIDENCY: CORE C7 Residency Counter * perf code: 0x03 * Available model: SNB,IVB,HSW,BDW,SKL,CNL,KBL,CML, - * ICL,TGL,RKL,ADL,RPL,MTL,ARL,LNL + * ICL,TGL,RKL,ADL,RPL,MTL,ARL,LNL, + * PTL * Scope: Core * MSR_PKG_C2_RESIDENCY: Package C2 Residency Counter. * perf code: 0x00 * Available model: SNB,IVB,HSW,BDW,SKL,KNL,GLM,CNL, * KBL,CML,ICL,ICX,TGL,TNT,RKL,ADL, - * RPL,SPR,MTL,ARL,LNL,SRF + * RPL,SPR,MTL,ARL,LNL,SRF,PTL * Scope: Package (physical package) * MSR_PKG_C3_RESIDENCY: Package C3 Residency Counter. * perf code: 0x01 * Available model: NHM,WSM,SNB,IVB,HSW,BDW,SKL,KNL, * GLM,CNL,KBL,CML,ICL,TGL,TNT,RKL, - * ADL,RPL,MTL,ARL,LNL + * ADL,RPL,MTL,ARL * Scope: Package (physical package) * MSR_PKG_C6_RESIDENCY: Package C6 Residency Counter. * perf code: 0x02 * Available model: SLM,AMT,NHM,WSM,SNB,IVB,HSW,BDW, * SKL,KNL,GLM,CNL,KBL,CML,ICL,ICX, * TGL,TNT,RKL,ADL,RPL,SPR,MTL,SRF, - * ARL,LNL + * ARL,LNL,PTL * Scope: Package (physical package) * MSR_PKG_C7_RESIDENCY: Package C7 Residency Counter. * perf code: 0x03 @@ -96,7 +97,7 @@ * MSR_PKG_C10_RESIDENCY: Package C10 Residency Counter. * perf code: 0x06 * Available model: HSW ULT,KBL,GLM,CNL,CML,ICL,TGL, - * TNT,RKL,ADL,RPL,MTL,ARL,LNL + * TNT,RKL,ADL,RPL,MTL,ARL,LNL,PTL * Scope: Package (physical package) * MSR_MODULE_C6_RES_MS: Module C6 Residency Counter. * perf code: 0x00 @@ -522,7 +523,6 @@ static const struct cstate_model lnl_cstates __initconst = { BIT(PERF_CSTATE_CORE_C7_RES), .pkg_events = BIT(PERF_CSTATE_PKG_C2_RES) | - BIT(PERF_CSTATE_PKG_C3_RES) | BIT(PERF_CSTATE_PKG_C6_RES) | BIT(PERF_CSTATE_PKG_C10_RES), }; @@ -628,6 +628,7 @@ static const struct x86_cpu_id intel_cstates_match[] __initconst = { X86_MATCH_VFM(INTEL_ATOM_GRACEMONT, &adl_cstates), X86_MATCH_VFM(INTEL_ATOM_CRESTMONT_X, &srf_cstates), X86_MATCH_VFM(INTEL_ATOM_CRESTMONT, &grr_cstates), + X86_MATCH_VFM(INTEL_ATOM_DARKMONT_X, &srf_cstates), X86_MATCH_VFM(INTEL_ICELAKE_L, &icl_cstates), X86_MATCH_VFM(INTEL_ICELAKE, &icl_cstates), @@ -652,6 +653,7 @@ static const struct x86_cpu_id intel_cstates_match[] __initconst = { X86_MATCH_VFM(INTEL_ARROWLAKE_H, &adl_cstates), X86_MATCH_VFM(INTEL_ARROWLAKE_U, &adl_cstates), X86_MATCH_VFM(INTEL_LUNARLAKE_M, &lnl_cstates), + X86_MATCH_VFM(INTEL_PANTHERLAKE_L, &lnl_cstates), { }, }; MODULE_DEVICE_TABLE(x86cpu, intel_cstates_match); diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index 01bc59e9286c..feb1c3cf63e4 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -626,13 +626,18 @@ static int alloc_pebs_buffer(int cpu) int max, node = cpu_to_node(cpu); void *buffer, *insn_buff, *cea; - if (!x86_pmu.ds_pebs) + if (!intel_pmu_has_pebs()) return 0; buffer = dsalloc_pages(bsiz, GFP_KERNEL, cpu); if (unlikely(!buffer)) return -ENOMEM; + if (x86_pmu.arch_pebs) { + hwev->pebs_vaddr = buffer; + return 0; + } + /* * HSW+ already provides us the eventing ip; no need to allocate this * buffer then. @@ -645,7 +650,7 @@ static int alloc_pebs_buffer(int cpu) } per_cpu(insn_buffer, cpu) = insn_buff; } - hwev->ds_pebs_vaddr = buffer; + hwev->pebs_vaddr = buffer; /* Update the cpu entry area mapping */ cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.pebs_buffer; ds->pebs_buffer_base = (unsigned long) cea; @@ -661,17 +666,20 @@ static void release_pebs_buffer(int cpu) struct cpu_hw_events *hwev = per_cpu_ptr(&cpu_hw_events, cpu); void *cea; - if (!x86_pmu.ds_pebs) + if (!intel_pmu_has_pebs()) return; - kfree(per_cpu(insn_buffer, cpu)); - per_cpu(insn_buffer, cpu) = NULL; + if (x86_pmu.ds_pebs) { + kfree(per_cpu(insn_buffer, cpu)); + per_cpu(insn_buffer, cpu) = NULL; - /* Clear the fixmap */ - cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.pebs_buffer; - ds_clear_cea(cea, x86_pmu.pebs_buffer_size); - dsfree_pages(hwev->ds_pebs_vaddr, x86_pmu.pebs_buffer_size); - hwev->ds_pebs_vaddr = NULL; + /* Clear the fixmap */ + cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.pebs_buffer; + ds_clear_cea(cea, x86_pmu.pebs_buffer_size); + } + + dsfree_pages(hwev->pebs_vaddr, x86_pmu.pebs_buffer_size); + hwev->pebs_vaddr = NULL; } static int alloc_bts_buffer(int cpu) @@ -824,6 +832,56 @@ void reserve_ds_buffers(void) } } +inline int alloc_arch_pebs_buf_on_cpu(int cpu) +{ + if (!x86_pmu.arch_pebs) + return 0; + + return alloc_pebs_buffer(cpu); +} + +inline void release_arch_pebs_buf_on_cpu(int cpu) +{ + if (!x86_pmu.arch_pebs) + return; + + release_pebs_buffer(cpu); +} + +void init_arch_pebs_on_cpu(int cpu) +{ + struct cpu_hw_events *cpuc = per_cpu_ptr(&cpu_hw_events, cpu); + u64 arch_pebs_base; + + if (!x86_pmu.arch_pebs) + return; + + if (!cpuc->pebs_vaddr) { + WARN(1, "Fail to allocate PEBS buffer on CPU %d\n", cpu); + x86_pmu.pebs_active = 0; + return; + } + + /* + * 4KB-aligned pointer of the output buffer + * (__alloc_pages_node() return page aligned address) + * Buffer Size = 4KB * 2^SIZE + * contiguous physical buffer (__alloc_pages_node() with order) + */ + arch_pebs_base = virt_to_phys(cpuc->pebs_vaddr) | PEBS_BUFFER_SHIFT; + wrmsr_on_cpu(cpu, MSR_IA32_PEBS_BASE, (u32)arch_pebs_base, + (u32)(arch_pebs_base >> 32)); + x86_pmu.pebs_active = 1; +} + +inline void fini_arch_pebs_on_cpu(int cpu) +{ + if (!x86_pmu.arch_pebs) + return; + + wrmsr_on_cpu(cpu, MSR_IA32_PEBS_BASE, 0, 0); +} + /* * BTS */ @@ -1471,6 +1529,25 @@ pebs_update_state(bool needed_cb, struct cpu_hw_events *cpuc, } } +u64 intel_get_arch_pebs_data_config(struct perf_event *event) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + u64 pebs_data_cfg = 0; + u64 cntr_mask; + + if (WARN_ON(event->hw.idx < 0 || event->hw.idx >= X86_PMC_IDX_MAX)) + return 0; + + pebs_data_cfg |= pebs_update_adaptive_cfg(event); + + cntr_mask = (PEBS_DATACFG_CNTR_MASK << PEBS_DATACFG_CNTR_SHIFT) | + (PEBS_DATACFG_FIX_MASK << PEBS_DATACFG_FIX_SHIFT) | + PEBS_DATACFG_CNTR | PEBS_DATACFG_METRICS; + pebs_data_cfg |= cpuc->pebs_data_cfg & cntr_mask; + + return pebs_data_cfg; +} + void intel_pmu_pebs_add(struct perf_event *event) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); @@ -1532,6 +1609,15 @@ static inline void intel_pmu_drain_large_pebs(struct cpu_hw_events *cpuc) intel_pmu_drain_pebs_buffer(); } +static void __intel_pmu_pebs_enable(struct perf_event *event) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + struct hw_perf_event *hwc = &event->hw; + + hwc->config &= ~ARCH_PERFMON_EVENTSEL_INT; + cpuc->pebs_enabled |= 1ULL << hwc->idx; +} + void intel_pmu_pebs_enable(struct perf_event *event) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); @@ -1540,9 +1626,7 @@ void intel_pmu_pebs_enable(struct perf_event *event) struct debug_store *ds = cpuc->ds; unsigned int idx = hwc->idx; - hwc->config &= ~ARCH_PERFMON_EVENTSEL_INT; - - cpuc->pebs_enabled |= 1ULL << hwc->idx; + __intel_pmu_pebs_enable(event); if ((event->hw.flags & PERF_X86_EVENT_PEBS_LDLAT) && (x86_pmu.version < 5)) cpuc->pebs_enabled |= 1ULL << (hwc->idx + 32); @@ -1604,14 +1688,22 @@ void intel_pmu_pebs_del(struct perf_event *event) pebs_update_state(needed_cb, cpuc, event, false); } -void intel_pmu_pebs_disable(struct perf_event *event) +static void __intel_pmu_pebs_disable(struct perf_event *event) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct hw_perf_event *hwc = &event->hw; intel_pmu_drain_large_pebs(cpuc); - cpuc->pebs_enabled &= ~(1ULL << hwc->idx); + hwc->config |= ARCH_PERFMON_EVENTSEL_INT; +} + +void intel_pmu_pebs_disable(struct perf_event *event) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + struct hw_perf_event *hwc = &event->hw; + + __intel_pmu_pebs_disable(event); if ((event->hw.flags & PERF_X86_EVENT_PEBS_LDLAT) && (x86_pmu.version < 5)) @@ -1623,8 +1715,6 @@ void intel_pmu_pebs_disable(struct perf_event *event) if (cpuc->enabled) wrmsrq(MSR_IA32_PEBS_ENABLE, cpuc->pebs_enabled); - - hwc->config |= ARCH_PERFMON_EVENTSEL_INT; } void intel_pmu_pebs_enable_all(void) @@ -2060,6 +2150,90 @@ static inline void __setup_pebs_counter_group(struct cpu_hw_events *cpuc, #define PEBS_LATENCY_MASK 0xffff +static inline void __setup_perf_sample_data(struct perf_event *event, + struct pt_regs *iregs, + struct perf_sample_data *data) +{ + perf_sample_data_init(data, 0, event->hw.last_period); + + /* + * We must however always use iregs for the unwinder to stay sane; the + * record BP,SP,IP can point into thin air when the record is from a + * previous PMI context or an (I)RET happened between the record and + * PMI. + */ + perf_sample_save_callchain(data, event, iregs); +} + +static inline void __setup_pebs_basic_group(struct perf_event *event, + struct pt_regs *regs, + struct perf_sample_data *data, + u64 sample_type, u64 ip, + u64 tsc, u16 retire) +{ + /* The ip in basic is EventingIP */ + set_linear_ip(regs, ip); + regs->flags = PERF_EFLAGS_EXACT; + setup_pebs_time(event, data, tsc); + + if (sample_type & PERF_SAMPLE_WEIGHT_STRUCT) + data->weight.var3_w = retire; +} + +static inline void __setup_pebs_gpr_group(struct perf_event *event, + struct pt_regs *regs, + struct pebs_gprs *gprs, + u64 sample_type) +{ + if (event->attr.precise_ip < 2) { + set_linear_ip(regs, gprs->ip); + regs->flags &= ~PERF_EFLAGS_EXACT; + } + + if (sample_type & (PERF_SAMPLE_REGS_INTR | PERF_SAMPLE_REGS_USER)) + adaptive_pebs_save_regs(regs, gprs); +} + +static inline void __setup_pebs_meminfo_group(struct perf_event *event, + struct perf_sample_data *data, + u64 sample_type, u64 latency, + u16 instr_latency, u64 address, + u64 aux, u64 tsx_tuning, u64 ax) +{ + if (sample_type & PERF_SAMPLE_WEIGHT_TYPE) { + u64 tsx_latency = intel_get_tsx_weight(tsx_tuning); + + data->weight.var2_w = instr_latency; + + /* + * Although meminfo::latency is defined as a u64, + * only the lower 32 bits include the valid data + * in practice on Ice Lake and earlier platforms. + */ + if (sample_type & PERF_SAMPLE_WEIGHT) + data->weight.full = latency ?: tsx_latency; + else + data->weight.var1_dw = (u32)latency ?: tsx_latency; + + data->sample_flags |= PERF_SAMPLE_WEIGHT_TYPE; + } + + if (sample_type & PERF_SAMPLE_DATA_SRC) { + data->data_src.val = get_data_src(event, aux); + data->sample_flags |= PERF_SAMPLE_DATA_SRC; + } + + if (sample_type & PERF_SAMPLE_ADDR_TYPE) { + data->addr = address; + data->sample_flags |= PERF_SAMPLE_ADDR; + } + + if (sample_type & PERF_SAMPLE_TRANSACTION) { + data->txn = intel_get_tsx_transaction(tsx_tuning, ax); + data->sample_flags |= PERF_SAMPLE_TRANSACTION; + } +} + /* * With adaptive PEBS the layout depends on what fields are configured. */ @@ -2069,12 +2243,14 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event, struct pt_regs *regs) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + u64 sample_type = event->attr.sample_type; struct pebs_basic *basic = __pebs; void *next_record = basic + 1; - u64 sample_type, format_group; struct pebs_meminfo *meminfo = NULL; struct pebs_gprs *gprs = NULL; struct x86_perf_regs *perf_regs; + u64 format_group; + u16 retire; if (basic == NULL) return; @@ -2082,31 +2258,17 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event, perf_regs = container_of(regs, struct x86_perf_regs, regs); perf_regs->xmm_regs = NULL; - sample_type = event->attr.sample_type; format_group = basic->format_group; - perf_sample_data_init(data, 0, event->hw.last_period); - setup_pebs_time(event, data, basic->tsc); - - /* - * We must however always use iregs for the unwinder to stay sane; the - * record BP,SP,IP can point into thin air when the record is from a - * previous PMI context or an (I)RET happened between the record and - * PMI. - */ - perf_sample_save_callchain(data, event, iregs); + __setup_perf_sample_data(event, iregs, data); *regs = *iregs; - /* The ip in basic is EventingIP */ - set_linear_ip(regs, basic->ip); - regs->flags = PERF_EFLAGS_EXACT; - if (sample_type & PERF_SAMPLE_WEIGHT_STRUCT) { - if (x86_pmu.flags & PMU_FL_RETIRE_LATENCY) - data->weight.var3_w = basic->retire_latency; - else - data->weight.var3_w = 0; - } + /* basic group */ + retire = x86_pmu.flags & PMU_FL_RETIRE_LATENCY ? + basic->retire_latency : 0; + __setup_pebs_basic_group(event, regs, data, sample_type, + basic->ip, basic->tsc, retire); /* * The record for MEMINFO is in front of GP @@ -2122,54 +2284,20 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event, gprs = next_record; next_record = gprs + 1; - if (event->attr.precise_ip < 2) { - set_linear_ip(regs, gprs->ip); - regs->flags &= ~PERF_EFLAGS_EXACT; - } - - if (sample_type & (PERF_SAMPLE_REGS_INTR | PERF_SAMPLE_REGS_USER)) - adaptive_pebs_save_regs(regs, gprs); + __setup_pebs_gpr_group(event, regs, gprs, sample_type); } if (format_group & PEBS_DATACFG_MEMINFO) { - if (sample_type & PERF_SAMPLE_WEIGHT_TYPE) { - u64 latency = x86_pmu.flags & PMU_FL_INSTR_LATENCY ? - meminfo->cache_latency : meminfo->mem_latency; - - if (x86_pmu.flags & PMU_FL_INSTR_LATENCY) - data->weight.var2_w = meminfo->instr_latency; - - /* - * Although meminfo::latency is defined as a u64, - * only the lower 32 bits include the valid data - * in practice on Ice Lake and earlier platforms. - */ - if (sample_type & PERF_SAMPLE_WEIGHT) { - data->weight.full = latency ?: - intel_get_tsx_weight(meminfo->tsx_tuning); - } else { - data->weight.var1_dw = (u32)latency ?: - intel_get_tsx_weight(meminfo->tsx_tuning); - } - - data->sample_flags |= PERF_SAMPLE_WEIGHT_TYPE; - } - - if (sample_type & PERF_SAMPLE_DATA_SRC) { - data->data_src.val = get_data_src(event, meminfo->aux); - data->sample_flags |= PERF_SAMPLE_DATA_SRC; - } - - if (sample_type & PERF_SAMPLE_ADDR_TYPE) { - data->addr = meminfo->address; - data->sample_flags |= PERF_SAMPLE_ADDR; - } - - if (sample_type & PERF_SAMPLE_TRANSACTION) { - data->txn = intel_get_tsx_transaction(meminfo->tsx_tuning, - gprs ? gprs->ax : 0); - data->sample_flags |= PERF_SAMPLE_TRANSACTION; - } + u64 latency = x86_pmu.flags & PMU_FL_INSTR_LATENCY ? + meminfo->cache_latency : meminfo->mem_latency; + u64 instr_latency = x86_pmu.flags & PMU_FL_INSTR_LATENCY ? + meminfo->instr_latency : 0; + u64 ax = gprs ? gprs->ax : 0; + + __setup_pebs_meminfo_group(event, data, sample_type, latency, + instr_latency, meminfo->address, + meminfo->aux, meminfo->tsx_tuning, + ax); } if (format_group & PEBS_DATACFG_XMMS) { @@ -2220,6 +2348,135 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event, format_group); } +static inline bool arch_pebs_record_continued(struct arch_pebs_header *header) +{ + /* Continue bit or null PEBS record indicates fragment follows. */ + return header->cont || !(header->format & GENMASK_ULL(63, 16)); +} + +static void setup_arch_pebs_sample_data(struct perf_event *event, + struct pt_regs *iregs, + void *__pebs, + struct perf_sample_data *data, + struct pt_regs *regs) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + u64 sample_type = event->attr.sample_type; + struct arch_pebs_header *header = NULL; + struct arch_pebs_aux *meminfo = NULL; + struct arch_pebs_gprs *gprs = NULL; + struct x86_perf_regs *perf_regs; + void *next_record; + void *at = __pebs; + + if (at == NULL) + return; + + perf_regs = container_of(regs, struct x86_perf_regs, regs); + perf_regs->xmm_regs = NULL; + + __setup_perf_sample_data(event, iregs, data); + + *regs = *iregs; + +again: + header = at; + next_record = at + sizeof(struct arch_pebs_header); + if (header->basic) { + struct arch_pebs_basic *basic = next_record; + u16 retire = 0; + + next_record = basic + 1; + + if (sample_type & PERF_SAMPLE_WEIGHT_STRUCT) + retire = basic->valid ? basic->retire : 0; + __setup_pebs_basic_group(event, regs, data, sample_type, + basic->ip, basic->tsc, retire); + } + + /* + * The record for MEMINFO is in front of GP + * But PERF_SAMPLE_TRANSACTION needs gprs->ax. + * Save the pointer here but process later. + */ + if (header->aux) { + meminfo = next_record; + next_record = meminfo + 1; + } + + if (header->gpr) { + gprs = next_record; + next_record = gprs + 1; + + __setup_pebs_gpr_group(event, regs, + (struct pebs_gprs *)gprs, + sample_type); + } + + if (header->aux) { + u64 ax = gprs ? gprs->ax : 0; + + __setup_pebs_meminfo_group(event, data, sample_type, + meminfo->cache_latency, + meminfo->instr_latency, + meminfo->address, meminfo->aux, + meminfo->tsx_tuning, ax); + } + + if (header->xmm) { + struct pebs_xmm *xmm; + + next_record += sizeof(struct arch_pebs_xer_header); + + xmm = next_record; + perf_regs->xmm_regs = xmm->xmm; + next_record = xmm + 1; + } + + if (header->lbr) { + struct arch_pebs_lbr_header *lbr_header = next_record; + struct lbr_entry *lbr; + int num_lbr; + + next_record = lbr_header + 1; + lbr = next_record; + + num_lbr = header->lbr == ARCH_PEBS_LBR_NUM_VAR ? + lbr_header->depth : + header->lbr * ARCH_PEBS_BASE_LBR_ENTRIES; + next_record += num_lbr * sizeof(struct lbr_entry); + + if (has_branch_stack(event)) { + intel_pmu_store_pebs_lbrs(lbr); + intel_pmu_lbr_save_brstack(data, cpuc, event); + } + } + + if (header->cntr) { + struct arch_pebs_cntr_header *cntr = next_record; + unsigned int nr; + + next_record += sizeof(struct arch_pebs_cntr_header); + + if (is_pebs_counter_event_group(event)) { + __setup_pebs_counter_group(cpuc, event, + (struct pebs_cntr_header *)cntr, next_record); + data->sample_flags |= PERF_SAMPLE_READ; + } + + nr = hweight32(cntr->cntr) + hweight32(cntr->fixed); + if (cntr->metrics == INTEL_CNTR_METRICS) + nr += 2; + next_record += nr * sizeof(u64); + } + + /* Parse followed fragments if there are. */ + if (arch_pebs_record_continued(header)) { + at = at + header->size; + goto again; + } +} + static inline void * get_next_pebs_record_by_bit(void *base, void *top, int bit) { @@ -2602,6 +2859,57 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs, struct perf_sample_d } } +static __always_inline void +__intel_pmu_handle_pebs_record(struct pt_regs *iregs, + struct pt_regs *regs, + struct perf_sample_data *data, + void *at, u64 pebs_status, + short *counts, void **last, + setup_fn setup_sample) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + struct perf_event *event; + int bit; + + for_each_set_bit(bit, (unsigned long *)&pebs_status, X86_PMC_IDX_MAX) { + event = cpuc->events[bit]; + + if (WARN_ON_ONCE(!event) || + WARN_ON_ONCE(!event->attr.precise_ip)) + continue; + + if (counts[bit]++) { + __intel_pmu_pebs_event(event, iregs, regs, data, + last[bit], setup_sample); + } + + last[bit] = at; + } +} + +static __always_inline void +__intel_pmu_handle_last_pebs_record(struct pt_regs *iregs, + struct pt_regs *regs, + struct perf_sample_data *data, + u64 mask, short *counts, void **last, + setup_fn setup_sample) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + struct perf_event *event; + int bit; + + for_each_set_bit(bit, (unsigned long *)&mask, X86_PMC_IDX_MAX) { + if (!counts[bit]) + continue; + + event = cpuc->events[bit]; + + __intel_pmu_pebs_last_event(event, iregs, regs, data, last[bit], + counts[bit], setup_sample); + } + +} + static void intel_pmu_drain_pebs_icl(struct pt_regs *iregs, struct perf_sample_data *data) { short counts[INTEL_PMC_IDX_FIXED + MAX_FIXED_PEBS_EVENTS] = {}; @@ -2611,9 +2919,7 @@ static void intel_pmu_drain_pebs_icl(struct pt_regs *iregs, struct perf_sample_d struct x86_perf_regs perf_regs; struct pt_regs *regs = &perf_regs.regs; struct pebs_basic *basic; - struct perf_event *event; void *base, *at, *top; - int bit; u64 mask; if (!x86_pmu.pebs_active) @@ -2626,6 +2932,7 @@ static void intel_pmu_drain_pebs_icl(struct pt_regs *iregs, struct perf_sample_d mask = hybrid(cpuc->pmu, pebs_events_mask) | (hybrid(cpuc->pmu, fixed_cntr_mask64) << INTEL_PMC_IDX_FIXED); + mask &= cpuc->pebs_enabled; if (unlikely(base >= top)) { intel_pmu_pebs_event_update_no_drain(cpuc, mask); @@ -2643,38 +2950,114 @@ static void intel_pmu_drain_pebs_icl(struct pt_regs *iregs, struct perf_sample_d if (basic->format_size != cpuc->pebs_record_size) continue; - pebs_status = basic->applicable_counters & cpuc->pebs_enabled & mask; - for_each_set_bit(bit, (unsigned long *)&pebs_status, X86_PMC_IDX_MAX) { - event = cpuc->events[bit]; + pebs_status = mask & basic->applicable_counters; + __intel_pmu_handle_pebs_record(iregs, regs, data, at, + pebs_status, counts, last, + setup_pebs_adaptive_sample_data); + } - if (WARN_ON_ONCE(!event) || - WARN_ON_ONCE(!event->attr.precise_ip)) - continue; + __intel_pmu_handle_last_pebs_record(iregs, regs, data, mask, counts, last, + setup_pebs_adaptive_sample_data); +} - if (counts[bit]++) { - __intel_pmu_pebs_event(event, iregs, regs, data, last[bit], - setup_pebs_adaptive_sample_data); - } - last[bit] = at; - } +static void intel_pmu_drain_arch_pebs(struct pt_regs *iregs, + struct perf_sample_data *data) +{ + short counts[INTEL_PMC_IDX_FIXED + MAX_FIXED_PEBS_EVENTS] = {}; + void *last[INTEL_PMC_IDX_FIXED + MAX_FIXED_PEBS_EVENTS]; + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + union arch_pebs_index index; + struct x86_perf_regs perf_regs; + struct pt_regs *regs = &perf_regs.regs; + void *base, *at, *top; + u64 mask; + + rdmsrq(MSR_IA32_PEBS_INDEX, index.whole); + + if (unlikely(!index.wr)) { + intel_pmu_pebs_event_update_no_drain(cpuc, X86_PMC_IDX_MAX); + return; } - for_each_set_bit(bit, (unsigned long *)&mask, X86_PMC_IDX_MAX) { - if (!counts[bit]) + base = cpuc->pebs_vaddr; + top = cpuc->pebs_vaddr + (index.wr << ARCH_PEBS_INDEX_WR_SHIFT); + + index.wr = 0; + index.full = 0; + index.en = 1; + if (cpuc->n_pebs == cpuc->n_large_pebs) + index.thresh = ARCH_PEBS_THRESH_MULTI; + else + index.thresh = ARCH_PEBS_THRESH_SINGLE; + wrmsrq(MSR_IA32_PEBS_INDEX, index.whole); + + mask = hybrid(cpuc->pmu, arch_pebs_cap).counters & cpuc->pebs_enabled; + + if (!iregs) + iregs = &dummy_iregs; + + /* Process all but the last event for each counter. */ + for (at = base; at < top;) { + struct arch_pebs_header *header; + struct arch_pebs_basic *basic; + u64 pebs_status; + + header = at; + + if (WARN_ON_ONCE(!header->size)) + break; + + /* 1st fragment or single record must have basic group */ + if (!header->basic) { + at += header->size; continue; + } - event = cpuc->events[bit]; + basic = at + sizeof(struct arch_pebs_header); + pebs_status = mask & basic->applicable_counters; + __intel_pmu_handle_pebs_record(iregs, regs, data, at, + pebs_status, counts, last, + setup_arch_pebs_sample_data); + + /* Skip non-last fragments */ + while (arch_pebs_record_continued(header)) { + if (!header->size) + break; + at += header->size; + header = at; + } - __intel_pmu_pebs_last_event(event, iregs, regs, data, last[bit], - counts[bit], setup_pebs_adaptive_sample_data); + /* Skip last fragment or the single record */ + at += header->size; } + + __intel_pmu_handle_last_pebs_record(iregs, regs, data, mask, + counts, last, + setup_arch_pebs_sample_data); +} + +static void __init intel_arch_pebs_init(void) +{ + /* + * Current hybrid platforms always both support arch-PEBS or not + * on all kinds of cores. So directly set x86_pmu.arch_pebs flag + * if boot cpu supports arch-PEBS. + */ + x86_pmu.arch_pebs = 1; + x86_pmu.pebs_buffer_size = PEBS_BUFFER_SIZE; + x86_pmu.drain_pebs = intel_pmu_drain_arch_pebs; + x86_pmu.pebs_capable = ~0ULL; + x86_pmu.flags |= PMU_FL_PEBS_ALL; + + x86_pmu.pebs_enable = __intel_pmu_pebs_enable; + x86_pmu.pebs_disable = __intel_pmu_pebs_disable; } /* * PEBS probe and setup */ -void __init intel_pebs_init(void) +static void __init intel_ds_pebs_init(void) { /* * No support for 32bit formats @@ -2736,10 +3119,8 @@ void __init intel_pebs_init(void) break; case 6: - if (x86_pmu.intel_cap.pebs_baseline) { + if (x86_pmu.intel_cap.pebs_baseline) x86_pmu.large_pebs_flags |= PERF_SAMPLE_READ; - x86_pmu.late_setup = intel_pmu_late_setup; - } fallthrough; case 5: x86_pmu.pebs_ept = 1; @@ -2789,6 +3170,14 @@ void __init intel_pebs_init(void) } } +void __init intel_pebs_init(void) +{ + if (x86_pmu.intel_cap.pebs_format == 0xf) + intel_arch_pebs_init(); + else + intel_ds_pebs_init(); +} + void perf_restore_debug_store(void) { struct debug_store *ds = __this_cpu_read(cpu_hw_events.ds); diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 2b969386dcdd..3161ec0a3416 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -283,8 +283,9 @@ struct cpu_hw_events { * Intel DebugStore bits */ struct debug_store *ds; - void *ds_pebs_vaddr; void *ds_bts_vaddr; + /* DS based PEBS or arch-PEBS buffer address */ + void *pebs_vaddr; u64 pebs_enabled; int n_pebs; int n_large_pebs; @@ -303,6 +304,8 @@ struct cpu_hw_events { /* Intel ACR configuration */ u64 acr_cfg_b[X86_PMC_IDX_MAX]; u64 acr_cfg_c[X86_PMC_IDX_MAX]; + /* Cached CFG_C values */ + u64 cfg_c_val[X86_PMC_IDX_MAX]; /* * Intel LBR bits @@ -708,6 +711,12 @@ enum hybrid_pmu_type { hybrid_big_small_tiny = hybrid_big | hybrid_small_tiny, }; +struct arch_pebs_cap { + u64 caps; + u64 counters; + u64 pdists; +}; + struct x86_hybrid_pmu { struct pmu pmu; const char *name; @@ -752,6 +761,8 @@ struct x86_hybrid_pmu { mid_ack :1, enabled_ack :1; + struct arch_pebs_cap arch_pebs_cap; + u64 pebs_data_source[PERF_PEBS_DATA_SOURCE_MAX]; }; @@ -906,7 +917,7 @@ struct x86_pmu { union perf_capabilities intel_cap; /* - * Intel DebugStore bits + * Intel DebugStore and PEBS bits */ unsigned int bts :1, bts_active :1, @@ -917,7 +928,8 @@ struct x86_pmu { pebs_no_tlb :1, pebs_no_isolation :1, pebs_block :1, - pebs_ept :1; + pebs_ept :1, + arch_pebs :1; int pebs_record_size; int pebs_buffer_size; u64 pebs_events_mask; @@ -930,6 +942,11 @@ struct x86_pmu { u64 pebs_capable; /* + * Intel Architectural PEBS + */ + struct arch_pebs_cap arch_pebs_cap; + + /* * Intel LBR */ unsigned int lbr_tos, lbr_from, lbr_to, @@ -1124,7 +1141,6 @@ static struct perf_pmu_format_hybrid_attr format_attr_hybrid_##_name = {\ .pmu_type = _pmu, \ } -int is_x86_event(struct perf_event *event); struct pmu *x86_get_pmu(unsigned int cpu); extern struct x86_pmu x86_pmu __read_mostly; @@ -1217,7 +1233,7 @@ int x86_reserve_hardware(void); void x86_release_hardware(void); -int x86_pmu_max_precise(void); +int x86_pmu_max_precise(struct pmu *pmu); void hw_perf_lbr_event_destroy(struct perf_event *event); @@ -1604,6 +1620,14 @@ extern void intel_cpuc_finish(struct cpu_hw_events *cpuc); int intel_pmu_init(void); +int alloc_arch_pebs_buf_on_cpu(int cpu); + +void release_arch_pebs_buf_on_cpu(int cpu); + +void init_arch_pebs_on_cpu(int cpu); + +void fini_arch_pebs_on_cpu(int cpu); + void init_debug_store_on_cpu(int cpu); void fini_debug_store_on_cpu(int cpu); @@ -1760,6 +1784,8 @@ void intel_pmu_pebs_data_source_cmt(void); void intel_pmu_pebs_data_source_lnl(void); +u64 intel_get_arch_pebs_data_config(struct perf_event *event); + int intel_pmu_setup_lbr_filter(struct perf_event *event); void intel_pt_interrupt(void); @@ -1792,6 +1818,11 @@ static inline int intel_pmu_max_num_pebs(struct pmu *pmu) return fls((u32)hybrid(pmu, pebs_events_mask)); } +static inline bool intel_pmu_has_pebs(void) +{ + return x86_pmu.ds_pebs || x86_pmu.arch_pebs; +} + #else /* CONFIG_CPU_SUP_INTEL */ static inline void reserve_ds_buffers(void) diff --git a/arch/x86/include/asm/insn-eval.h b/arch/x86/include/asm/insn-eval.h index 54368a43abf6..4733e9064ee5 100644 --- a/arch/x86/include/asm/insn-eval.h +++ b/arch/x86/include/asm/insn-eval.h @@ -44,4 +44,6 @@ enum insn_mmio_type { enum insn_mmio_type insn_decode_mmio(struct insn *insn, int *bytes); +bool insn_is_nop(struct insn *insn); + #endif /* _ASM_X86_INSN_EVAL_H */ diff --git a/arch/x86/include/asm/insn.h b/arch/x86/include/asm/insn.h index 091f88c8254d..846d21c1a7f8 100644 --- a/arch/x86/include/asm/insn.h +++ b/arch/x86/include/asm/insn.h @@ -312,7 +312,6 @@ static inline int insn_offset_immediate(struct insn *insn) /** * for_each_insn_prefix() -- Iterate prefixes in the instruction * @insn: Pointer to struct insn. - * @idx: Index storage. * @prefix: Prefix byte. * * Iterate prefix bytes of given @insn. Each prefix byte is stored in @prefix @@ -321,8 +320,8 @@ static inline int insn_offset_immediate(struct insn *insn) * Since prefixes.nbytes can be bigger than 4 if some prefixes * are repeated, it cannot be used for looping over the prefixes. */ -#define for_each_insn_prefix(insn, idx, prefix) \ - for (idx = 0; idx < ARRAY_SIZE(insn->prefixes.bytes) && (prefix = insn->prefixes.bytes[idx]) != 0; idx++) +#define for_each_insn_prefix(insn, prefix) \ + for (int idx = 0; idx < ARRAY_SIZE(insn->prefixes.bytes) && (prefix = insn->prefixes.bytes[idx]) != 0; idx++) #define POP_SS_OPCODE 0x1f #define MOV_SREG_OPCODE 0x8e diff --git a/arch/x86/include/asm/intel_ds.h b/arch/x86/include/asm/intel_ds.h index 5dbeac48a5b9..695f87efbeb8 100644 --- a/arch/x86/include/asm/intel_ds.h +++ b/arch/x86/include/asm/intel_ds.h @@ -4,7 +4,15 @@ #include <linux/percpu-defs.h> #define BTS_BUFFER_SIZE (PAGE_SIZE << 4) -#define PEBS_BUFFER_SIZE (PAGE_SIZE << 4) +#define PEBS_BUFFER_SHIFT 4 +#define PEBS_BUFFER_SIZE (PAGE_SIZE << PEBS_BUFFER_SHIFT) + +/* + * The largest PEBS record could consume a page, ensure + * a record at least can be written after triggering PMI. + */ +#define ARCH_PEBS_THRESH_MULTI ((PEBS_BUFFER_SIZE - PAGE_SIZE) >> PEBS_BUFFER_SHIFT) +#define ARCH_PEBS_THRESH_SINGLE 1 /* The maximal number of PEBS events: */ #define MAX_PEBS_EVENTS_FMT4 8 diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 9e1720d73244..65cc528fbad8 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -327,6 +327,26 @@ PERF_CAP_PEBS_FORMAT | PERF_CAP_PEBS_BASELINE | \ PERF_CAP_PEBS_TIMING_INFO) +/* Arch PEBS */ +#define MSR_IA32_PEBS_BASE 0x000003f4 +#define MSR_IA32_PEBS_INDEX 0x000003f5 +#define ARCH_PEBS_OFFSET_MASK 0x7fffff +#define ARCH_PEBS_INDEX_WR_SHIFT 4 + +#define ARCH_PEBS_RELOAD 0xffffffff +#define ARCH_PEBS_CNTR_ALLOW BIT_ULL(35) +#define ARCH_PEBS_CNTR_GP BIT_ULL(36) +#define ARCH_PEBS_CNTR_FIXED BIT_ULL(37) +#define ARCH_PEBS_CNTR_METRICS BIT_ULL(38) +#define ARCH_PEBS_LBR_SHIFT 40 +#define ARCH_PEBS_LBR (0x3ull << ARCH_PEBS_LBR_SHIFT) +#define ARCH_PEBS_VECR_XMM BIT_ULL(49) +#define ARCH_PEBS_GPR BIT_ULL(61) +#define ARCH_PEBS_AUX BIT_ULL(62) +#define ARCH_PEBS_EN BIT_ULL(63) +#define ARCH_PEBS_CNTR_MASK (ARCH_PEBS_CNTR_GP | ARCH_PEBS_CNTR_FIXED | \ + ARCH_PEBS_CNTR_METRICS) + #define MSR_IA32_RTIT_CTL 0x00000570 #define RTIT_CTL_TRACEEN BIT(0) #define RTIT_CTL_CYCLEACC BIT(1) diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index 49a4d442f3fc..7276ba70c88a 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -141,16 +141,16 @@ #define ARCH_PERFMON_EVENTS_COUNT 7 #define PEBS_DATACFG_MEMINFO BIT_ULL(0) -#define PEBS_DATACFG_GP BIT_ULL(1) +#define PEBS_DATACFG_GP BIT_ULL(1) #define PEBS_DATACFG_XMMS BIT_ULL(2) #define PEBS_DATACFG_LBRS BIT_ULL(3) -#define PEBS_DATACFG_LBR_SHIFT 24 #define PEBS_DATACFG_CNTR BIT_ULL(4) +#define PEBS_DATACFG_METRICS BIT_ULL(5) +#define PEBS_DATACFG_LBR_SHIFT 24 #define PEBS_DATACFG_CNTR_SHIFT 32 #define PEBS_DATACFG_CNTR_MASK GENMASK_ULL(15, 0) #define PEBS_DATACFG_FIX_SHIFT 48 #define PEBS_DATACFG_FIX_MASK GENMASK_ULL(7, 0) -#define PEBS_DATACFG_METRICS BIT_ULL(5) /* Steal the highest bit of pebs_data_cfg for SW usage */ #define PEBS_UPDATE_DS_SW BIT_ULL(63) @@ -200,6 +200,8 @@ union cpuid10_edx { #define ARCH_PERFMON_EXT_LEAF 0x00000023 #define ARCH_PERFMON_NUM_COUNTER_LEAF 0x1 #define ARCH_PERFMON_ACR_LEAF 0x2 +#define ARCH_PERFMON_PEBS_CAP_LEAF 0x4 +#define ARCH_PERFMON_PEBS_COUNTER_LEAF 0x5 union cpuid35_eax { struct { @@ -210,7 +212,10 @@ union cpuid35_eax { unsigned int acr_subleaf:1; /* Events Sub-Leaf */ unsigned int events_subleaf:1; - unsigned int reserved:28; + /* arch-PEBS Sub-Leaves */ + unsigned int pebs_caps_subleaf:1; + unsigned int pebs_cnts_subleaf:1; + unsigned int reserved:26; } split; unsigned int full; }; @@ -432,6 +437,8 @@ static inline bool is_topdown_idx(int idx) #define GLOBAL_STATUS_LBRS_FROZEN BIT_ULL(GLOBAL_STATUS_LBRS_FROZEN_BIT) #define GLOBAL_STATUS_TRACE_TOPAPMI_BIT 55 #define GLOBAL_STATUS_TRACE_TOPAPMI BIT_ULL(GLOBAL_STATUS_TRACE_TOPAPMI_BIT) +#define GLOBAL_STATUS_ARCH_PEBS_THRESHOLD_BIT 54 +#define GLOBAL_STATUS_ARCH_PEBS_THRESHOLD BIT_ULL(GLOBAL_STATUS_ARCH_PEBS_THRESHOLD_BIT) #define GLOBAL_STATUS_PERF_METRICS_OVF_BIT 48 #define GLOBAL_CTRL_EN_PERF_METRICS BIT_ULL(48) @@ -503,6 +510,107 @@ struct pebs_cntr_header { #define INTEL_CNTR_METRICS 0x3 /* + * Arch PEBS + */ +union arch_pebs_index { + struct { + u64 rsvd:4, + wr:23, + rsvd2:4, + full:1, + en:1, + rsvd3:3, + thresh:23, + rsvd4:5; + }; + u64 whole; +}; + +struct arch_pebs_header { + union { + u64 format; + struct { + u64 size:16, /* Record size */ + rsvd:14, + mode:1, /* 64BIT_MODE */ + cont:1, + rsvd2:3, + cntr:5, + lbr:2, + rsvd3:7, + xmm:1, + ymmh:1, + rsvd4:2, + opmask:1, + zmmh:1, + h16zmm:1, + rsvd5:5, + gpr:1, + aux:1, + basic:1; + }; + }; + u64 rsvd6; +}; + +struct arch_pebs_basic { + u64 ip; + u64 applicable_counters; + u64 tsc; + u64 retire :16, /* Retire Latency */ + valid :1, + rsvd :47; + u64 rsvd2; + u64 rsvd3; +}; + +struct arch_pebs_aux { + u64 address; + u64 rsvd; + u64 rsvd2; + u64 rsvd3; + u64 rsvd4; + u64 aux; + u64 instr_latency :16, + pad2 :16, + cache_latency :16, + pad3 :16; + u64 tsx_tuning; +}; + +struct arch_pebs_gprs { + u64 flags, ip, ax, cx, dx, bx, sp, bp, si, di; + u64 r8, r9, r10, r11, r12, r13, r14, r15, ssp; + u64 rsvd; +}; + +struct arch_pebs_xer_header { + u64 xstate; + u64 rsvd; +}; + +#define ARCH_PEBS_LBR_NAN 0x0 +#define ARCH_PEBS_LBR_NUM_8 0x1 +#define ARCH_PEBS_LBR_NUM_16 0x2 +#define ARCH_PEBS_LBR_NUM_VAR 0x3 +#define ARCH_PEBS_BASE_LBR_ENTRIES 8 +struct arch_pebs_lbr_header { + u64 rsvd; + u64 ctl; + u64 depth; + u64 ler_from; + u64 ler_to; + u64 ler_info; +}; + +struct arch_pebs_cntr_header { + u32 cntr; + u32 fixed; + u32 metrics; + u32 reserved; +}; + +/* * AMD Extended Performance Monitoring and Debug cpuid feature detection */ #define EXT_PERFMON_DEBUG_FEATURES 0x80000022 diff --git a/arch/x86/include/asm/unwind_user.h b/arch/x86/include/asm/unwind_user.h new file mode 100644 index 000000000000..12064284bc4e --- /dev/null +++ b/arch/x86/include/asm/unwind_user.h @@ -0,0 +1,41 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_X86_UNWIND_USER_H +#define _ASM_X86_UNWIND_USER_H + +#ifdef CONFIG_HAVE_UNWIND_USER_FP + +#include <asm/ptrace.h> +#include <asm/uprobes.h> + +#define ARCH_INIT_USER_FP_FRAME(ws) \ + .cfa_off = 2*(ws), \ + .ra_off = -1*(ws), \ + .fp_off = -2*(ws), \ + .use_fp = true, + +#define ARCH_INIT_USER_FP_ENTRY_FRAME(ws) \ + .cfa_off = 1*(ws), \ + .ra_off = -1*(ws), \ + .fp_off = 0, \ + .use_fp = false, + +static inline int unwind_user_word_size(struct pt_regs *regs) +{ + /* We can't unwind VM86 stacks */ + if (regs->flags & X86_VM_MASK) + return 0; +#ifdef CONFIG_X86_64 + if (!user_64bit_mode(regs)) + return sizeof(int); +#endif + return sizeof(long); +} + +static inline bool unwind_user_at_function_start(struct pt_regs *regs) +{ + return is_uprobe_at_func_entry(regs); +} + +#endif /* CONFIG_HAVE_UNWIND_USER_FP */ + +#endif /* _ASM_X86_UNWIND_USER_H */ diff --git a/arch/x86/include/asm/uprobes.h b/arch/x86/include/asm/uprobes.h index 1ee2e5115955..362210c79998 100644 --- a/arch/x86/include/asm/uprobes.h +++ b/arch/x86/include/asm/uprobes.h @@ -62,4 +62,13 @@ struct arch_uprobe_task { unsigned int saved_tf; }; +#ifdef CONFIG_UPROBES +extern bool is_uprobe_at_func_entry(struct pt_regs *regs); +#else +static bool is_uprobe_at_func_entry(struct pt_regs *regs) +{ + return false; +} +#endif /* CONFIG_UPROBES */ + #endif /* _ASM_UPROBES_H */ diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index d19a3fd7cf04..9f354ebf76e4 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -9,6 +9,7 @@ #include <asm/text-patching.h> #include <asm/insn.h> +#include <asm/insn-eval.h> #include <asm/ibt.h> #include <asm/set_memory.h> #include <asm/nmi.h> @@ -346,25 +347,6 @@ static void add_nop(u8 *buf, unsigned int len) } /* - * Matches NOP and NOPL, not any of the other possible NOPs. - */ -static bool insn_is_nop(struct insn *insn) -{ - /* Anything NOP, but no REP NOP */ - if (insn->opcode.bytes[0] == 0x90 && - (!insn->prefixes.nbytes || insn->prefixes.bytes[0] != 0xF3)) - return true; - - /* NOPL */ - if (insn->opcode.bytes[0] == 0x0F && insn->opcode.bytes[1] == 0x1F) - return true; - - /* TODO: more nops */ - - return false; -} - -/* * Find the offset of the first non-NOP instruction starting at @offset * but no further than @len. */ diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index 3863d7709386..c1fac3a9fecc 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -141,7 +141,6 @@ bool can_boost(struct insn *insn, void *addr) { kprobe_opcode_t opcode; insn_byte_t prefix; - int i; if (search_exception_tables((unsigned long)addr)) return false; /* Page fault may occur on this address. */ @@ -154,7 +153,7 @@ bool can_boost(struct insn *insn, void *addr) if (insn->opcode.nbytes != 1) return false; - for_each_insn_prefix(insn, i, prefix) { + for_each_insn_prefix(insn, prefix) { insn_attr_t attr; attr = inat_get_opcode_attribute(prefix); diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index 845aeaf36b8d..7be8e361ca55 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -17,6 +17,7 @@ #include <linux/kdebug.h> #include <asm/processor.h> #include <asm/insn.h> +#include <asm/insn-eval.h> #include <asm/mmu_context.h> #include <asm/nops.h> @@ -258,9 +259,8 @@ static volatile u32 good_2byte_insns[256 / 32] = { static bool is_prefix_bad(struct insn *insn) { insn_byte_t p; - int i; - for_each_insn_prefix(insn, i, p) { + for_each_insn_prefix(insn, p) { insn_attr_t attr; attr = inat_get_opcode_attribute(p); @@ -1158,35 +1158,12 @@ unlock: mmap_write_unlock(mm); } -static bool insn_is_nop(struct insn *insn) -{ - return insn->opcode.nbytes == 1 && insn->opcode.bytes[0] == 0x90; -} - -static bool insn_is_nopl(struct insn *insn) -{ - if (insn->opcode.nbytes != 2) - return false; - - if (insn->opcode.bytes[0] != 0x0f || insn->opcode.bytes[1] != 0x1f) - return false; - - if (!insn->modrm.nbytes) - return false; - - if (X86_MODRM_REG(insn->modrm.bytes[0]) != 0) - return false; - - /* 0f 1f /0 - NOPL */ - return true; -} - static bool can_optimize(struct insn *insn, unsigned long vaddr) { if (!insn->x86_64 || insn->length != 5) return false; - if (!insn_is_nop(insn) && !insn_is_nopl(insn)) + if (!insn_is_nop(insn)) return false; /* We can't do cross page atomic writes yet. */ @@ -1426,19 +1403,14 @@ static int branch_setup_xol_ops(struct arch_uprobe *auprobe, struct insn *insn) { u8 opc1 = OPCODE1(insn); insn_byte_t p; - int i; - /* x86_nops[insn->length]; same as jmp with .offs = 0 */ - if (insn->length <= ASM_NOP_MAX && - !memcmp(insn->kaddr, x86_nops[insn->length], insn->length)) + if (insn_is_nop(insn)) goto setup; switch (opc1) { case 0xeb: /* jmp 8 */ case 0xe9: /* jmp 32 */ break; - case 0x90: /* prefix* + nop; same as jmp with .offs = 0 */ - goto setup; case 0xe8: /* call relative */ branch_clear_offset(auprobe, insn); @@ -1463,7 +1435,7 @@ static int branch_setup_xol_ops(struct arch_uprobe *auprobe, struct insn *insn) * Intel and AMD behavior differ in 64-bit mode: Intel ignores 66 prefix. * No one uses these insns, reject any branch insns with such prefix. */ - for_each_insn_prefix(insn, i, p) { + for_each_insn_prefix(insn, p) { if (p == 0x66) return -ENOTSUPP; } @@ -1819,3 +1791,35 @@ bool arch_uretprobe_is_alive(struct return_instance *ret, enum rp_check ctx, else return regs->sp <= ret->stack; } + +/* + * Heuristic-based check if uprobe is installed at the function entry. + * + * Under assumption of user code being compiled with frame pointers, + * `push %rbp/%ebp` is a good indicator that we indeed are. + * + * Similarly, `endbr64` (assuming 64-bit mode) is also a common pattern. + * If we get this wrong, captured stack trace might have one extra bogus + * entry, but the rest of stack trace will still be meaningful. + */ +bool is_uprobe_at_func_entry(struct pt_regs *regs) +{ + struct arch_uprobe *auprobe; + + if (!current->utask) + return false; + + auprobe = current->utask->auprobe; + if (!auprobe) + return false; + + /* push %rbp/%ebp */ + if (auprobe->insn[0] == 0x55) + return true; + + /* endbr64 (64-bit only) */ + if (user_64bit_mode(regs) && is_endbr((u32 *)auprobe->insn)) + return true; + + return false; +} diff --git a/arch/x86/lib/insn-eval.c b/arch/x86/lib/insn-eval.c index 4e385cbfd444..e03eeec55cfe 100644 --- a/arch/x86/lib/insn-eval.c +++ b/arch/x86/lib/insn-eval.c @@ -63,11 +63,10 @@ static bool is_string_insn(struct insn *insn) bool insn_has_rep_prefix(struct insn *insn) { insn_byte_t p; - int i; insn_get_prefixes(insn); - for_each_insn_prefix(insn, i, p) { + for_each_insn_prefix(insn, p) { if (p == 0xf2 || p == 0xf3) return true; } @@ -92,13 +91,13 @@ bool insn_has_rep_prefix(struct insn *insn) static int get_seg_reg_override_idx(struct insn *insn) { int idx = INAT_SEG_REG_DEFAULT; - int num_overrides = 0, i; + int num_overrides = 0; insn_byte_t p; insn_get_prefixes(insn); /* Look for any segment override prefixes. */ - for_each_insn_prefix(insn, i, p) { + for_each_insn_prefix(insn, p) { insn_attr_t attr; attr = inat_get_opcode_attribute(p); @@ -1676,3 +1675,147 @@ enum insn_mmio_type insn_decode_mmio(struct insn *insn, int *bytes) return type; } + +/* + * Recognise typical NOP patterns for both 32bit and 64bit. + * + * Notably: + * - NOP, but not: REP NOP aka PAUSE + * - NOPL + * - MOV %reg, %reg + * - LEA 0(%reg),%reg + * - JMP +0 + * + * Must not have false-positives; instructions identified as a NOP might be + * emulated as a NOP (uprobe) or Run Length Encoded in a larger NOP + * (alternatives). + * + * False-negatives are fine; need not be exhaustive. + */ +bool insn_is_nop(struct insn *insn) +{ + u8 b3 = 0, x3 = 0, r3 = 0; + u8 b4 = 0, x4 = 0, r4 = 0, m = 0; + u8 modrm, modrm_mod, modrm_reg, modrm_rm; + u8 sib = 0, sib_scale, sib_index, sib_base; + u8 nrex, rex; + u8 p, rep = 0; + + if ((nrex = insn->rex_prefix.nbytes)) { + rex = insn->rex_prefix.bytes[nrex-1]; + + r3 = !!X86_REX_R(rex); + x3 = !!X86_REX_X(rex); + b3 = !!X86_REX_B(rex); + if (nrex > 1) { + r4 = !!X86_REX2_R(rex); + x4 = !!X86_REX2_X(rex); + b4 = !!X86_REX2_B(rex); + m = !!X86_REX2_M(rex); + } + + } else if (insn->vex_prefix.nbytes) { + /* + * Ignore VEX encoded NOPs + */ + return false; + } + + if (insn->modrm.nbytes) { + modrm = insn->modrm.bytes[0]; + modrm_mod = X86_MODRM_MOD(modrm); + modrm_reg = X86_MODRM_REG(modrm) + 8*r3 + 16*r4; + modrm_rm = X86_MODRM_RM(modrm) + 8*b3 + 16*b4; + modrm = 1; + } + + if (insn->sib.nbytes) { + sib = insn->sib.bytes[0]; + sib_scale = X86_SIB_SCALE(sib); + sib_index = X86_SIB_INDEX(sib) + 8*x3 + 16*x4; + sib_base = X86_SIB_BASE(sib) + 8*b3 + 16*b4; + sib = 1; + + modrm_rm = sib_base; + } + + for_each_insn_prefix(insn, p) { + if (p == 0xf3) /* REPE */ + rep = 1; + } + + /* + * Opcode map munging: + * + * REX2: 0 - single byte opcode + * 1 - 0f second byte opcode + */ + switch (m) { + case 0: break; + case 1: insn->opcode.value <<= 8; + insn->opcode.value |= 0x0f; + break; + default: + return false; + } + + switch (insn->opcode.bytes[0]) { + case 0x0f: /* 2nd byte */ + break; + + case 0x89: /* MOV */ + if (modrm_mod != 3) /* register-direct */ + return false; + + /* native size */ + if (insn->opnd_bytes != 4 * (1 + insn->x86_64)) + return false; + + return modrm_reg == modrm_rm; /* MOV %reg, %reg */ + + case 0x8d: /* LEA */ + if (modrm_mod == 0 || modrm_mod == 3) /* register-indirect with disp */ + return false; + + /* native size */ + if (insn->opnd_bytes != 4 * (1 + insn->x86_64)) + return false; + + if (insn->displacement.value != 0) + return false; + + if (sib && (sib_scale != 0 || sib_index != 4)) /* (%reg, %eiz, 1) */ + return false; + + for_each_insn_prefix(insn, p) { + if (p != 0x3e) /* DS */ + return false; + } + + return modrm_reg == modrm_rm; /* LEA 0(%reg), %reg */ + + case 0x90: /* NOP */ + if (b3 || b4) /* XCHG %r{8,16,24},%rax */ + return false; + + if (rep) /* REP NOP := PAUSE */ + return false; + + return true; + + case 0xe9: /* JMP.d32 */ + case 0xeb: /* JMP.d8 */ + return insn->immediate.value == 0; /* JMP +0 */ + + default: + return false; + } + + switch (insn->opcode.bytes[1]) { + case 0x1f: + return modrm_reg == 0; /* 0f 1f /0 -- NOPL */ + + default: + return false; + } +} |