summaryrefslogtreecommitdiff
path: root/arch/x86/kernel/cpu/perf_event_intel.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2016-01-11 14:39:17 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2016-01-11 14:39:17 -0800
commit5cb52b5e1654f3f1ed9c32e34456d98559c85aa0 (patch)
tree737c73d6aef99a17f57c2974f1e2a142a5f1a377 /arch/x86/kernel/cpu/perf_event_intel.c
parent24af98c4cf5f5e69266e270c7f3fb34b82ff6656 (diff)
parent3eb9ede23bdd96e9ba60e2b4d4d17a7c35d58448 (diff)
Merge branch 'perf-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull perf updates from Ingo Molnar: "Kernel side changes: - Intel Knights Landing support. (Harish Chegondi) - Intel Broadwell-EP uncore PMU support. (Kan Liang) - Core code improvements. (Peter Zijlstra.) - Event filter, LBR and PEBS fixes. (Stephane Eranian) - Enable cycles:pp on Intel Atom. (Stephane Eranian) - Add cycles:ppp support for Skylake. (Andi Kleen) - Various x86 NMI overhead optimizations. (Andi Kleen) - Intel PT enhancements. (Takao Indoh) - AMD cache events fix. (Vince Weaver) Tons of tooling changes: - Show random perf tool tips in the 'perf report' bottom line (Namhyung Kim) - perf report now defaults to --group if the perf.data file has grouped events, try it with: # perf record -e '{cycles,instructions}' -a sleep 1 [ perf record: Woken up 1 times to write data ] [ perf record: Captured and wrote 1.093 MB perf.data (1247 samples) ] # perf report # Samples: 1K of event 'anon group { cycles, instructions }' # Event count (approx.): 1955219195 # # Overhead Command Shared Object Symbol 2.86% 0.22% swapper [kernel.kallsyms] [k] intel_idle 1.05% 0.33% firefox libxul.so [.] js::SetObjectElement 1.05% 0.00% kworker/0:3 [kernel.kallsyms] [k] gen6_ring_get_seqno 0.88% 0.17% chrome chrome [.] 0x0000000000ee27ab 0.65% 0.86% firefox libxul.so [.] js::ValueToId<(js::AllowGC)1> 0.64% 0.23% JS Helper libxul.so [.] js::SplayTree<js::jit::LiveRange*, js::jit::LiveRange>::splay 0.62% 1.27% firefox libxul.so [.] js::GetIterator 0.61% 1.74% firefox libxul.so [.] js::NativeSetProperty 0.61% 0.31% firefox libxul.so [.] js::SetPropertyByDefining - Introduce the 'perf stat record/report' workflow: Generate perf.data files from 'perf stat', to tap into the scripting capabilities perf has instead of defining a 'perf stat' specific scripting support to calculate event ratios, etc. Simple example: $ perf stat record -e cycles usleep 1 Performance counter stats for 'usleep 1': 1,134,996 cycles 0.000670644 seconds time elapsed $ perf stat report Performance counter stats for '/home/acme/bin/perf stat record -e cycles usleep 1': 1,134,996 cycles 0.000670644 seconds time elapsed $ It generates PERF_RECORD_ userspace records to store the details: $ perf report -D | grep PERF_RECORD 0xf0 [0x28]: PERF_RECORD_THREAD_MAP nr: 1 thread: 27637 0x118 [0x12]: PERF_RECORD_CPU_MAP nr: 1 cpu: 65535 0x12a [0x40]: PERF_RECORD_STAT_CONFIG 0x16a [0x30]: PERF_RECORD_STAT -1 -1 0x19a [0x40]: PERF_RECORD_MMAP -1/0: [0xffffffff81000000(0x1f000000) @ 0xffffffff81000000]: x [kernel.kallsyms]_text 0x1da [0x18]: PERF_RECORD_STAT_ROUND [acme@ssdandy linux]$ An effort was made to make perf.data files generated like this to not generate cryptic messages when processed by older tools. The 'perf script' bits need rebasing, will go up later. - Make command line options always available, even when they depend on some feature being enabled, warning the user about use of such options (Wang Nan) - Support hw breakpoint events (mem:0xAddress) in the default output mode in 'perf script' (Wang Nan) - Fixes and improvements for supporting annotating ARM binaries, support ARM call and jump instructions, more work needed to have arch specific stuff separated into tools/perf/arch/*/annotate/ (Russell King) - Add initial 'perf config' command, for now just with a --list command to the contents of the configuration file in use and a basic man page describing its format, commands for doing edits and detailed documentation are being reviewed and proof-read. (Taeung Song) - Allows BPF scriptlets specify arguments to be fetched using DWARF info, using a prologue generated at compile/build time (He Kuang, Wang Nan) - Allow attaching BPF scriptlets to module symbols (Wang Nan) - Allow attaching BPF scriptlets to userspace code using uprobe (Wang Nan) - BPF programs now can specify 'perf probe' tunables via its section name, separating key=val values using semicolons (Wang Nan) Testing some of these new BPF features: Use case: get callchains when receiving SSL packets, filter then in the kernel, at arbitrary place. # cat ssl.bpf.c #define SEC(NAME) __attribute__((section(NAME), used)) struct pt_regs; SEC("func=__inet_lookup_established hnum") int func(struct pt_regs *ctx, int err, unsigned short port) { return err == 0 && port == 443; } char _license[] SEC("license") = "GPL"; int _version SEC("version") = LINUX_VERSION_CODE; # # perf record -a -g -e ssl.bpf.c ^C[ perf record: Woken up 1 times to write data ] [ perf record: Captured and wrote 0.787 MB perf.data (3 samples) ] # perf script | head -30 swapper 0 [000] 58783.268118: perf_bpf_probe:func: (ffffffff816a0f60) hnum=0x1bb 8a0f61 __inet_lookup_established (/lib/modules/4.3.0+/build/vmlinux) 896def ip_rcv_finish (/lib/modules/4.3.0+/build/vmlinux) 8976c2 ip_rcv (/lib/modules/4.3.0+/build/vmlinux) 855eba __netif_receive_skb_core (/lib/modules/4.3.0+/build/vmlinux) 8565d8 __netif_receive_skb (/lib/modules/4.3.0+/build/vmlinux) 8572a8 process_backlog (/lib/modules/4.3.0+/build/vmlinux) 856b11 net_rx_action (/lib/modules/4.3.0+/build/vmlinux) 2a284b __do_softirq (/lib/modules/4.3.0+/build/vmlinux) 2a2ba3 irq_exit (/lib/modules/4.3.0+/build/vmlinux) 96b7a4 do_IRQ (/lib/modules/4.3.0+/build/vmlinux) 969807 ret_from_intr (/lib/modules/4.3.0+/build/vmlinux) 2dede5 cpu_startup_entry (/lib/modules/4.3.0+/build/vmlinux) 95d5bc rest_init (/lib/modules/4.3.0+/build/vmlinux) 1163ffa start_kernel ([kernel.vmlinux].init.text) 11634d7 x86_64_start_reservations ([kernel.vmlinux].init.text) 1163623 x86_64_start_kernel ([kernel.vmlinux].init.text) qemu-system-x86 9178 [003] 58785.792417: perf_bpf_probe:func: (ffffffff816a0f60) hnum=0x1bb 8a0f61 __inet_lookup_established (/lib/modules/4.3.0+/build/vmlinux) 896def ip_rcv_finish (/lib/modules/4.3.0+/build/vmlinux) 8976c2 ip_rcv (/lib/modules/4.3.0+/build/vmlinux) 855eba __netif_receive_skb_core (/lib/modules/4.3.0+/build/vmlinux) 8565d8 __netif_receive_skb (/lib/modules/4.3.0+/build/vmlinux) 856660 netif_receive_skb_internal (/lib/modules/4.3.0+/build/vmlinux) 8566ec netif_receive_skb_sk (/lib/modules/4.3.0+/build/vmlinux) 430a br_handle_frame_finish ([bridge]) 48bc br_handle_frame ([bridge]) 855f44 __netif_receive_skb_core (/lib/modules/4.3.0+/build/vmlinux) 8565d8 __netif_receive_skb (/lib/modules/4.3.0+/build/vmlinux) # - Use 'perf probe' various options to list functions, see what variables can be collected at any given point, experiment first collecting without a filter, then filter, use it together with 'perf trace', 'perf top', with or without callchains, if it explodes, please tell us! - Introduce a new callchain mode: "folded", that will list per line representations of all callchains for a give histogram entry, facilitating 'perf report' output processing by other tools, such as Brendan Gregg's flamegraph tools (Namhyung Kim) E.g: # perf report | grep -v ^# | head 18.37% 0.00% swapper [kernel.kallsyms] [k] cpu_startup_entry | ---cpu_startup_entry | |--12.07%--start_secondary | --6.30%--rest_init start_kernel x86_64_start_reservations x86_64_start_kernel # Becomes, in "folded" mode: # perf report -g folded | grep -v ^# | head -5 18.37% 0.00% swapper [kernel.kallsyms] [k] cpu_startup_entry 12.07% cpu_startup_entry;start_secondary 6.30% cpu_startup_entry;rest_init;start_kernel;x86_64_start_reservations;x86_64_start_kernel 16.90% 0.00% swapper [kernel.kallsyms] [k] call_cpuidle 11.23% call_cpuidle;cpu_startup_entry;start_secondary 5.67% call_cpuidle;cpu_startup_entry;rest_init;start_kernel;x86_64_start_reservations;x86_64_start_kernel 16.90% 0.00% swapper [kernel.kallsyms] [k] cpuidle_enter 11.23% cpuidle_enter;call_cpuidle;cpu_startup_entry;start_secondary 5.67% cpuidle_enter;call_cpuidle;cpu_startup_entry;rest_init;start_kernel;x86_64_start_reservations;x86_64_start_kernel 15.12% 0.00% swapper [kernel.kallsyms] [k] cpuidle_enter_state # The user can also select one of "count", "period" or "percent" as the first column. ... and lots of infrastructure enhancements, plus fixes and other changes, features I failed to list - see the shortlog and the git log for details" * 'perf-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (271 commits) perf evlist: Add --trace-fields option to show trace fields perf record: Store data mmaps for dwarf unwind perf libdw: Check for mmaps also in MAP__VARIABLE tree perf unwind: Check for mmaps also in MAP__VARIABLE tree perf unwind: Use find_map function in access_dso_mem perf evlist: Remove perf_evlist__(enable|disable)_event functions perf evlist: Make perf_evlist__open() open evsels with their cpus and threads (like perf record does) perf report: Show random usage tip on the help line perf hists: Export a couple of hist functions perf diff: Use perf_hpp__register_sort_field interface perf tools: Add overhead/overhead_children keys defaults via string perf tools: Remove list entry from struct sort_entry perf tools: Include all tools/lib directory for tags/cscope/TAGS targets perf script: Align event name properly perf tools: Add missing headers in perf's MANIFEST perf tools: Do not show trace command if it's not compiled in perf report: Change default to use event group view perf top: Decay periods in callchains tools lib: Move bitmap.[ch] from tools/perf/ to tools/{lib,include}/ tools lib: Sync tools/lib/find_bit.c with the kernel ...
Diffstat (limited to 'arch/x86/kernel/cpu/perf_event_intel.c')
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel.c115
1 files changed, 110 insertions, 5 deletions
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index e2a430021e46..a667078a5180 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -185,6 +185,14 @@ struct event_constraint intel_skl_event_constraints[] = {
EVENT_CONSTRAINT_END
};
+static struct extra_reg intel_knl_extra_regs[] __read_mostly = {
+ INTEL_UEVENT_EXTRA_REG(0x01b7,
+ MSR_OFFCORE_RSP_0, 0x7f9ffbffffull, RSP_0),
+ INTEL_UEVENT_EXTRA_REG(0x02b7,
+ MSR_OFFCORE_RSP_1, 0x3f9ffbffffull, RSP_1),
+ EVENT_EXTRA_END
+};
+
static struct extra_reg intel_snb_extra_regs[] __read_mostly = {
/* must define OFFCORE_RSP_X first, see intel_fixup_er() */
INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x3f807f8fffull, RSP_0),
@@ -255,7 +263,7 @@ struct event_constraint intel_bdw_event_constraints[] = {
FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
INTEL_UEVENT_CONSTRAINT(0x148, 0x4), /* L1D_PEND_MISS.PENDING */
- INTEL_UEVENT_CONSTRAINT(0x8a3, 0x4), /* CYCLE_ACTIVITY.CYCLES_L1D_MISS */
+ INTEL_UBIT_EVENT_CONSTRAINT(0x8a3, 0x4), /* CYCLE_ACTIVITY.CYCLES_L1D_MISS */
EVENT_CONSTRAINT_END
};
@@ -1457,6 +1465,42 @@ static __initconst const u64 slm_hw_cache_event_ids
},
};
+#define KNL_OT_L2_HITE BIT_ULL(19) /* Other Tile L2 Hit */
+#define KNL_OT_L2_HITF BIT_ULL(20) /* Other Tile L2 Hit */
+#define KNL_MCDRAM_LOCAL BIT_ULL(21)
+#define KNL_MCDRAM_FAR BIT_ULL(22)
+#define KNL_DDR_LOCAL BIT_ULL(23)
+#define KNL_DDR_FAR BIT_ULL(24)
+#define KNL_DRAM_ANY (KNL_MCDRAM_LOCAL | KNL_MCDRAM_FAR | \
+ KNL_DDR_LOCAL | KNL_DDR_FAR)
+#define KNL_L2_READ SLM_DMND_READ
+#define KNL_L2_WRITE SLM_DMND_WRITE
+#define KNL_L2_PREFETCH SLM_DMND_PREFETCH
+#define KNL_L2_ACCESS SLM_LLC_ACCESS
+#define KNL_L2_MISS (KNL_OT_L2_HITE | KNL_OT_L2_HITF | \
+ KNL_DRAM_ANY | SNB_SNP_ANY | \
+ SNB_NON_DRAM)
+
+static __initconst const u64 knl_hw_cache_extra_regs
+ [PERF_COUNT_HW_CACHE_MAX]
+ [PERF_COUNT_HW_CACHE_OP_MAX]
+ [PERF_COUNT_HW_CACHE_RESULT_MAX] = {
+ [C(LL)] = {
+ [C(OP_READ)] = {
+ [C(RESULT_ACCESS)] = KNL_L2_READ | KNL_L2_ACCESS,
+ [C(RESULT_MISS)] = 0,
+ },
+ [C(OP_WRITE)] = {
+ [C(RESULT_ACCESS)] = KNL_L2_WRITE | KNL_L2_ACCESS,
+ [C(RESULT_MISS)] = KNL_L2_WRITE | KNL_L2_MISS,
+ },
+ [C(OP_PREFETCH)] = {
+ [C(RESULT_ACCESS)] = KNL_L2_PREFETCH | KNL_L2_ACCESS,
+ [C(RESULT_MISS)] = KNL_L2_PREFETCH | KNL_L2_MISS,
+ },
+ },
+};
+
/*
* Use from PMIs where the LBRs are already disabled.
*/
@@ -2475,6 +2519,44 @@ static void intel_pebs_aliases_snb(struct perf_event *event)
}
}
+static void intel_pebs_aliases_precdist(struct perf_event *event)
+{
+ if ((event->hw.config & X86_RAW_EVENT_MASK) == 0x003c) {
+ /*
+ * Use an alternative encoding for CPU_CLK_UNHALTED.THREAD_P
+ * (0x003c) so that we can use it with PEBS.
+ *
+ * The regular CPU_CLK_UNHALTED.THREAD_P event (0x003c) isn't
+ * PEBS capable. However we can use INST_RETIRED.PREC_DIST
+ * (0x01c0), which is a PEBS capable event, to get the same
+ * count.
+ *
+ * The PREC_DIST event has special support to minimize sample
+ * shadowing effects. One drawback is that it can be
+ * only programmed on counter 1, but that seems like an
+ * acceptable trade off.
+ */
+ u64 alt_config = X86_CONFIG(.event=0xc0, .umask=0x01, .inv=1, .cmask=16);
+
+ alt_config |= (event->hw.config & ~X86_RAW_EVENT_MASK);
+ event->hw.config = alt_config;
+ }
+}
+
+static void intel_pebs_aliases_ivb(struct perf_event *event)
+{
+ if (event->attr.precise_ip < 3)
+ return intel_pebs_aliases_snb(event);
+ return intel_pebs_aliases_precdist(event);
+}
+
+static void intel_pebs_aliases_skl(struct perf_event *event)
+{
+ if (event->attr.precise_ip < 3)
+ return intel_pebs_aliases_core2(event);
+ return intel_pebs_aliases_precdist(event);
+}
+
static unsigned long intel_pmu_free_running_flags(struct perf_event *event)
{
unsigned long flags = x86_pmu.free_running_flags;
@@ -3332,6 +3414,7 @@ __init int intel_pmu_init(void)
x86_pmu.event_constraints = intel_gen_event_constraints;
x86_pmu.pebs_constraints = intel_atom_pebs_event_constraints;
+ x86_pmu.pebs_aliases = intel_pebs_aliases_core2;
pr_cont("Atom events, ");
break;
@@ -3431,7 +3514,8 @@ __init int intel_pmu_init(void)
x86_pmu.event_constraints = intel_ivb_event_constraints;
x86_pmu.pebs_constraints = intel_ivb_pebs_event_constraints;
- x86_pmu.pebs_aliases = intel_pebs_aliases_snb;
+ x86_pmu.pebs_aliases = intel_pebs_aliases_ivb;
+ x86_pmu.pebs_prec_dist = true;
if (boot_cpu_data.x86_model == 62)
x86_pmu.extra_regs = intel_snbep_extra_regs;
else
@@ -3464,7 +3548,8 @@ __init int intel_pmu_init(void)
x86_pmu.event_constraints = intel_hsw_event_constraints;
x86_pmu.pebs_constraints = intel_hsw_pebs_event_constraints;
x86_pmu.extra_regs = intel_snbep_extra_regs;
- x86_pmu.pebs_aliases = intel_pebs_aliases_snb;
+ x86_pmu.pebs_aliases = intel_pebs_aliases_ivb;
+ x86_pmu.pebs_prec_dist = true;
/* all extra regs are per-cpu when HT is on */
x86_pmu.flags |= PMU_FL_HAS_RSP_1;
x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
@@ -3499,7 +3584,8 @@ __init int intel_pmu_init(void)
x86_pmu.event_constraints = intel_bdw_event_constraints;
x86_pmu.pebs_constraints = intel_hsw_pebs_event_constraints;
x86_pmu.extra_regs = intel_snbep_extra_regs;
- x86_pmu.pebs_aliases = intel_pebs_aliases_snb;
+ x86_pmu.pebs_aliases = intel_pebs_aliases_ivb;
+ x86_pmu.pebs_prec_dist = true;
/* all extra regs are per-cpu when HT is on */
x86_pmu.flags |= PMU_FL_HAS_RSP_1;
x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
@@ -3511,6 +3597,24 @@ __init int intel_pmu_init(void)
pr_cont("Broadwell events, ");
break;
+ case 87: /* Knights Landing Xeon Phi */
+ memcpy(hw_cache_event_ids,
+ slm_hw_cache_event_ids, sizeof(hw_cache_event_ids));
+ memcpy(hw_cache_extra_regs,
+ knl_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
+ intel_pmu_lbr_init_knl();
+
+ x86_pmu.event_constraints = intel_slm_event_constraints;
+ x86_pmu.pebs_constraints = intel_slm_pebs_event_constraints;
+ x86_pmu.extra_regs = intel_knl_extra_regs;
+
+ /* all extra regs are per-cpu when HT is on */
+ x86_pmu.flags |= PMU_FL_HAS_RSP_1;
+ x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
+
+ pr_cont("Knights Landing events, ");
+ break;
+
case 78: /* 14nm Skylake Mobile */
case 94: /* 14nm Skylake Desktop */
x86_pmu.late_ack = true;
@@ -3521,7 +3625,8 @@ __init int intel_pmu_init(void)
x86_pmu.event_constraints = intel_skl_event_constraints;
x86_pmu.pebs_constraints = intel_skl_pebs_event_constraints;
x86_pmu.extra_regs = intel_skl_extra_regs;
- x86_pmu.pebs_aliases = intel_pebs_aliases_snb;
+ x86_pmu.pebs_aliases = intel_pebs_aliases_skl;
+ x86_pmu.pebs_prec_dist = true;
/* all extra regs are per-cpu when HT is on */
x86_pmu.flags |= PMU_FL_HAS_RSP_1;
x86_pmu.flags |= PMU_FL_NO_HT_SHARING;