Merge tag 'perf-tools-for-v6.9-2024-03-13' of git://git.kernel.org/pub/scm/linux/kernel/git/perf/perf-tools

Pull perf tools updates from Namhyung Kim: "perf stat: - Support new 'cluster' aggregation mode for shared resources depending on the hardware configuration: $ sudo perf stat -a --per-cluster -e cycles,instructions sleep 1 Performance counter stats for 'system wide': S0-D0-CLS0 2 85,051,822 cycles S0-D0-CLS0 2 73,909,908 instructions # 0.87 insn per cycle S0-D0-CLS2 2 93,365,918 cycles S0-D0-CLS2 2 83,006,158 instructions # 0.89 insn per cycle S0-D0-CLS4 2 104,157,523 cycles S0-D0-CLS4 2 53,234,396 instructions # 0.51 insn per cycle S0-D0-CLS6 2 65,891,079 cycles S0-D0-CLS6 2 41,478,273 instructions # 0.63 insn per cycle 1.002407989 seconds time elapsed - Various fixes and cleanups for event metrics including NaN handling perf script: - Use libcapstone if available to disassemble the instructions. This enables 'perf script -F disasm' and 'perf script --insn-trace=disasm' (for Intel-PT): $ perf script -F event,ip,disasm cycles:P: ffffffffa988d428 wrmsr cycles:P: ffffffffa9839d25 movq %rax, %r14 cycles:P: ffffffffa9cdcaf0 endbr64 cycles:P: ffffffffa988d428 wrmsr cycles:P: ffffffffa988d428 wrmsr cycles:P: ffffffffaa401f86 iretq cycles:P: ffffffffa99c4de5 movq 0x30(%rcx), %r8 cycles:P: ffffffffa988d428 wrmsr cycles:P: ffffffffaa401f86 iretq cycles:P: ffffffffa9907983 movl 0x68(%rbx), %eax cycles:P: ffffffffa988d428 wrmsr - Expose sample ID / stream ID to python scripts perf test: - Add more perf test cases from Redhat internal test suites. This time it adds the base infra and a few perf probe tests. More to come. :) - Add 'perf test -p' for parallel execution and fix some issues found by the parallel test - Support symbol test to print symbols in given (active) module: $ perf test -F -v Symbols --dso /lib/modules/$(uname -r)/kernel/fs/ext4/ext4.ko --- start --- Testing /lib/modules/6.5.13-1rodete2-amd64/kernel/fs/ext4/ext4.ko Overlapping symbols: 7a990-7a9a0 l __pfx_ext4_exit_fs 7a990-7a9a0 g __pfx_cleanup_module Overlapping symbols: 7a9a0-7aa1c l ext4_exit_fs 7a9a0-7aa1c g cleanup_module ... JSON metric updates: - A new round of Intel metric updates - Support Power11 PVR (compatible to Power10) - Fix cache latency events on Zen 4 to set SliceId properly Internal: - Fix reference counting for 'map' data structure, tireless work from Ian! - More memory optimization for struct thread and annotate histogram. Now, 'perf report' (TUI) and 'perf annotate' should be much lighter-weight in terms of memory footprint - Support cross-arch perf register access. Clean up the build configuration so that it can detect arch-register support at runtime. This can allow to parse register data in sample which was recorded in a different arch Others: - Sync task state in 'perf sched' to kernel using trace event fields. The task states have been changed so tools cannot assume a fixed encoding - Clean up 'perf mem' to generalize the arch-specific events - Add support for local and global variables to data type profiling. This would increase the success rate of type resolution with DWARF - Add short option -H for --hierarchy in 'perf report' and 'perf top'" * tag 'perf-tools-for-v6.9-2024-03-13' of git://git.kernel.org/pub/scm/linux/kernel/git/perf/perf-tools: (154 commits) perf annotate: Add comments in the data structures perf annotate: Remove sym_hist.addr[] array perf annotate: Calculate instruction overhead using hashmap perf annotate: Add a hashmap for symbol histogram perf threads: Reduce table size from 256 to 8 perf threads: Switch from rbtree to hashmap perf threads: Move threads to its own files perf machine: Move machine's threads into its own abstraction perf machine: Move fprintf to for_each loop and a callback perf trace: Ignore thread hashing in summary perf report: Sort child tasks by tid perf vendor events amd: Fix Zen 4 cache latency events perf version: Display availability of OpenCSD support perf vendor events intel: Add umasks/occ_sel to PCU events. perf map: Fix map reference count issues libperf evlist: Avoid out-of-bounds access perf lock contention: Account contending locks too perf metrics: Fix segv for metrics with no events perf metrics: Fix metric matching perf pmu: Fix a potential memory leak in perf_pmu__lookup() ...
author: Linus Torvalds <torvalds@linux-foundation.org> 2024-03-14 16:31:23 -0700
committer: Linus Torvalds <torvalds@linux-foundation.org> 2024-03-14 16:31:23 -0700
commit: 1bbeaf83dd7b5e3628b98bec66ff8fe2646e14aa (patch)
tree: a391eed8ae206613b48e02e56e6ad5c4432d8767 /tools/perf/util/mem-events.c
parent: 63bd30f249dcf0a7ce16967935cecee8feec24bb (diff)
parent: 0f66dfe7b91d2743cc71dfff37af503215b204ef (diff)
1 files changed, 128 insertions, 89 deletions
diff --git a/tools/perf/util/mem-events.c b/tools/perf/util/mem-events.c
index 3a2e3687878c..637cbd4a7bfb 100644
--- a/tools/perf/util/mem-events.c
+++ b/tools/perf/util/mem-events.c
@@ -17,51 +17,126 @@
 
 unsigned int perf_mem_events__loads_ldlat = 30;
 
-#define E(t, n, s) { .tag = t, .name = n, .sysfs_name = s }
+#define E(t, n, s, l, a) { .tag = t, .name = n, .event_name = s, .ldlat = l, .aux_event = a }
 
-static struct perf_mem_event perf_mem_events[PERF_MEM_EVENTS__MAX] = {
-	E("ldlat-loads",	"cpu/mem-loads,ldlat=%u/P",	"cpu/events/mem-loads"),
-	E("ldlat-stores",	"cpu/mem-stores/P",		"cpu/events/mem-stores"),
-	E(NULL,			NULL,				NULL),
+struct perf_mem_event perf_mem_events[PERF_MEM_EVENTS__MAX] = {
+	E("ldlat-loads",	"%s/mem-loads,ldlat=%u/P",	"mem-loads",	true,	0),
+	E("ldlat-stores",	"%s/mem-stores/P",		"mem-stores",	false,	0),
+	E(NULL,			NULL,				NULL,		false,	0),
 };
 #undef E
 
 static char mem_loads_name[100];
-static bool mem_loads_name__init;
+static char mem_stores_name[100];
 
-struct perf_mem_event * __weak perf_mem_events__ptr(int i)
+struct perf_mem_event *perf_pmu__mem_events_ptr(struct perf_pmu *pmu, int i)
 {
-	if (i >= PERF_MEM_EVENTS__MAX)
+	if (i >= PERF_MEM_EVENTS__MAX || !pmu)
 		return NULL;
 
-	return &perf_mem_events[i];
+	return &pmu->mem_events[i];
 }
 
-const char * __weak perf_mem_events__name(int i, const char *pmu_name  __maybe_unused)
+static struct perf_pmu *perf_pmus__scan_mem(struct perf_pmu *pmu)
 {
-	struct perf_mem_event *e = perf_mem_events__ptr(i);
+	while ((pmu = perf_pmus__scan(pmu)) != NULL) {
+		if (pmu->mem_events)
+			return pmu;
+	}
+	return NULL;
+}
+
+struct perf_pmu *perf_mem_events_find_pmu(void)
+{
+	/*
+	 * The current perf mem doesn't support per-PMU configuration.
+	 * The exact same configuration is applied to all the
+	 * mem_events supported PMUs.
+	 * Return the first mem_events supported PMU.
+	 *
+	 * Notes: The only case which may support multiple mem_events
+	 * supported PMUs is Intel hybrid. The exact same mem_events
+	 * is shared among the PMUs. Only configure the first PMU
+	 * is good enough as well.
+	 */
+	return perf_pmus__scan_mem(NULL);
+}
+
+/**
+ * perf_pmu__mem_events_num_mem_pmus - Get the number of mem PMUs since the given pmu
+ * @pmu: Start pmu. If it's NULL, search the entire PMU list.
+ */
+int perf_pmu__mem_events_num_mem_pmus(struct perf_pmu *pmu)
+{
+	int num = 0;
+
+	while ((pmu = perf_pmus__scan_mem(pmu)) != NULL)
+		num++;
+
+	return num;
+}
 
+static const char *perf_pmu__mem_events_name(int i, struct perf_pmu *pmu)
+{
+	struct perf_mem_event *e;
+
+	if (i >= PERF_MEM_EVENTS__MAX || !pmu)
+		return NULL;
+
+	e = &pmu->mem_events[i];
 	if (!e)
 		return NULL;
 
-	if (i == PERF_MEM_EVENTS__LOAD) {
-		if (!mem_loads_name__init) {
-			mem_loads_name__init = true;
-			scnprintf(mem_loads_name, sizeof(mem_loads_name),
-				  e->name, perf_mem_events__loads_ldlat);
+	if (i == PERF_MEM_EVENTS__LOAD || i == PERF_MEM_EVENTS__LOAD_STORE) {
+		if (e->ldlat) {
+			if (!e->aux_event) {
+				/* ARM and Most of Intel */
+				scnprintf(mem_loads_name, sizeof(mem_loads_name),
+					  e->name, pmu->name,
+					  perf_mem_events__loads_ldlat);
+			} else {
+				/* Intel with mem-loads-aux event */
+				scnprintf(mem_loads_name, sizeof(mem_loads_name),
+					  e->name, pmu->name, pmu->name,
+					  perf_mem_events__loads_ldlat);
+			}
+		} else {
+			if (!e->aux_event) {
+				/* AMD and POWER */
+				scnprintf(mem_loads_name, sizeof(mem_loads_name),
+					  e->name, pmu->name);
+			} else
+				return NULL;
 		}
+
 		return mem_loads_name;
 	}
 
-	return e->name;
+	if (i == PERF_MEM_EVENTS__STORE) {
+		scnprintf(mem_stores_name, sizeof(mem_stores_name),
+			  e->name, pmu->name);
+		return mem_stores_name;
+	}
+
+	return NULL;
 }
 
-__weak bool is_mem_loads_aux_event(struct evsel *leader __maybe_unused)
+bool is_mem_loads_aux_event(struct evsel *leader)
 {
-	return false;
+	struct perf_pmu *pmu = leader->pmu;
+	struct perf_mem_event *e;
+
+	if (!pmu || !pmu->mem_events)
+		return false;
+
+	e = &pmu->mem_events[PERF_MEM_EVENTS__LOAD];
+	if (!e->aux_event)
+		return false;
+
+	return leader->core.attr.config == e->aux_event;
 }
 
-int perf_mem_events__parse(const char *str)
+int perf_pmu__mem_events_parse(struct perf_pmu *pmu, const char *str)
 {
 	char *tok, *saveptr = NULL;
 	bool found = false;
@@ -79,7 +154,7 @@ int perf_mem_events__parse(const char *str)
 
 	while (tok) {
 		for (j = 0; j < PERF_MEM_EVENTS__MAX; j++) {
-			struct perf_mem_event *e = perf_mem_events__ptr(j);
+			struct perf_mem_event *e = perf_pmu__mem_events_ptr(pmu, j);
 
 			if (!e->tag)
 				continue;
@@ -100,19 +175,21 @@ int perf_mem_events__parse(const char *str)
 	return -1;
 }
 
-static bool perf_mem_event__supported(const char *mnt, struct perf_pmu *pmu,
+static bool perf_pmu__mem_events_supported(const char *mnt, struct perf_pmu *pmu,
 				      struct perf_mem_event *e)
 {
-	char sysfs_name[100];
 	char path[PATH_MAX];
 	struct stat st;
 
-	scnprintf(sysfs_name, sizeof(sysfs_name), e->sysfs_name, pmu->name);
-	scnprintf(path, PATH_MAX, "%s/devices/%s", mnt, sysfs_name);
+	if (!e->event_name)
+		return true;
+
+	scnprintf(path, PATH_MAX, "%s/devices/%s/events/%s", mnt, pmu->name, e->event_name);
+
 	return !stat(path, &st);
 }
 
-int perf_mem_events__init(void)
+int perf_pmu__mem_events_init(struct perf_pmu *pmu)
 {
 	const char *mnt = sysfs__mount();
 	bool found = false;
@@ -122,8 +199,7 @@ int perf_mem_events__init(void)
 		return -ENOENT;
 
 	for (j = 0; j < PERF_MEM_EVENTS__MAX; j++) {
-		struct perf_mem_event *e = perf_mem_events__ptr(j);
-		struct perf_pmu *pmu = NULL;
+		struct perf_mem_event *e = perf_pmu__mem_events_ptr(pmu, j);
 
 		/*
 		 * If the event entry isn't valid, skip initialization
@@ -132,103 +208,66 @@ int perf_mem_events__init(void)
 		if (!e->tag)
 			continue;
 
-		/*
-		 * Scan all PMUs not just core ones, since perf mem/c2c on
-		 * platforms like AMD uses IBS OP PMU which is independent
-		 * of core PMU.
-		 */
-		while ((pmu = perf_pmus__scan(pmu)) != NULL) {
-			e->supported |= perf_mem_event__supported(mnt, pmu, e);
-			if (e->supported) {
-				found = true;
-				break;
-			}
-		}
+		e->supported |= perf_pmu__mem_events_supported(mnt, pmu, e);
+		if (e->supported)
+			found = true;
 	}
 
 	return found ? 0 : -ENOENT;
 }
 
-void perf_mem_events__list(void)
+void perf_pmu__mem_events_list(struct perf_pmu *pmu)
 {
 	int j;
 
 	for (j = 0; j < PERF_MEM_EVENTS__MAX; j++) {
-		struct perf_mem_event *e = perf_mem_events__ptr(j);
+		struct perf_mem_event *e = perf_pmu__mem_events_ptr(pmu, j);
 
 		fprintf(stderr, "%-*s%-*s%s",
 			e->tag ? 13 : 0,
 			e->tag ? : "",
 			e->tag && verbose > 0 ? 25 : 0,
-			e->tag && verbose > 0 ? perf_mem_events__name(j, NULL) : "",
+			e->tag && verbose > 0 ? perf_pmu__mem_events_name(j, pmu) : "",
 			e->supported ? ": available\n" : "");
 	}
 }
 
-static void perf_mem_events__print_unsupport_hybrid(struct perf_mem_event *e,
-						    int idx)
+int perf_mem_events__record_args(const char **rec_argv, int *argv_nr)
 {
 	const char *mnt = sysfs__mount();
 	struct perf_pmu *pmu = NULL;
-
-	while ((pmu = perf_pmus__scan(pmu)) != NULL) {
-		if (!perf_mem_event__supported(mnt, pmu, e)) {
-			pr_err("failed: event '%s' not supported\n",
-			       perf_mem_events__name(idx, pmu->name));
-		}
-	}
-}
-
-int perf_mem_events__record_args(const char **rec_argv, int *argv_nr,
-				 char **rec_tmp, int *tmp_nr)
-{
-	const char *mnt = sysfs__mount();
-	int i = *argv_nr, k = 0;
 	struct perf_mem_event *e;
+	int i = *argv_nr;
+	const char *s;
+	char *copy;
 
-	for (int j = 0; j < PERF_MEM_EVENTS__MAX; j++) {
-		e = perf_mem_events__ptr(j);
-		if (!e->record)
-			continue;
+	while ((pmu = perf_pmus__scan_mem(pmu)) != NULL) {
+		for (int j = 0; j < PERF_MEM_EVENTS__MAX; j++) {
+			e = perf_pmu__mem_events_ptr(pmu, j);
+
+			if (!e->record)
+				continue;
 
-		if (perf_pmus__num_mem_pmus() == 1) {
 			if (!e->supported) {
 				pr_err("failed: event '%s' not supported\n",
-				       perf_mem_events__name(j, NULL));
+					perf_pmu__mem_events_name(j, pmu));
 				return -1;
 			}
 
-			rec_argv[i++] = "-e";
-			rec_argv[i++] = perf_mem_events__name(j, NULL);
-		} else {
-			struct perf_pmu *pmu = NULL;
+			s = perf_pmu__mem_events_name(j, pmu);
+			if (!s || !perf_pmu__mem_events_supported(mnt, pmu, e))
+				continue;
 
-			if (!e->supported) {
-				perf_mem_events__print_unsupport_hybrid(e, j);
+			copy = strdup(s);
+			if (!copy)
 				return -1;
-			}
-
-			while ((pmu = perf_pmus__scan(pmu)) != NULL) {
-				const char *s = perf_mem_events__name(j, pmu->name);
-
-				if (!perf_mem_event__supported(mnt, pmu, e))
-					continue;
 
-				rec_argv[i++] = "-e";
-				if (s) {
-					char *copy = strdup(s);
-					if (!copy)
-						return -1;
-
-					rec_argv[i++] = copy;
-					rec_tmp[k++] = copy;
-				}
-			}
+			rec_argv[i++] = "-e";
+			rec_argv[i++] = copy;
 		}
 	}
 
 	*argv_nr = i;
-	*tmp_nr = k;
 	return 0;
 }
author	Linus Torvalds <torvalds@linux-foundation.org>	2024-03-14 16:31:23 -0700
committer	Linus Torvalds <torvalds@linux-foundation.org>	2024-03-14 16:31:23 -0700
commit	1bbeaf83dd7b5e3628b98bec66ff8fe2646e14aa (patch)
tree	a391eed8ae206613b48e02e56e6ad5c4432d8767 /tools/perf/util/mem-events.c
parent	63bd30f249dcf0a7ce16967935cecee8feec24bb (diff)
parent	0f66dfe7b91d2743cc71dfff37af503215b204ef (diff)