diff options
| author | Linus Torvalds <torvalds@linux-foundation.org> | 2024-03-14 16:31:23 -0700 |
|---|---|---|
| committer | Linus Torvalds <torvalds@linux-foundation.org> | 2024-03-14 16:31:23 -0700 |
| commit | 1bbeaf83dd7b5e3628b98bec66ff8fe2646e14aa (patch) | |
| tree | a391eed8ae206613b48e02e56e6ad5c4432d8767 /tools/perf/util/mem-events.c | |
| parent | 63bd30f249dcf0a7ce16967935cecee8feec24bb (diff) | |
| parent | 0f66dfe7b91d2743cc71dfff37af503215b204ef (diff) | |
Merge tag 'perf-tools-for-v6.9-2024-03-13' of git://git.kernel.org/pub/scm/linux/kernel/git/perf/perf-tools
Pull perf tools updates from Namhyung Kim:
"perf stat:
- Support new 'cluster' aggregation mode for shared resources
depending on the hardware configuration:
$ sudo perf stat -a --per-cluster -e cycles,instructions sleep 1
Performance counter stats for 'system wide':
S0-D0-CLS0 2 85,051,822 cycles
S0-D0-CLS0 2 73,909,908 instructions # 0.87 insn per cycle
S0-D0-CLS2 2 93,365,918 cycles
S0-D0-CLS2 2 83,006,158 instructions # 0.89 insn per cycle
S0-D0-CLS4 2 104,157,523 cycles
S0-D0-CLS4 2 53,234,396 instructions # 0.51 insn per cycle
S0-D0-CLS6 2 65,891,079 cycles
S0-D0-CLS6 2 41,478,273 instructions # 0.63 insn per cycle
1.002407989 seconds time elapsed
- Various fixes and cleanups for event metrics including NaN handling
perf script:
- Use libcapstone if available to disassemble the instructions. This
enables 'perf script -F disasm' and 'perf script --insn-trace=disasm'
(for Intel-PT):
$ perf script -F event,ip,disasm
cycles:P: ffffffffa988d428 wrmsr
cycles:P: ffffffffa9839d25 movq %rax, %r14
cycles:P: ffffffffa9cdcaf0 endbr64
cycles:P: ffffffffa988d428 wrmsr
cycles:P: ffffffffa988d428 wrmsr
cycles:P: ffffffffaa401f86 iretq
cycles:P: ffffffffa99c4de5 movq 0x30(%rcx), %r8
cycles:P: ffffffffa988d428 wrmsr
cycles:P: ffffffffaa401f86 iretq
cycles:P: ffffffffa9907983 movl 0x68(%rbx), %eax
cycles:P: ffffffffa988d428 wrmsr
- Expose sample ID / stream ID to python scripts
perf test:
- Add more perf test cases from Redhat internal test suites. This
time it adds the base infra and a few perf probe tests. More to
come. :)
- Add 'perf test -p' for parallel execution and fix some issues found
by the parallel test
- Support symbol test to print symbols in given (active) module:
$ perf test -F -v Symbols --dso /lib/modules/$(uname -r)/kernel/fs/ext4/ext4.ko
--- start ---
Testing /lib/modules/6.5.13-1rodete2-amd64/kernel/fs/ext4/ext4.ko
Overlapping symbols:
7a990-7a9a0 l __pfx_ext4_exit_fs
7a990-7a9a0 g __pfx_cleanup_module
Overlapping symbols:
7a9a0-7aa1c l ext4_exit_fs
7a9a0-7aa1c g cleanup_module
...
JSON metric updates:
- A new round of Intel metric updates
- Support Power11 PVR (compatible to Power10)
- Fix cache latency events on Zen 4 to set SliceId properly
Internal:
- Fix reference counting for 'map' data structure, tireless work from
Ian!
- More memory optimization for struct thread and annotate histogram.
Now, 'perf report' (TUI) and 'perf annotate' should be much
lighter-weight in terms of memory footprint
- Support cross-arch perf register access. Clean up the build
configuration so that it can detect arch-register support at
runtime. This can allow to parse register data in sample which was
recorded in a different arch
Others:
- Sync task state in 'perf sched' to kernel using trace event fields.
The task states have been changed so tools cannot assume a fixed
encoding
- Clean up 'perf mem' to generalize the arch-specific events
- Add support for local and global variables to data type profiling.
This would increase the success rate of type resolution with DWARF
- Add short option -H for --hierarchy in 'perf report' and 'perf top'"
* tag 'perf-tools-for-v6.9-2024-03-13' of git://git.kernel.org/pub/scm/linux/kernel/git/perf/perf-tools: (154 commits)
perf annotate: Add comments in the data structures
perf annotate: Remove sym_hist.addr[] array
perf annotate: Calculate instruction overhead using hashmap
perf annotate: Add a hashmap for symbol histogram
perf threads: Reduce table size from 256 to 8
perf threads: Switch from rbtree to hashmap
perf threads: Move threads to its own files
perf machine: Move machine's threads into its own abstraction
perf machine: Move fprintf to for_each loop and a callback
perf trace: Ignore thread hashing in summary
perf report: Sort child tasks by tid
perf vendor events amd: Fix Zen 4 cache latency events
perf version: Display availability of OpenCSD support
perf vendor events intel: Add umasks/occ_sel to PCU events.
perf map: Fix map reference count issues
libperf evlist: Avoid out-of-bounds access
perf lock contention: Account contending locks too
perf metrics: Fix segv for metrics with no events
perf metrics: Fix metric matching
perf pmu: Fix a potential memory leak in perf_pmu__lookup()
...
Diffstat (limited to 'tools/perf/util/mem-events.c')
| -rw-r--r-- | tools/perf/util/mem-events.c | 217 |
1 files changed, 128 insertions, 89 deletions
diff --git a/tools/perf/util/mem-events.c b/tools/perf/util/mem-events.c index 3a2e3687878c..637cbd4a7bfb 100644 --- a/tools/perf/util/mem-events.c +++ b/tools/perf/util/mem-events.c @@ -17,51 +17,126 @@ unsigned int perf_mem_events__loads_ldlat = 30; -#define E(t, n, s) { .tag = t, .name = n, .sysfs_name = s } +#define E(t, n, s, l, a) { .tag = t, .name = n, .event_name = s, .ldlat = l, .aux_event = a } -static struct perf_mem_event perf_mem_events[PERF_MEM_EVENTS__MAX] = { - E("ldlat-loads", "cpu/mem-loads,ldlat=%u/P", "cpu/events/mem-loads"), - E("ldlat-stores", "cpu/mem-stores/P", "cpu/events/mem-stores"), - E(NULL, NULL, NULL), +struct perf_mem_event perf_mem_events[PERF_MEM_EVENTS__MAX] = { + E("ldlat-loads", "%s/mem-loads,ldlat=%u/P", "mem-loads", true, 0), + E("ldlat-stores", "%s/mem-stores/P", "mem-stores", false, 0), + E(NULL, NULL, NULL, false, 0), }; #undef E static char mem_loads_name[100]; -static bool mem_loads_name__init; +static char mem_stores_name[100]; -struct perf_mem_event * __weak perf_mem_events__ptr(int i) +struct perf_mem_event *perf_pmu__mem_events_ptr(struct perf_pmu *pmu, int i) { - if (i >= PERF_MEM_EVENTS__MAX) + if (i >= PERF_MEM_EVENTS__MAX || !pmu) return NULL; - return &perf_mem_events[i]; + return &pmu->mem_events[i]; } -const char * __weak perf_mem_events__name(int i, const char *pmu_name __maybe_unused) +static struct perf_pmu *perf_pmus__scan_mem(struct perf_pmu *pmu) { - struct perf_mem_event *e = perf_mem_events__ptr(i); + while ((pmu = perf_pmus__scan(pmu)) != NULL) { + if (pmu->mem_events) + return pmu; + } + return NULL; +} + +struct perf_pmu *perf_mem_events_find_pmu(void) +{ + /* + * The current perf mem doesn't support per-PMU configuration. + * The exact same configuration is applied to all the + * mem_events supported PMUs. + * Return the first mem_events supported PMU. + * + * Notes: The only case which may support multiple mem_events + * supported PMUs is Intel hybrid. The exact same mem_events + * is shared among the PMUs. Only configure the first PMU + * is good enough as well. + */ + return perf_pmus__scan_mem(NULL); +} + +/** + * perf_pmu__mem_events_num_mem_pmus - Get the number of mem PMUs since the given pmu + * @pmu: Start pmu. If it's NULL, search the entire PMU list. + */ +int perf_pmu__mem_events_num_mem_pmus(struct perf_pmu *pmu) +{ + int num = 0; + + while ((pmu = perf_pmus__scan_mem(pmu)) != NULL) + num++; + + return num; +} +static const char *perf_pmu__mem_events_name(int i, struct perf_pmu *pmu) +{ + struct perf_mem_event *e; + + if (i >= PERF_MEM_EVENTS__MAX || !pmu) + return NULL; + + e = &pmu->mem_events[i]; if (!e) return NULL; - if (i == PERF_MEM_EVENTS__LOAD) { - if (!mem_loads_name__init) { - mem_loads_name__init = true; - scnprintf(mem_loads_name, sizeof(mem_loads_name), - e->name, perf_mem_events__loads_ldlat); + if (i == PERF_MEM_EVENTS__LOAD || i == PERF_MEM_EVENTS__LOAD_STORE) { + if (e->ldlat) { + if (!e->aux_event) { + /* ARM and Most of Intel */ + scnprintf(mem_loads_name, sizeof(mem_loads_name), + e->name, pmu->name, + perf_mem_events__loads_ldlat); + } else { + /* Intel with mem-loads-aux event */ + scnprintf(mem_loads_name, sizeof(mem_loads_name), + e->name, pmu->name, pmu->name, + perf_mem_events__loads_ldlat); + } + } else { + if (!e->aux_event) { + /* AMD and POWER */ + scnprintf(mem_loads_name, sizeof(mem_loads_name), + e->name, pmu->name); + } else + return NULL; } + return mem_loads_name; } - return e->name; + if (i == PERF_MEM_EVENTS__STORE) { + scnprintf(mem_stores_name, sizeof(mem_stores_name), + e->name, pmu->name); + return mem_stores_name; + } + + return NULL; } -__weak bool is_mem_loads_aux_event(struct evsel *leader __maybe_unused) +bool is_mem_loads_aux_event(struct evsel *leader) { - return false; + struct perf_pmu *pmu = leader->pmu; + struct perf_mem_event *e; + + if (!pmu || !pmu->mem_events) + return false; + + e = &pmu->mem_events[PERF_MEM_EVENTS__LOAD]; + if (!e->aux_event) + return false; + + return leader->core.attr.config == e->aux_event; } -int perf_mem_events__parse(const char *str) +int perf_pmu__mem_events_parse(struct perf_pmu *pmu, const char *str) { char *tok, *saveptr = NULL; bool found = false; @@ -79,7 +154,7 @@ int perf_mem_events__parse(const char *str) while (tok) { for (j = 0; j < PERF_MEM_EVENTS__MAX; j++) { - struct perf_mem_event *e = perf_mem_events__ptr(j); + struct perf_mem_event *e = perf_pmu__mem_events_ptr(pmu, j); if (!e->tag) continue; @@ -100,19 +175,21 @@ int perf_mem_events__parse(const char *str) return -1; } -static bool perf_mem_event__supported(const char *mnt, struct perf_pmu *pmu, +static bool perf_pmu__mem_events_supported(const char *mnt, struct perf_pmu *pmu, struct perf_mem_event *e) { - char sysfs_name[100]; char path[PATH_MAX]; struct stat st; - scnprintf(sysfs_name, sizeof(sysfs_name), e->sysfs_name, pmu->name); - scnprintf(path, PATH_MAX, "%s/devices/%s", mnt, sysfs_name); + if (!e->event_name) + return true; + + scnprintf(path, PATH_MAX, "%s/devices/%s/events/%s", mnt, pmu->name, e->event_name); + return !stat(path, &st); } -int perf_mem_events__init(void) +int perf_pmu__mem_events_init(struct perf_pmu *pmu) { const char *mnt = sysfs__mount(); bool found = false; @@ -122,8 +199,7 @@ int perf_mem_events__init(void) return -ENOENT; for (j = 0; j < PERF_MEM_EVENTS__MAX; j++) { - struct perf_mem_event *e = perf_mem_events__ptr(j); - struct perf_pmu *pmu = NULL; + struct perf_mem_event *e = perf_pmu__mem_events_ptr(pmu, j); /* * If the event entry isn't valid, skip initialization @@ -132,103 +208,66 @@ int perf_mem_events__init(void) if (!e->tag) continue; - /* - * Scan all PMUs not just core ones, since perf mem/c2c on - * platforms like AMD uses IBS OP PMU which is independent - * of core PMU. - */ - while ((pmu = perf_pmus__scan(pmu)) != NULL) { - e->supported |= perf_mem_event__supported(mnt, pmu, e); - if (e->supported) { - found = true; - break; - } - } + e->supported |= perf_pmu__mem_events_supported(mnt, pmu, e); + if (e->supported) + found = true; } return found ? 0 : -ENOENT; } -void perf_mem_events__list(void) +void perf_pmu__mem_events_list(struct perf_pmu *pmu) { int j; for (j = 0; j < PERF_MEM_EVENTS__MAX; j++) { - struct perf_mem_event *e = perf_mem_events__ptr(j); + struct perf_mem_event *e = perf_pmu__mem_events_ptr(pmu, j); fprintf(stderr, "%-*s%-*s%s", e->tag ? 13 : 0, e->tag ? : "", e->tag && verbose > 0 ? 25 : 0, - e->tag && verbose > 0 ? perf_mem_events__name(j, NULL) : "", + e->tag && verbose > 0 ? perf_pmu__mem_events_name(j, pmu) : "", e->supported ? ": available\n" : ""); } } -static void perf_mem_events__print_unsupport_hybrid(struct perf_mem_event *e, - int idx) +int perf_mem_events__record_args(const char **rec_argv, int *argv_nr) { const char *mnt = sysfs__mount(); struct perf_pmu *pmu = NULL; - - while ((pmu = perf_pmus__scan(pmu)) != NULL) { - if (!perf_mem_event__supported(mnt, pmu, e)) { - pr_err("failed: event '%s' not supported\n", - perf_mem_events__name(idx, pmu->name)); - } - } -} - -int perf_mem_events__record_args(const char **rec_argv, int *argv_nr, - char **rec_tmp, int *tmp_nr) -{ - const char *mnt = sysfs__mount(); - int i = *argv_nr, k = 0; struct perf_mem_event *e; + int i = *argv_nr; + const char *s; + char *copy; - for (int j = 0; j < PERF_MEM_EVENTS__MAX; j++) { - e = perf_mem_events__ptr(j); - if (!e->record) - continue; + while ((pmu = perf_pmus__scan_mem(pmu)) != NULL) { + for (int j = 0; j < PERF_MEM_EVENTS__MAX; j++) { + e = perf_pmu__mem_events_ptr(pmu, j); + + if (!e->record) + continue; - if (perf_pmus__num_mem_pmus() == 1) { if (!e->supported) { pr_err("failed: event '%s' not supported\n", - perf_mem_events__name(j, NULL)); + perf_pmu__mem_events_name(j, pmu)); return -1; } - rec_argv[i++] = "-e"; - rec_argv[i++] = perf_mem_events__name(j, NULL); - } else { - struct perf_pmu *pmu = NULL; + s = perf_pmu__mem_events_name(j, pmu); + if (!s || !perf_pmu__mem_events_supported(mnt, pmu, e)) + continue; - if (!e->supported) { - perf_mem_events__print_unsupport_hybrid(e, j); + copy = strdup(s); + if (!copy) return -1; - } - - while ((pmu = perf_pmus__scan(pmu)) != NULL) { - const char *s = perf_mem_events__name(j, pmu->name); - - if (!perf_mem_event__supported(mnt, pmu, e)) - continue; - rec_argv[i++] = "-e"; - if (s) { - char *copy = strdup(s); - if (!copy) - return -1; - - rec_argv[i++] = copy; - rec_tmp[k++] = copy; - } - } + rec_argv[i++] = "-e"; + rec_argv[i++] = copy; } } *argv_nr = i; - *tmp_nr = k; return 0; } |