linux-kernel - Re: [PATCH v2] perf trace: Implement syscall summary in BPF

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <CAP-5=fWW=9WboQ0_MJx1pYeUTNSC0FNmyeTzw40+Q-mw+TreeA@mail.gmail.com>
Date: Mon, 17 Mar 2025 11:37:22 -0700
From: Ian Rogers <irogers@...gle.com>
To: Namhyung Kim <namhyung@...nel.org>
Cc: Arnaldo Carvalho de Melo <acme@...nel.org>, Kan Liang <kan.liang@...ux.intel.com>, 
	Jiri Olsa <jolsa@...nel.org>, Adrian Hunter <adrian.hunter@...el.com>, 
	Peter Zijlstra <peterz@...radead.org>, Ingo Molnar <mingo@...nel.org>, 
	LKML <linux-kernel@...r.kernel.org>, linux-perf-users@...r.kernel.org, 
	bpf@...r.kernel.org, Song Liu <song@...nel.org>, Howard Chu <howardchu95@...il.com>
Subject: Re: [PATCH v2] perf trace: Implement syscall summary in BPF

On Mon, Mar 17, 2025 at 11:08 AM Namhyung Kim <namhyung@...nel.org> wrote:
>
> When -s/--summary option is used, it doesn't need (augmented) arguments
> of syscalls.  Let's skip the augmentation and load another small BPF
> program to collect the statistics in the kernel instead of copying the
> data to the ring-buffer to calculate the stats in userspace.  This will
> be much more light-weight than the existing approach and remove any lost
> events.
>
> Let's add a new option --bpf-summary to control this behavior.  I cannot
> make it default because there's no way to get e_machine in the BPF which
> is needed for detecting different ABIs like 32-bit compat mode.
>
> No functional changes intended except for no more LOST events. :)
>
>   $ sudo perf trace -as --bpf-summary --summary-mode=total -- sleep 1
>
>    Summary of events:
>
>    total, 2824 events
>
>      syscall            calls  errors  total       min       avg       max       stddev
>                                        (msec)    (msec)    (msec)    (msec)        (%)
>      --------------- --------  ------ -------- --------- --------- ---------     ------
>      futex                372     18  4373.773     0.000    11.757   997.715    660.42%
>      poll                 241      0  2757.963     0.000    11.444   997.758    580.34%
>      epoll_wait           161      0  2460.854     0.000    15.285   325.189    260.73%
>      ppoll                 19      0  1298.652     0.000    68.350   667.172    281.46%
>      clock_nanosleep        1      0  1000.093     0.000  1000.093  1000.093      0.00%
>      epoll_pwait           16      0   192.787     0.000    12.049   173.994    348.73%
>      nanosleep              6      0    50.926     0.000     8.488    10.210     43.96%
>      ...
>
> Cc: Howard Chu <howardchu95@...il.com>
> Signed-off-by: Namhyung Kim <namhyung@...nel.org>
> ---
> v2)
>  * rebased on top of Ian's e_machine changes
>  * add --bpf-summary option
>  * support per-thread summary
>  * add stddev calculation  (Howard)
>
>  tools/perf/Documentation/perf-trace.txt       |   6 +
>  tools/perf/Makefile.perf                      |   2 +-
>  tools/perf/builtin-trace.c                    |  43 ++-
>  tools/perf/util/Build                         |   1 +
>  tools/perf/util/bpf-trace-summary.c           | 334 ++++++++++++++++++
>  .../perf/util/bpf_skel/syscall_summary.bpf.c  | 118 +++++++
>  tools/perf/util/bpf_skel/syscall_summary.h    |  25 ++
>  tools/perf/util/trace.h                       |  37 ++
>  8 files changed, 553 insertions(+), 13 deletions(-)
>  create mode 100644 tools/perf/util/bpf-trace-summary.c
>  create mode 100644 tools/perf/util/bpf_skel/syscall_summary.bpf.c
>  create mode 100644 tools/perf/util/bpf_skel/syscall_summary.h
>  create mode 100644 tools/perf/util/trace.h
>
> diff --git a/tools/perf/Documentation/perf-trace.txt b/tools/perf/Documentation/perf-trace.txt
> index 887dc37773d0f4d6..a8a0d8c33438fef7 100644
> --- a/tools/perf/Documentation/perf-trace.txt
> +++ b/tools/perf/Documentation/perf-trace.txt
> @@ -251,6 +251,12 @@ the thread executes on the designated CPUs. Default is to monitor all CPUs.
>         pretty-printing serves as a fallback to hand-crafted pretty printers, as the latter can
>         better pretty-print integer flags and struct pointers.
>
> +--bpf-summary::
> +       Collect system call statistics in BPF.  This is only for live mode and
> +       works well with -s/--summary option where no argument information is
> +       required.
> +
> +
>  PAGEFAULTS
>  ----------
>
> diff --git a/tools/perf/Makefile.perf b/tools/perf/Makefile.perf
> index f3cd8de15d1a2681..d7a7e0c68fc10b8b 100644
> --- a/tools/perf/Makefile.perf
> +++ b/tools/perf/Makefile.perf
> @@ -1206,7 +1206,7 @@ SKELETONS += $(SKEL_OUT)/bperf_leader.skel.h $(SKEL_OUT)/bperf_follower.skel.h
>  SKELETONS += $(SKEL_OUT)/bperf_cgroup.skel.h $(SKEL_OUT)/func_latency.skel.h
>  SKELETONS += $(SKEL_OUT)/off_cpu.skel.h $(SKEL_OUT)/lock_contention.skel.h
>  SKELETONS += $(SKEL_OUT)/kwork_trace.skel.h $(SKEL_OUT)/sample_filter.skel.h
> -SKELETONS += $(SKEL_OUT)/kwork_top.skel.h
> +SKELETONS += $(SKEL_OUT)/kwork_top.skel.h $(SKEL_OUT)/syscall_summary.skel.h
>  SKELETONS += $(SKEL_OUT)/bench_uprobe.skel.h
>  SKELETONS += $(SKEL_OUT)/augmented_raw_syscalls.skel.h
>
> diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c
> index 7fef59306db2891f..deeb7250e8c52354 100644
> --- a/tools/perf/builtin-trace.c
> +++ b/tools/perf/builtin-trace.c
> @@ -55,6 +55,7 @@
>  #include "util/thread_map.h"
>  #include "util/stat.h"
>  #include "util/tool.h"
> +#include "util/trace.h"
>  #include "util/util.h"
>  #include "trace/beauty/beauty.h"
>  #include "trace-event.h"
> @@ -141,12 +142,6 @@ struct syscall_fmt {
>         bool       hexret;
>  };
>
> -enum summary_mode {
> -       SUMMARY__NONE = 0,
> -       SUMMARY__BY_TOTAL,
> -       SUMMARY__BY_THREAD,
> -};
> -
>  struct trace {
>         struct perf_tool        tool;
>         struct {
> @@ -205,7 +200,7 @@ struct trace {
>         } stats;
>         unsigned int            max_stack;
>         unsigned int            min_stack;
> -       enum summary_mode       summary_mode;
> +       enum trace_summary_mode summary_mode;
>         int                     raw_augmented_syscalls_args_size;
>         bool                    raw_augmented_syscalls;
>         bool                    fd_path_disabled;
> @@ -234,6 +229,7 @@ struct trace {
>         bool                    force;
>         bool                    vfs_getname;
>         bool                    force_btf;
> +       bool                    summary_bpf;
>         int                     trace_pgfaults;
>         char                    *perfconfig_events;
>         struct {
> @@ -4356,6 +4352,13 @@ static int trace__run(struct trace *trace, int argc, const char **argv)
>
>         trace->live = true;
>
> +       if (trace->summary_bpf) {
> +               if (trace_prepare_bpf_summary(trace->summary_mode) < 0)
> +                       goto out_delete_evlist;
> +
> +               goto create_maps;
> +       }
> +
>         if (!trace->raw_augmented_syscalls) {
>                 if (trace->trace_syscalls && trace__add_syscall_newtp(trace))
>                         goto out_error_raw_syscalls;
> @@ -4414,6 +4417,7 @@ static int trace__run(struct trace *trace, int argc, const char **argv)
>         if (trace->cgroup)
>                 evlist__set_default_cgroup(trace->evlist, trace->cgroup);
>
> +create_maps:
>         err = evlist__create_maps(evlist, &trace->opts.target);
>         if (err < 0) {
>                 fprintf(trace->output, "Problems parsing the target to trace, check your options!\n");
> @@ -4426,7 +4430,7 @@ static int trace__run(struct trace *trace, int argc, const char **argv)
>                 goto out_delete_evlist;
>         }
>
> -       if (trace->summary_mode == SUMMARY__BY_TOTAL) {
> +       if (trace->summary_mode == SUMMARY__BY_TOTAL && !trace->summary_bpf) {
>                 trace->syscall_stats = alloc_syscall_stats();
>                 if (trace->syscall_stats == NULL)
>                         goto out_delete_evlist;
> @@ -4512,9 +4516,11 @@ static int trace__run(struct trace *trace, int argc, const char **argv)
>         if (err < 0)
>                 goto out_error_apply_filters;
>
> -       err = evlist__mmap(evlist, trace->opts.mmap_pages);
> -       if (err < 0)
> -               goto out_error_mmap;
> +       if (!trace->summary_bpf) {
> +               err = evlist__mmap(evlist, trace->opts.mmap_pages);
> +               if (err < 0)
> +                       goto out_error_mmap;
> +       }
>
>         if (!target__none(&trace->opts.target) && !trace->opts.target.initial_delay)
>                 evlist__enable(evlist);
> @@ -4527,6 +4533,9 @@ static int trace__run(struct trace *trace, int argc, const char **argv)
>                 evlist__enable(evlist);
>         }
>
> +       if (trace->summary_bpf)
> +               trace_start_bpf_summary();
> +
>         trace->multiple_threads = perf_thread_map__pid(evlist->core.threads, 0) == -1 ||
>                 perf_thread_map__nr(evlist->core.threads) > 1 ||
>                 evlist__first(evlist)->core.attr.inherit;
> @@ -4594,12 +4603,17 @@ static int trace__run(struct trace *trace, int argc, const char **argv)
>
>         evlist__disable(evlist);
>
> +       if (trace->summary_bpf)
> +               trace_end_bpf_summary();
> +
>         if (trace->sort_events)
>                 ordered_events__flush(&trace->oe.data, OE_FLUSH__FINAL);
>
>         if (!err) {
>                 if (trace->summary) {
> -                       if (trace->summary_mode == SUMMARY__BY_TOTAL)
> +                       if (trace->summary_bpf)
> +                               trace_print_bpf_summary(trace->output);
> +                       else if (trace->summary_mode == SUMMARY__BY_TOTAL)
>                                 trace__fprintf_total_summary(trace, trace->output);
>                         else
>                                 trace__fprintf_thread_summary(trace, trace->output);
> @@ -4615,6 +4629,7 @@ static int trace__run(struct trace *trace, int argc, const char **argv)
>         }
>
>  out_delete_evlist:
> +       trace_cleanup_bpf_summary();
>         delete_syscall_stats(trace->syscall_stats);
>         trace__symbols__exit(trace);
>         evlist__free_syscall_tp_fields(evlist);
> @@ -5444,6 +5459,7 @@ int cmd_trace(int argc, const char **argv)
>                      "start"),
>         OPT_BOOLEAN(0, "force-btf", &trace.force_btf, "Prefer btf_dump general pretty printer"
>                        "to customized ones"),
> +       OPT_BOOLEAN(0, "bpf-summary", &trace.summary_bpf, "Summary syscall stats in BPF"),
>         OPTS_EVSWITCH(&trace.evswitch),
>         OPT_END()
>         };
> @@ -5535,6 +5551,9 @@ int cmd_trace(int argc, const char **argv)
>                 goto skip_augmentation;
>         }
>
> +       if (trace.summary_only && trace.summary_bpf)
> +               goto skip_augmentation;
> +
>         trace.skel = augmented_raw_syscalls_bpf__open();
>         if (!trace.skel) {
>                 pr_debug("Failed to open augmented syscalls BPF skeleton");
> diff --git a/tools/perf/util/Build b/tools/perf/util/Build
> index 034a6603d5a8e8b0..ba4201a6f3c69753 100644
> --- a/tools/perf/util/Build
> +++ b/tools/perf/util/Build
> @@ -171,6 +171,7 @@ perf-util-$(CONFIG_PERF_BPF_SKEL) += bpf_off_cpu.o
>  perf-util-$(CONFIG_PERF_BPF_SKEL) += bpf-filter.o
>  perf-util-$(CONFIG_PERF_BPF_SKEL) += bpf-filter-flex.o
>  perf-util-$(CONFIG_PERF_BPF_SKEL) += bpf-filter-bison.o
> +perf-util-$(CONFIG_PERF_BPF_SKEL) += bpf-trace-summary.o
>  perf-util-$(CONFIG_PERF_BPF_SKEL) += btf.o
>
>  ifeq ($(CONFIG_LIBTRACEEVENT),y)
> diff --git a/tools/perf/util/bpf-trace-summary.c b/tools/perf/util/bpf-trace-summary.c
> new file mode 100644
> index 0000000000000000..5ae9feca244d5b22
> --- /dev/null
> +++ b/tools/perf/util/bpf-trace-summary.c
> @@ -0,0 +1,334 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#include <inttypes.h>
> +#include <math.h>
> +#include <stdio.h>
> +#include <stdlib.h>
> +
> +#include "dwarf-regs.h" /* for EM_HOST */
> +#include "syscalltbl.h"
> +#include "util/hashmap.h"
> +#include "util/trace.h"
> +#include "util/util.h"
> +#include <bpf/bpf.h>
> +#include <linux/time64.h>
> +#include <tools/libc_compat.h> /* reallocarray */
> +
> +#include "bpf_skel/syscall_summary.h"
> +#include "bpf_skel/syscall_summary.skel.h"
> +
> +
> +static struct syscall_summary_bpf *skel;
> +
> +int trace_prepare_bpf_summary(enum trace_summary_mode mode)
> +{
> +       skel = syscall_summary_bpf__open();
> +       if (skel == NULL) {
> +               fprintf(stderr, "failed to open syscall summary bpf skeleton\n");
> +               return -1;
> +       }
> +
> +       if (mode == SUMMARY__BY_THREAD)
> +               skel->rodata->aggr_mode = SYSCALL_AGGR_THREAD;
> +       else
> +               skel->rodata->aggr_mode = SYSCALL_AGGR_CPU;
> +
> +       if (syscall_summary_bpf__load(skel) < 0) {
> +               fprintf(stderr, "failed to load syscall summary bpf skeleton\n");
> +               return -1;
> +       }
> +
> +       if (syscall_summary_bpf__attach(skel) < 0) {
> +               fprintf(stderr, "failed to attach syscall summary bpf skeleton\n");
> +               return -1;
> +       }
> +
> +       return 0;
> +}
> +
> +void trace_start_bpf_summary(void)
> +{
> +       skel->bss->enabled = 1;
> +}
> +
> +void trace_end_bpf_summary(void)
> +{
> +       skel->bss->enabled = 0;
> +}
> +
> +struct syscall_node {
> +       int syscall_nr;
> +       struct syscall_stats stats;
> +};
> +
> +static double rel_stddev(struct syscall_stats *stat)
> +{
> +       double variance, average;
> +
> +       if (stat->count < 2)
> +               return 0;
> +
> +       average = (double)stat->total_time / stat->count;
> +
> +       variance = stat->squared_sum;
> +       variance -= (stat->total_time * stat->total_time) / stat->count;
> +       variance /= stat->count;
> +
> +       return 100 * sqrt(variance) / average;
> +}
> +
> +struct syscall_data {
> +       int key; /* tid if AGGR_THREAD, syscall-nr if AGGR_CPU */
> +       int nr_events;
> +       int nr_nodes;
> +       u64 total_time;
> +       struct syscall_node *nodes;
> +};
> +
> +static int datacmp(const void *a, const void *b)
> +{
> +       const struct syscall_data * const *sa = a;
> +       const struct syscall_data * const *sb = b;
> +
> +       return (*sa)->total_time > (*sb)->total_time ? -1 : 1;
> +}
> +
> +static int nodecmp(const void *a, const void *b)
> +{
> +       const struct syscall_node *na = a;
> +       const struct syscall_node *nb = b;
> +
> +       return na->stats.total_time > nb->stats.total_time ? -1 : 1;
> +}
> +
> +static size_t sc_node_hash(long key, void *ctx __maybe_unused)
> +{
> +       return key;
> +}
> +
> +static bool sc_node_equal(long key1, long key2, void *ctx __maybe_unused)
> +{
> +       return key1 == key2;
> +}
> +
> +static int print_common_stats(struct syscall_data *data, FILE *fp)
> +{
> +       int printed = 0;
> +
> +       for (int i = 0; i < data->nr_nodes; i++) {
> +               struct syscall_node *node = &data->nodes[i];
> +               struct syscall_stats *stat = &node->stats;
> +               double total = (double)(stat->total_time) / NSEC_PER_MSEC;
> +               double min = (double)(stat->min_time) / NSEC_PER_MSEC;
> +               double max = (double)(stat->max_time) / NSEC_PER_MSEC;
> +               double avg = total / stat->count;
> +               const char *name;
> +
> +               /* TODO: support other ABIs */
> +               name = syscalltbl__name(EM_HOST, node->syscall_nr);
> +               if (name)
> +                       printed += fprintf(fp, "   %-15s", name);
> +               else
> +                       printed += fprintf(fp, "   syscall:%-7d", node->syscall_nr);
> +
> +               printed += fprintf(fp, " %8u %6u %9.3f %9.3f %9.3f %9.3f %9.2f%%\n",
> +                                  stat->count, stat->error, total, min, avg, max,
> +                                  rel_stddev(stat));
> +       }
> +       return printed;
> +}
> +
> +static int update_thread_stats(struct hashmap *hash, struct syscall_key *map_key,
> +                              struct syscall_stats *map_data)
> +{
> +       struct syscall_data *data;
> +       struct syscall_node *nodes;
> +
> +       if (!hashmap__find(hash, map_key->cpu_or_tid, &data)) {
> +               data = zalloc(sizeof(*data));
> +               if (data == NULL)
> +                       return -ENOMEM;
> +
> +               data->key = map_key->cpu_or_tid;
> +               if (hashmap__add(hash, data->key, data) < 0) {
> +                       free(data);
> +                       return -ENOMEM;
> +               }
> +       }
> +
> +       /* update thread total stats */
> +       data->nr_events += map_data->count;
> +       data->total_time += map_data->total_time;
> +
> +       nodes = reallocarray(data->nodes, data->nr_nodes + 1, sizeof(*nodes));
> +       if (nodes == NULL)
> +               return -ENOMEM;
> +
> +       data->nodes = nodes;
> +       nodes = &data->nodes[data->nr_nodes++];
> +       nodes->syscall_nr = map_key->nr;
> +
> +       /* each thread has an entry for each syscall, just use the stat */
> +       memcpy(&nodes->stats, map_data, sizeof(*map_data));
> +       return 0;
> +}
> +
> +static int print_thread_stat(struct syscall_data *data, FILE *fp)
> +{
> +       int printed = 0;
> +
> +       qsort(data->nodes, data->nr_nodes, sizeof(*data->nodes), nodecmp);
> +
> +       printed += fprintf(fp, " thread (%d), ", data->key);
> +       printed += fprintf(fp, "%d events\n\n", data->nr_events);
> +
> +       printed += fprintf(fp, "   syscall            calls  errors  total       min       avg       max       stddev\n");
> +       printed += fprintf(fp, "                                     (msec)    (msec)    (msec)    (msec)        (%%)\n");
> +       printed += fprintf(fp, "   --------------- --------  ------ -------- --------- --------- ---------     ------\n");

The code lgtm and follows the code base's conventions but it seems odd
to put very specific output like this inside of util/ rather than
builtin-trace.c. stat-display.c is similar, it just seems the boundary
between util/ and the buitin-*.c files is blurry.

Thanks,
Ian

> +
> +       printed += print_common_stats(data, fp);
> +       printed += fprintf(fp, "\n\n");
> +
> +       return printed;
> +}
> +
> +static int print_thread_stats(struct syscall_data **data, int nr_data, FILE *fp)
> +{
> +       int printed = 0;
> +
> +       for (int i = 0; i < nr_data; i++)
> +               printed += print_thread_stat(data[i], fp);
> +
> +       return printed;
> +}
> +
> +static int update_total_stats(struct hashmap *hash, struct syscall_key *map_key,
> +                             struct syscall_stats *map_data)
> +{
> +       struct syscall_data *data;
> +       struct syscall_stats *stat;
> +
> +       if (!hashmap__find(hash, map_key, &data)) {
> +               data = zalloc(sizeof(*data));
> +               if (data == NULL)
> +                       return -ENOMEM;
> +
> +               data->nodes = zalloc(sizeof(*data->nodes));
> +               if (data->nodes == NULL) {
> +                       free(data);
> +                       return -ENOMEM;
> +               }
> +
> +               data->nr_nodes = 1;
> +               data->key = map_key->nr;
> +               data->nodes->syscall_nr = data->key;
> +
> +               if (hashmap__add(hash, data->key, data) < 0) {
> +                       free(data->nodes);
> +                       free(data);
> +                       return -ENOMEM;
> +               }
> +       }
> +
> +       /* update total stats for this syscall */
> +       data->nr_events += map_data->count;
> +       data->total_time += map_data->total_time;
> +
> +       /* This is sum of the same syscall from different CPUs */
> +       stat = &data->nodes->stats;
> +
> +       stat->total_time += map_data->total_time;
> +       stat->squared_sum += map_data->squared_sum;
> +       stat->count += map_data->count;
> +       stat->error += map_data->error;
> +
> +       if (stat->max_time < map_data->max_time)
> +               stat->max_time = map_data->max_time;
> +       if (stat->min_time > map_data->min_time)
> +               stat->min_time = map_data->min_time;
> +
> +       return 0;
> +}
> +
> +static int print_total_stats(struct syscall_data **data, int nr_data, FILE *fp)
> +{
> +       int printed = 0;
> +       int nr_events = 0;
> +
> +       for (int i = 0; i < nr_data; i++)
> +               nr_events += data[i]->nr_events;
> +
> +       printed += fprintf(fp, " total, %d events\n\n", nr_events);
> +
> +       printed += fprintf(fp, "   syscall            calls  errors  total       min       avg       max       stddev\n");
> +       printed += fprintf(fp, "                                     (msec)    (msec)    (msec)    (msec)        (%%)\n");
> +       printed += fprintf(fp, "   --------------- --------  ------ -------- --------- --------- ---------     ------\n");
> +
> +       for (int i = 0; i < nr_data; i++)
> +               printed += print_common_stats(data[i], fp);
> +
> +       printed += fprintf(fp, "\n\n");
> +       return printed;
> +}
> +
> +int trace_print_bpf_summary(FILE *fp)
> +{
> +       struct bpf_map *map = skel->maps.syscall_stats_map;
> +       struct syscall_key *prev_key, key;
> +       struct syscall_data **data = NULL;
> +       struct hashmap schash;
> +       struct hashmap_entry *entry;
> +       int nr_data = 0;
> +       int printed = 0;
> +       int i;
> +       size_t bkt;
> +
> +       hashmap__init(&schash, sc_node_hash, sc_node_equal, /*ctx=*/NULL);
> +
> +       printed = fprintf(fp, "\n Summary of events:\n\n");
> +
> +       /* get stats from the bpf map */
> +       prev_key = NULL;
> +       while (!bpf_map__get_next_key(map, prev_key, &key, sizeof(key))) {
> +               struct syscall_stats stat;
> +
> +               if (!bpf_map__lookup_elem(map, &key, sizeof(key), &stat, sizeof(stat), 0)) {
> +                       if (skel->rodata->aggr_mode == SYSCALL_AGGR_THREAD)
> +                               update_thread_stats(&schash, &key, &stat);
> +                       else
> +                               update_total_stats(&schash, &key, &stat);
> +               }
> +
> +               prev_key = &key;
> +       }
> +
> +       nr_data = hashmap__size(&schash);
> +       data = calloc(nr_data, sizeof(*data));
> +       if (data == NULL)
> +               goto out;
> +
> +       i = 0;
> +       hashmap__for_each_entry(&schash, entry, bkt)
> +               data[i++] = entry->pvalue;
> +
> +       qsort(data, nr_data, sizeof(*data), datacmp);
> +
> +       if (skel->rodata->aggr_mode == SYSCALL_AGGR_THREAD)
> +               printed += print_thread_stats(data, nr_data, fp);
> +       else
> +               printed += print_total_stats(data, nr_data, fp);
> +
> +       for (i = 0; i < nr_data && data; i++) {
> +               free(data[i]->nodes);
> +               free(data[i]);
> +       }
> +       free(data);
> +
> +out:
> +       hashmap__clear(&schash);
> +       return printed;
> +}
> +
> +void trace_cleanup_bpf_summary(void)
> +{
> +       syscall_summary_bpf__destroy(skel);
> +}
> diff --git a/tools/perf/util/bpf_skel/syscall_summary.bpf.c b/tools/perf/util/bpf_skel/syscall_summary.bpf.c
> new file mode 100644
> index 0000000000000000..b25f53b3c1351392
> --- /dev/null
> +++ b/tools/perf/util/bpf_skel/syscall_summary.bpf.c
> @@ -0,0 +1,118 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * Trace raw_syscalls tracepoints to collect system call statistics.
> + */
> +
> +#include "vmlinux.h"
> +#include "syscall_summary.h"
> +
> +#include <bpf/bpf_helpers.h>
> +#include <bpf/bpf_tracing.h>
> +
> +/* This is to calculate a delta between sys-enter and sys-exit for each thread */
> +struct syscall_trace {
> +       int nr; /* syscall number is only available at sys-enter */
> +       int unused;
> +       u64 timestamp;
> +};
> +
> +#define MAX_ENTRIES    (128 * 1024)
> +
> +struct syscall_trace_map {
> +       __uint(type, BPF_MAP_TYPE_HASH);
> +       __type(key, int); /* tid */
> +       __type(value, struct syscall_trace);
> +       __uint(max_entries, MAX_ENTRIES);
> +} syscall_trace_map SEC(".maps");
> +
> +struct syscall_stats_map {
> +       __uint(type, BPF_MAP_TYPE_HASH);
> +       __type(key, struct syscall_key);
> +       __type(value, struct syscall_stats);
> +       __uint(max_entries, MAX_ENTRIES);
> +} syscall_stats_map SEC(".maps");
> +
> +int enabled; /* controlled from userspace */
> +
> +const volatile enum syscall_aggr_mode aggr_mode;
> +
> +static void update_stats(int cpu_or_tid, int nr, s64 duration, long ret)
> +{
> +       struct syscall_key key = { .cpu_or_tid = cpu_or_tid, .nr = nr, };
> +       struct syscall_stats *stats;
> +
> +       stats = bpf_map_lookup_elem(&syscall_stats_map, &key);
> +       if (stats == NULL) {
> +               struct syscall_stats zero = {};
> +
> +               bpf_map_update_elem(&syscall_stats_map, &key, &zero, BPF_NOEXIST);
> +               stats = bpf_map_lookup_elem(&syscall_stats_map, &key);
> +               if (stats == NULL)
> +                       return;
> +       }
> +
> +       __sync_fetch_and_add(&stats->count, 1);
> +       if (ret < 0)
> +               __sync_fetch_and_add(&stats->error, 1);
> +
> +       if (duration > 0) {
> +               __sync_fetch_and_add(&stats->total_time, duration);
> +               __sync_fetch_and_add(&stats->squared_sum, duration * duration);
> +               if (stats->max_time < duration)
> +                       stats->max_time = duration;
> +               if (stats->min_time > duration || stats->min_time == 0)
> +                       stats->min_time = duration;
> +       }
> +
> +       return;
> +}
> +
> +SEC("tp_btf/sys_enter")
> +int sys_enter(u64 *ctx)
> +{
> +       int tid;
> +       struct syscall_trace st;
> +
> +       if (!enabled)
> +               return 0;
> +
> +       st.nr = ctx[1]; /* syscall number */
> +       st.unused = 0;
> +       st.timestamp = bpf_ktime_get_ns();
> +
> +       tid = bpf_get_current_pid_tgid();
> +       bpf_map_update_elem(&syscall_trace_map, &tid, &st, BPF_ANY);
> +
> +       return 0;
> +}
> +
> +SEC("tp_btf/sys_exit")
> +int sys_exit(u64 *ctx)
> +{
> +       int tid;
> +       int key;
> +       long ret = ctx[1]; /* return value of the syscall */
> +       struct syscall_trace *st;
> +       s64 delta;
> +
> +       if (!enabled)
> +               return 0;
> +
> +       tid = bpf_get_current_pid_tgid();
> +       st = bpf_map_lookup_elem(&syscall_trace_map, &tid);
> +       if (st == NULL)
> +               return 0;
> +
> +       if (aggr_mode == SYSCALL_AGGR_THREAD)
> +               key = tid;
> +       else
> +               key = bpf_get_smp_processor_id();
> +
> +       delta = bpf_ktime_get_ns() - st->timestamp;
> +       update_stats(key, st->nr, delta, ret);
> +
> +       bpf_map_delete_elem(&syscall_trace_map, &tid);
> +       return 0;
> +}
> +
> +char _license[] SEC("license") = "GPL";
> diff --git a/tools/perf/util/bpf_skel/syscall_summary.h b/tools/perf/util/bpf_skel/syscall_summary.h
> new file mode 100644
> index 0000000000000000..17f9ecba657088aa
> --- /dev/null
> +++ b/tools/perf/util/bpf_skel/syscall_summary.h
> @@ -0,0 +1,25 @@
> +// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
> +/* Data structures shared between BPF and tools. */
> +#ifndef UTIL_BPF_SKEL_SYSCALL_SUMMARY_H
> +#define UTIL_BPF_SKEL_SYSCALL_SUMMARY_H
> +
> +enum syscall_aggr_mode {
> +       SYSCALL_AGGR_THREAD,
> +       SYSCALL_AGGR_CPU,
> +};
> +
> +struct syscall_key {
> +       int cpu_or_tid;
> +       int nr;
> +};
> +
> +struct syscall_stats {
> +       u64 total_time;
> +       u64 squared_sum;
> +       u64 max_time;
> +       u64 min_time;
> +       u32 count;
> +       u32 error;
> +};
> +
> +#endif /* UTIL_BPF_SKEL_SYSCALL_SUMMARY_H */
> diff --git a/tools/perf/util/trace.h b/tools/perf/util/trace.h
> new file mode 100644
> index 0000000000000000..ef8361ed12c4edc1
> --- /dev/null
> +++ b/tools/perf/util/trace.h
> @@ -0,0 +1,37 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#ifndef UTIL_TRACE_H
> +#define UTIL_TRACE_H
> +
> +#include <stdio.h>  /* for FILE */
> +
> +enum trace_summary_mode {
> +       SUMMARY__NONE = 0,
> +       SUMMARY__BY_TOTAL,
> +       SUMMARY__BY_THREAD,
> +};
> +
> +#ifdef HAVE_BPF_SKEL
> +
> +int trace_prepare_bpf_summary(enum trace_summary_mode mode);
> +void trace_start_bpf_summary(void);
> +void trace_end_bpf_summary(void);
> +int trace_print_bpf_summary(FILE *fp);
> +void trace_cleanup_bpf_summary(void);
> +
> +#else /* !HAVE_BPF_SKEL */
> +
> +static inline int trace_prepare_bpf_summary(enum trace_summary_mode mode __maybe_unused)
> +{
> +       return -1;
> +}
> +static inline void trace_start_bpf_summary(void) {}
> +static inline void trace_end_bpf_summary(void) {}
> +static inline int trace_print_bpf_summary(FILE *fp __maybe_unused)
> +{
> +       return 0;
> +}
> +static inline void trace_cleanup_bpf_summary(void) {}
> +
> +#endif /* HAVE_BPF_SKEL */
> +
> +#endif /* UTIL_TRACE_H */
> --
> 2.49.0.rc1.451.g8f38331e32-goog
>