[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <CAP-5=fWpGm9OhWFjrm9qZL8Ja-uqU76x60gB9MUU++JX6Nuyug@mail.gmail.com>
Date: Sat, 25 Oct 2025 13:29:48 -0700
From: Ian Rogers <irogers@...gle.com>
To: Namhyung Kim <namhyung@...nel.org>
Cc: Peter Zijlstra <peterz@...radead.org>, Ingo Molnar <mingo@...hat.com>,
Arnaldo Carvalho de Melo <acme@...nel.org>, Alexander Shishkin <alexander.shishkin@...ux.intel.com>,
Jiri Olsa <jolsa@...nel.org>, Adrian Hunter <adrian.hunter@...el.com>,
linux-perf-users@...r.kernel.org, linux-kernel@...r.kernel.org
Subject: Re: [PATCH v2] perf stat: Add/fix bperf cgroup max events workarounds
On Sat, Oct 25, 2025 at 10:37 AM Namhyung Kim <namhyung@...nel.org> wrote:
>
> Hi Ian,
>
> On Sat, Oct 25, 2025 at 09:50:19AM -0700, Ian Rogers wrote:
> > Commit b8308511f6e0 bumped the max events to 1024 but this results in
> > BPF verifier issues if the number of command line events is too
> > large. Workaround this by:
> >
> > 1) moving the constants to a header file to share between BPF and perf
> > C code,
> > 2) testing that the maximum number of events doesn't cause BPF
> > verifier issues in debug builds,
> > 3) lower the max events from 1024 to 128,
> > 4) in perf stat, if there are more events than the BPF counters can
> > support then disable BPF counter usage.
> >
> > The rodata setup is factored into its own function to avoid
> > duplicating it in the testing code.
> >
> > Signed-off-by: Ian Rogers <irogers@...gle.com>
> > Fixes: b8308511f6e0 ("perf stat bperf cgroup: Increase MAX_EVENTS from 32 to 1024")
> > ---
> > v2: Add missing header file
> > ---
> > tools/perf/builtin-stat.c | 13 +++-
> > tools/perf/util/bpf_counter_cgroup.c | 79 +++++++++++++++------
> > tools/perf/util/bpf_skel/bperf_cgroup.bpf.c | 18 +++--
> > tools/perf/util/bpf_skel/bperf_cgroup.h | 15 ++++
> > 4 files changed, 91 insertions(+), 34 deletions(-)
> > create mode 100644 tools/perf/util/bpf_skel/bperf_cgroup.h
> >
> > diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
> > index 3c3188a57016..130515f87ee0 100644
> > --- a/tools/perf/builtin-stat.c
> > +++ b/tools/perf/builtin-stat.c
> > @@ -96,6 +96,10 @@
> > #include <perf/evlist.h>
> > #include <internal/threadmap.h>
> >
> > +#ifdef HAVE_BPF_SKEL
> > +#include "util/bpf_skel/bperf_cgroup.h"
> > +#endif
> > +
> > #define DEFAULT_SEPARATOR " "
> > #define FREEZE_ON_SMI_PATH "bus/event_source/devices/cpu/freeze_on_smi"
> >
> > @@ -2852,7 +2856,14 @@ int cmd_stat(int argc, const char **argv)
> > goto out;
> > }
> > }
> > -
> > +#ifdef HAVE_BPF_SKEL
> > + if (target.use_bpf &&
> > + (evsel_list->core.nr_entries / nr_cgroups) > BPERF_CGROUP__MAX_EVENTS) {
>
> I guess you also need to check if nr_cgroups is not zero. Otherwise
> looks good to me.
Good catch! Fixed in v3.
Thanks,
Ian
> Thanks,
> Namhyung
>
> > + pr_warning("Disabling BPF counters due to more events (%d) than the max (%d)\n",
> > + evsel_list->core.nr_entries / nr_cgroups, BPERF_CGROUP__MAX_EVENTS);
> > + target.use_bpf = false;
> > + }
> > +#endif // HAVE_BPF_SKEL
> > evlist__warn_user_requested_cpus(evsel_list, target.cpu_list);
> >
> > evlist__for_each_entry(evsel_list, counter) {
> > diff --git a/tools/perf/util/bpf_counter_cgroup.c b/tools/perf/util/bpf_counter_cgroup.c
> > index 690be3ce3e11..68bd994c8880 100644
> > --- a/tools/perf/util/bpf_counter_cgroup.c
> > +++ b/tools/perf/util/bpf_counter_cgroup.c
> > @@ -27,6 +27,7 @@
> > #include "cpumap.h"
> > #include "thread_map.h"
> >
> > +#include "bpf_skel/bperf_cgroup.h"
> > #include "bpf_skel/bperf_cgroup.skel.h"
> >
> > static struct perf_event_attr cgrp_switch_attr = {
> > @@ -42,6 +43,55 @@ static struct bperf_cgroup_bpf *skel;
> >
> > #define FD(evt, cpu) (*(int *)xyarray__entry(evt->core.fd, cpu, 0))
> >
> > +static void setup_rodata(struct bperf_cgroup_bpf *sk, int evlist_size)
> > +{
> > + int map_size, total_cpus = cpu__max_cpu().cpu;
> > +
> > + sk->rodata->num_cpus = total_cpus;
> > + sk->rodata->num_events = evlist_size / nr_cgroups;
> > +
> > + if (cgroup_is_v2("perf_event") > 0)
> > + sk->rodata->use_cgroup_v2 = 1;
> > +
> > + BUG_ON(evlist_size % nr_cgroups != 0);
> > +
> > + /* we need one copy of events per cpu for reading */
> > + map_size = total_cpus * evlist_size / nr_cgroups;
> > + bpf_map__set_max_entries(sk->maps.events, map_size);
> > + bpf_map__set_max_entries(sk->maps.cgrp_idx, nr_cgroups);
> > + /* previous result is saved in a per-cpu array */
> > + map_size = evlist_size / nr_cgroups;
> > + bpf_map__set_max_entries(sk->maps.prev_readings, map_size);
> > + /* cgroup result needs all events (per-cpu) */
> > + map_size = evlist_size;
> > + bpf_map__set_max_entries(sk->maps.cgrp_readings, map_size);
> > +}
> > +
> > +static void test_max_events_program_load(void)
> > +{
> > +#ifndef NDEBUG
> > + /*
> > + * Test that the program verifies with the maximum number of events. If
> > + * this test fails unfortunately perf needs recompiling with a lower
> > + * BPERF_CGROUP__MAX_EVENTS to avoid BPF verifier issues.
> > + */
> > + int err, max_events = BPERF_CGROUP__MAX_EVENTS * nr_cgroups;
> > + struct bperf_cgroup_bpf *test_skel = bperf_cgroup_bpf__open();
> > +
> > + if (!test_skel) {
> > + pr_err("Failed to open cgroup skeleton\n");
> > + return;
> > + }
> > + setup_rodata(test_skel, max_events);
> > + err = bperf_cgroup_bpf__load(test_skel);
> > + if (err) {
> > + pr_err("Failed to load cgroup skeleton with max events %d.\n",
> > + BPERF_CGROUP__MAX_EVENTS);
> > + }
> > + bperf_cgroup_bpf__destroy(test_skel);
> > +#endif
> > +}
> > +
> > static int bperf_load_program(struct evlist *evlist)
> > {
> > struct bpf_link *link;
> > @@ -50,35 +100,18 @@ static int bperf_load_program(struct evlist *evlist)
> > int i, j;
> > struct perf_cpu cpu;
> > int total_cpus = cpu__max_cpu().cpu;
> > - int map_size, map_fd;
> > - int prog_fd, err;
> > + int map_fd, prog_fd, err;
> > +
> > + set_max_rlimit();
> > +
> > + test_max_events_program_load();
> >
> > skel = bperf_cgroup_bpf__open();
> > if (!skel) {
> > pr_err("Failed to open cgroup skeleton\n");
> > return -1;
> > }
> > -
> > - skel->rodata->num_cpus = total_cpus;
> > - skel->rodata->num_events = evlist->core.nr_entries / nr_cgroups;
> > -
> > - if (cgroup_is_v2("perf_event") > 0)
> > - skel->rodata->use_cgroup_v2 = 1;
> > -
> > - BUG_ON(evlist->core.nr_entries % nr_cgroups != 0);
> > -
> > - /* we need one copy of events per cpu for reading */
> > - map_size = total_cpus * evlist->core.nr_entries / nr_cgroups;
> > - bpf_map__set_max_entries(skel->maps.events, map_size);
> > - bpf_map__set_max_entries(skel->maps.cgrp_idx, nr_cgroups);
> > - /* previous result is saved in a per-cpu array */
> > - map_size = evlist->core.nr_entries / nr_cgroups;
> > - bpf_map__set_max_entries(skel->maps.prev_readings, map_size);
> > - /* cgroup result needs all events (per-cpu) */
> > - map_size = evlist->core.nr_entries;
> > - bpf_map__set_max_entries(skel->maps.cgrp_readings, map_size);
> > -
> > - set_max_rlimit();
> > + setup_rodata(skel, evlist->core.nr_entries);
> >
> > err = bperf_cgroup_bpf__load(skel);
> > if (err) {
> > diff --git a/tools/perf/util/bpf_skel/bperf_cgroup.bpf.c b/tools/perf/util/bpf_skel/bperf_cgroup.bpf.c
> > index 18ab4d9b49ff..c2298a2decc9 100644
> > --- a/tools/perf/util/bpf_skel/bperf_cgroup.bpf.c
> > +++ b/tools/perf/util/bpf_skel/bperf_cgroup.bpf.c
> > @@ -1,14 +1,12 @@
> > // SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
> > // Copyright (c) 2021 Facebook
> > // Copyright (c) 2021 Google
> > +#include "bperf_cgroup.h"
> > #include "vmlinux.h"
> > #include <bpf/bpf_helpers.h>
> > #include <bpf/bpf_tracing.h>
> > #include <bpf/bpf_core_read.h>
> >
> > -#define MAX_LEVELS 10 // max cgroup hierarchy level: arbitrary
> > -#define MAX_EVENTS 1024 // max events per cgroup: arbitrary
> > -
> > // NOTE: many of map and global data will be modified before loading
> > // from the userspace (perf tool) using the skeleton helpers.
> >
> > @@ -97,7 +95,7 @@ static inline int get_cgroup_v1_idx(__u32 *cgrps, int size)
> > cgrp = BPF_CORE_READ(p, cgroups, subsys[perf_subsys_id], cgroup);
> > level = BPF_CORE_READ(cgrp, level);
> >
> > - for (cnt = 0; i < MAX_LEVELS; i++) {
> > + for (cnt = 0; i < BPERF_CGROUP__MAX_LEVELS; i++) {
> > __u64 cgrp_id;
> >
> > if (i > level)
> > @@ -123,7 +121,7 @@ static inline int get_cgroup_v2_idx(__u32 *cgrps, int size)
> > __u32 *elem;
> > int cnt;
> >
> > - for (cnt = 0; i < MAX_LEVELS; i++) {
> > + for (cnt = 0; i < BPERF_CGROUP__MAX_LEVELS; i++) {
> > __u64 cgrp_id = bpf_get_current_ancestor_cgroup_id(i);
> >
> > if (cgrp_id == 0)
> > @@ -148,17 +146,17 @@ static int bperf_cgroup_count(void)
> > register int c = 0;
> > struct bpf_perf_event_value val, delta, *prev_val, *cgrp_val;
> > __u32 cpu = bpf_get_smp_processor_id();
> > - __u32 cgrp_idx[MAX_LEVELS];
> > + __u32 cgrp_idx[BPERF_CGROUP__MAX_LEVELS];
> > int cgrp_cnt;
> > __u32 key, cgrp;
> > long err;
> >
> > if (use_cgroup_v2)
> > - cgrp_cnt = get_cgroup_v2_idx(cgrp_idx, MAX_LEVELS);
> > + cgrp_cnt = get_cgroup_v2_idx(cgrp_idx, BPERF_CGROUP__MAX_LEVELS);
> > else
> > - cgrp_cnt = get_cgroup_v1_idx(cgrp_idx, MAX_LEVELS);
> > + cgrp_cnt = get_cgroup_v1_idx(cgrp_idx, BPERF_CGROUP__MAX_LEVELS);
> >
> > - for ( ; idx < MAX_EVENTS; idx++) {
> > + for ( ; idx < BPERF_CGROUP__MAX_EVENTS; idx++) {
> > if (idx == num_events)
> > break;
> >
> > @@ -186,7 +184,7 @@ static int bperf_cgroup_count(void)
> > delta.enabled = val.enabled - prev_val->enabled;
> > delta.running = val.running - prev_val->running;
> >
> > - for (c = 0; c < MAX_LEVELS; c++) {
> > + for (c = 0; c < BPERF_CGROUP__MAX_LEVELS; c++) {
> > if (c == cgrp_cnt)
> > break;
> >
> > diff --git a/tools/perf/util/bpf_skel/bperf_cgroup.h b/tools/perf/util/bpf_skel/bperf_cgroup.h
> > new file mode 100644
> > index 000000000000..3fb84b19d39a
> > --- /dev/null
> > +++ b/tools/perf/util/bpf_skel/bperf_cgroup.h
> > @@ -0,0 +1,15 @@
> > +/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */
> > +/* Data structures shared between BPF and tools. */
> > +#ifndef __BPERF_CGROUP_H
> > +#define __BPERF_CGROUP_H
> > +
> > +// These constants impact code size of bperf_cgroup.bpf.c that may result in BPF
> > +// verifier issues. They are exposed to control the size and also to disable BPF
> > +// counters when the number of user events is too large.
> > +
> > +// max cgroup hierarchy level: arbitrary
> > +#define BPERF_CGROUP__MAX_LEVELS 10
> > +// max events per cgroup: arbitrary
> > +#define BPERF_CGROUP__MAX_EVENTS 128
> > +
> > +#endif /* __BPERF_CGROUP_H */
> > --
> > 2.51.1.821.gb6fe4d2222-goog
> >
Powered by blists - more mailing lists