linux-kernel - Re: [PATCH v2] perf stat: Add/fix bperf cgroup max events workarounds

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <aP0KyW5MmTL3yTtu@google.com>
Date: Sat, 25 Oct 2025 10:37:13 -0700
From: Namhyung Kim <namhyung@...nel.org>
To: Ian Rogers <irogers@...gle.com>
Cc: Peter Zijlstra <peterz@...radead.org>, Ingo Molnar <mingo@...hat.com>,
	Arnaldo Carvalho de Melo <acme@...nel.org>,
	Alexander Shishkin <alexander.shishkin@...ux.intel.com>,
	Jiri Olsa <jolsa@...nel.org>,
	Adrian Hunter <adrian.hunter@...el.com>,
	linux-perf-users@...r.kernel.org, linux-kernel@...r.kernel.org
Subject: Re: [PATCH v2] perf stat: Add/fix bperf cgroup max events workarounds

Hi Ian,

On Sat, Oct 25, 2025 at 09:50:19AM -0700, Ian Rogers wrote:
> Commit b8308511f6e0 bumped the max events to 1024 but this results in
> BPF verifier issues if the number of command line events is too
> large. Workaround this by:
> 
> 1) moving the constants to a header file to share between BPF and perf
>    C code,
> 2) testing that the maximum number of events doesn't cause BPF
>    verifier issues in debug builds,
> 3) lower the max events from 1024 to 128,
> 4) in perf stat, if there are more events than the BPF counters can
>    support then disable BPF counter usage.
> 
> The rodata setup is factored into its own function to avoid
> duplicating it in the testing code.
> 
> Signed-off-by: Ian Rogers <irogers@...gle.com>
> Fixes: b8308511f6e0 ("perf stat bperf cgroup: Increase MAX_EVENTS from 32 to 1024")
> ---
> v2: Add missing header file
> ---
>  tools/perf/builtin-stat.c                   | 13 +++-
>  tools/perf/util/bpf_counter_cgroup.c        | 79 +++++++++++++++------
>  tools/perf/util/bpf_skel/bperf_cgroup.bpf.c | 18 +++--
>  tools/perf/util/bpf_skel/bperf_cgroup.h     | 15 ++++
>  4 files changed, 91 insertions(+), 34 deletions(-)
>  create mode 100644 tools/perf/util/bpf_skel/bperf_cgroup.h
> 
> diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
> index 3c3188a57016..130515f87ee0 100644
> --- a/tools/perf/builtin-stat.c
> +++ b/tools/perf/builtin-stat.c
> @@ -96,6 +96,10 @@
>  #include <perf/evlist.h>
>  #include <internal/threadmap.h>
>  
> +#ifdef HAVE_BPF_SKEL
> +#include "util/bpf_skel/bperf_cgroup.h"
> +#endif
> +
>  #define DEFAULT_SEPARATOR	" "
>  #define FREEZE_ON_SMI_PATH	"bus/event_source/devices/cpu/freeze_on_smi"
>  
> @@ -2852,7 +2856,14 @@ int cmd_stat(int argc, const char **argv)
>  			goto out;
>  		}
>  	}
> -
> +#ifdef HAVE_BPF_SKEL
> +	if (target.use_bpf &&
> +	    (evsel_list->core.nr_entries / nr_cgroups) > BPERF_CGROUP__MAX_EVENTS) {

I guess you also need to check if nr_cgroups is not zero.  Otherwise
looks good to me.

Thanks,
Namhyung

> +		pr_warning("Disabling BPF counters due to more events (%d) than the max (%d)\n",
> +			   evsel_list->core.nr_entries / nr_cgroups, BPERF_CGROUP__MAX_EVENTS);
> +		target.use_bpf = false;
> +	}
> +#endif // HAVE_BPF_SKEL
>  	evlist__warn_user_requested_cpus(evsel_list, target.cpu_list);
>  
>  	evlist__for_each_entry(evsel_list, counter) {
> diff --git a/tools/perf/util/bpf_counter_cgroup.c b/tools/perf/util/bpf_counter_cgroup.c
> index 690be3ce3e11..68bd994c8880 100644
> --- a/tools/perf/util/bpf_counter_cgroup.c
> +++ b/tools/perf/util/bpf_counter_cgroup.c
> @@ -27,6 +27,7 @@
>  #include "cpumap.h"
>  #include "thread_map.h"
>  
> +#include "bpf_skel/bperf_cgroup.h"
>  #include "bpf_skel/bperf_cgroup.skel.h"
>  
>  static struct perf_event_attr cgrp_switch_attr = {
> @@ -42,6 +43,55 @@ static struct bperf_cgroup_bpf *skel;
>  
>  #define FD(evt, cpu) (*(int *)xyarray__entry(evt->core.fd, cpu, 0))
>  
> +static void setup_rodata(struct bperf_cgroup_bpf *sk, int evlist_size)
> +{
> +	int map_size, total_cpus = cpu__max_cpu().cpu;
> +
> +	sk->rodata->num_cpus = total_cpus;
> +	sk->rodata->num_events = evlist_size / nr_cgroups;
> +
> +	if (cgroup_is_v2("perf_event") > 0)
> +		sk->rodata->use_cgroup_v2 = 1;
> +
> +	BUG_ON(evlist_size % nr_cgroups != 0);
> +
> +	/* we need one copy of events per cpu for reading */
> +	map_size = total_cpus * evlist_size / nr_cgroups;
> +	bpf_map__set_max_entries(sk->maps.events, map_size);
> +	bpf_map__set_max_entries(sk->maps.cgrp_idx, nr_cgroups);
> +	/* previous result is saved in a per-cpu array */
> +	map_size = evlist_size / nr_cgroups;
> +	bpf_map__set_max_entries(sk->maps.prev_readings, map_size);
> +	/* cgroup result needs all events (per-cpu) */
> +	map_size = evlist_size;
> +	bpf_map__set_max_entries(sk->maps.cgrp_readings, map_size);
> +}
> +
> +static void test_max_events_program_load(void)
> +{
> +#ifndef NDEBUG
> +	/*
> +	 * Test that the program verifies with the maximum number of events. If
> +	 * this test fails unfortunately perf needs recompiling with a lower
> +	 * BPERF_CGROUP__MAX_EVENTS to avoid BPF verifier issues.
> +	 */
> +	int err, max_events = BPERF_CGROUP__MAX_EVENTS * nr_cgroups;
> +	struct bperf_cgroup_bpf *test_skel = bperf_cgroup_bpf__open();
> +
> +	if (!test_skel) {
> +		pr_err("Failed to open cgroup skeleton\n");
> +		return;
> +	}
> +	setup_rodata(test_skel, max_events);
> +	err = bperf_cgroup_bpf__load(test_skel);
> +	if (err) {
> +		pr_err("Failed to load cgroup skeleton with max events %d.\n",
> +			BPERF_CGROUP__MAX_EVENTS);
> +	}
> +	bperf_cgroup_bpf__destroy(test_skel);
> +#endif
> +}
> +
>  static int bperf_load_program(struct evlist *evlist)
>  {
>  	struct bpf_link *link;
> @@ -50,35 +100,18 @@ static int bperf_load_program(struct evlist *evlist)
>  	int i, j;
>  	struct perf_cpu cpu;
>  	int total_cpus = cpu__max_cpu().cpu;
> -	int map_size, map_fd;
> -	int prog_fd, err;
> +	int map_fd, prog_fd, err;
> +
> +	set_max_rlimit();
> +
> +	test_max_events_program_load();
>  
>  	skel = bperf_cgroup_bpf__open();
>  	if (!skel) {
>  		pr_err("Failed to open cgroup skeleton\n");
>  		return -1;
>  	}
> -
> -	skel->rodata->num_cpus = total_cpus;
> -	skel->rodata->num_events = evlist->core.nr_entries / nr_cgroups;
> -
> -	if (cgroup_is_v2("perf_event") > 0)
> -		skel->rodata->use_cgroup_v2 = 1;
> -
> -	BUG_ON(evlist->core.nr_entries % nr_cgroups != 0);
> -
> -	/* we need one copy of events per cpu for reading */
> -	map_size = total_cpus * evlist->core.nr_entries / nr_cgroups;
> -	bpf_map__set_max_entries(skel->maps.events, map_size);
> -	bpf_map__set_max_entries(skel->maps.cgrp_idx, nr_cgroups);
> -	/* previous result is saved in a per-cpu array */
> -	map_size = evlist->core.nr_entries / nr_cgroups;
> -	bpf_map__set_max_entries(skel->maps.prev_readings, map_size);
> -	/* cgroup result needs all events (per-cpu) */
> -	map_size = evlist->core.nr_entries;
> -	bpf_map__set_max_entries(skel->maps.cgrp_readings, map_size);
> -
> -	set_max_rlimit();
> +	setup_rodata(skel, evlist->core.nr_entries);
>  
>  	err = bperf_cgroup_bpf__load(skel);
>  	if (err) {
> diff --git a/tools/perf/util/bpf_skel/bperf_cgroup.bpf.c b/tools/perf/util/bpf_skel/bperf_cgroup.bpf.c
> index 18ab4d9b49ff..c2298a2decc9 100644
> --- a/tools/perf/util/bpf_skel/bperf_cgroup.bpf.c
> +++ b/tools/perf/util/bpf_skel/bperf_cgroup.bpf.c
> @@ -1,14 +1,12 @@
>  // SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
>  // Copyright (c) 2021 Facebook
>  // Copyright (c) 2021 Google
> +#include "bperf_cgroup.h"
>  #include "vmlinux.h"
>  #include <bpf/bpf_helpers.h>
>  #include <bpf/bpf_tracing.h>
>  #include <bpf/bpf_core_read.h>
>  
> -#define MAX_LEVELS  10  // max cgroup hierarchy level: arbitrary
> -#define MAX_EVENTS  1024  // max events per cgroup: arbitrary
> -
>  // NOTE: many of map and global data will be modified before loading
>  //       from the userspace (perf tool) using the skeleton helpers.
>  
> @@ -97,7 +95,7 @@ static inline int get_cgroup_v1_idx(__u32 *cgrps, int size)
>  	cgrp = BPF_CORE_READ(p, cgroups, subsys[perf_subsys_id], cgroup);
>  	level = BPF_CORE_READ(cgrp, level);
>  
> -	for (cnt = 0; i < MAX_LEVELS; i++) {
> +	for (cnt = 0; i < BPERF_CGROUP__MAX_LEVELS; i++) {
>  		__u64 cgrp_id;
>  
>  		if (i > level)
> @@ -123,7 +121,7 @@ static inline int get_cgroup_v2_idx(__u32 *cgrps, int size)
>  	__u32 *elem;
>  	int cnt;
>  
> -	for (cnt = 0; i < MAX_LEVELS; i++) {
> +	for (cnt = 0; i < BPERF_CGROUP__MAX_LEVELS; i++) {
>  		__u64 cgrp_id = bpf_get_current_ancestor_cgroup_id(i);
>  
>  		if (cgrp_id == 0)
> @@ -148,17 +146,17 @@ static int bperf_cgroup_count(void)
>  	register int c = 0;
>  	struct bpf_perf_event_value val, delta, *prev_val, *cgrp_val;
>  	__u32 cpu = bpf_get_smp_processor_id();
> -	__u32 cgrp_idx[MAX_LEVELS];
> +	__u32 cgrp_idx[BPERF_CGROUP__MAX_LEVELS];
>  	int cgrp_cnt;
>  	__u32 key, cgrp;
>  	long err;
>  
>  	if (use_cgroup_v2)
> -		cgrp_cnt = get_cgroup_v2_idx(cgrp_idx, MAX_LEVELS);
> +		cgrp_cnt = get_cgroup_v2_idx(cgrp_idx, BPERF_CGROUP__MAX_LEVELS);
>  	else
> -		cgrp_cnt = get_cgroup_v1_idx(cgrp_idx, MAX_LEVELS);
> +		cgrp_cnt = get_cgroup_v1_idx(cgrp_idx, BPERF_CGROUP__MAX_LEVELS);
>  
> -	for ( ; idx < MAX_EVENTS; idx++) {
> +	for ( ; idx < BPERF_CGROUP__MAX_EVENTS; idx++) {
>  		if (idx == num_events)
>  			break;
>  
> @@ -186,7 +184,7 @@ static int bperf_cgroup_count(void)
>  			delta.enabled = val.enabled - prev_val->enabled;
>  			delta.running = val.running - prev_val->running;
>  
> -			for (c = 0; c < MAX_LEVELS; c++) {
> +			for (c = 0; c < BPERF_CGROUP__MAX_LEVELS; c++) {
>  				if (c == cgrp_cnt)
>  					break;
>  
> diff --git a/tools/perf/util/bpf_skel/bperf_cgroup.h b/tools/perf/util/bpf_skel/bperf_cgroup.h
> new file mode 100644
> index 000000000000..3fb84b19d39a
> --- /dev/null
> +++ b/tools/perf/util/bpf_skel/bperf_cgroup.h
> @@ -0,0 +1,15 @@
> +/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */
> +/* Data structures shared between BPF and tools. */
> +#ifndef __BPERF_CGROUP_H
> +#define __BPERF_CGROUP_H
> +
> +// These constants impact code size of bperf_cgroup.bpf.c that may result in BPF
> +// verifier issues. They are exposed to control the size and also to disable BPF
> +// counters when the number of user events is too large.
> +
> +// max cgroup hierarchy level: arbitrary
> +#define BPERF_CGROUP__MAX_LEVELS  10
> +// max events per cgroup: arbitrary
> +#define BPERF_CGROUP__MAX_EVENTS  128
> +
> +#endif /* __BPERF_CGROUP_H */
> -- 
> 2.51.1.821.gb6fe4d2222-goog
>