linux-kernel - Re: [RFC PATCH bpf-next v2 2/2] bpf: Pass external callchain entry to get_perf

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives

Hash Suite: Windows password security audit tool. GUI, reports in PDF.

[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]

Message-ID: <aO4-jAA5RIUY2yxc@krava>
Date: Tue, 14 Oct 2025 14:14:04 +0200
From: Jiri Olsa <olsajiri@...il.com>
To: Tao Chen <chen.dylane@...ux.dev>
Cc: peterz@...radead.org, mingo@...hat.com, acme@...nel.org,
	namhyung@...nel.org, mark.rutland@....com,
	alexander.shishkin@...ux.intel.com, irogers@...gle.com,
	adrian.hunter@...el.com, kan.liang@...ux.intel.com, song@...nel.org,
	ast@...nel.org, daniel@...earbox.net, andrii@...nel.org,
	martin.lau@...ux.dev, eddyz87@...il.com, yonghong.song@...ux.dev,
	john.fastabend@...il.com, kpsingh@...nel.org, sdf@...ichev.me,
	haoluo@...gle.com, linux-perf-users@...r.kernel.org,
	linux-kernel@...r.kernel.org, bpf@...r.kernel.org
Subject: Re: [RFC PATCH bpf-next v2 2/2] bpf: Pass external callchain entry
 to get_perf_callchain

On Tue, Oct 14, 2025 at 06:01:28PM +0800, Tao Chen wrote:
> As Alexei noted, get_perf_callchain() return values may be reused
> if a task is preempted after the BPF program enters migrate disable
> mode. Drawing on the per-cpu design of bpf_perf_callchain_entries,
> stack-allocated memory of bpf_perf_callchain_entry is used here.
> 
> Signed-off-by: Tao Chen <chen.dylane@...ux.dev>
> ---
>  kernel/bpf/stackmap.c | 19 +++++++++++--------
>  1 file changed, 11 insertions(+), 8 deletions(-)
> 
> diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
> index 94e46b7f340..acd72c021c0 100644
> --- a/kernel/bpf/stackmap.c
> +++ b/kernel/bpf/stackmap.c
> @@ -31,6 +31,11 @@ struct bpf_stack_map {
>  	struct stack_map_bucket *buckets[] __counted_by(n_buckets);
>  };
>  
> +struct bpf_perf_callchain_entry {
> +	u64 nr;
> +	u64 ip[PERF_MAX_STACK_DEPTH];
> +};
> +
>  static inline bool stack_map_use_build_id(struct bpf_map *map)
>  {
>  	return (map->map_flags & BPF_F_STACK_BUILD_ID);
> @@ -305,6 +310,7 @@ BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map,
>  	bool user = flags & BPF_F_USER_STACK;
>  	struct perf_callchain_entry *trace;
>  	bool kernel = !user;
> +	struct bpf_perf_callchain_entry entry = { 0 };

so IIUC having entries on stack we do not need to do preempt_disable
you had in the previous version, right?

I saw Andrii's justification to have this on the stack, I think it's
fine, but does it have to be initialized? it seems that only used
entries are copied to map

jirka

>  
>  	if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK |
>  			       BPF_F_FAST_STACK_CMP | BPF_F_REUSE_STACKID)))
> @@ -314,12 +320,8 @@ BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map,
>  	if (max_depth > sysctl_perf_event_max_stack)
>  		max_depth = sysctl_perf_event_max_stack;
>  
> -	trace = get_perf_callchain(regs, NULL, kernel, user, max_depth,
> -				   false, false);
> -
> -	if (unlikely(!trace))
> -		/* couldn't fetch the stack trace */
> -		return -EFAULT;
> +	trace = get_perf_callchain(regs, (struct perf_callchain_entry *)&entry,
> +				   kernel, user, max_depth, false, false);
>  
>  	return __bpf_get_stackid(map, trace, flags);
>  }
> @@ -412,6 +414,7 @@ static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task,
>  	u32 skip = flags & BPF_F_SKIP_FIELD_MASK;
>  	bool user = flags & BPF_F_USER_STACK;
>  	struct perf_callchain_entry *trace;
> +	struct bpf_perf_callchain_entry entry = { 0 };
>  	bool kernel = !user;
>  	int err = -EINVAL;
>  	u64 *ips;
> @@ -451,8 +454,8 @@ static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task,
>  	else if (kernel && task)
>  		trace = get_callchain_entry_for_task(task, max_depth);
>  	else
> -		trace = get_perf_callchain(regs, NULL, kernel, user, max_depth,
> -					   crosstask, false);
> +		trace = get_perf_callchain(regs, (struct perf_callchain_entry *)&entry,
> +					   kernel, user, max_depth, crosstask, false);
>  
>  	if (unlikely(!trace) || trace->nr < skip) {
>  		if (may_fault)
> -- 
> 2.48.1
>