linux-kernel - Re: [patch V2 RESEND 5/6] perf/core: Split the ringbuffer mmap() and allocation code out

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <bbf55749-3fba-47b6-8151-39e66fec8950@lucifer.local>
Date: Mon, 11 Aug 2025 14:56:26 +0100
From: Lorenzo Stoakes <lorenzo.stoakes@...cle.com>
To: Thomas Gleixner <tglx@...utronix.de>
Cc: LKML <linux-kernel@...r.kernel.org>,
        Linus Torvalds <torvalds@...uxfoundation.org>,
        Peter Zijlstra <peterz@...radead.org>, Ingo Molnar <mingo@...nel.org>,
        Namhyung Kim <namhyung@...nel.org>,
        Arnaldo Carvalho de Melo <acme@...hat.com>,
        Kees Cook <kees@...nel.org>
Subject: Re: [patch V2 RESEND 5/6] perf/core: Split the ringbuffer mmap() and
 allocation code out

On Mon, Aug 11, 2025 at 02:36:37PM +0200, Thomas Gleixner wrote:
> The code logic in perf_mmap() is incomprehensible and has been source of
> subtle bugs in the past. It makes it impossible to convert the atomic_t
> reference counts to refcount_t.
>
> Now that the AUX buffer mapping and allocation code is in it's own function
> apply the same treatment to the ringbuffer part and remove the temporary
> workarounds created by the AUX split out.
>
> No functional change intended.
>
> Signed-off-by: Thomas Gleixner <tglx@...utronix.de>

Aside from nits/code style stuff, this LGTM afaict.

I've done a bunch of (basic) testing locally and hit no issues as well with
the series so this all seems fine.

So:

Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@...cle.com>

>
> ---
>  kernel/events/core.c |  176 ++++++++++++++++++++++-----------------------------
>  1 file changed, 78 insertions(+), 98 deletions(-)
>
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -6970,6 +6970,70 @@ static void perf_mmap_account(struct vm_
>  	atomic64_add(extra, &vma->vm_mm->pinned_vm);
>  }
>
> +static int perf_mmap_rb(struct vm_area_struct *vma, struct perf_event *event,
> +			unsigned long nr_pages)
> +{
> +	long user_extra = nr_pages, extra = 0;
> +	struct perf_buffer *rb = event->rb;
> +	int rb_flags = 0;
> +
> +	/*
> +	 * If we have rb pages ensure they're a power-of-two number, so we
> +	 * can do bitmasks instead of modulo.
> +	 */
> +	if (--nr_pages != 0 && !is_power_of_2(nr_pages))
> +		return -EINVAL;
> +
> +	WARN_ON_ONCE(event->ctx->parent_ctx);
> +
> +	if (rb) {
> +		/* Must have the same size */
> +		if (data_page_nr(rb) != nr_pages)
> +			return -EINVAL;
> +
> +		if (atomic_inc_not_zero(&event->rb->mmap_count)) {

NIT: Can't we use &rb->mmap_count here?

> +			/*
> +			 * Success -- managed to mmap() the same buffer
> +			 * multiple times.
> +			 */
> +			atomic_inc(&event->mmap_count);
> +			return 0;
> +		}
> +		/*
> +		 * Raced against perf_mmap_close()'s
> +		 * atomic_dec_and_mutex_lock() remove the event and
> +		 * continue as if !event->rb
> +		 */
> +		ring_buffer_attach(event, NULL);
> +	}
> +
> +	if (!perf_mmap_calc_limits(vma, &user_extra, &extra))
> +		return -EPERM;
> +
> +	if (vma->vm_flags & VM_WRITE)
> +		rb_flags |= RING_BUFFER_WRITABLE;
> +
> +	rb = rb_alloc(nr_pages, event->attr.watermark ? event->attr.wakeup_watermark : 0,
> +		      event->cpu, rb_flags);

Not sure I love us reusing 'rb' here, perhaps we should declare:

struct perf_buffer *prev_rb = event->rb;
struct perf_buffer *rb;

Then refactor above to refer to prev_rb and assign rb here?

> +
> +	if (!rb)
> +		return -ENOMEM;
> +
> +	atomic_set(&rb->mmap_count, 1);
> +	rb->mmap_user = get_current_user();
> +	rb->mmap_locked = extra;
> +
> +	ring_buffer_attach(event, rb);
> +
> +	perf_event_update_time(event);
> +	perf_event_init_userpage(event);
> +	perf_event_update_userpage(event);
> +
> +	perf_mmap_account(vma, user_extra, extra);
> +	atomic_set(&event->mmap_count, 1);
> +	return 0;
> +}
> +
>  static int perf_mmap_aux(struct vm_area_struct *vma, struct perf_event *event,
>  			 unsigned long nr_pages)
>  {
> @@ -7042,10 +7106,8 @@ static int perf_mmap(struct file *file,
>  {
>  	struct perf_event *event = file->private_data;
>  	unsigned long vma_size, nr_pages;
> -	long user_extra = 0, extra = 0;
> -	struct perf_buffer *rb = NULL;
> -	int ret, flags = 0;
>  	mapped_f mapped;
> +	int ret;
>
>  	/*
>  	 * Don't allow mmap() of inherited per-task counters. This would
> @@ -7071,114 +7133,32 @@ static int perf_mmap(struct file *file,
>  	if (vma_size != PAGE_SIZE * nr_pages)
>  		return -EINVAL;
>
> -	user_extra = nr_pages;
> -
> -	mutex_lock(&event->mmap_mutex);
> -	ret = -EINVAL;
> -
> -	/*
> -	 * This relies on __pmu_detach_event() taking mmap_mutex after marking
> -	 * the event REVOKED. Either we observe the state, or __pmu_detach_event()
> -	 * will detach the rb created here.
> -	 */
> -	if (event->state <= PERF_EVENT_STATE_REVOKED) {
> -		ret = -ENODEV;
> -		goto unlock;
> -	}
> -
> -	if (vma->vm_pgoff == 0) {
> -		nr_pages -= 1;
> -
> +	scoped_guard(mutex, &event->mmap_mutex) {
>  		/*
> -		 * If we have rb pages ensure they're a power-of-two number, so we
> -		 * can do bitmasks instead of modulo.
> +		 * This relies on __pmu_detach_event() taking mmap_mutex
> +		 * after marking the event REVOKED. Either we observe the
> +		 * state, or __pmu_detach_event() will detach the rb
> +		 * created here.
>  		 */
> -		if (nr_pages != 0 && !is_power_of_2(nr_pages))
> -			goto unlock;
> -
> -		WARN_ON_ONCE(event->ctx->parent_ctx);
> +		if (event->state <= PERF_EVENT_STATE_REVOKED)
> +			return -ENODEV;
>
> -		if (event->rb) {
> -			if (data_page_nr(event->rb) != nr_pages)
> -				goto unlock;
> -
> -			if (atomic_inc_not_zero(&event->rb->mmap_count)) {
> -				/*
> -				 * Success -- managed to mmap() the same buffer
> -				 * multiple times.
> -				 */
> -				ret = 0;
> -				/* We need the rb to map pages. */
> -				rb = event->rb;
> -				goto unlock;
> -			}
> -
> -			/*
> -			 * Raced against perf_mmap_close()'s
> -			 * atomic_dec_and_mutex_lock() remove the
> -			 * event and continue as if !event->rb
> -			 */
> -			ring_buffer_attach(event, NULL);
> -		}
> -
> -	} else {
> -		if (!event->rb) {
> -			ret = -EINVAL;
> +		if (vma->vm_pgoff == 0) {
> +			ret = perf_mmap_rb(vma, event, nr_pages);
>  		} else {
> +			if (!event->rb)
> +				return -EINVAL;
>  			scoped_guard(mutex, &event->rb->aux_mutex)
>  				ret = perf_mmap_aux(vma, event, nr_pages);
>  		}
> -		// Temporary workaround to split out AUX handling first
> -		mutex_unlock(&event->mmap_mutex);
> -		goto out;
> -	}
> -
> -	if (!perf_mmap_calc_limits(vma, &user_extra, &extra)) {
> -		ret = -EPERM;
> -		goto unlock;
> -	}
> -
> -	WARN_ON(!rb && event->rb);
> -
> -	if (vma->vm_flags & VM_WRITE)
> -		flags |= RING_BUFFER_WRITABLE;
> -
> -	if (!rb) {
> -		rb = rb_alloc(nr_pages,
> -			      event->attr.watermark ? event->attr.wakeup_watermark : 0,
> -			      event->cpu, flags);
> -
> -		if (!rb) {
> -			ret = -ENOMEM;
> -			goto unlock;
> -		}
> -
> -		atomic_set(&rb->mmap_count, 1);
> -		rb->mmap_user = get_current_user();
> -		rb->mmap_locked = extra;
> -
> -		ring_buffer_attach(event, rb);
> -
> -		perf_event_update_time(event);
> -		perf_event_init_userpage(event);
> -		perf_event_update_userpage(event);
> -		ret = 0;
> -	}
> -unlock:
> -	if (!ret) {
> -		perf_mmap_account(vma, user_extra, extra);
> -		atomic_inc(&event->mmap_count);
>  	}
> -	mutex_unlock(&event->mmap_mutex);
>
> -// Temporary until RB allocation is split out.
> -out:
>  	if (ret)
>  		return ret;
>
>  	/*
>  	 * Since pinned accounting is per vm we cannot allow fork() to copy our
> -	 * vma.
> +	 * VMA. The VMA is fixed size and must not be included in dumps.
>  	 */
>  	vm_flags_set(vma, VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP);
>  	vma->vm_ops = &perf_mmap_vmops;
> @@ -7193,7 +7173,7 @@ static int perf_mmap(struct file *file,
>  	 * full cleanup in this case and therefore does not invoke
>  	 * vmops::close().
>  	 */
> -	ret = map_range(rb, vma);
> +	ret = map_range(event->rb, vma);

Have gone through the code and convinced myself we always set this by this
point :) I seem to remember there was some issue with me using event->rb in
a previous version of my series to add map_range(). Perhaps it was
positioned differently... Or maybe misremembering...

At any rate we used to have a WARN_ON(rb != event->rb) here so it's clearly
an invariant since hey we're meant to set this field anyway...

>  	if (ret)
>  		perf_mmap_close(vma);
>
>