lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <cd206fce-0a91-4501-a863-d398dbca197f@lucifer.local>
Date: Thu, 7 Aug 2025 16:38:34 +0100
From: Lorenzo Stoakes <lorenzo.stoakes@...cle.com>
To: Thomas Gleixner <tglx@...utronix.de>
Cc: LKML <linux-kernel@...r.kernel.org>,
        Linus Torvalds <torvalds@...uxfoundation.org>,
        Peter Zijlstra <peterz@...radead.org>, Ingo Molnar <mingo@...nel.org>,
        Namhyung Kim <namhyung@...nel.org>,
        Arnaldo Carvalho de Melo <acme@...hat.com>,
        Kees Cook <kees@...nel.org>
Subject: Re: [patch 4/6] perf/core: Split out ringbuffer allocation

On Wed, Aug 06, 2025 at 10:12:58PM +0200, Thomas Gleixner wrote:
> The code logic in perf_mmap() is incomprehensible and has been source of
> subtle bugs in the past. It makes it impossible to convert the atomic_t
> reference counts to refcount_t.
>
> There is not really much, which is shared between the ringbuffer and AUX
> buffer allocation code since the mlock limit calculation and the
> accounting has been split out into helper functions.
>
> Move the AUX buffer allocation code out and integrate the call with a
> momentary workaround to allow skipping the remaining ringbuffer related
> code completely. That workaround will be removed once the ringbuffer
> allocation is moved to its own function as well.
>
> No functional change.
>
> Signed-off-by: Thomas Gleixner <tglx@...utronix.de>

It's easier if you separate out the move and refactor but am going through here,
just to check logic:

- Separate out checks into perf_mmap_aux()
- Add some comments
- We use scoped_guard() for the &event->rb->aux_mutex lock
- We actually return -EINVAL instead of stupidly relying on the default ret
  (such a footgun that, have hit issues with that before)
- Instead of proceeding with the rest of the code as before, we temporarily:
   - drop mutex event->mmap_mutex mutex

And then we explore the other differences below.

There seem to be two bugs, I've noted them inline below. The most series is an
inverted event->rb check, the logic below assumes that's fixed.

THe second is that you don't seem to be doing:

WARN_ON(!rb && event->rb);
if (vma->vm_flags & VM_WRITE)
	flags |= RING_BUFFER_WRITABLE;

In the aux code any more. Maybe first irrelevant, but second surely is?

I noted below inline anyway.

OK moving on, therefore perf_mmap_aux() does the stuff in perf_mmap() that would
previously be execute which would be:

~~~~~~~~~~ 1. ~~~~~~~~~~

-- BEFORE --

	if (!perf_mmap_calc_limits(vma, &user_extra, &extra)) {
		ret = -EPERM;
		goto unlock;
	}
...
unlock:
	if (!ret) {
		...
	} else if (rb) {
		/* AUX allocation failed */
		atomic_dec(&rb->mmap_count);
	}

aux_unlock:
	if (aux_mutex)
		mutex_unlock(aux_mutex);
	mutex_unlock(&event->mmap_mutex);

	if (ret)
		return ret;

-- AFTER --

(We already checked that event->rb is non-NULL)


	if (!perf_mmap_calc_limits(vma, &user_extra, &extra)) {
		atomic_dec(&rb->mmap_count);
		return -EPERM;
	}
	... < scope_guard takes care of aux_mutex >

		// Temporary workaround to split out AUX handling first
		mutex_unlock(&event->mmap_mutex);
		goto out;
 	}

out:
 	if (ret)
 		return ret;

(seems equivalent)

~~~~~~~~~~ 2. ~~~~~~~~~~

-- BEFORE --

	WARN_ON(!rb && event->rb);

	if (vma->vm_flags & VM_WRITE)
		flags |= RING_BUFFER_WRITABLE;

	...

	} else {
		ret = rb_alloc_aux(rb, event, vma->vm_pgoff, nr_pages,
				   event->attr.aux_watermark, flags);
		if (!ret) {
			atomic_set(&rb->aux_mmap_count, 1);
			rb->aux_mmap_locked = extra;
		}
	}

unlock:
	if (!ret) {
		perf_mmap_account(vma, user_extra, extra);
		atomic_inc(&event->mmap_count);
	} else if (rb) {
		/* AUX allocation failed */
		atomic_dec(&rb->mmap_count);
	}

aux_unlock:
	if (aux_mutex)
		mutex_unlock(aux_mutex);
	mutex_unlock(&event->mmap_mutex);

	if (ret)
		return ret;

	... vm_flags_set() blah blah ...

-- AFTER --

	ret = rb_alloc_aux(rb, event, vma->vm_pgoff, nr_pages,
			   event->attr.aux_watermark, rb_flags);
	if (ret) {
		atomic_dec(&rb->mmap_count);
		return ret;
	}

	atomic_set(&rb->aux_mmap_count, 1);
	rb->aux_mmap_locked = extra;
	perf_mmap_account(vma, user_extra, extra);
	atomic_inc(&event->mmap_count);
	return 0;

	... < scope_guard takes care of aux_mutex >

		// Temporary workaround to split out AUX handling first
		mutex_unlock(&event->mmap_mutex);
		goto out;
 	}

out:
 	if (ret)
 		return ret;

	... vm_flags_set() blah blah ...


-- rb_alloc_aux() failure path --

BEFORE

WARN_ON(!rb && event->rb);
if (vma->vm_flags & VM_WRITE)
	flags |= RING_BUFFER_WRITABLE;
atomic_dec(&rb->mmap_count);
if (aux_mutex)
	mutex_unlock(aux_mutex);
mutex_unlock(&event->mmap_mutex);
if (ret)
	return ret;

AFTER

atomic_dec(&rb->mmap_count);
.. < scope_guard takes care of aux_mutex >
mutex_unlock(&event->mmap_mutex);
if (ret)
	return ret;

--  rb_alloc_aux() success path --

BEFORE

WARN_ON(!rb && event->rb);
if (vma->vm_flags & VM_WRITE)
	flags |= RING_BUFFER_WRITABLE;
atomic_set(&rb->aux_mmap_count, 1);
rb->aux_mmap_locked = extra;
perf_mmap_account(vma, user_extra, extra);
atomic_inc(&event->mmap_count);
if (aux_mutex)
	mutex_unlock(aux_mutex);
mutex_unlock(&event->mmap_mutex);
...vm_flags_set() blah blah  ...

AFTER

atomic_set(&rb->aux_mmap_count, 1);
rb->aux_mmap_locked = extra;
.. < scope_guard takes care of aux_mutex >

perf_mmap_account(vma, user_extra, extra);
atomic_inc(&event->mmap_count);
mutex_unlock(&event->mmap_mutex);
...vm_flags_set() blah blah  ...


DIFFERENCES:

If we get to the rb_alloc_aux() bit, we're missing the:

WARN_ON(!rb && event->rb);
if (vma->vm_flags & VM_WRITE)
	flags |= RING_BUFFER_WRITABLE;

Bit for aux case.

Otherwise, it seems to be equivalent.

> ---
>  kernel/events/core.c |  134 ++++++++++++++++++++++++++++-----------------------
>  1 file changed, 75 insertions(+), 59 deletions(-)
>
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -6970,12 +6970,76 @@ static void perf_mmap_account(struct vm_
>  	atomic64_add(extra, &vma->vm_mm->pinned_vm);
>  }
>
> +static int perf_mmap_aux(struct vm_area_struct *vma, struct perf_event *event,
> +			 unsigned long nr_pages)
> +{
> +	long user_extra = nr_pages, extra = 0;
> +	struct perf_buffer *rb = event->rb;
> +	u64 aux_offset, aux_size;
> +	int ret, rb_flags = 0;
> +
> +	/*
> +	 * AUX area mapping: if rb->aux_nr_pages != 0, it's already
> +	 * mapped, all subsequent mappings should have the same size
> +	 * and offset. Must be above the normal perf buffer.
> +	 */
> +	aux_offset = READ_ONCE(rb->user_page->aux_offset);
> +	aux_size = READ_ONCE(rb->user_page->aux_size);
> +
> +	if (aux_offset < perf_data_size(rb) + PAGE_SIZE)
> +		return -EINVAL;
> +
> +	if (aux_offset != vma->vm_pgoff << PAGE_SHIFT)
> +		return -EINVAL;
> +
> +	/* Already mapped with a different offset */
> +	if (rb_has_aux(rb) && rb->aux_pgoff != vma->vm_pgoff)
> +		return -EINVAL;
> +
> +	if (aux_size != nr_pages * PAGE_SIZE)
> +		return -EINVAL;
> +
> +	/* Already mapped with a different size */
> +	if (rb_has_aux(rb) && rb->aux_nr_pages != nr_pages)
> +		return -EINVAL;
> +
> +	if (!is_power_of_2(nr_pages))
> +		return -EINVAL;
> +
> +	/* If this succeeds, subsequent failures have to undo it */
> +	if (!atomic_inc_not_zero(&rb->mmap_count))
> +		return -EINVAL;
> +
> +	/* If mapped, attach to it */
> +	if (rb_has_aux(rb)) {
> +		atomic_inc(&rb->aux_mmap_count);
> +		return 0;
> +	}
> +
> +	if (!perf_mmap_calc_limits(vma, &user_extra, &extra)) {
> +		atomic_dec(&rb->mmap_count);
> +		return -EPERM;
> +	}
> +
> +	ret = rb_alloc_aux(rb, event, vma->vm_pgoff, nr_pages,
> +			   event->attr.aux_watermark, rb_flags);
> +	if (ret) {
> +		atomic_dec(&rb->mmap_count);
> +		return ret;
> +	}
> +
> +	atomic_set(&rb->aux_mmap_count, 1);
> +	rb->aux_mmap_locked = extra;
> +	perf_mmap_account(vma, user_extra, extra);
> +	atomic_inc(&event->mmap_count);
> +	return 0;
> +}
> +
>  static int perf_mmap(struct file *file, struct vm_area_struct *vma)
>  {
>  	struct perf_event *event = file->private_data;
>  	unsigned long vma_size, nr_pages;
>  	long user_extra = 0, extra = 0;
> -	struct mutex *aux_mutex = NULL;
>  	struct perf_buffer *rb = NULL;
>  	int ret, flags = 0;
>  	mapped_f mapped;
> @@ -7055,51 +7119,15 @@ static int perf_mmap(struct file *file,
>  		}
>
>  	} else {
> -		/*
> -		 * AUX area mapping: if rb->aux_nr_pages != 0, it's already
> -		 * mapped, all subsequent mappings should have the same size
> -		 * and offset. Must be above the normal perf buffer.
> -		 */
> -		u64 aux_offset, aux_size;
> -
> -		rb = event->rb;
> -		if (!rb)
> -			goto aux_unlock;
> -
> -		aux_mutex = &rb->aux_mutex;
> -		mutex_lock(aux_mutex);
> -
> -		aux_offset = READ_ONCE(rb->user_page->aux_offset);
> -		aux_size = READ_ONCE(rb->user_page->aux_size);
> -
> -		if (aux_offset < perf_data_size(rb) + PAGE_SIZE)
> -			goto aux_unlock;
> -
> -		if (aux_offset != vma->vm_pgoff << PAGE_SHIFT)
> -			goto aux_unlock;
> -
> -		/* already mapped with a different offset */
> -		if (rb_has_aux(rb) && rb->aux_pgoff != vma->vm_pgoff)
> -			goto aux_unlock;
> -
> -		if (aux_size != nr_pages * PAGE_SIZE)
> -			goto aux_unlock;
> -
> -		/* already mapped with a different size */
> -		if (rb_has_aux(rb) && rb->aux_nr_pages != nr_pages)
> -			goto aux_unlock;
> -
> -		if (!is_power_of_2(nr_pages))
> -			goto aux_unlock;
> -
> -		if (!atomic_inc_not_zero(&rb->mmap_count))
> -			goto aux_unlock;
> -
> -		if (rb_has_aux(rb)) {
> -			atomic_inc(&rb->aux_mmap_count);
> -			ret = 0;
> -			goto unlock;
> +		if (event->rb) {
> +			ret = -EINVAL;

Shouldn't this be if (!event->rb) ?

> +		} else {

Because here you're dereffing event->rb in branch where !event->rb?

> +			scoped_guard(mutex, &event->rb->aux_mutex)
> +				ret = perf_mmap_aux(vma, event, nr_pages);
>  		}
> +		// Temporary workaround to split out AUX handling first
> +		mutex_unlock(&event->mmap_mutex);
> +		goto out;

Noted above but you're now skipping the:

	WARN_ON(!rb && event->rb);

	if (vma->vm_flags & VM_WRITE)
		flags |= RING_BUFFER_WRITABLE;

Bit in the aux case no? Unless I'm missing something?

>  	}
>
>  	if (!perf_mmap_calc_limits(vma, &user_extra, &extra)) {
> @@ -7132,28 +7160,16 @@ static int perf_mmap(struct file *file,
>  		perf_event_init_userpage(event);
>  		perf_event_update_userpage(event);
>  		ret = 0;
> -	} else {
> -		ret = rb_alloc_aux(rb, event, vma->vm_pgoff, nr_pages,
> -				   event->attr.aux_watermark, flags);
> -		if (!ret) {
> -			atomic_set(&rb->aux_mmap_count, 1);
> -			rb->aux_mmap_locked = extra;
> -		}
>  	}
> -
>  unlock:
>  	if (!ret) {
>  		perf_mmap_account(vma, user_extra, extra);
>  		atomic_inc(&event->mmap_count);
> -	} else if (rb) {
> -		/* AUX allocation failed */
> -		atomic_dec(&rb->mmap_count);
>  	}
> -aux_unlock:
> -	if (aux_mutex)
> -		mutex_unlock(aux_mutex);
>  	mutex_unlock(&event->mmap_mutex);
>
> +// Temporary until RB allocation is split out.
> +out:
>  	if (ret)
>  		return ret;
>
>

Cheers, Lorenzo

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ