[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <bbf55749-3fba-47b6-8151-39e66fec8950@lucifer.local>
Date: Mon, 11 Aug 2025 14:56:26 +0100
From: Lorenzo Stoakes <lorenzo.stoakes@...cle.com>
To: Thomas Gleixner <tglx@...utronix.de>
Cc: LKML <linux-kernel@...r.kernel.org>,
Linus Torvalds <torvalds@...uxfoundation.org>,
Peter Zijlstra <peterz@...radead.org>, Ingo Molnar <mingo@...nel.org>,
Namhyung Kim <namhyung@...nel.org>,
Arnaldo Carvalho de Melo <acme@...hat.com>,
Kees Cook <kees@...nel.org>
Subject: Re: [patch V2 RESEND 5/6] perf/core: Split the ringbuffer mmap() and
allocation code out
On Mon, Aug 11, 2025 at 02:36:37PM +0200, Thomas Gleixner wrote:
> The code logic in perf_mmap() is incomprehensible and has been source of
> subtle bugs in the past. It makes it impossible to convert the atomic_t
> reference counts to refcount_t.
>
> Now that the AUX buffer mapping and allocation code is in it's own function
> apply the same treatment to the ringbuffer part and remove the temporary
> workarounds created by the AUX split out.
>
> No functional change intended.
>
> Signed-off-by: Thomas Gleixner <tglx@...utronix.de>
Aside from nits/code style stuff, this LGTM afaict.
I've done a bunch of (basic) testing locally and hit no issues as well with
the series so this all seems fine.
So:
Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@...cle.com>
>
> ---
> kernel/events/core.c | 176 ++++++++++++++++++++++-----------------------------
> 1 file changed, 78 insertions(+), 98 deletions(-)
>
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -6970,6 +6970,70 @@ static void perf_mmap_account(struct vm_
> atomic64_add(extra, &vma->vm_mm->pinned_vm);
> }
>
> +static int perf_mmap_rb(struct vm_area_struct *vma, struct perf_event *event,
> + unsigned long nr_pages)
> +{
> + long user_extra = nr_pages, extra = 0;
> + struct perf_buffer *rb = event->rb;
> + int rb_flags = 0;
> +
> + /*
> + * If we have rb pages ensure they're a power-of-two number, so we
> + * can do bitmasks instead of modulo.
> + */
> + if (--nr_pages != 0 && !is_power_of_2(nr_pages))
> + return -EINVAL;
> +
> + WARN_ON_ONCE(event->ctx->parent_ctx);
> +
> + if (rb) {
> + /* Must have the same size */
> + if (data_page_nr(rb) != nr_pages)
> + return -EINVAL;
> +
> + if (atomic_inc_not_zero(&event->rb->mmap_count)) {
NIT: Can't we use &rb->mmap_count here?
> + /*
> + * Success -- managed to mmap() the same buffer
> + * multiple times.
> + */
> + atomic_inc(&event->mmap_count);
> + return 0;
> + }
> + /*
> + * Raced against perf_mmap_close()'s
> + * atomic_dec_and_mutex_lock() remove the event and
> + * continue as if !event->rb
> + */
> + ring_buffer_attach(event, NULL);
> + }
> +
> + if (!perf_mmap_calc_limits(vma, &user_extra, &extra))
> + return -EPERM;
> +
> + if (vma->vm_flags & VM_WRITE)
> + rb_flags |= RING_BUFFER_WRITABLE;
> +
> + rb = rb_alloc(nr_pages, event->attr.watermark ? event->attr.wakeup_watermark : 0,
> + event->cpu, rb_flags);
Not sure I love us reusing 'rb' here, perhaps we should declare:
struct perf_buffer *prev_rb = event->rb;
struct perf_buffer *rb;
Then refactor above to refer to prev_rb and assign rb here?
> +
> + if (!rb)
> + return -ENOMEM;
> +
> + atomic_set(&rb->mmap_count, 1);
> + rb->mmap_user = get_current_user();
> + rb->mmap_locked = extra;
> +
> + ring_buffer_attach(event, rb);
> +
> + perf_event_update_time(event);
> + perf_event_init_userpage(event);
> + perf_event_update_userpage(event);
> +
> + perf_mmap_account(vma, user_extra, extra);
> + atomic_set(&event->mmap_count, 1);
> + return 0;
> +}
> +
> static int perf_mmap_aux(struct vm_area_struct *vma, struct perf_event *event,
> unsigned long nr_pages)
> {
> @@ -7042,10 +7106,8 @@ static int perf_mmap(struct file *file,
> {
> struct perf_event *event = file->private_data;
> unsigned long vma_size, nr_pages;
> - long user_extra = 0, extra = 0;
> - struct perf_buffer *rb = NULL;
> - int ret, flags = 0;
> mapped_f mapped;
> + int ret;
>
> /*
> * Don't allow mmap() of inherited per-task counters. This would
> @@ -7071,114 +7133,32 @@ static int perf_mmap(struct file *file,
> if (vma_size != PAGE_SIZE * nr_pages)
> return -EINVAL;
>
> - user_extra = nr_pages;
> -
> - mutex_lock(&event->mmap_mutex);
> - ret = -EINVAL;
> -
> - /*
> - * This relies on __pmu_detach_event() taking mmap_mutex after marking
> - * the event REVOKED. Either we observe the state, or __pmu_detach_event()
> - * will detach the rb created here.
> - */
> - if (event->state <= PERF_EVENT_STATE_REVOKED) {
> - ret = -ENODEV;
> - goto unlock;
> - }
> -
> - if (vma->vm_pgoff == 0) {
> - nr_pages -= 1;
> -
> + scoped_guard(mutex, &event->mmap_mutex) {
> /*
> - * If we have rb pages ensure they're a power-of-two number, so we
> - * can do bitmasks instead of modulo.
> + * This relies on __pmu_detach_event() taking mmap_mutex
> + * after marking the event REVOKED. Either we observe the
> + * state, or __pmu_detach_event() will detach the rb
> + * created here.
> */
> - if (nr_pages != 0 && !is_power_of_2(nr_pages))
> - goto unlock;
> -
> - WARN_ON_ONCE(event->ctx->parent_ctx);
> + if (event->state <= PERF_EVENT_STATE_REVOKED)
> + return -ENODEV;
>
> - if (event->rb) {
> - if (data_page_nr(event->rb) != nr_pages)
> - goto unlock;
> -
> - if (atomic_inc_not_zero(&event->rb->mmap_count)) {
> - /*
> - * Success -- managed to mmap() the same buffer
> - * multiple times.
> - */
> - ret = 0;
> - /* We need the rb to map pages. */
> - rb = event->rb;
> - goto unlock;
> - }
> -
> - /*
> - * Raced against perf_mmap_close()'s
> - * atomic_dec_and_mutex_lock() remove the
> - * event and continue as if !event->rb
> - */
> - ring_buffer_attach(event, NULL);
> - }
> -
> - } else {
> - if (!event->rb) {
> - ret = -EINVAL;
> + if (vma->vm_pgoff == 0) {
> + ret = perf_mmap_rb(vma, event, nr_pages);
> } else {
> + if (!event->rb)
> + return -EINVAL;
> scoped_guard(mutex, &event->rb->aux_mutex)
> ret = perf_mmap_aux(vma, event, nr_pages);
> }
> - // Temporary workaround to split out AUX handling first
> - mutex_unlock(&event->mmap_mutex);
> - goto out;
> - }
> -
> - if (!perf_mmap_calc_limits(vma, &user_extra, &extra)) {
> - ret = -EPERM;
> - goto unlock;
> - }
> -
> - WARN_ON(!rb && event->rb);
> -
> - if (vma->vm_flags & VM_WRITE)
> - flags |= RING_BUFFER_WRITABLE;
> -
> - if (!rb) {
> - rb = rb_alloc(nr_pages,
> - event->attr.watermark ? event->attr.wakeup_watermark : 0,
> - event->cpu, flags);
> -
> - if (!rb) {
> - ret = -ENOMEM;
> - goto unlock;
> - }
> -
> - atomic_set(&rb->mmap_count, 1);
> - rb->mmap_user = get_current_user();
> - rb->mmap_locked = extra;
> -
> - ring_buffer_attach(event, rb);
> -
> - perf_event_update_time(event);
> - perf_event_init_userpage(event);
> - perf_event_update_userpage(event);
> - ret = 0;
> - }
> -unlock:
> - if (!ret) {
> - perf_mmap_account(vma, user_extra, extra);
> - atomic_inc(&event->mmap_count);
> }
> - mutex_unlock(&event->mmap_mutex);
>
> -// Temporary until RB allocation is split out.
> -out:
> if (ret)
> return ret;
>
> /*
> * Since pinned accounting is per vm we cannot allow fork() to copy our
> - * vma.
> + * VMA. The VMA is fixed size and must not be included in dumps.
> */
> vm_flags_set(vma, VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP);
> vma->vm_ops = &perf_mmap_vmops;
> @@ -7193,7 +7173,7 @@ static int perf_mmap(struct file *file,
> * full cleanup in this case and therefore does not invoke
> * vmops::close().
> */
> - ret = map_range(rb, vma);
> + ret = map_range(event->rb, vma);
Have gone through the code and convinced myself we always set this by this
point :) I seem to remember there was some issue with me using event->rb in
a previous version of my series to add map_range(). Perhaps it was
positioned differently... Or maybe misremembering...
At any rate we used to have a WARN_ON(rb != event->rb) here so it's clearly
an invariant since hey we're meant to set this field anyway...
> if (ret)
> perf_mmap_close(vma);
>
>
Powered by blists - more mailing lists