[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <2550a3c6-b0d9-4ad2-ae04-5735d7b4cf5f@lucifer.local>
Date: Thu, 28 Nov 2024 20:47:28 +0000
From: Lorenzo Stoakes <lorenzo.stoakes@...cle.com>
To: Peter Zijlstra <peterz@...radead.org>
Cc: Ingo Molnar <mingo@...hat.com>, Arnaldo Carvalho de Melo <acme@...nel.org>,
Namhyung Kim <namhyung@...nel.org>,
Mark Rutland <mark.rutland@....com>,
Alexander Shishkin <alexander.shishkin@...ux.intel.com>,
Jiri Olsa <jolsa@...nel.org>, Ian Rogers <irogers@...gle.com>,
Adrian Hunter <adrian.hunter@...el.com>,
Kan Liang <kan.liang@...ux.intel.com>,
linux-perf-users@...r.kernel.org, linux-kernel@...r.kernel.org,
linux-mm@...ck.org, Matthew Wilcox <willy@...radead.org>
Subject: Re: [PATCH] perf: map pages in advance
Peter - not sure whether it's easy for you to make a simple adjustment to this
patch or if you want me to just send a v2, but I have to pop an #ifdef CONFIG_MMU
into the code.
I've indicated inline below where it's needed, but if it's a pain/not
possible for you to do a fixup in your tree I'm happy to just respin, let
me know what works for you.
Sorry about that!
On Thu, Nov 28, 2024 at 11:37:14AM +0000, Lorenzo Stoakes wrote:
> We are current refactoring struct page to make it smaller, removing
> unneeded fields that correctly belong to struct folio.
>
> Two of those fields are page->index and page->mapping. Perf is currently
> making use of both of these, so this patch removes this usage as it turns
> out it is unnecessary.
>
> Perf establishes its own internally controlled memory-mapped pages using
> vm_ops hooks. The first page in the mapping is the read/write user control
> page, and the rest of the mapping consists of read-only pages.
>
> The VMA is backed by kernel memory either from the buddy allocator or
> vmalloc depending on configuration. It is intended to be mapped read/write,
> but because it has a page_mkwrite() hook, vma_wants_writenotify() indicaets
> that it should be mapped read-only.
>
> When a write fault occurs, the provided page_mkwrite() hook,
> perf_mmap_fault() (doing double duty handing faults as well) uses the
> vmf->pgoff field to determine if this is the first page, allowing for the
> desired read/write first page, read-only rest mapping.
>
> For this to work the implementation has to carefully work around faulting
> logic. When a page is write-faulted, the fault() hook is called first, then
> its page_mkwrite() hook is called (to allow for dirty tracking in file
> systems).
>
> On fault we set the folio's mapping in perf_mmap_fault(), this is because
> when do_page_mkwrite() is subsequently invoked, it treats a missing mapping
> as an indicator that the fault should be retried.
>
> We also set the folio's index so, given the folio is being treated as faux
> user memory, it correctly references its offset within the VMA.
>
> This explains why the mapping and index fields are used - but it's not
> necessary.
>
> We preallocate pages when perf_mmap() is called for the first time via
> rb_alloc(), and further allocate auxiliary pages via rb_aux_alloc() as
> needed if the mapping requires it.
>
> This allocation is done in the f_ops->mmap() hook provided in perf_mmap(),
> and so we can instead simply map all the memory right away here - there's
> no point in handling (read) page faults when we don't demand page nor need
> to be notified about them (perf does not).
>
> This patch therefore changes this logic to map everything when the mmap()
> hook is called, establishing a PFN map. It implements vm_ops->pfn_mkwrite()
> to provide the required read/write vs. read-only behaviour, which does not
> require the previously implemented workarounds.
>
> It makes sense semantically to establish a PFN map too - we are managing
> the pages internally and so it is appropriate to mark this as a special
> mapping.
>
> There should be no change to actual functionality as a result of this
> change.
>
> Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@...cle.com>
> ---
> kernel/events/core.c | 76 +++++++++++++++++++------------------
> kernel/events/ring_buffer.c | 19 +---------
> 2 files changed, 40 insertions(+), 55 deletions(-)
>
> diff --git a/kernel/events/core.c b/kernel/events/core.c
> index 5d4a54f50826..0754b070497f 100644
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -6284,41 +6284,6 @@ void perf_event_update_userpage(struct perf_event *event)
> }
> EXPORT_SYMBOL_GPL(perf_event_update_userpage);
>
> -static vm_fault_t perf_mmap_fault(struct vm_fault *vmf)
> -{
> - struct perf_event *event = vmf->vma->vm_file->private_data;
> - struct perf_buffer *rb;
> - vm_fault_t ret = VM_FAULT_SIGBUS;
> -
> - if (vmf->flags & FAULT_FLAG_MKWRITE) {
> - if (vmf->pgoff == 0)
> - ret = 0;
> - return ret;
> - }
> -
> - rcu_read_lock();
> - rb = rcu_dereference(event->rb);
> - if (!rb)
> - goto unlock;
> -
> - if (vmf->pgoff && (vmf->flags & FAULT_FLAG_WRITE))
> - goto unlock;
> -
> - vmf->page = perf_mmap_to_page(rb, vmf->pgoff);
> - if (!vmf->page)
> - goto unlock;
> -
> - get_page(vmf->page);
> - vmf->page->mapping = vmf->vma->vm_file->f_mapping;
> - vmf->page->index = vmf->pgoff;
> -
> - ret = 0;
> -unlock:
> - rcu_read_unlock();
> -
> - return ret;
> -}
> -
> static void ring_buffer_attach(struct perf_event *event,
> struct perf_buffer *rb)
> {
> @@ -6558,13 +6523,47 @@ static void perf_mmap_close(struct vm_area_struct *vma)
> ring_buffer_put(rb); /* could be last */
> }
>
> +static vm_fault_t perf_mmap_pfn_mkwrite(struct vm_fault *vmf)
> +{
> + /* The first page is the user control page, others are read-only. */
> + return vmf->pgoff == 0 ? 0 : VM_FAULT_SIGBUS;
> +}
> +
> static const struct vm_operations_struct perf_mmap_vmops = {
> .open = perf_mmap_open,
> .close = perf_mmap_close, /* non mergeable */
> - .fault = perf_mmap_fault,
> - .page_mkwrite = perf_mmap_fault,
> + .pfn_mkwrite = perf_mmap_pfn_mkwrite,
> };
>
> +static int map_range(struct perf_buffer *rb, struct vm_area_struct *vma)
> +{
> + unsigned long nr_pages = vma_pages(vma);
> + int err = 0;
> + unsigned long pgoff;
> +
> + for (pgoff = 0; pgoff < nr_pages; pgoff++) {
> + unsigned long va = vma->vm_start + PAGE_SIZE * pgoff;
> + struct page *page = perf_mmap_to_page(rb, pgoff);
> +
> + if (page == NULL) {
> + err = -EINVAL;
> + break;
> + }
> +
> + /* Map readonly, perf_mmap_pfn_mkwrite() called on write fault. */
> + err = remap_pfn_range(vma, va, page_to_pfn(page), PAGE_SIZE,
> + vm_get_page_prot(vma->vm_flags & ~VM_SHARED));
> + if (err)
> + break;
> + }
> +
Need a:
#ifdef CONFIG_MMU
> + /* Clear any partial mappings on error. */
> + if (err)
> + zap_page_range_single(vma, vma->vm_start, nr_pages * PAGE_SIZE, NULL);
#endif
Here to work around the wonders of nommu :)
> +
> + return err;
> +}
> +
> static int perf_mmap(struct file *file, struct vm_area_struct *vma)
> {
> struct perf_event *event = file->private_data;
> @@ -6783,6 +6782,9 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
> vm_flags_set(vma, VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP);
> vma->vm_ops = &perf_mmap_vmops;
>
> + if (!ret)
> + ret = map_range(rb, vma);
> +
> if (event->pmu->event_mapped)
> event->pmu->event_mapped(event, vma->vm_mm);
>
> diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
> index 4f46f688d0d4..180509132d4b 100644
> --- a/kernel/events/ring_buffer.c
> +++ b/kernel/events/ring_buffer.c
> @@ -643,7 +643,6 @@ static void rb_free_aux_page(struct perf_buffer *rb, int idx)
> struct page *page = virt_to_page(rb->aux_pages[idx]);
>
> ClearPagePrivate(page);
> - page->mapping = NULL;
> __free_page(page);
> }
>
> @@ -819,7 +818,6 @@ static void perf_mmap_free_page(void *addr)
> {
> struct page *page = virt_to_page(addr);
>
> - page->mapping = NULL;
> __free_page(page);
> }
>
> @@ -890,28 +888,13 @@ __perf_mmap_to_page(struct perf_buffer *rb, unsigned long pgoff)
> return vmalloc_to_page((void *)rb->user_page + pgoff * PAGE_SIZE);
> }
>
> -static void perf_mmap_unmark_page(void *addr)
> -{
> - struct page *page = vmalloc_to_page(addr);
> -
> - page->mapping = NULL;
> -}
> -
> static void rb_free_work(struct work_struct *work)
> {
> struct perf_buffer *rb;
> - void *base;
> - int i, nr;
>
> rb = container_of(work, struct perf_buffer, work);
> - nr = data_page_nr(rb);
> -
> - base = rb->user_page;
> - /* The '<=' counts in the user page. */
> - for (i = 0; i <= nr; i++)
> - perf_mmap_unmark_page(base + (i * PAGE_SIZE));
>
> - vfree(base);
> + vfree(rb->user_page);
> kfree(rb);
> }
>
> --
> 2.47.0
>
Powered by blists - more mailing lists