[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <35d63781-ea17-4472-87ec-541b6c8e0ecf@lucifer.local>
Date: Thu, 11 Sep 2025 10:53:13 +0100
From: Lorenzo Stoakes <lorenzo.stoakes@...cle.com>
To: "Liam R. Howlett" <Liam.Howlett@...cle.com>
Cc: Andrew Morton <akpm@...ux-foundation.org>, maple-tree@...ts.infradead.org,
linux-mm@...ck.org, linux-kernel@...r.kernel.org,
David Hildenbrand <david@...hat.com>, Vlastimil Babka <vbabka@...e.cz>,
Suren Baghdasaryan <surenb@...gle.com>, Michal Hocko <mhocko@...e.com>,
Jann Horn <jannh@...gle.com>, Pedro Falcato <pfalcato@...e.de>,
Charan Teja Kalla <quic_charante@...cinc.com>,
shikemeng@...weicloud.com, kasong@...cent.com, nphamcs@...il.com,
bhe@...hat.com, baohua@...nel.org, chrisl@...nel.org,
Matthew Wilcox <willy@...radead.org>
Subject: Re: [PATCH v1 8/9] mm/vma: Use unmap_desc in vms_clear_ptes() and
exit_mmap()
On Tue, Sep 09, 2025 at 03:09:44PM -0400, Liam R. Howlett wrote:
> vms_clear_ptes() is slightly different than other callers to
> unmap_region() and so had the unmapping open-coded. Using the new
> structure it is now possible to special-case the struct setup instead of
> having the open-coded function.
>
> exit_mmap() also calls unmap_vmas() with many arguemnts. Using the
> unmap_all_init() function to set the unmap descriptor for all vmas makes
> this a bit easier to read.
>
> Update to the vma test code is necessary to ensure testing continues to
> function.
>
> No functional changes intended.
>
> Signed-off-by: Liam R. Howlett <Liam.Howlett@...cle.com>
> ---
> include/linux/mm.h | 3 ---
> mm/internal.h | 3 +++
> mm/memory.c | 24 ++++++++------------
> mm/mmap.c | 5 +++-
> mm/vma.c | 39 ++++++++++++++++++--------------
> mm/vma.h | 14 ++++++++++++
> tools/testing/vma/vma_internal.h | 14 ++++--------
> 7 files changed, 56 insertions(+), 46 deletions(-)
>
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index 892fe5dbf9de0..23eb59d543390 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -2450,9 +2450,6 @@ static inline void zap_vma_pages(struct vm_area_struct *vma)
> zap_page_range_single(vma, vma->vm_start,
> vma->vm_end - vma->vm_start, NULL);
> }
> -void unmap_vmas(struct mmu_gather *tlb, struct ma_state *mas,
> - struct vm_area_struct *start_vma, unsigned long start,
> - unsigned long end, unsigned long tree_end, bool mm_wr_locked);
>
> struct mmu_notifier_range;
>
> diff --git a/mm/internal.h b/mm/internal.h
> index d295252407fee..1239944f2830a 100644
> --- a/mm/internal.h
> +++ b/mm/internal.h
> @@ -197,6 +197,9 @@ static inline void vma_close(struct vm_area_struct *vma)
> }
> }
>
> +/* unmap_vmas is in mm/memory.c */
> +void unmap_vmas(struct mmu_gather *tlb, struct unmap_desc *unmap);
> +
> #ifdef CONFIG_MMU
>
> /* Flags for folio_pte_batch(). */
> diff --git a/mm/memory.c b/mm/memory.c
> index 829cd94950182..8d4d976311037 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -2084,12 +2084,7 @@ static void unmap_single_vma(struct mmu_gather *tlb,
> /**
> * unmap_vmas - unmap a range of memory covered by a list of vma's
> * @tlb: address of the caller's struct mmu_gather
> - * @mas: the maple state
> - * @vma: the starting vma
> - * @start_addr: virtual address at which to start unmapping
> - * @end_addr: virtual address at which to end unmapping
> - * @tree_end: The maximum index to check
> - * @mm_wr_locked: lock flag
> + * @unmap: The unmap_desc
> *
> * Unmap all pages in the vma list.
> *
> @@ -2102,11 +2097,9 @@ static void unmap_single_vma(struct mmu_gather *tlb,
> * ensure that any thus-far unmapped pages are flushed before unmap_vmas()
> * drops the lock and schedules.
> */
> -void unmap_vmas(struct mmu_gather *tlb, struct ma_state *mas,
> - struct vm_area_struct *vma, unsigned long start_addr,
> - unsigned long end_addr, unsigned long tree_end,
> - bool mm_wr_locked)
> +void unmap_vmas(struct mmu_gather *tlb, struct unmap_desc *unmap)
> {
> + struct vm_area_struct *vma;
> struct mmu_notifier_range range;
> struct zap_details details = {
> .zap_flags = ZAP_FLAG_DROP_MARKER | ZAP_FLAG_UNMAP,
> @@ -2114,17 +2107,18 @@ void unmap_vmas(struct mmu_gather *tlb, struct ma_state *mas,
> .even_cows = true,
> };
>
> + vma = unmap->first;
> mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma->vm_mm,
> - start_addr, end_addr);
> + unmap->vma_min, unmap->vma_max);
> mmu_notifier_invalidate_range_start(&range);
> do {
> - unsigned long start = start_addr;
> - unsigned long end = end_addr;
> + unsigned long start = unmap->vma_min;
> + unsigned long end = unmap->vma_max;
> hugetlb_zap_begin(vma, &start, &end);
> unmap_single_vma(tlb, vma, start, end, &details,
> - mm_wr_locked);
> + unmap->mm_wr_locked);
> hugetlb_zap_end(vma, &details);
> - vma = mas_find(mas, tree_end - 1);
> + vma = mas_find(unmap->mas, unmap->tree_max - 1);
An aside, but do kinda see David's point about min, max & start, end & floor
ceililng, perhaps the prefix_ is enough to differentiate these and we could be
as consistent as we can be?
> } while (vma);
> mmu_notifier_invalidate_range_end(&range);
> }
> diff --git a/mm/mmap.c b/mm/mmap.c
> index 5c9bd3f20e53f..6011f62b0a294 100644
> --- a/mm/mmap.c
> +++ b/mm/mmap.c
> @@ -1280,10 +1280,12 @@ void exit_mmap(struct mm_struct *mm)
> struct vm_area_struct *vma;
> unsigned long nr_accounted = 0;
> VMA_ITERATOR(vmi, mm, 0);
> + struct unmap_desc unmap;
Perhaps :
struct unmap_desc unnmap = {
.mm_wr_locked = false,
};
To be clear? I mean it does mean we're initialising some fields twice
though... but at least avoids uninitialised state if we add new fields.
However I see you're already seeing mm_wr_locked to false in
unmap_all_init() anyway?
Ideally we'd have something like UNMAP_STATE_VMA() here, but ofc you don't
have the VMA yet.
So probably something like what you've done here is the way to go, but I'd
rather we initialise this var in case of adding fields later.
Then again... you would have to make sure the _init() function set all
fields anyway so maybe not necessary. Hmm.
>
> /* mm's last user has gone, and its about to be pulled down */
> mmu_notifier_release(mm);
>
> + unmap.mm_wr_locked = false;
> mmap_read_lock(mm);
> arch_exit_mmap(mm);
>
> @@ -1295,11 +1297,12 @@ void exit_mmap(struct mm_struct *mm)
> goto destroy;
> }
>
> + unmap_all_init(&unmap, &vmi, vma);
> flush_cache_mm(mm);
> tlb_gather_mmu_fullmm(&tlb, mm);
> /* update_hiwater_rss(mm) here? but nobody should be looking */
> /* Use ULONG_MAX here to ensure all VMAs in the mm are unmapped */
> - unmap_vmas(&tlb, &vmi.mas, vma, 0, ULONG_MAX, ULONG_MAX, false);
> + unmap_vmas(&tlb, &unmap);
> mmap_read_unlock(mm);
>
> /*
> diff --git a/mm/vma.c b/mm/vma.c
> index c92384975cbb2..ad64cd9795ef3 100644
> --- a/mm/vma.c
> +++ b/mm/vma.c
> @@ -481,8 +481,7 @@ void unmap_region(struct unmap_desc *desc)
>
> tlb_gather_mmu(&tlb, mm);
> update_hiwater_rss(mm);
> - unmap_vmas(&tlb, mas, desc->first, desc->vma_min, desc->vma_max,
> - desc->vma_max, desc->mm_wr_locked);
> + unmap_vmas(&tlb, desc);
Nit, perhaps didn't notice on previous commit actually, but you're being
inconsistent between naming this 'desc' and 'unmap'. Let's choose one and
stick with it.
> mas_set(mas, desc->tree_reset);
> free_pgtables(&tlb, mas, desc->first, desc->first_pgaddr,
> desc->last_pgaddr, desc->tree_max,
> @@ -1213,26 +1212,32 @@ int vma_shrink(struct vma_iterator *vmi, struct vm_area_struct *vma,
> static inline void vms_clear_ptes(struct vma_munmap_struct *vms,
> struct ma_state *mas_detach, bool mm_wr_locked)
> {
> - struct mmu_gather tlb;
> + struct unmap_desc unmap = {
> + .mas = mas_detach,
> + .first = vms->vma,
> + /* start and end may be different if there is no prev or next vma. */
> + .first_pgaddr = vms->unmap_start,
> + .last_pgaddr = vms->unmap_end,
> + .vma_min = vms->start,
> + .vma_max = vms->end,
> + /*
> + * The tree limits and reset differ from the normal case since it's a
> + * side-tree
> + */
> + .tree_reset = 1,
> + .tree_max = vms->vma_count,
> + /*
> + * We can free page tables without write-locking mmap_lock because VMAs
> + * were isolated before we downgraded mmap_lock.
> + */
> + .mm_wr_locked = mm_wr_locked,
These comments are great thanks! :)
> + };
>
> if (!vms->clear_ptes) /* Nothing to do */
> return;
>
> - /*
> - * We can free page tables without write-locking mmap_lock because VMAs
> - * were isolated before we downgraded mmap_lock.
> - */
> mas_set(mas_detach, 1);
> - tlb_gather_mmu(&tlb, vms->vma->vm_mm);
> - update_hiwater_rss(vms->vma->vm_mm);
> - unmap_vmas(&tlb, mas_detach, vms->vma, vms->start, vms->end,
> - vms->vma_count, mm_wr_locked);
> -
> - mas_set(mas_detach, 1);
> - /* start and end may be different if there is no prev or next vma. */
> - free_pgtables(&tlb, mas_detach, vms->vma, vms->unmap_start,
> - vms->unmap_end, vms->unmap_end, mm_wr_locked);
> - tlb_finish_mmu(&tlb);
> + unmap_region(&unmap);
HMmm why are we removing all these? I'm guessing this is a separate
refactoring, could we perhaps break this out, as it doesn't seem to quite
belong in this patch?
I mean this is really nice :) just I think belongs in another patch, unless
you feel it's really tied to this one/necessary here?
Sorry to be a pain... :)
> vms->clear_ptes = false;
> }
>
> diff --git a/mm/vma.h b/mm/vma.h
> index 4edd5d26ffcfc..8b55a0c73d097 100644
> --- a/mm/vma.h
> +++ b/mm/vma.h
> @@ -164,6 +164,20 @@ struct unmap_desc {
> bool mm_wr_locked; /* If the mmap write lock is held */
> };
>
> +static inline void unmap_all_init(struct unmap_desc *desc,
> + struct vma_iterator *vmi, struct vm_area_struct *vma)
> +{
> + desc->mas = &vmi->mas;
> + desc->first = vma;
> + desc->first_pgaddr = FIRST_USER_ADDRESS;
> + desc->last_pgaddr = USER_PGTABLES_CEILING;
> + desc->vma_min = 0;
> + desc->vma_max = ULONG_MAX;
> + desc->tree_max = ULONG_MAX;
> + desc->tree_reset = vma->vm_end;
> + desc->mm_wr_locked = false;
> +}
> +
> #define UNMAP_REGION(name, _vmi, _vma, _vma_min, _vma_max, _prev, _next) \
> struct unmap_desc name = { \
> .mas = &(_vmi)->mas, \
> diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h
> index 823d379e1fac2..d73ad4747d40a 100644
> --- a/tools/testing/vma/vma_internal.h
> +++ b/tools/testing/vma/vma_internal.h
> @@ -884,18 +884,12 @@ static inline void update_hiwater_vm(struct mm_struct *)
> {
> }
>
> -static inline void unmap_vmas(struct mmu_gather *tlb, struct ma_state *mas,
> - struct vm_area_struct *vma, unsigned long start_addr,
> - unsigned long end_addr, unsigned long tree_end,
> - bool mm_wr_locked)
> +struct unmap_desc;
> +
> +static inline void unmap_vmas(struct mmu_gather *tlb, struct unmap_desc *unmap)
> {
> (void)tlb;
> - (void)mas;
> - (void)vma;
> - (void)start_addr;
> - (void)end_addr;
> - (void)tree_end;
> - (void)mm_wr_locked;
> + (void)unmap;
> }
Hmm why is this still here, I thought I'd removed all these (void)'s... I think
actually that's what's in Vlasta's tree, so this will conflict unfortunately.
Anwyay, please just remove all these (void)'s, they're unnecessary.
(Also I'm sorry for making updating this file a thing, but I'm not sure there's
a better way)
>
> static inline void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas,
> --
> 2.47.2
>
Powered by blists - more mailing lists