[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <8f1f86df-f3b9-4e6d-832c-5c4578c68285@lucifer.local>
Date: Fri, 9 Aug 2024 18:20:42 +0100
From: Lorenzo Stoakes <lorenzo.stoakes@...cle.com>
To: "Liam R. Howlett" <Liam.Howlett@...cle.com>, linux-mm@...ck.org,
linux-kernel@...r.kernel.org,
Andrew Morton <akpm@...ux-foundation.org>,
Vlastimil Babka <vbabka@...e.cz>
Subject: Re: [PATCH 07/10] mm: avoid using vma_merge() for new VMAs
On Fri, Aug 09, 2024 at 11:23:30AM GMT, Liam R. Howlett wrote:
> * Lorenzo Stoakes <lorenzo.stoakes@...cle.com> [240805 08:14]:
> > In mmap_region() and do_brk_flags() we open code scenarios where we prefer
> > to use vma_expand() rather than invoke a full vma_merge() operation.
> >
> > Abstract this logic and eliminate all of the open-coding, and also use the
> > same logic for all cases where we add new VMAs to, rather than ultimately
> > use vma_merge(), rather use vma_expand().
> >
> > We implement this by replacing vma_merge_new_vma() with this newly
> > abstracted logic.
> >
> > Doing so removes duplication and simplifies VMA merging in all such cases,
> > laying the ground for us to eliminate the merging of new VMAs in
> > vma_merge() altogether.
> >
> > This makes it far easier to understand what is happening in these cases
> > avoiding confusion, bugs and allowing for future optimisation.
> >
> > As a result of this change we are also able to make vma_prepare(),
> > init_vma_prep(), vma_complete(), can_vma_merge_before() and
> > can_vma_merge_after() static and internal to vma.c.
> >
> > Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@...cle.com>
> > ---
> > mm/mmap.c | 79 ++---
> > mm/vma.c | 482 +++++++++++++++++++------------
> > mm/vma.h | 51 +---
> > tools/testing/vma/vma_internal.h | 6 +
> > 4 files changed, 324 insertions(+), 294 deletions(-)
> >
> > diff --git a/mm/mmap.c b/mm/mmap.c
> > index f6593a81f73d..c03f50f46396 100644
> > --- a/mm/mmap.c
> > +++ b/mm/mmap.c
> > @@ -1363,8 +1363,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
> > {
> > struct mm_struct *mm = current->mm;
> > struct vm_area_struct *vma = NULL;
> > - struct vm_area_struct *next, *prev, *merge;
> > - pgoff_t pglen = len >> PAGE_SHIFT;
> > + struct vm_area_struct *merge;
> > unsigned long charged = 0;
> > unsigned long end = addr + len;
> > bool writable_file_mapping = false;
> > @@ -1411,44 +1410,9 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
> > vm_flags |= VM_ACCOUNT;
> > }
> >
> > - next = vmg.next = vma_next(&vmi);
> > - prev = vmg.prev = vma_prev(&vmi);
> > - if (vm_flags & VM_SPECIAL) {
> > - if (prev)
> > - vma_iter_next_range(&vmi);
> > - goto cannot_expand;
> > - }
> > -
> > - /* Attempt to expand an old mapping */
> > - /* Check next */
> > - if (next && next->vm_start == end && can_vma_merge_before(&vmg)) {
> > - /* We can adjust this as can_vma_merge_after() doesn't touch */
> > - vmg.end = next->vm_end;
> > - vma = vmg.vma = next;
> > - vmg.pgoff = next->vm_pgoff - pglen;
> > -
> > - /* We may merge our NULL anon_vma with non-NULL in next. */
> > - vmg.anon_vma = vma->anon_vma;
> > - }
> > -
> > - /* Check prev */
> > - if (prev && prev->vm_end == addr && can_vma_merge_after(&vmg)) {
> > - vmg.start = prev->vm_start;
> > - vma = vmg.vma = prev;
> > - vmg.pgoff = prev->vm_pgoff;
> > - } else if (prev) {
> > - vma_iter_next_range(&vmi);
> > - }
> > -
> > - /* Actually expand, if possible */
> > - if (vma && !vma_expand(&vmg)) {
> > - khugepaged_enter_vma(vma, vm_flags);
> > + vma = vma_merge_new_vma(&vmg);
> > + if (vma)
> > goto expanded;
> > - }
> > -
> > - if (vma == prev)
> > - vma_iter_set(&vmi, addr);
> > -cannot_expand:
> >
> > /*
> > * Determine the object being mapped and call the appropriate
> > @@ -1493,10 +1457,9 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
> > * If vm_flags changed after call_mmap(), we should try merge
> > * vma again as we may succeed this time.
> > */
> > - if (unlikely(vm_flags != vma->vm_flags && prev)) {
> > - merge = vma_merge_new_vma_wrapper(&vmi, prev, vma,
> > - vma->vm_start, vma->vm_end,
> > - vma->vm_pgoff);
> > + if (unlikely(vm_flags != vma->vm_flags && vmg.prev)) {
> > + merge = vma_merge_new_vma(&vmg);
> > +
> > if (merge) {
> > /*
> > * ->mmap() can change vma->vm_file and fput
> > @@ -1596,7 +1559,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
> >
> > vma_iter_set(&vmi, vma->vm_end);
> > /* Undo any partial mapping done by a device driver. */
> > - unmap_region(mm, &vmi.mas, vma, prev, next, vma->vm_start,
> > + unmap_region(mm, &vmi.mas, vma, vmg.prev, vmg.next, vma->vm_start,
> > vma->vm_end, vma->vm_end, true);
> > }
> > if (writable_file_mapping)
> > @@ -1773,7 +1736,6 @@ static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma,
> > unsigned long addr, unsigned long len, unsigned long flags)
> > {
> > struct mm_struct *mm = current->mm;
> > - struct vma_prepare vp;
> >
> > /*
> > * Check against address space limits by the changed size
> > @@ -1795,29 +1757,22 @@ static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma,
> > */
> > if (vma && vma->vm_end == addr) {
> > struct vma_merge_struct vmg = {
> > + .vmi = vmi,
> > .prev = vma,
> > + .next = NULL,
> > + .start = addr,
> > + .end = addr + len,
> > .flags = flags,
> > .pgoff = addr >> PAGE_SHIFT,
> > + .file = vma->vm_file,
> > + .anon_vma = vma->anon_vma,
> > + .policy = vma_policy(vma),
> > + .uffd_ctx = vma->vm_userfaultfd_ctx,
> > + .anon_name = anon_vma_name(vma),
> > };
> >
> > - if (can_vma_merge_after(&vmg)) {
> > - vma_iter_config(vmi, vma->vm_start, addr + len);
> > - if (vma_iter_prealloc(vmi, vma))
> > - goto unacct_fail;
> > -
> > - vma_start_write(vma);
> > -
> > - init_vma_prep(&vp, vma);
> > - vma_prepare(&vp);
> > - vma_adjust_trans_huge(vma, vma->vm_start, addr + len, 0);
> > - vma->vm_end = addr + len;
> > - vm_flags_set(vma, VM_SOFTDIRTY);
> > - vma_iter_store(vmi, vma);
> > -
> > - vma_complete(&vp, vmi, mm);
> > - khugepaged_enter_vma(vma, flags);
> > + if (vma_merge_new_vma(&vmg))
> > goto out;
>
> This is very convoluted to follow. It seems vma_merge_new_vma() will do
> what is necessary by finding out that it can merge after, then call
> vma_expand() which calls commit merge(), which sets the iterator to
> vmg->start, but vmg->start isn't set to vma->vm_start, it is set to addr
> here.. it's actually set to prev->vm_start in vma_merge_new_vma().
Sorry, it's kind of hard to make a change like this all that lovely to
follow.
The only extra checks before it checks mergeability are prev - which we set
to vma, so is not NULL (except in the case of first vma, which is wasteful,
but a one-off) and an is_special and next check.
So this isn't _hugely_ terrible.
As to the vmi positioning... I thought there might be some things that we
could improve on this :)
However, we set prev == vma here, so vmg->start = vma->vm_start, vmg->end =
addr + len which is the same as before right?
I do notice that we've incorrectly removed the vm_flags_set(VM_SOFTDIRTY)
though... will add that back in. Ugh.
Again, so frustrating to not have these functions testable. I'd like to
find a way to move things around if possible at some point. But if we're
worried about call stack maybe not feasible.
>
> This is getting really hard to trace what happens. I'm also concerned
> that the overhead of following all these checks will cost performance on
> the brk system call?
I'll take a look at mm-tests results.
>
> Maybe we can have a way to set up the vmg and call the right function to
> just make the above happen? We know with the can_vma_merge_after() that
> it is going to work, so could we just call vma_start_write() and
> commit_merge()?
I'm happy to add an enum or something to set a specific mode if we want,
but maybe worth looking at scalability results first to see if there's
really a regression?
I mean from our discussions on irc, it sounds like this is very possible so
we could figure something out.
>
> Also, vma_merge_new_vma() could fail because it's out of memory so it
> should goto unacct_fail.. but we now don't know if it's because the
> merge wasn't allowed or if we are out of memory..
Yes this is a mistake, damn it. Will fix. Grumble about untestability of
these functions x2.
As per your comment below I think simplest way may be to have an error or
outcome field or some such that we can check to see _why_ things failed.
>
> > - }
> > }
> >
> > if (vma)
> > diff --git a/mm/vma.c b/mm/vma.c
> > index 55615392e8d2..a404cf718f9e 100644
> > --- a/mm/vma.c
> > +++ b/mm/vma.c
> > @@ -97,8 +97,7 @@ static void init_multi_vma_prep(struct vma_prepare *vp,
> > *
> > * We assume the vma may be removed as part of the merge.
> > */
> > -bool
> > -can_vma_merge_before(struct vma_merge_struct *vmg)
> > +static bool can_vma_merge_before(struct vma_merge_struct *vmg)
> > {
> > pgoff_t pglen = PHYS_PFN(vmg->end - vmg->start);
> >
> > @@ -120,7 +119,7 @@ can_vma_merge_before(struct vma_merge_struct *vmg)
> > *
> > * We assume that vma is not removed as part of the merge.
> > */
> > -bool can_vma_merge_after(struct vma_merge_struct *vmg)
> > +static bool can_vma_merge_after(struct vma_merge_struct *vmg)
> > {
> > if (is_mergeable_vma(vmg, false) &&
> > is_mergeable_anon_vma(vmg->anon_vma, vmg->prev->anon_vma, vmg->prev)) {
> > @@ -130,6 +129,164 @@ bool can_vma_merge_after(struct vma_merge_struct *vmg)
> > return false;
> > }
> >
> > +static void __vma_link_file(struct vm_area_struct *vma,
> > + struct address_space *mapping)
> > +{
> > + if (vma_is_shared_maywrite(vma))
> > + mapping_allow_writable(mapping);
> > +
> > + flush_dcache_mmap_lock(mapping);
> > + vma_interval_tree_insert(vma, &mapping->i_mmap);
> > + flush_dcache_mmap_unlock(mapping);
> > +}
> > +
> > +/*
> > + * Requires inode->i_mapping->i_mmap_rwsem
> > + */
> > +static void __remove_shared_vm_struct(struct vm_area_struct *vma,
> > + struct address_space *mapping)
> > +{
> > + if (vma_is_shared_maywrite(vma))
> > + mapping_unmap_writable(mapping);
> > +
> > + flush_dcache_mmap_lock(mapping);
> > + vma_interval_tree_remove(vma, &mapping->i_mmap);
> > + flush_dcache_mmap_unlock(mapping);
> > +}
> > +
> > +/*
> > + * vma_prepare() - Helper function for handling locking VMAs prior to altering
> > + * @vp: The initialized vma_prepare struct
> > + */
> > +static void vma_prepare(struct vma_prepare *vp)
> > +{
> > + if (vp->file) {
> > + uprobe_munmap(vp->vma, vp->vma->vm_start, vp->vma->vm_end);
> > +
> > + if (vp->adj_next)
> > + uprobe_munmap(vp->adj_next, vp->adj_next->vm_start,
> > + vp->adj_next->vm_end);
> > +
> > + i_mmap_lock_write(vp->mapping);
> > + if (vp->insert && vp->insert->vm_file) {
> > + /*
> > + * Put into interval tree now, so instantiated pages
> > + * are visible to arm/parisc __flush_dcache_page
> > + * throughout; but we cannot insert into address
> > + * space until vma start or end is updated.
> > + */
> > + __vma_link_file(vp->insert,
> > + vp->insert->vm_file->f_mapping);
> > + }
> > + }
> > +
> > + if (vp->anon_vma) {
> > + anon_vma_lock_write(vp->anon_vma);
> > + anon_vma_interval_tree_pre_update_vma(vp->vma);
> > + if (vp->adj_next)
> > + anon_vma_interval_tree_pre_update_vma(vp->adj_next);
> > + }
> > +
> > + if (vp->file) {
> > + flush_dcache_mmap_lock(vp->mapping);
> > + vma_interval_tree_remove(vp->vma, &vp->mapping->i_mmap);
> > + if (vp->adj_next)
> > + vma_interval_tree_remove(vp->adj_next,
> > + &vp->mapping->i_mmap);
> > + }
> > +
> > +}
> > +
> > +/*
> > + * vma_complete- Helper function for handling the unlocking after altering VMAs,
> > + * or for inserting a VMA.
> > + *
> > + * @vp: The vma_prepare struct
> > + * @vmi: The vma iterator
> > + * @mm: The mm_struct
> > + */
> > +static void vma_complete(struct vma_prepare *vp,
> > + struct vma_iterator *vmi, struct mm_struct *mm)
> > +{
> > + if (vp->file) {
> > + if (vp->adj_next)
> > + vma_interval_tree_insert(vp->adj_next,
> > + &vp->mapping->i_mmap);
> > + vma_interval_tree_insert(vp->vma, &vp->mapping->i_mmap);
> > + flush_dcache_mmap_unlock(vp->mapping);
> > + }
> > +
> > + if (vp->remove && vp->file) {
> > + __remove_shared_vm_struct(vp->remove, vp->mapping);
> > + if (vp->remove2)
> > + __remove_shared_vm_struct(vp->remove2, vp->mapping);
> > + } else if (vp->insert) {
> > + /*
> > + * split_vma has split insert from vma, and needs
> > + * us to insert it before dropping the locks
> > + * (it may either follow vma or precede it).
> > + */
> > + vma_iter_store(vmi, vp->insert);
> > + mm->map_count++;
> > + }
> > +
> > + if (vp->anon_vma) {
> > + anon_vma_interval_tree_post_update_vma(vp->vma);
> > + if (vp->adj_next)
> > + anon_vma_interval_tree_post_update_vma(vp->adj_next);
> > + anon_vma_unlock_write(vp->anon_vma);
> > + }
> > +
> > + if (vp->file) {
> > + i_mmap_unlock_write(vp->mapping);
> > + uprobe_mmap(vp->vma);
> > +
> > + if (vp->adj_next)
> > + uprobe_mmap(vp->adj_next);
> > + }
> > +
> > + if (vp->remove) {
> > +again:
> > + vma_mark_detached(vp->remove, true);
> > + if (vp->file) {
> > + uprobe_munmap(vp->remove, vp->remove->vm_start,
> > + vp->remove->vm_end);
> > + fput(vp->file);
> > + }
> > + if (vp->remove->anon_vma)
> > + anon_vma_merge(vp->vma, vp->remove);
> > + mm->map_count--;
> > + mpol_put(vma_policy(vp->remove));
> > + if (!vp->remove2)
> > + WARN_ON_ONCE(vp->vma->vm_end < vp->remove->vm_end);
> > + vm_area_free(vp->remove);
> > +
> > + /*
> > + * In mprotect's case 6 (see comments on vma_merge),
> > + * we are removing both mid and next vmas
> > + */
> > + if (vp->remove2) {
> > + vp->remove = vp->remove2;
> > + vp->remove2 = NULL;
> > + goto again;
> > + }
> > + }
> > + if (vp->insert && vp->file)
> > + uprobe_mmap(vp->insert);
> > + validate_mm(mm);
> > +}
> > +
> > +/*
> > + * init_vma_prep() - Initializer wrapper for vma_prepare struct
> > + * @vp: The vma_prepare struct
> > + * @vma: The vma that will be altered once locked
> > + */
> > +static void init_vma_prep(struct vma_prepare *vp,
> > + struct vm_area_struct *vma)
> > +{
> > + init_multi_vma_prep(vp, vma, NULL, NULL, NULL);
> > +}
> > +
> > /*
> > * Close a vm structure and free it.
> > */
> > @@ -292,31 +449,6 @@ static inline void remove_mt(struct mm_struct *mm, struct ma_state *mas)
> > vm_unacct_memory(nr_accounted);
> > }
> >
> > -/*
> > - * init_vma_prep() - Initializer wrapper for vma_prepare struct
> > - * @vp: The vma_prepare struct
> > - * @vma: The vma that will be altered once locked
> > - */
> > -void init_vma_prep(struct vma_prepare *vp,
> > - struct vm_area_struct *vma)
> > -{
> > - init_multi_vma_prep(vp, vma, NULL, NULL, NULL);
> > -}
> > -
> > -/*
> > - * Requires inode->i_mapping->i_mmap_rwsem
> > - */
> > -static void __remove_shared_vm_struct(struct vm_area_struct *vma,
> > - struct address_space *mapping)
> > -{
> > - if (vma_is_shared_maywrite(vma))
> > - mapping_unmap_writable(mapping);
> > -
> > - flush_dcache_mmap_lock(mapping);
> > - vma_interval_tree_remove(vma, &mapping->i_mmap);
> > - flush_dcache_mmap_unlock(mapping);
> > -}
> > -
> > /*
> > * vma has some anon_vma assigned, and is already inserted on that
> > * anon_vma's interval trees.
> > @@ -349,60 +481,6 @@ anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma)
> > anon_vma_interval_tree_insert(avc, &avc->anon_vma->rb_root);
> > }
> >
> > -static void __vma_link_file(struct vm_area_struct *vma,
> > - struct address_space *mapping)
> > -{
> > - if (vma_is_shared_maywrite(vma))
> > - mapping_allow_writable(mapping);
> > -
> > - flush_dcache_mmap_lock(mapping);
> > - vma_interval_tree_insert(vma, &mapping->i_mmap);
> > - flush_dcache_mmap_unlock(mapping);
> > -}
> > -
> > -/*
> > - * vma_prepare() - Helper function for handling locking VMAs prior to altering
> > - * @vp: The initialized vma_prepare struct
> > - */
> > -void vma_prepare(struct vma_prepare *vp)
> > -{
> > - if (vp->file) {
> > - uprobe_munmap(vp->vma, vp->vma->vm_start, vp->vma->vm_end);
> > -
> > - if (vp->adj_next)
> > - uprobe_munmap(vp->adj_next, vp->adj_next->vm_start,
> > - vp->adj_next->vm_end);
> > -
> > - i_mmap_lock_write(vp->mapping);
> > - if (vp->insert && vp->insert->vm_file) {
> > - /*
> > - * Put into interval tree now, so instantiated pages
> > - * are visible to arm/parisc __flush_dcache_page
> > - * throughout; but we cannot insert into address
> > - * space until vma start or end is updated.
> > - */
> > - __vma_link_file(vp->insert,
> > - vp->insert->vm_file->f_mapping);
> > - }
> > - }
> > -
> > - if (vp->anon_vma) {
> > - anon_vma_lock_write(vp->anon_vma);
> > - anon_vma_interval_tree_pre_update_vma(vp->vma);
> > - if (vp->adj_next)
> > - anon_vma_interval_tree_pre_update_vma(vp->adj_next);
> > - }
> > -
> > - if (vp->file) {
> > - flush_dcache_mmap_lock(vp->mapping);
> > - vma_interval_tree_remove(vp->vma, &vp->mapping->i_mmap);
> > - if (vp->adj_next)
> > - vma_interval_tree_remove(vp->adj_next,
> > - &vp->mapping->i_mmap);
> > - }
> > -
> > -}
> > -
> > /*
> > * dup_anon_vma() - Helper function to duplicate anon_vma
> > * @dst: The destination VMA
> > @@ -486,6 +564,120 @@ void validate_mm(struct mm_struct *mm)
> > }
> > #endif /* CONFIG_DEBUG_VM_MAPLE_TREE */
> >
> > +/*
> > + * vma_merge_new_vma - Attempt to merge a new VMA into address space
> > + *
> > + * @vmg: Describes the VMA we are adding, in the range @vmg->start to @vmg->end
> > + * (exclusive), which we try to merge with any adjacent VMAs if possible.
> > + *
> > + * We are about to add a VMA to the address space starting at @vmg->start and
> > + * ending at @vmg->end. There are three different possible scenarios:
> > + *
> > + * 1. There is a VMA with identical properties immediately adjacent to the
> > + * proposed new VMA [@vmg->start, @vmg->end) either before or after it -
> > + * EXPAND that VMA:
> > + *
> > + * Proposed: |-----| or |-----|
> > + * Existing: |----| |----|
> > + *
> > + * 2. There are VMAs with identical properties immediately adjacent to the
> > + * proposed new VMA [@vmg->start, @vmg->end) both before AND after it -
> > + * EXPAND the former and REMOVE the latter:
> > + *
> > + * Proposed: |-----|
> > + * Existing: |----| |----|
> > + *
> > + * 3. There are no VMAs immediately adjacent to the proposed new VMA or those
> > + * VMAs do not have identical attributes - NO MERGE POSSIBLE.
>
> We still have diagrams, that's too bad.
But they're cute ones! Upgrade right?
>
> > + *
> > + * In instances where we can merge, this function returns the expanded VMA which
> > + * will have its range adjusted accordingly and the underlying maple tree also
> > + * adjusted.
> > + *
> > + * Returns: In instances where no merge was possible, NULL. Otherwise, a pointer
> > + * to the VMA we expanded.
> > + *
> > + * This function also adjusts @vmg to provide @vmg->prev and @vmg->next if
> > + * neither already specified, and adjusts [@vmg->start, @vmg->end) to span the
> > + * expanded range.
> > + *
> > + * ASSUMPTIONS:
> > + * - The caller must hold a WRITE lock on the mm_struct->mmap_lock.
> > + * - The caller must have determined that [@vmg->start, @vmg->end) is empty.
> > + */
> > +struct vm_area_struct *vma_merge_new_vma(struct vma_merge_struct *vmg)
> > +{
> > + bool is_special = vmg->flags & VM_SPECIAL;
> > + struct vm_area_struct *prev = vmg->prev;
> > + struct vm_area_struct *next = vmg->next;
> > + unsigned long start = vmg->start;
> > + unsigned long end = vmg->end;
> > + pgoff_t pgoff = vmg->pgoff;
> > + pgoff_t pglen = PHYS_PFN(end - start);
> > +
> > + VM_WARN_ON(vmg->vma);
> > +
> > + if (!prev && !next) {
> > + /*
> > + * Since the caller must have determined that the requested
> > + * range is empty, vmg->vmi will be left pointing at the VMA
> > + * immediately prior.
> > + */
> > + next = vmg->next = vma_next(vmg->vmi);
> > + prev = vmg->prev = vma_prev(vmg->vmi);
> > +
> > + /* Avoid maple tree re-walk. */
> > + if (is_special && prev)
> > + vma_iter_next_range(vmg->vmi);
> > + }
> > +
> > + /* If special mapping or no adjacent VMAs, nothing to merge. */
> > + if (is_special || (!prev && !next))
> > + return NULL;
> > +
> > + /* If we can merge with the following VMA, adjust vmg accordingly. */
> > + if (next && next->vm_start == end && can_vma_merge_before(vmg)) {
> > + /*
> > + * We can adjust this here as can_vma_merge_after() doesn't
> > + * touch vmg->end.
> > + */
> > + vmg->end = next->vm_end;
> > + vmg->vma = next;
> > + vmg->pgoff = next->vm_pgoff - pglen;
> > +
> > + vmg->anon_vma = next->anon_vma;
> > + }
> > +
> > + /* If we can merge with the previous VMA, adjust vmg accordingly. */
> > + if (prev && prev->vm_end == start && can_vma_merge_after(vmg)) {
> > + vmg->start = prev->vm_start;
> > + vmg->vma = prev;
> > + vmg->pgoff = prev->vm_pgoff;
> > + } else if (prev) {
> > + vma_iter_next_range(vmg->vmi);
> > + }
> > +
> > + /*
> > + * Now try to expand adjacent VMA(s). This takes care of removing the
> > + * following VMA if we have VMAs on both sides.
> > + */
> > + if (vmg->vma && !vma_expand(vmg)) {
> > + khugepaged_enter_vma(vmg->vma, vmg->flags);
> > + return vmg->vma;
> > + }
> > +
> > + /* If expansion failed, reset state. Allows us to retry merge later. */
> > + vmg->vma = NULL;
> > + vmg->anon_vma = NULL;
> > + vmg->start = start;
> > + vmg->end = end;
> > + vmg->pgoff = pgoff;
> > + if (vmg->vma == prev)
> > + vma_iter_set(vmg->vmi, start);
> > +
> > + return NULL;
> > +}
>
> Can we split this up a bit? I was thinking that, for the brk() case, we
> need to know if we can merge prev and if that merge fails. I was
> thinking something that you create a vmg with whatever, then call
> can_merge_prev, and that'd do the block above and change the vmg as
> required. We could have a can_merge_next that does the same, then we
> need to prepare the change (dup anon vma, preallocate for maple tree,
> locking, whatever), then commit.
Yeah that's not a bad idea, that could actually help really help clarify
what's going on.
Then could have a sort of state machine that indicates that we've already
adjusted vmg parameters for the merge.
I'm thinking though of a vma_merge_new_vma() / vma_merge_modified_vma()
that invokes different code to figure out how to expand.
I will have a fiddle around and see what I can figure out that makes sense.
>
> There could still be the function above, but with smaller widgets to do
> what we need so we gain flexibility in what we decide to check - prev
> only in brk().
>
> I'm not sure if we'd need one for expanding vs existing or if we could
> check !vmg->vma to figure that out..
>
> This would also have the effect of self-documenting what is going on.
> For brk, it would look like this:
>
> if (vmg_expand_prev()) {
> if (vmg_prepare())
> goto no_mem;
> vmg_commit();
> }
>
> I think this would change your exposed interface, at least for brk() -
> or a wrapper for this, but small widgets may gain us some
> self-documented code?
>
> If you really don't like the exposure of the interface, the vmg could
> have a return so we can see if we ran out of memory?
I really don't like can_vma_merge_xxx() being exposed, it's very clearly an
internal interface.
As mentioned above we can have some kind of way of passing back an error
code.
Obviously if testing indicates stack size/perf is a problem we can
begrudgingly accept the interface leak :'(. Will check that.
>
> > +
> > /*
> > * vma_expand - Expand an existing VMA
> > *
> > @@ -496,7 +688,11 @@ void validate_mm(struct mm_struct *mm)
> > * vmg->next->vm_end. Checking if the vmg->vma can expand and merge with
> > * vmg->next needs to be handled by the caller.
> > *
> > - * Returns: 0 on success
> > + * Returns: 0 on success.
> > + *
> > + * ASSUMPTIONS:
> > + * - The caller must hold a WRITE lock on vmg->vma->mm->mmap_lock.
> > + * - The caller must have set @vmg->prev and @vmg->next.
> > */
> > int vma_expand(struct vma_merge_struct *vmg)
> > {
> > @@ -576,85 +772,6 @@ int vma_shrink(struct vma_merge_struct *vmg)
> > return 0;
> > }
> >
> > -/*
> > - * vma_complete- Helper function for handling the unlocking after altering VMAs,
> > - * or for inserting a VMA.
> > - *
> > - * @vp: The vma_prepare struct
> > - * @vmi: The vma iterator
> > - * @mm: The mm_struct
> > - */
> > -void vma_complete(struct vma_prepare *vp,
> > - struct vma_iterator *vmi, struct mm_struct *mm)
> > -{
> > - if (vp->file) {
> > - if (vp->adj_next)
> > - vma_interval_tree_insert(vp->adj_next,
> > - &vp->mapping->i_mmap);
> > - vma_interval_tree_insert(vp->vma, &vp->mapping->i_mmap);
> > - flush_dcache_mmap_unlock(vp->mapping);
> > - }
> > -
> > - if (vp->remove && vp->file) {
> > - __remove_shared_vm_struct(vp->remove, vp->mapping);
> > - if (vp->remove2)
> > - __remove_shared_vm_struct(vp->remove2, vp->mapping);
> > - } else if (vp->insert) {
> > - /*
> > - * split_vma has split insert from vma, and needs
> > - * us to insert it before dropping the locks
> > - * (it may either follow vma or precede it).
> > - */
> > - vma_iter_store(vmi, vp->insert);
> > - mm->map_count++;
> > - }
> > -
> > - if (vp->anon_vma) {
> > - anon_vma_interval_tree_post_update_vma(vp->vma);
> > - if (vp->adj_next)
> > - anon_vma_interval_tree_post_update_vma(vp->adj_next);
> > - anon_vma_unlock_write(vp->anon_vma);
> > - }
> > -
> > - if (vp->file) {
> > - i_mmap_unlock_write(vp->mapping);
> > - uprobe_mmap(vp->vma);
> > -
> > - if (vp->adj_next)
> > - uprobe_mmap(vp->adj_next);
> > - }
> > -
> > - if (vp->remove) {
> > -again:
> > - vma_mark_detached(vp->remove, true);
> > - if (vp->file) {
> > - uprobe_munmap(vp->remove, vp->remove->vm_start,
> > - vp->remove->vm_end);
> > - fput(vp->file);
> > - }
> > - if (vp->remove->anon_vma)
> > - anon_vma_merge(vp->vma, vp->remove);
> > - mm->map_count--;
> > - mpol_put(vma_policy(vp->remove));
> > - if (!vp->remove2)
> > - WARN_ON_ONCE(vp->vma->vm_end < vp->remove->vm_end);
> > - vm_area_free(vp->remove);
> > -
> > - /*
> > - * In mprotect's case 6 (see comments on vma_merge),
> > - * we are removing both mid and next vmas
> > - */
> > - if (vp->remove2) {
> > - vp->remove = vp->remove2;
> > - vp->remove2 = NULL;
> > - goto again;
> > - }
> > - }
> > - if (vp->insert && vp->file)
> > - uprobe_mmap(vp->insert);
> > - validate_mm(mm);
> > -}
> > -
> > /*
> > * do_vmi_align_munmap() - munmap the aligned region from @start to @end.
> > * @vmi: The vma iterator
> > @@ -1261,20 +1378,6 @@ struct vm_area_struct
> > return vma_modify(&vmg);
> > }
> >
> > -/*
> > - * Attempt to merge a newly mapped VMA with those adjacent to it. The caller
> > - * must ensure that [start, end) does not overlap any existing VMA.
> > - */
> > -struct vm_area_struct *vma_merge_new_vma(struct vma_merge_struct *vmg)
> > -{
> > - if (!vmg->prev) {
> > - vmg->prev = vma_prev(vmg->vmi);
> > - vma_iter_set(vmg->vmi, vmg->start);
> > - }
> > -
> > - return vma_merge(vmg);
> > -}
> > -
> > /*
> > * Expand vma by delta bytes, potentially merging with an immediately adjacent
> > * VMA with identical properties.
> > @@ -1297,8 +1400,7 @@ struct vm_area_struct *vma_merge_extend(struct vma_iterator *vmi,
> > .anon_name = anon_vma_name(vma),
> > };
> >
> > - /* vma is specified as prev, so case 1 or 2 will apply. */
> > - return vma_merge(&vmg);
> > + return vma_merge_new_vma(&vmg);
> > }
> >
> > void unlink_file_vma_batch_init(struct unlink_vma_file_batch *vb)
> > @@ -1399,24 +1501,40 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
> > struct vm_area_struct *vma = *vmap;
> > unsigned long vma_start = vma->vm_start;
> > struct mm_struct *mm = vma->vm_mm;
> > - struct vm_area_struct *new_vma, *prev;
> > + struct vm_area_struct *new_vma;
> > bool faulted_in_anon_vma = true;
> > VMA_ITERATOR(vmi, mm, addr);
> > + struct vma_merge_struct vmg = {
> > + .vmi = &vmi,
> > + .start = addr,
> > + .end = addr + len,
> > + .flags = vma->vm_flags,
> > + .pgoff = pgoff,
> > + .file = vma->vm_file,
> > + .anon_vma = vma->anon_vma,
> > + .policy = vma_policy(vma),
> > + .uffd_ctx = vma->vm_userfaultfd_ctx,
> > + .anon_name = anon_vma_name(vma),
> > + };
> >
> > /*
> > * If anonymous vma has not yet been faulted, update new pgoff
> > * to match new location, to increase its chance of merging.
> > */
> > if (unlikely(vma_is_anonymous(vma) && !vma->anon_vma)) {
> > - pgoff = addr >> PAGE_SHIFT;
> > + pgoff = vmg.pgoff = addr >> PAGE_SHIFT;
> > faulted_in_anon_vma = false;
> > }
> >
> > - new_vma = find_vma_prev(mm, addr, &prev);
> > + new_vma = find_vma_prev(mm, addr, &vmg.prev);
> > if (new_vma && new_vma->vm_start < addr + len)
> > return NULL; /* should never get here */
> >
> > - new_vma = vma_merge_new_vma_wrapper(&vmi, prev, vma, addr, addr + len, pgoff);
> > + vmg.next = vma_next(&vmi);
> > + vma_prev(&vmi);
> > +
> > + new_vma = vma_merge_new_vma(&vmg);
> > +
> > if (new_vma) {
> > /*
> > * Source vma may have been merged into new_vma
> > diff --git a/mm/vma.h b/mm/vma.h
> > index 50459f9e4c7f..bbb173053f34 100644
> > --- a/mm/vma.h
> > +++ b/mm/vma.h
> > @@ -55,17 +55,6 @@ void anon_vma_interval_tree_pre_update_vma(struct vm_area_struct *vma);
> > /* Required for expand_downwards(). */
> > void anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma);
> >
> > -/* Required for do_brk_flags(). */
> > -void vma_prepare(struct vma_prepare *vp);
> > -
> > -/* Required for do_brk_flags(). */
> > -void init_vma_prep(struct vma_prepare *vp,
> > - struct vm_area_struct *vma);
> > -
> > -/* Required for do_brk_flags(). */
> > -void vma_complete(struct vma_prepare *vp,
> > - struct vma_iterator *vmi, struct mm_struct *mm);
> > -
> > int vma_expand(struct vma_merge_struct *vmg);
> > int vma_shrink(struct vma_merge_struct *vmg);
> >
> > @@ -85,20 +74,6 @@ void unmap_region(struct mm_struct *mm, struct ma_state *mas,
> > struct vm_area_struct *next, unsigned long start,
> > unsigned long end, unsigned long tree_end, bool mm_wr_locked);
> >
> > -/*
> > - * Can we merge the VMA described by vmg into the following VMA vmg->next?
> > - *
> > - * Required by mmap_region().
> > - */
> > -bool can_vma_merge_before(struct vma_merge_struct *vmg);
> > -
> > -/*
> > - * Can we merge the VMA described by vmg into the preceding VMA vmg->prev?
> > - *
> > - * Required by mmap_region() and do_brk_flags().
> > - */
> > -bool can_vma_merge_after(struct vma_merge_struct *vmg);
> > -
> > /* We are about to modify the VMA's flags. */
> > struct vm_area_struct *vma_modify_flags(struct vma_iterator *vmi,
> > struct vm_area_struct *prev,
> > @@ -133,31 +108,7 @@ struct vm_area_struct
> > unsigned long new_flags,
> > struct vm_userfaultfd_ctx new_ctx);
> >
> > -struct vm_area_struct
> > -*vma_merge_new_vma(struct vma_merge_struct *vmg);
> > -
> > -/* Temporary convenience wrapper. */
> > -static inline struct vm_area_struct
> > -*vma_merge_new_vma_wrapper(struct vma_iterator *vmi, struct vm_area_struct *prev,
> > - struct vm_area_struct *vma, unsigned long start,
> > - unsigned long end, pgoff_t pgoff)
> > -{
> > - struct vma_merge_struct vmg = {
> > - .vmi = vmi,
> > - .prev = prev,
> > - .start = start,
> > - .end = end,
> > - .flags = vma->vm_flags,
> > - .file = vma->vm_file,
> > - .anon_vma = vma->anon_vma,
> > - .pgoff = pgoff,
> > - .policy = vma_policy(vma),
> > - .uffd_ctx = vma->vm_userfaultfd_ctx,
> > - .anon_name = anon_vma_name(vma),
> > - };
> > -
> > - return vma_merge_new_vma(&vmg);
> > -}
> > +struct vm_area_struct *vma_merge_new_vma(struct vma_merge_struct *vmg);
> >
> > /*
> > * Temporary wrapper around vma_merge() so we can have a common interface for
> > diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h
> > index 40797a819d3d..a39a734282d0 100644
> > --- a/tools/testing/vma/vma_internal.h
> > +++ b/tools/testing/vma/vma_internal.h
> > @@ -709,6 +709,12 @@ static inline void vma_iter_free(struct vma_iterator *vmi)
> > mas_destroy(&vmi->mas);
> > }
> >
> > +static inline
> > +struct vm_area_struct *vma_iter_next_range(struct vma_iterator *vmi)
> > +{
> > + return mas_next_range(&vmi->mas, ULONG_MAX);
> > +}
> > +
> > static inline void vm_acct_memory(long pages)
> > {
> > }
> > --
> > 2.45.2
> >
Powered by blists - more mailing lists