lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <8f1f86df-f3b9-4e6d-832c-5c4578c68285@lucifer.local>
Date: Fri, 9 Aug 2024 18:20:42 +0100
From: Lorenzo Stoakes <lorenzo.stoakes@...cle.com>
To: "Liam R. Howlett" <Liam.Howlett@...cle.com>, linux-mm@...ck.org,
        linux-kernel@...r.kernel.org,
        Andrew Morton <akpm@...ux-foundation.org>,
        Vlastimil Babka <vbabka@...e.cz>
Subject: Re: [PATCH 07/10] mm: avoid using vma_merge() for new VMAs

On Fri, Aug 09, 2024 at 11:23:30AM GMT, Liam R. Howlett wrote:
> * Lorenzo Stoakes <lorenzo.stoakes@...cle.com> [240805 08:14]:
> > In mmap_region() and do_brk_flags() we open code scenarios where we prefer
> > to use vma_expand() rather than invoke a full vma_merge() operation.
> >
> > Abstract this logic and eliminate all of the open-coding, and also use the
> > same logic for all cases where we add new VMAs to, rather than ultimately
> > use vma_merge(), rather use vma_expand().
> >
> > We implement this by replacing vma_merge_new_vma() with this newly
> > abstracted logic.
> >
> > Doing so removes duplication and simplifies VMA merging in all such cases,
> > laying the ground for us to eliminate the merging of new VMAs in
> > vma_merge() altogether.
> >
> > This makes it far easier to understand what is happening in these cases
> > avoiding confusion, bugs and allowing for future optimisation.
> >
> > As a result of this change we are also able to make vma_prepare(),
> > init_vma_prep(), vma_complete(), can_vma_merge_before() and
> > can_vma_merge_after() static and internal to vma.c.
> >
> > Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@...cle.com>
> > ---
> >  mm/mmap.c                        |  79 ++---
> >  mm/vma.c                         | 482 +++++++++++++++++++------------
> >  mm/vma.h                         |  51 +---
> >  tools/testing/vma/vma_internal.h |   6 +
> >  4 files changed, 324 insertions(+), 294 deletions(-)
> >
> > diff --git a/mm/mmap.c b/mm/mmap.c
> > index f6593a81f73d..c03f50f46396 100644
> > --- a/mm/mmap.c
> > +++ b/mm/mmap.c
> > @@ -1363,8 +1363,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
> >  {
> >  	struct mm_struct *mm = current->mm;
> >  	struct vm_area_struct *vma = NULL;
> > -	struct vm_area_struct *next, *prev, *merge;
> > -	pgoff_t pglen = len >> PAGE_SHIFT;
> > +	struct vm_area_struct *merge;
> >  	unsigned long charged = 0;
> >  	unsigned long end = addr + len;
> >  	bool writable_file_mapping = false;
> > @@ -1411,44 +1410,9 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
> >  		vm_flags |= VM_ACCOUNT;
> >  	}
> >
> > -	next = vmg.next = vma_next(&vmi);
> > -	prev = vmg.prev = vma_prev(&vmi);
> > -	if (vm_flags & VM_SPECIAL) {
> > -		if (prev)
> > -			vma_iter_next_range(&vmi);
> > -		goto cannot_expand;
> > -	}
> > -
> > -	/* Attempt to expand an old mapping */
> > -	/* Check next */
> > -	if (next && next->vm_start == end && can_vma_merge_before(&vmg)) {
> > -		/* We can adjust this as can_vma_merge_after() doesn't touch */
> > -		vmg.end = next->vm_end;
> > -		vma = vmg.vma = next;
> > -		vmg.pgoff = next->vm_pgoff - pglen;
> > -
> > -		/* We may merge our NULL anon_vma with non-NULL in next. */
> > -		vmg.anon_vma = vma->anon_vma;
> > -	}
> > -
> > -	/* Check prev */
> > -	if (prev && prev->vm_end == addr && can_vma_merge_after(&vmg)) {
> > -		vmg.start = prev->vm_start;
> > -		vma = vmg.vma = prev;
> > -		vmg.pgoff = prev->vm_pgoff;
> > -	} else if (prev) {
> > -		vma_iter_next_range(&vmi);
> > -	}
> > -
> > -	/* Actually expand, if possible */
> > -	if (vma && !vma_expand(&vmg)) {
> > -		khugepaged_enter_vma(vma, vm_flags);
> > +	vma = vma_merge_new_vma(&vmg);
> > +	if (vma)
> >  		goto expanded;
> > -	}
> > -
> > -	if (vma == prev)
> > -		vma_iter_set(&vmi, addr);
> > -cannot_expand:
> >
> >  	/*
> >  	 * Determine the object being mapped and call the appropriate
> > @@ -1493,10 +1457,9 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
> >  		 * If vm_flags changed after call_mmap(), we should try merge
> >  		 * vma again as we may succeed this time.
> >  		 */
> > -		if (unlikely(vm_flags != vma->vm_flags && prev)) {
> > -			merge = vma_merge_new_vma_wrapper(&vmi, prev, vma,
> > -							  vma->vm_start, vma->vm_end,
> > -							  vma->vm_pgoff);
> > +		if (unlikely(vm_flags != vma->vm_flags && vmg.prev)) {
> > +			merge = vma_merge_new_vma(&vmg);
> > +
> >  			if (merge) {
> >  				/*
> >  				 * ->mmap() can change vma->vm_file and fput
> > @@ -1596,7 +1559,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
> >
> >  		vma_iter_set(&vmi, vma->vm_end);
> >  		/* Undo any partial mapping done by a device driver. */
> > -		unmap_region(mm, &vmi.mas, vma, prev, next, vma->vm_start,
> > +		unmap_region(mm, &vmi.mas, vma, vmg.prev, vmg.next, vma->vm_start,
> >  			     vma->vm_end, vma->vm_end, true);
> >  	}
> >  	if (writable_file_mapping)
> > @@ -1773,7 +1736,6 @@ static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma,
> >  		unsigned long addr, unsigned long len, unsigned long flags)
> >  {
> >  	struct mm_struct *mm = current->mm;
> > -	struct vma_prepare vp;
> >
> >  	/*
> >  	 * Check against address space limits by the changed size
> > @@ -1795,29 +1757,22 @@ static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma,
> >  	 */
> >  	if (vma && vma->vm_end == addr) {
> >  		struct vma_merge_struct vmg = {
> > +			.vmi = vmi,
> >  			.prev = vma,
> > +			.next = NULL,
> > +			.start = addr,
> > +			.end = addr + len,
> >  			.flags = flags,
> >  			.pgoff = addr >> PAGE_SHIFT,
> > +			.file = vma->vm_file,
> > +			.anon_vma = vma->anon_vma,
> > +			.policy = vma_policy(vma),
> > +			.uffd_ctx = vma->vm_userfaultfd_ctx,
> > +			.anon_name = anon_vma_name(vma),
> >  		};
> >
> > -		if (can_vma_merge_after(&vmg)) {
> > -			vma_iter_config(vmi, vma->vm_start, addr + len);
> > -			if (vma_iter_prealloc(vmi, vma))
> > -				goto unacct_fail;
> > -
> > -			vma_start_write(vma);
> > -
> > -			init_vma_prep(&vp, vma);
> > -			vma_prepare(&vp);
> > -			vma_adjust_trans_huge(vma, vma->vm_start, addr + len, 0);
> > -			vma->vm_end = addr + len;
> > -			vm_flags_set(vma, VM_SOFTDIRTY);
> > -			vma_iter_store(vmi, vma);
> > -
> > -			vma_complete(&vp, vmi, mm);
> > -			khugepaged_enter_vma(vma, flags);
> > +		if (vma_merge_new_vma(&vmg))
> >  			goto out;
>
> This is very convoluted to follow.  It seems vma_merge_new_vma() will do
> what is necessary by finding out that it can merge after, then call
> vma_expand() which calls commit merge(), which sets the iterator to
> vmg->start, but vmg->start isn't set to vma->vm_start, it is set to addr
> here.. it's actually set to prev->vm_start in vma_merge_new_vma().

Sorry, it's kind of hard to make a change like this all that lovely to
follow.

The only extra checks before it checks mergeability are prev - which we set
to vma, so is not NULL (except in the case of first vma, which is wasteful,
but a one-off) and an is_special and next check.

So this isn't _hugely_ terrible.

As to the vmi positioning... I thought there might be some things that we
could improve on this :)

However, we set prev == vma here, so vmg->start = vma->vm_start, vmg->end =
addr + len which is the same as before right?

I do notice that we've incorrectly removed the vm_flags_set(VM_SOFTDIRTY)
though... will add that back in. Ugh.

Again, so frustrating to not have these functions testable. I'd like to
find a way to move things around if possible at some point. But if we're
worried about call stack maybe not feasible.

>
> This is getting really hard to trace what happens.  I'm also concerned
> that the overhead of following all these checks will cost performance on
> the brk system call?

I'll take a look at mm-tests results.

>
> Maybe we can have a way to set up the vmg and call the right function to
> just make the above happen?  We know with the can_vma_merge_after() that
> it is going to work, so could we just call vma_start_write() and
> commit_merge()?

I'm happy to add an enum or something to set a specific mode if we want,
but maybe worth looking at scalability results first to see if there's
really a regression?

I mean from our discussions on irc, it sounds like this is very possible so
we could figure something out.

>
> Also, vma_merge_new_vma() could fail because it's out of memory so it
> should goto unacct_fail.. but we now don't know if it's because the
> merge wasn't allowed or if we are out of memory..

Yes this is a mistake, damn it. Will fix. Grumble about untestability of
these functions x2.

As per your comment below I think simplest way may be to have an error or
outcome field or some such that we can check to see _why_ things failed.

>
> > -		}
> >  	}
> >
> >  	if (vma)
> > diff --git a/mm/vma.c b/mm/vma.c
> > index 55615392e8d2..a404cf718f9e 100644
> > --- a/mm/vma.c
> > +++ b/mm/vma.c
> > @@ -97,8 +97,7 @@ static void init_multi_vma_prep(struct vma_prepare *vp,
> >   *
> >   * We assume the vma may be removed as part of the merge.
> >   */
> > -bool
> > -can_vma_merge_before(struct vma_merge_struct *vmg)
> > +static bool can_vma_merge_before(struct vma_merge_struct *vmg)
> >  {
> >  	pgoff_t pglen = PHYS_PFN(vmg->end - vmg->start);
> >
> > @@ -120,7 +119,7 @@ can_vma_merge_before(struct vma_merge_struct *vmg)
> >   *
> >   * We assume that vma is not removed as part of the merge.
> >   */
> > -bool can_vma_merge_after(struct vma_merge_struct *vmg)
> > +static bool can_vma_merge_after(struct vma_merge_struct *vmg)
> >  {
> >  	if (is_mergeable_vma(vmg, false) &&
> >  	    is_mergeable_anon_vma(vmg->anon_vma, vmg->prev->anon_vma, vmg->prev)) {
> > @@ -130,6 +129,164 @@ bool can_vma_merge_after(struct vma_merge_struct *vmg)
> >  	return false;
> >  }
> >
> > +static void __vma_link_file(struct vm_area_struct *vma,
> > +			    struct address_space *mapping)
> > +{
> > +	if (vma_is_shared_maywrite(vma))
> > +		mapping_allow_writable(mapping);
> > +
> > +	flush_dcache_mmap_lock(mapping);
> > +	vma_interval_tree_insert(vma, &mapping->i_mmap);
> > +	flush_dcache_mmap_unlock(mapping);
> > +}
> > +
> > +/*
> > + * Requires inode->i_mapping->i_mmap_rwsem
> > + */
> > +static void __remove_shared_vm_struct(struct vm_area_struct *vma,
> > +				      struct address_space *mapping)
> > +{
> > +	if (vma_is_shared_maywrite(vma))
> > +		mapping_unmap_writable(mapping);
> > +
> > +	flush_dcache_mmap_lock(mapping);
> > +	vma_interval_tree_remove(vma, &mapping->i_mmap);
> > +	flush_dcache_mmap_unlock(mapping);
> > +}
> > +
> > +/*
> > + * vma_prepare() - Helper function for handling locking VMAs prior to altering
> > + * @vp: The initialized vma_prepare struct
> > + */
> > +static void vma_prepare(struct vma_prepare *vp)
> > +{
> > +	if (vp->file) {
> > +		uprobe_munmap(vp->vma, vp->vma->vm_start, vp->vma->vm_end);
> > +
> > +		if (vp->adj_next)
> > +			uprobe_munmap(vp->adj_next, vp->adj_next->vm_start,
> > +				      vp->adj_next->vm_end);
> > +
> > +		i_mmap_lock_write(vp->mapping);
> > +		if (vp->insert && vp->insert->vm_file) {
> > +			/*
> > +			 * Put into interval tree now, so instantiated pages
> > +			 * are visible to arm/parisc __flush_dcache_page
> > +			 * throughout; but we cannot insert into address
> > +			 * space until vma start or end is updated.
> > +			 */
> > +			__vma_link_file(vp->insert,
> > +					vp->insert->vm_file->f_mapping);
> > +		}
> > +	}
> > +
> > +	if (vp->anon_vma) {
> > +		anon_vma_lock_write(vp->anon_vma);
> > +		anon_vma_interval_tree_pre_update_vma(vp->vma);
> > +		if (vp->adj_next)
> > +			anon_vma_interval_tree_pre_update_vma(vp->adj_next);
> > +	}
> > +
> > +	if (vp->file) {
> > +		flush_dcache_mmap_lock(vp->mapping);
> > +		vma_interval_tree_remove(vp->vma, &vp->mapping->i_mmap);
> > +		if (vp->adj_next)
> > +			vma_interval_tree_remove(vp->adj_next,
> > +						 &vp->mapping->i_mmap);
> > +	}
> > +
> > +}
> > +
> > +/*
> > + * vma_complete- Helper function for handling the unlocking after altering VMAs,
> > + * or for inserting a VMA.
> > + *
> > + * @vp: The vma_prepare struct
> > + * @vmi: The vma iterator
> > + * @mm: The mm_struct
> > + */
> > +static void vma_complete(struct vma_prepare *vp,
> > +			 struct vma_iterator *vmi, struct mm_struct *mm)
> > +{
> > +	if (vp->file) {
> > +		if (vp->adj_next)
> > +			vma_interval_tree_insert(vp->adj_next,
> > +						 &vp->mapping->i_mmap);
> > +		vma_interval_tree_insert(vp->vma, &vp->mapping->i_mmap);
> > +		flush_dcache_mmap_unlock(vp->mapping);
> > +	}
> > +
> > +	if (vp->remove && vp->file) {
> > +		__remove_shared_vm_struct(vp->remove, vp->mapping);
> > +		if (vp->remove2)
> > +			__remove_shared_vm_struct(vp->remove2, vp->mapping);
> > +	} else if (vp->insert) {
> > +		/*
> > +		 * split_vma has split insert from vma, and needs
> > +		 * us to insert it before dropping the locks
> > +		 * (it may either follow vma or precede it).
> > +		 */
> > +		vma_iter_store(vmi, vp->insert);
> > +		mm->map_count++;
> > +	}
> > +
> > +	if (vp->anon_vma) {
> > +		anon_vma_interval_tree_post_update_vma(vp->vma);
> > +		if (vp->adj_next)
> > +			anon_vma_interval_tree_post_update_vma(vp->adj_next);
> > +		anon_vma_unlock_write(vp->anon_vma);
> > +	}
> > +
> > +	if (vp->file) {
> > +		i_mmap_unlock_write(vp->mapping);
> > +		uprobe_mmap(vp->vma);
> > +
> > +		if (vp->adj_next)
> > +			uprobe_mmap(vp->adj_next);
> > +	}
> > +
> > +	if (vp->remove) {
> > +again:
> > +		vma_mark_detached(vp->remove, true);
> > +		if (vp->file) {
> > +			uprobe_munmap(vp->remove, vp->remove->vm_start,
> > +				      vp->remove->vm_end);
> > +			fput(vp->file);
> > +		}
> > +		if (vp->remove->anon_vma)
> > +			anon_vma_merge(vp->vma, vp->remove);
> > +		mm->map_count--;
> > +		mpol_put(vma_policy(vp->remove));
> > +		if (!vp->remove2)
> > +			WARN_ON_ONCE(vp->vma->vm_end < vp->remove->vm_end);
> > +		vm_area_free(vp->remove);
> > +
> > +		/*
> > +		 * In mprotect's case 6 (see comments on vma_merge),
> > +		 * we are removing both mid and next vmas
> > +		 */
> > +		if (vp->remove2) {
> > +			vp->remove = vp->remove2;
> > +			vp->remove2 = NULL;
> > +			goto again;
> > +		}
> > +	}
> > +	if (vp->insert && vp->file)
> > +		uprobe_mmap(vp->insert);
> > +	validate_mm(mm);
> > +}
> > +
> > +/*
> > + * init_vma_prep() - Initializer wrapper for vma_prepare struct
> > + * @vp: The vma_prepare struct
> > + * @vma: The vma that will be altered once locked
> > + */
> > +static void init_vma_prep(struct vma_prepare *vp,
> > +			  struct vm_area_struct *vma)
> > +{
> > +	init_multi_vma_prep(vp, vma, NULL, NULL, NULL);
> > +}
> > +
> >  /*
> >   * Close a vm structure and free it.
> >   */
> > @@ -292,31 +449,6 @@ static inline void remove_mt(struct mm_struct *mm, struct ma_state *mas)
> >  	vm_unacct_memory(nr_accounted);
> >  }
> >
> > -/*
> > - * init_vma_prep() - Initializer wrapper for vma_prepare struct
> > - * @vp: The vma_prepare struct
> > - * @vma: The vma that will be altered once locked
> > - */
> > -void init_vma_prep(struct vma_prepare *vp,
> > -		   struct vm_area_struct *vma)
> > -{
> > -	init_multi_vma_prep(vp, vma, NULL, NULL, NULL);
> > -}
> > -
> > -/*
> > - * Requires inode->i_mapping->i_mmap_rwsem
> > - */
> > -static void __remove_shared_vm_struct(struct vm_area_struct *vma,
> > -				      struct address_space *mapping)
> > -{
> > -	if (vma_is_shared_maywrite(vma))
> > -		mapping_unmap_writable(mapping);
> > -
> > -	flush_dcache_mmap_lock(mapping);
> > -	vma_interval_tree_remove(vma, &mapping->i_mmap);
> > -	flush_dcache_mmap_unlock(mapping);
> > -}
> > -
> >  /*
> >   * vma has some anon_vma assigned, and is already inserted on that
> >   * anon_vma's interval trees.
> > @@ -349,60 +481,6 @@ anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma)
> >  		anon_vma_interval_tree_insert(avc, &avc->anon_vma->rb_root);
> >  }
> >
> > -static void __vma_link_file(struct vm_area_struct *vma,
> > -			    struct address_space *mapping)
> > -{
> > -	if (vma_is_shared_maywrite(vma))
> > -		mapping_allow_writable(mapping);
> > -
> > -	flush_dcache_mmap_lock(mapping);
> > -	vma_interval_tree_insert(vma, &mapping->i_mmap);
> > -	flush_dcache_mmap_unlock(mapping);
> > -}
> > -
> > -/*
> > - * vma_prepare() - Helper function for handling locking VMAs prior to altering
> > - * @vp: The initialized vma_prepare struct
> > - */
> > -void vma_prepare(struct vma_prepare *vp)
> > -{
> > -	if (vp->file) {
> > -		uprobe_munmap(vp->vma, vp->vma->vm_start, vp->vma->vm_end);
> > -
> > -		if (vp->adj_next)
> > -			uprobe_munmap(vp->adj_next, vp->adj_next->vm_start,
> > -				      vp->adj_next->vm_end);
> > -
> > -		i_mmap_lock_write(vp->mapping);
> > -		if (vp->insert && vp->insert->vm_file) {
> > -			/*
> > -			 * Put into interval tree now, so instantiated pages
> > -			 * are visible to arm/parisc __flush_dcache_page
> > -			 * throughout; but we cannot insert into address
> > -			 * space until vma start or end is updated.
> > -			 */
> > -			__vma_link_file(vp->insert,
> > -					vp->insert->vm_file->f_mapping);
> > -		}
> > -	}
> > -
> > -	if (vp->anon_vma) {
> > -		anon_vma_lock_write(vp->anon_vma);
> > -		anon_vma_interval_tree_pre_update_vma(vp->vma);
> > -		if (vp->adj_next)
> > -			anon_vma_interval_tree_pre_update_vma(vp->adj_next);
> > -	}
> > -
> > -	if (vp->file) {
> > -		flush_dcache_mmap_lock(vp->mapping);
> > -		vma_interval_tree_remove(vp->vma, &vp->mapping->i_mmap);
> > -		if (vp->adj_next)
> > -			vma_interval_tree_remove(vp->adj_next,
> > -						 &vp->mapping->i_mmap);
> > -	}
> > -
> > -}
> > -
> >  /*
> >   * dup_anon_vma() - Helper function to duplicate anon_vma
> >   * @dst: The destination VMA
> > @@ -486,6 +564,120 @@ void validate_mm(struct mm_struct *mm)
> >  }
> >  #endif /* CONFIG_DEBUG_VM_MAPLE_TREE */
> >
> > +/*
> > + * vma_merge_new_vma - Attempt to merge a new VMA into address space
> > + *
> > + * @vmg: Describes the VMA we are adding, in the range @vmg->start to @vmg->end
> > + *       (exclusive), which we try to merge with any adjacent VMAs if possible.
> > + *
> > + * We are about to add a VMA to the address space starting at @vmg->start and
> > + * ending at @vmg->end. There are three different possible scenarios:
> > + *
> > + * 1. There is a VMA with identical properties immediately adjacent to the
> > + *    proposed new VMA [@vmg->start, @vmg->end) either before or after it -
> > + *    EXPAND that VMA:
> > + *
> > + * Proposed:       |-----|  or  |-----|
> > + * Existing:  |----|                  |----|
> > + *
> > + * 2. There are VMAs with identical properties immediately adjacent to the
> > + *    proposed new VMA [@vmg->start, @vmg->end) both before AND after it -
> > + *    EXPAND the former and REMOVE the latter:
> > + *
> > + * Proposed:       |-----|
> > + * Existing:  |----|     |----|
> > + *
> > + * 3. There are no VMAs immediately adjacent to the proposed new VMA or those
> > + *    VMAs do not have identical attributes - NO MERGE POSSIBLE.
>
> We still have diagrams, that's too bad.

But they're cute ones! Upgrade right?

>
> > + *
> > + * In instances where we can merge, this function returns the expanded VMA which
> > + * will have its range adjusted accordingly and the underlying maple tree also
> > + * adjusted.
> > + *
> > + * Returns: In instances where no merge was possible, NULL. Otherwise, a pointer
> > + *          to the VMA we expanded.
> > + *
> > + * This function also adjusts @vmg to provide @vmg->prev and @vmg->next if
> > + * neither already specified, and adjusts [@vmg->start, @vmg->end) to span the
> > + * expanded range.
> > + *
> > + * ASSUMPTIONS:
> > + * - The caller must hold a WRITE lock on the mm_struct->mmap_lock.
> > + * - The caller must have determined that [@vmg->start, @vmg->end) is empty.
> > + */
> > +struct vm_area_struct *vma_merge_new_vma(struct vma_merge_struct *vmg)
> > +{
> > +	bool is_special = vmg->flags & VM_SPECIAL;
> > +	struct vm_area_struct *prev = vmg->prev;
> > +	struct vm_area_struct *next = vmg->next;
> > +	unsigned long start = vmg->start;
> > +	unsigned long end = vmg->end;
> > +	pgoff_t pgoff = vmg->pgoff;
> > +	pgoff_t pglen = PHYS_PFN(end - start);
> > +
> > +	VM_WARN_ON(vmg->vma);
> > +
> > +	if (!prev && !next) {
> > +		/*
> > +		 * Since the caller must have determined that the requested
> > +		 * range is empty, vmg->vmi will be left pointing at the VMA
> > +		 * immediately prior.
> > +		 */
> > +		next = vmg->next = vma_next(vmg->vmi);
> > +		prev = vmg->prev = vma_prev(vmg->vmi);
> > +
> > +		/* Avoid maple tree re-walk. */
> > +		if (is_special && prev)
> > +			vma_iter_next_range(vmg->vmi);
> > +	}
> > +
> > +	/* If special mapping or no adjacent VMAs, nothing to merge. */
> > +	if (is_special || (!prev && !next))
> > +		return NULL;
> > +
> > +	/* If we can merge with the following VMA, adjust vmg accordingly. */
> > +	if (next && next->vm_start == end && can_vma_merge_before(vmg)) {
> > +		/*
> > +		 * We can adjust this here as can_vma_merge_after() doesn't
> > +		 * touch vmg->end.
> > +		 */
> > +		vmg->end = next->vm_end;
> > +		vmg->vma = next;
> > +		vmg->pgoff = next->vm_pgoff - pglen;
> > +
> > +		vmg->anon_vma = next->anon_vma;
> > +	}
> > +
> > +	/* If we can merge with the previous VMA, adjust vmg accordingly. */
> > +	if (prev && prev->vm_end == start && can_vma_merge_after(vmg)) {
> > +		vmg->start = prev->vm_start;
> > +		vmg->vma = prev;
> > +		vmg->pgoff = prev->vm_pgoff;
> > +	} else if (prev) {
> > +		vma_iter_next_range(vmg->vmi);
> > +	}
> > +
> > +	/*
> > +	 * Now try to expand adjacent VMA(s). This takes care of removing the
> > +	 * following VMA if we have VMAs on both sides.
> > +	 */
> > +	if (vmg->vma && !vma_expand(vmg)) {
> > +		khugepaged_enter_vma(vmg->vma, vmg->flags);
> > +		return vmg->vma;
> > +	}
> > +
> > +	/* If expansion failed, reset state. Allows us to retry merge later. */
> > +	vmg->vma = NULL;
> > +	vmg->anon_vma = NULL;
> > +	vmg->start = start;
> > +	vmg->end = end;
> > +	vmg->pgoff = pgoff;
> > +	if (vmg->vma == prev)
> > +		vma_iter_set(vmg->vmi, start);
> > +
> > +	return NULL;
> > +}
>
> Can we split this up a bit?  I was thinking that, for the brk() case, we
> need to know if we can merge prev and if that merge fails.  I was
> thinking something that you create a vmg with whatever, then call
> can_merge_prev, and that'd do the block above and change the vmg as
> required.  We could have a can_merge_next that does the same, then we
> need to prepare the change (dup anon vma, preallocate for maple tree,
> locking, whatever), then commit.

Yeah that's not a bad idea, that could actually help really help clarify
what's going on.

Then could have a sort of state machine that indicates that we've already
adjusted vmg parameters for the merge.

I'm thinking though of a vma_merge_new_vma() / vma_merge_modified_vma()
that invokes different code to figure out how to expand.

I will have a fiddle around and see what I can figure out that makes sense.

>
> There could still be the function above, but with smaller widgets to do
> what we need so we gain flexibility in what we decide to check - prev
> only in brk().
>
> I'm not sure if we'd need one for expanding vs existing or if we could
> check !vmg->vma to figure that out..
>
> This would also have the effect of self-documenting what is going on.
> For brk, it would look like this:
>
> if (vmg_expand_prev()) {
> 	if (vmg_prepare())
> 		goto no_mem;
> 	vmg_commit();
> }
>
> I think this would change your exposed interface, at least for brk() -
> or a wrapper for this, but small widgets may gain us some
> self-documented code?
>
> If you really don't like the exposure of the interface, the vmg could
> have a return so we can see if we ran out of memory?

I really don't like can_vma_merge_xxx() being exposed, it's very clearly an
internal interface.

As mentioned above we can have some kind of way of passing back an error
code.

Obviously if testing indicates stack size/perf is a problem we can
begrudgingly accept the interface leak :'(. Will check that.

>
> > +
> >  /*
> >   * vma_expand - Expand an existing VMA
> >   *
> > @@ -496,7 +688,11 @@ void validate_mm(struct mm_struct *mm)
> >   * vmg->next->vm_end.  Checking if the vmg->vma can expand and merge with
> >   * vmg->next needs to be handled by the caller.
> >   *
> > - * Returns: 0 on success
> > + * Returns: 0 on success.
> > + *
> > + * ASSUMPTIONS:
> > + * - The caller must hold a WRITE lock on vmg->vma->mm->mmap_lock.
> > + * - The caller must have set @vmg->prev and @vmg->next.
> >   */
> >  int vma_expand(struct vma_merge_struct *vmg)
> >  {
> > @@ -576,85 +772,6 @@ int vma_shrink(struct vma_merge_struct *vmg)
> >  	return 0;
> >  }
> >
> > -/*
> > - * vma_complete- Helper function for handling the unlocking after altering VMAs,
> > - * or for inserting a VMA.
> > - *
> > - * @vp: The vma_prepare struct
> > - * @vmi: The vma iterator
> > - * @mm: The mm_struct
> > - */
> > -void vma_complete(struct vma_prepare *vp,
> > -		  struct vma_iterator *vmi, struct mm_struct *mm)
> > -{
> > -	if (vp->file) {
> > -		if (vp->adj_next)
> > -			vma_interval_tree_insert(vp->adj_next,
> > -						 &vp->mapping->i_mmap);
> > -		vma_interval_tree_insert(vp->vma, &vp->mapping->i_mmap);
> > -		flush_dcache_mmap_unlock(vp->mapping);
> > -	}
> > -
> > -	if (vp->remove && vp->file) {
> > -		__remove_shared_vm_struct(vp->remove, vp->mapping);
> > -		if (vp->remove2)
> > -			__remove_shared_vm_struct(vp->remove2, vp->mapping);
> > -	} else if (vp->insert) {
> > -		/*
> > -		 * split_vma has split insert from vma, and needs
> > -		 * us to insert it before dropping the locks
> > -		 * (it may either follow vma or precede it).
> > -		 */
> > -		vma_iter_store(vmi, vp->insert);
> > -		mm->map_count++;
> > -	}
> > -
> > -	if (vp->anon_vma) {
> > -		anon_vma_interval_tree_post_update_vma(vp->vma);
> > -		if (vp->adj_next)
> > -			anon_vma_interval_tree_post_update_vma(vp->adj_next);
> > -		anon_vma_unlock_write(vp->anon_vma);
> > -	}
> > -
> > -	if (vp->file) {
> > -		i_mmap_unlock_write(vp->mapping);
> > -		uprobe_mmap(vp->vma);
> > -
> > -		if (vp->adj_next)
> > -			uprobe_mmap(vp->adj_next);
> > -	}
> > -
> > -	if (vp->remove) {
> > -again:
> > -		vma_mark_detached(vp->remove, true);
> > -		if (vp->file) {
> > -			uprobe_munmap(vp->remove, vp->remove->vm_start,
> > -				      vp->remove->vm_end);
> > -			fput(vp->file);
> > -		}
> > -		if (vp->remove->anon_vma)
> > -			anon_vma_merge(vp->vma, vp->remove);
> > -		mm->map_count--;
> > -		mpol_put(vma_policy(vp->remove));
> > -		if (!vp->remove2)
> > -			WARN_ON_ONCE(vp->vma->vm_end < vp->remove->vm_end);
> > -		vm_area_free(vp->remove);
> > -
> > -		/*
> > -		 * In mprotect's case 6 (see comments on vma_merge),
> > -		 * we are removing both mid and next vmas
> > -		 */
> > -		if (vp->remove2) {
> > -			vp->remove = vp->remove2;
> > -			vp->remove2 = NULL;
> > -			goto again;
> > -		}
> > -	}
> > -	if (vp->insert && vp->file)
> > -		uprobe_mmap(vp->insert);
> > -	validate_mm(mm);
> > -}
> > -
> >  /*
> >   * do_vmi_align_munmap() - munmap the aligned region from @start to @end.
> >   * @vmi: The vma iterator
> > @@ -1261,20 +1378,6 @@ struct vm_area_struct
> >  	return vma_modify(&vmg);
> >  }
> >
> > -/*
> > - * Attempt to merge a newly mapped VMA with those adjacent to it. The caller
> > - * must ensure that [start, end) does not overlap any existing VMA.
> > - */
> > -struct vm_area_struct *vma_merge_new_vma(struct vma_merge_struct *vmg)
> > -{
> > -	if (!vmg->prev) {
> > -		vmg->prev = vma_prev(vmg->vmi);
> > -		vma_iter_set(vmg->vmi, vmg->start);
> > -	}
> > -
> > -	return vma_merge(vmg);
> > -}
> > -
> >  /*
> >   * Expand vma by delta bytes, potentially merging with an immediately adjacent
> >   * VMA with identical properties.
> > @@ -1297,8 +1400,7 @@ struct vm_area_struct *vma_merge_extend(struct vma_iterator *vmi,
> >  		.anon_name = anon_vma_name(vma),
> >  	};
> >
> > -	/* vma is specified as prev, so case 1 or 2 will apply. */
> > -	return vma_merge(&vmg);
> > +	return vma_merge_new_vma(&vmg);
> >  }
> >
> >  void unlink_file_vma_batch_init(struct unlink_vma_file_batch *vb)
> > @@ -1399,24 +1501,40 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
> >  	struct vm_area_struct *vma = *vmap;
> >  	unsigned long vma_start = vma->vm_start;
> >  	struct mm_struct *mm = vma->vm_mm;
> > -	struct vm_area_struct *new_vma, *prev;
> > +	struct vm_area_struct *new_vma;
> >  	bool faulted_in_anon_vma = true;
> >  	VMA_ITERATOR(vmi, mm, addr);
> > +	struct vma_merge_struct vmg = {
> > +		.vmi = &vmi,
> > +		.start = addr,
> > +		.end = addr + len,
> > +		.flags = vma->vm_flags,
> > +		.pgoff = pgoff,
> > +		.file = vma->vm_file,
> > +		.anon_vma = vma->anon_vma,
> > +		.policy = vma_policy(vma),
> > +		.uffd_ctx = vma->vm_userfaultfd_ctx,
> > +		.anon_name = anon_vma_name(vma),
> > +	};
> >
> >  	/*
> >  	 * If anonymous vma has not yet been faulted, update new pgoff
> >  	 * to match new location, to increase its chance of merging.
> >  	 */
> >  	if (unlikely(vma_is_anonymous(vma) && !vma->anon_vma)) {
> > -		pgoff = addr >> PAGE_SHIFT;
> > +		pgoff = vmg.pgoff = addr >> PAGE_SHIFT;
> >  		faulted_in_anon_vma = false;
> >  	}
> >
> > -	new_vma = find_vma_prev(mm, addr, &prev);
> > +	new_vma = find_vma_prev(mm, addr, &vmg.prev);
> >  	if (new_vma && new_vma->vm_start < addr + len)
> >  		return NULL;	/* should never get here */
> >
> > -	new_vma = vma_merge_new_vma_wrapper(&vmi, prev, vma, addr, addr + len, pgoff);
> > +	vmg.next = vma_next(&vmi);
> > +	vma_prev(&vmi);
> > +
> > +	new_vma = vma_merge_new_vma(&vmg);
> > +
> >  	if (new_vma) {
> >  		/*
> >  		 * Source vma may have been merged into new_vma
> > diff --git a/mm/vma.h b/mm/vma.h
> > index 50459f9e4c7f..bbb173053f34 100644
> > --- a/mm/vma.h
> > +++ b/mm/vma.h
> > @@ -55,17 +55,6 @@ void anon_vma_interval_tree_pre_update_vma(struct vm_area_struct *vma);
> >  /* Required for expand_downwards(). */
> >  void anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma);
> >
> > -/* Required for do_brk_flags(). */
> > -void vma_prepare(struct vma_prepare *vp);
> > -
> > -/* Required for do_brk_flags(). */
> > -void init_vma_prep(struct vma_prepare *vp,
> > -		   struct vm_area_struct *vma);
> > -
> > -/* Required for do_brk_flags(). */
> > -void vma_complete(struct vma_prepare *vp,
> > -		  struct vma_iterator *vmi, struct mm_struct *mm);
> > -
> >  int vma_expand(struct vma_merge_struct *vmg);
> >  int vma_shrink(struct vma_merge_struct *vmg);
> >
> > @@ -85,20 +74,6 @@ void unmap_region(struct mm_struct *mm, struct ma_state *mas,
> >  		struct vm_area_struct *next, unsigned long start,
> >  		unsigned long end, unsigned long tree_end, bool mm_wr_locked);
> >
> > -/*
> > - * Can we merge the VMA described by vmg into the following VMA vmg->next?
> > - *
> > - * Required by mmap_region().
> > - */
> > -bool can_vma_merge_before(struct vma_merge_struct *vmg);
> > -
> > -/*
> > - * Can we merge the VMA described by vmg into the preceding VMA vmg->prev?
> > - *
> > - * Required by mmap_region() and do_brk_flags().
> > - */
> > -bool can_vma_merge_after(struct vma_merge_struct *vmg);
> > -
> >  /* We are about to modify the VMA's flags. */
> >  struct vm_area_struct *vma_modify_flags(struct vma_iterator *vmi,
> >  					struct vm_area_struct *prev,
> > @@ -133,31 +108,7 @@ struct vm_area_struct
> >  		       unsigned long new_flags,
> >  		       struct vm_userfaultfd_ctx new_ctx);
> >
> > -struct vm_area_struct
> > -*vma_merge_new_vma(struct vma_merge_struct *vmg);
> > -
> > -/* Temporary convenience wrapper. */
> > -static inline struct vm_area_struct
> > -*vma_merge_new_vma_wrapper(struct vma_iterator *vmi, struct vm_area_struct *prev,
> > -			   struct vm_area_struct *vma, unsigned long start,
> > -			   unsigned long end, pgoff_t pgoff)
> > -{
> > -	struct vma_merge_struct vmg = {
> > -		.vmi = vmi,
> > -		.prev = prev,
> > -		.start = start,
> > -		.end = end,
> > -		.flags = vma->vm_flags,
> > -		.file = vma->vm_file,
> > -		.anon_vma = vma->anon_vma,
> > -		.pgoff = pgoff,
> > -		.policy = vma_policy(vma),
> > -		.uffd_ctx = vma->vm_userfaultfd_ctx,
> > -		.anon_name = anon_vma_name(vma),
> > -	};
> > -
> > -	return vma_merge_new_vma(&vmg);
> > -}
> > +struct vm_area_struct *vma_merge_new_vma(struct vma_merge_struct *vmg);
> >
> >  /*
> >   * Temporary wrapper around vma_merge() so we can have a common interface for
> > diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h
> > index 40797a819d3d..a39a734282d0 100644
> > --- a/tools/testing/vma/vma_internal.h
> > +++ b/tools/testing/vma/vma_internal.h
> > @@ -709,6 +709,12 @@ static inline void vma_iter_free(struct vma_iterator *vmi)
> >  	mas_destroy(&vmi->mas);
> >  }
> >
> > +static inline
> > +struct vm_area_struct *vma_iter_next_range(struct vma_iterator *vmi)
> > +{
> > +	return mas_next_range(&vmi->mas, ULONG_MAX);
> > +}
> > +
> >  static inline void vm_acct_memory(long pages)
> >  {
> >  }
> > --
> > 2.45.2
> >

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ