[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <919071f2-0e2b-47d7-afee-1dfedc8aaca4@lucifer.local>
Date: Thu, 27 Jun 2024 20:41:04 +0100
From: Lorenzo Stoakes <lstoakes@...il.com>
To: "Liam R. Howlett" <Liam.Howlett@...cle.com>,
Andrew Morton <akpm@...ux-foundation.org>,
linux-fsdevel@...r.kernel.org, linux-kernel@...r.kernel.org,
linux-mm@...ck.org, Vlastimil Babka <vbabka@...e.cz>,
Matthew Wilcox <willy@...radead.org>,
Alexander Viro <viro@...iv.linux.org.uk>,
Christian Brauner <brauner@...nel.org>, Jan Kara <jack@...e.cz>,
Eric Biederman <ebiederm@...ssion.com>, Kees Cook <kees@...nel.org>,
Suren Baghdasaryan <surenb@...gle.com>
Subject: Re: [RFC PATCH 4/7] mm: move internal core VMA manipulation
functions to own file
On Thu, Jun 27, 2024 at 01:56:12PM -0400, Liam R. Howlett wrote:
> * Lorenzo Stoakes <lstoakes@...il.com> [240627 06:39]:
> > This patch introduces vma.c and moves internal core VMA manipulation
> > functions to this file from mmap.c.
> >
> > This allows us to isolate VMA functionality in a single place such that we
> > can create userspace testing code that invokes this functionality in an
> > environment where we can implement simple unit tests of core functionality.
> >
> > This patch ensures that core VMA functionality is explicitly marked as such
> > by its presence in mm/vma.h.
> >
> > It also places the header includes required by vma.c in vma_internal.h,
> > which is simply imported by vma.c. This makes the VMA functionality
> > testable, as userland testing code can simply stub out functionality
> > as required.
>
> My initial thought on vma_internal.h would be to contain the number of
> 'helper' functions and internal structures while mm/vma.h would have the
> interface.
This is what I've done though?
>
> In this way, we could include mm/vma.h into mm/internal.h (which most
> files you've edited already has included), and any special cases
> (mmu_notifier.c, etc) would need the addition. vma_internal.h would
> have only things needed in the vma.c file.
Happy to change it so mm/internal.h imports mm/vma_internal.h, it does make
sense for one to include theother.
>
> On testing, we could use the header guards to exclude what we wanted by
> either just #defining the right guard, or by making an entirely new
> header with a duplicate guard with the necessary stubs/functions.
Yeah this is a nice idea as mentioned on other patch, I will be doing this.
>
> >
> > Signed-off-by: Lorenzo Stoakes <lstoakes@...il.com>
> > ---
> > include/linux/mm.h | 35 -
> > mm/Makefile | 2 +-
> > mm/gup.c | 1 +
> > mm/huge_memory.c | 1 +
> > mm/internal.h | 227 +----
> > mm/madvise.c | 1 +
> > mm/memory.c | 1 +
> > mm/mempolicy.c | 1 +
> > mm/mlock.c | 1 +
> > mm/mmap.c | 1983 +++-----------------------------------------
> > mm/mmu_notifier.c | 2 +
> > mm/mprotect.c | 1 +
> > mm/mremap.c | 1 +
> > mm/mseal.c | 2 +
> > mm/rmap.c | 1 +
> > mm/userfaultfd.c | 2 +
> > mm/vma.c | 1766 +++++++++++++++++++++++++++++++++++++++
> > mm/vma.h | 356 ++++++++
> > mm/vma_internal.h | 143 ++++
> > 19 files changed, 2389 insertions(+), 2138 deletions(-)
> > create mode 100644 mm/vma.c
> > create mode 100644 mm/vma.h
> > create mode 100644 mm/vma_internal.h
> >
> > diff --git a/include/linux/mm.h b/include/linux/mm.h
> > index e3220439cf75..31f85db029b8 100644
> > --- a/include/linux/mm.h
> > +++ b/include/linux/mm.h
> > @@ -1004,21 +1004,6 @@ struct vm_area_struct *vma_iter_prev_range(struct vma_iterator *vmi)
> > return mas_prev_range(&vmi->mas, 0);
> > }
> >
> > -static inline unsigned long vma_iter_addr(struct vma_iterator *vmi)
> > -{
> > - return vmi->mas.index;
> > -}
> > -
> > -static inline unsigned long vma_iter_end(struct vma_iterator *vmi)
> > -{
> > - return vmi->mas.last + 1;
> > -}
> > -static inline int vma_iter_bulk_alloc(struct vma_iterator *vmi,
> > - unsigned long count)
> > -{
> > - return mas_expected_entries(&vmi->mas, count);
> > -}
> > -
> > static inline int vma_iter_clear_gfp(struct vma_iterator *vmi,
> > unsigned long start, unsigned long end, gfp_t gfp)
> > {
> > @@ -2548,21 +2533,6 @@ extern unsigned long move_page_tables(struct vm_area_struct *vma,
> > #define MM_CP_UFFD_WP_ALL (MM_CP_UFFD_WP | \
> > MM_CP_UFFD_WP_RESOLVE)
> >
> > -bool vma_needs_dirty_tracking(struct vm_area_struct *vma);
> > -bool vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot);
> > -static inline bool vma_wants_manual_pte_write_upgrade(struct vm_area_struct *vma)
> > -{
> > - /*
> > - * We want to check manually if we can change individual PTEs writable
> > - * if we can't do that automatically for all PTEs in a mapping. For
> > - * private mappings, that's always the case when we have write
> > - * permissions as we properly have to handle COW.
> > - */
> > - if (vma->vm_flags & VM_SHARED)
> > - return vma_wants_writenotify(vma, vma->vm_page_prot);
> > - return !!(vma->vm_flags & VM_WRITE);
> > -
> > -}
> > bool can_change_pte_writable(struct vm_area_struct *vma, unsigned long addr,
> > pte_t pte);
> > extern long change_protection(struct mmu_gather *tlb,
> > @@ -3277,12 +3247,7 @@ extern int vma_expand_bottom(struct vma_iterator *vmi, struct vm_area_struct *vm
> > unsigned long shift, struct vm_area_struct **next);
> > extern int vma_shrink_top(struct vma_iterator *vmi, struct vm_area_struct *vma,
> > unsigned long shift);
> > -extern struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *);
> > extern int insert_vm_struct(struct mm_struct *, struct vm_area_struct *);
> > -extern void unlink_file_vma(struct vm_area_struct *);
> > -extern struct vm_area_struct *copy_vma(struct vm_area_struct **,
> > - unsigned long addr, unsigned long len, pgoff_t pgoff,
> > - bool *need_rmap_locks);
> > extern void exit_mmap(struct mm_struct *);
> >
> > static inline int check_data_rlimit(unsigned long rlim,
> > diff --git a/mm/Makefile b/mm/Makefile
> > index d2915f8c9dc0..140a22654dde 100644
> > --- a/mm/Makefile
> > +++ b/mm/Makefile
> > @@ -37,7 +37,7 @@ mmu-y := nommu.o
> > mmu-$(CONFIG_MMU) := highmem.o memory.o mincore.o \
> > mlock.o mmap.o mmu_gather.o mprotect.o mremap.o \
> > msync.o page_vma_mapped.o pagewalk.o \
> > - pgtable-generic.o rmap.o vmalloc.o
> > + pgtable-generic.o rmap.o vmalloc.o vma.o
> >
> >
> > ifdef CONFIG_CROSS_MEMORY_ATTACH
> > diff --git a/mm/gup.c b/mm/gup.c
> > index 8bea9ad80984..34b846352679 100644
> > --- a/mm/gup.c
> > +++ b/mm/gup.c
> > @@ -26,6 +26,7 @@
> > #include <asm/tlbflush.h>
> >
> > #include "internal.h"
> > +#include "vma.h"
> >
> > struct follow_page_context {
> > struct dev_pagemap *pgmap;
> > diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> > index c7ce28f6b7f3..de6f150ed97b 100644
> > --- a/mm/huge_memory.c
> > +++ b/mm/huge_memory.c
> > @@ -44,6 +44,7 @@
> > #include <asm/tlb.h>
> > #include <asm/pgalloc.h>
> > #include "internal.h"
> > +#include "vma.h"
> > #include "swap.h"
> >
> > #define CREATE_TRACE_POINTS
> > diff --git a/mm/internal.h b/mm/internal.h
> > index f7779727bb78..76b4821cd751 100644
> > --- a/mm/internal.h
> > +++ b/mm/internal.h
> > @@ -8,7 +8,9 @@
> > #define __MM_INTERNAL_H
> >
> > #include <linux/fs.h>
> > +#include <linux/khugepaged.h>
> > #include <linux/mm.h>
> > +#include <linux/mm_inline.h>
> > #include <linux/pagemap.h>
> > #include <linux/rmap.h>
> > #include <linux/swap.h>
> > @@ -778,37 +780,6 @@ static inline bool free_area_empty(struct free_area *area, int migratetype)
> > return list_empty(&area->free_list[migratetype]);
> > }
> >
> > -/*
> > - * These three helpers classifies VMAs for virtual memory accounting.
> > - */
> > -
> > -/*
> > - * Executable code area - executable, not writable, not stack
> > - */
> > -static inline bool is_exec_mapping(vm_flags_t flags)
> > -{
> > - return (flags & (VM_EXEC | VM_WRITE | VM_STACK)) == VM_EXEC;
> > -}
> > -
> > -/*
> > - * Stack area (including shadow stacks)
> > - *
> > - * VM_GROWSUP / VM_GROWSDOWN VMAs are always private anonymous:
> > - * do_mmap() forbids all other combinations.
> > - */
> > -static inline bool is_stack_mapping(vm_flags_t flags)
> > -{
> > - return ((flags & VM_STACK) == VM_STACK) || (flags & VM_SHADOW_STACK);
> > -}
> > -
> > -/*
> > - * Data area - private, writable, not stack
> > - */
> > -static inline bool is_data_mapping(vm_flags_t flags)
> > -{
> > - return (flags & (VM_WRITE | VM_SHARED | VM_STACK)) == VM_WRITE;
> > -}
> > -
> > /* mm/util.c */
> > struct anon_vma *folio_anon_vma(struct folio *folio);
> >
> > @@ -1237,80 +1208,6 @@ void touch_pud(struct vm_area_struct *vma, unsigned long addr,
> > void touch_pmd(struct vm_area_struct *vma, unsigned long addr,
> > pmd_t *pmd, bool write);
> >
> > -/*
> > - * mm/mmap.c
> > - */
> > -struct vm_area_struct *vma_merge_extend(struct vma_iterator *vmi,
> > - struct vm_area_struct *vma,
> > - unsigned long delta);
> > -
> > -struct vm_area_struct *vma_modify(struct vma_iterator *vmi,
> > - struct vm_area_struct *prev,
> > - struct vm_area_struct *vma,
> > - unsigned long start, unsigned long end,
> > - unsigned long vm_flags,
> > - struct mempolicy *policy,
> > - struct vm_userfaultfd_ctx uffd_ctx,
> > - struct anon_vma_name *anon_name);
> > -
> > -/* We are about to modify the VMA's flags. */
> > -static inline struct vm_area_struct
> > -*vma_modify_flags(struct vma_iterator *vmi,
> > - struct vm_area_struct *prev,
> > - struct vm_area_struct *vma,
> > - unsigned long start, unsigned long end,
> > - unsigned long new_flags)
> > -{
> > - return vma_modify(vmi, prev, vma, start, end, new_flags,
> > - vma_policy(vma), vma->vm_userfaultfd_ctx,
> > - anon_vma_name(vma));
> > -}
> > -
> > -/* We are about to modify the VMA's flags and/or anon_name. */
> > -static inline struct vm_area_struct
> > -*vma_modify_flags_name(struct vma_iterator *vmi,
> > - struct vm_area_struct *prev,
> > - struct vm_area_struct *vma,
> > - unsigned long start,
> > - unsigned long end,
> > - unsigned long new_flags,
> > - struct anon_vma_name *new_name)
> > -{
> > - return vma_modify(vmi, prev, vma, start, end, new_flags,
> > - vma_policy(vma), vma->vm_userfaultfd_ctx, new_name);
> > -}
> > -
> > -/* We are about to modify the VMA's memory policy. */
> > -static inline struct vm_area_struct
> > -*vma_modify_policy(struct vma_iterator *vmi,
> > - struct vm_area_struct *prev,
> > - struct vm_area_struct *vma,
> > - unsigned long start, unsigned long end,
> > - struct mempolicy *new_pol)
> > -{
> > - return vma_modify(vmi, prev, vma, start, end, vma->vm_flags,
> > - new_pol, vma->vm_userfaultfd_ctx, anon_vma_name(vma));
> > -}
> > -
> > -/* We are about to modify the VMA's flags and/or uffd context. */
> > -static inline struct vm_area_struct
> > -*vma_modify_flags_uffd(struct vma_iterator *vmi,
> > - struct vm_area_struct *prev,
> > - struct vm_area_struct *vma,
> > - unsigned long start, unsigned long end,
> > - unsigned long new_flags,
> > - struct vm_userfaultfd_ctx new_ctx)
> > -{
> > - return vma_modify(vmi, prev, vma, start, end, new_flags,
> > - vma_policy(vma), new_ctx, anon_vma_name(vma));
> > -}
> > -
> > -int vma_expand(struct vma_iterator *vmi, struct vm_area_struct *vma,
> > - unsigned long start, unsigned long end, pgoff_t pgoff,
> > - struct vm_area_struct *next);
> > -int vma_shrink(struct vma_iterator *vmi, struct vm_area_struct *vma,
> > - unsigned long start, unsigned long end, pgoff_t pgoff);
> > -
> > enum {
> > /* mark page accessed */
> > FOLL_TOUCH = 1 << 16,
> > @@ -1437,117 +1334,6 @@ static inline bool pte_needs_soft_dirty_wp(struct vm_area_struct *vma, pte_t pte
> > return vma_soft_dirty_enabled(vma) && !pte_soft_dirty(pte);
> > }
> >
> > -static inline void vma_iter_config(struct vma_iterator *vmi,
> > - unsigned long index, unsigned long last)
> > -{
> > - __mas_set_range(&vmi->mas, index, last - 1);
> > -}
> > -
> > -static inline void vma_iter_reset(struct vma_iterator *vmi)
> > -{
> > - mas_reset(&vmi->mas);
> > -}
> > -
> > -static inline
> > -struct vm_area_struct *vma_iter_prev_range_limit(struct vma_iterator *vmi, unsigned long min)
> > -{
> > - return mas_prev_range(&vmi->mas, min);
> > -}
> > -
> > -static inline
> > -struct vm_area_struct *vma_iter_next_range_limit(struct vma_iterator *vmi, unsigned long max)
> > -{
> > - return mas_next_range(&vmi->mas, max);
> > -}
> > -
> > -static inline int vma_iter_area_lowest(struct vma_iterator *vmi, unsigned long min,
> > - unsigned long max, unsigned long size)
> > -{
> > - return mas_empty_area(&vmi->mas, min, max - 1, size);
> > -}
> > -
> > -static inline int vma_iter_area_highest(struct vma_iterator *vmi, unsigned long min,
> > - unsigned long max, unsigned long size)
> > -{
> > - return mas_empty_area_rev(&vmi->mas, min, max - 1, size);
> > -}
> > -
> > -/*
> > - * VMA Iterator functions shared between nommu and mmap
> > - */
> > -static inline int vma_iter_prealloc(struct vma_iterator *vmi,
> > - struct vm_area_struct *vma)
> > -{
> > - return mas_preallocate(&vmi->mas, vma, GFP_KERNEL);
> > -}
> > -
> > -static inline void vma_iter_clear(struct vma_iterator *vmi)
> > -{
> > - mas_store_prealloc(&vmi->mas, NULL);
> > -}
> > -
> > -static inline struct vm_area_struct *vma_iter_load(struct vma_iterator *vmi)
> > -{
> > - return mas_walk(&vmi->mas);
> > -}
> > -
> > -/* Store a VMA with preallocated memory */
> > -static inline void vma_iter_store(struct vma_iterator *vmi,
> > - struct vm_area_struct *vma)
> > -{
> > -
> > -#if defined(CONFIG_DEBUG_VM_MAPLE_TREE)
> > - if (MAS_WARN_ON(&vmi->mas, vmi->mas.status != ma_start &&
> > - vmi->mas.index > vma->vm_start)) {
> > - pr_warn("%lx > %lx\n store vma %lx-%lx\n into slot %lx-%lx\n",
> > - vmi->mas.index, vma->vm_start, vma->vm_start,
> > - vma->vm_end, vmi->mas.index, vmi->mas.last);
> > - }
> > - if (MAS_WARN_ON(&vmi->mas, vmi->mas.status != ma_start &&
> > - vmi->mas.last < vma->vm_start)) {
> > - pr_warn("%lx < %lx\nstore vma %lx-%lx\ninto slot %lx-%lx\n",
> > - vmi->mas.last, vma->vm_start, vma->vm_start, vma->vm_end,
> > - vmi->mas.index, vmi->mas.last);
> > - }
> > -#endif
> > -
> > - if (vmi->mas.status != ma_start &&
> > - ((vmi->mas.index > vma->vm_start) || (vmi->mas.last < vma->vm_start)))
> > - vma_iter_invalidate(vmi);
> > -
> > - __mas_set_range(&vmi->mas, vma->vm_start, vma->vm_end - 1);
> > - mas_store_prealloc(&vmi->mas, vma);
> > -}
> > -
> > -static inline int vma_iter_store_gfp(struct vma_iterator *vmi,
> > - struct vm_area_struct *vma, gfp_t gfp)
> > -{
> > - if (vmi->mas.status != ma_start &&
> > - ((vmi->mas.index > vma->vm_start) || (vmi->mas.last < vma->vm_start)))
> > - vma_iter_invalidate(vmi);
> > -
> > - __mas_set_range(&vmi->mas, vma->vm_start, vma->vm_end - 1);
> > - mas_store_gfp(&vmi->mas, vma, gfp);
> > - if (unlikely(mas_is_err(&vmi->mas)))
> > - return -ENOMEM;
> > -
> > - return 0;
> > -}
> > -
> > -/*
> > - * VMA lock generalization
> > - */
> > -struct vma_prepare {
> > - struct vm_area_struct *vma;
> > - struct vm_area_struct *adj_next;
> > - struct file *file;
> > - struct address_space *mapping;
> > - struct anon_vma *anon_vma;
> > - struct vm_area_struct *insert;
> > - struct vm_area_struct *remove;
> > - struct vm_area_struct *remove2;
> > -};
> > -
> > void __meminit __init_single_page(struct page *page, unsigned long pfn,
> > unsigned long zone, int nid);
> >
> > @@ -1636,13 +1422,4 @@ static inline void shrinker_debugfs_remove(struct dentry *debugfs_entry,
> > void workingset_update_node(struct xa_node *node);
> > extern struct list_lru shadow_nodes;
> >
> > -struct unlink_vma_file_batch {
> > - int count;
> > - struct vm_area_struct *vmas[8];
> > -};
> > -
> > -void unlink_file_vma_batch_init(struct unlink_vma_file_batch *);
> > -void unlink_file_vma_batch_add(struct unlink_vma_file_batch *, struct vm_area_struct *);
> > -void unlink_file_vma_batch_final(struct unlink_vma_file_batch *);
> > -
> > #endif /* __MM_INTERNAL_H */
> > diff --git a/mm/madvise.c b/mm/madvise.c
> > index 96c026fe0c99..42f62a8efd71 100644
> > --- a/mm/madvise.c
> > +++ b/mm/madvise.c
> > @@ -35,6 +35,7 @@
> > #include <asm/tlb.h>
> >
> > #include "internal.h"
> > +#include "vma.h"
> > #include "swap.h"
> >
> > struct madvise_walk_private {
> > diff --git a/mm/memory.c b/mm/memory.c
> > index 0a769f34bbb2..a2ca7df9d2cf 100644
> > --- a/mm/memory.c
> > +++ b/mm/memory.c
> > @@ -90,6 +90,7 @@
> >
> > #include "pgalloc-track.h"
> > #include "internal.h"
> > +#include "vma.h"
> > #include "swap.h"
> >
> > #if defined(LAST_CPUPID_NOT_IN_PAGE_FLAGS) && !defined(CONFIG_COMPILE_TEST)
> > diff --git a/mm/mempolicy.c b/mm/mempolicy.c
> > index f73acb01ad45..3dad2b52f319 100644
> > --- a/mm/mempolicy.c
> > +++ b/mm/mempolicy.c
> > @@ -115,6 +115,7 @@
> > #include <linux/uaccess.h>
> >
> > #include "internal.h"
> > +#include "vma.h"
> >
> > /* Internal flags */
> > #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0) /* Skip checks for continuous vmas */
> > diff --git a/mm/mlock.c b/mm/mlock.c
> > index 52d6e401ad67..ac84378bb796 100644
> > --- a/mm/mlock.c
> > +++ b/mm/mlock.c
> > @@ -27,6 +27,7 @@
> > #include <linux/secretmem.h>
> >
> > #include "internal.h"
> > +#include "vma.h"
> >
> > struct mlock_fbatch {
> > local_lock_t lock;
> > diff --git a/mm/mmap.c b/mm/mmap.c
> > index 574e69a04ebe..b4f7c1ea3f0f 100644
> > --- a/mm/mmap.c
> > +++ b/mm/mmap.c
> > @@ -57,6 +57,7 @@
> > #include <trace/events/mmap.h>
> >
> > #include "internal.h"
> > +#include "vma.h"
> >
> > #ifndef arch_mmap_check
> > #define arch_mmap_check(addr, len, flags) (0)
> > @@ -76,16 +77,6 @@ int mmap_rnd_compat_bits __read_mostly = CONFIG_ARCH_MMAP_RND_COMPAT_BITS;
> > static bool ignore_rlimit_data;
> > core_param(ignore_rlimit_data, ignore_rlimit_data, bool, 0644);
> >
> > -static void unmap_region(struct mm_struct *mm, struct ma_state *mas,
> > - struct vm_area_struct *vma, struct vm_area_struct *prev,
> > - struct vm_area_struct *next, unsigned long start,
> > - unsigned long end, unsigned long tree_end, bool mm_wr_locked);
> > -
> > -static pgprot_t vm_pgprot_modify(pgprot_t oldprot, unsigned long vm_flags)
> > -{
> > - return pgprot_modify(oldprot, vm_get_page_prot(vm_flags));
> > -}
> > -
> > /* Update vma->vm_page_prot to reflect vma->vm_flags. */
> > void vma_set_page_prot(struct vm_area_struct *vma)
> > {
> > @@ -101,100 +92,6 @@ void vma_set_page_prot(struct vm_area_struct *vma)
> > WRITE_ONCE(vma->vm_page_prot, vm_page_prot);
> > }
> >
> > -/*
> > - * Requires inode->i_mapping->i_mmap_rwsem
> > - */
> > -static void __remove_shared_vm_struct(struct vm_area_struct *vma,
> > - struct address_space *mapping)
> > -{
> > - if (vma_is_shared_maywrite(vma))
> > - mapping_unmap_writable(mapping);
> > -
> > - flush_dcache_mmap_lock(mapping);
> > - vma_interval_tree_remove(vma, &mapping->i_mmap);
> > - flush_dcache_mmap_unlock(mapping);
> > -}
> > -
> > -/*
> > - * Unlink a file-based vm structure from its interval tree, to hide
> > - * vma from rmap and vmtruncate before freeing its page tables.
> > - */
> > -void unlink_file_vma(struct vm_area_struct *vma)
> > -{
> > - struct file *file = vma->vm_file;
> > -
> > - if (file) {
> > - struct address_space *mapping = file->f_mapping;
> > - i_mmap_lock_write(mapping);
> > - __remove_shared_vm_struct(vma, mapping);
> > - i_mmap_unlock_write(mapping);
> > - }
> > -}
> > -
> > -void unlink_file_vma_batch_init(struct unlink_vma_file_batch *vb)
> > -{
> > - vb->count = 0;
> > -}
> > -
> > -static void unlink_file_vma_batch_process(struct unlink_vma_file_batch *vb)
> > -{
> > - struct address_space *mapping;
> > - int i;
> > -
> > - mapping = vb->vmas[0]->vm_file->f_mapping;
> > - i_mmap_lock_write(mapping);
> > - for (i = 0; i < vb->count; i++) {
> > - VM_WARN_ON_ONCE(vb->vmas[i]->vm_file->f_mapping != mapping);
> > - __remove_shared_vm_struct(vb->vmas[i], mapping);
> > - }
> > - i_mmap_unlock_write(mapping);
> > -
> > - unlink_file_vma_batch_init(vb);
> > -}
> > -
> > -void unlink_file_vma_batch_add(struct unlink_vma_file_batch *vb,
> > - struct vm_area_struct *vma)
> > -{
> > - if (vma->vm_file == NULL)
> > - return;
> > -
> > - if ((vb->count > 0 && vb->vmas[0]->vm_file != vma->vm_file) ||
> > - vb->count == ARRAY_SIZE(vb->vmas))
> > - unlink_file_vma_batch_process(vb);
> > -
> > - vb->vmas[vb->count] = vma;
> > - vb->count++;
> > -}
> > -
> > -void unlink_file_vma_batch_final(struct unlink_vma_file_batch *vb)
> > -{
> > - if (vb->count > 0)
> > - unlink_file_vma_batch_process(vb);
> > -}
> > -
> > -/*
> > - * Close a vm structure and free it.
> > - */
> > -static void remove_vma(struct vm_area_struct *vma, bool unreachable)
> > -{
> > - might_sleep();
> > - if (vma->vm_ops && vma->vm_ops->close)
> > - vma->vm_ops->close(vma);
> > - if (vma->vm_file)
> > - fput(vma->vm_file);
> > - mpol_put(vma_policy(vma));
> > - if (unreachable)
> > - __vm_area_free(vma);
> > - else
> > - vm_area_free(vma);
> > -}
> > -
> > -static inline struct vm_area_struct *vma_prev_limit(struct vma_iterator *vmi,
> > - unsigned long min)
> > -{
> > - return mas_prev(&vmi->mas, min);
> > -}
> > -
> > /*
> > * check_brk_limits() - Use platform specific check of range & verify mlock
> > * limits.
> > @@ -298,893 +195,24 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
> > brkvma = vma_prev_limit(&vmi, mm->start_brk);
> > /* Ok, looks good - let it rip. */
> > if (do_brk_flags(&vmi, brkvma, oldbrk, newbrk - oldbrk, 0) < 0)
> > - goto out;
> > -
> > - mm->brk = brk;
> > - if (mm->def_flags & VM_LOCKED)
> > - populate = true;
> > -
> > -success:
> > - mmap_write_unlock(mm);
> > -success_unlocked:
> > - userfaultfd_unmap_complete(mm, &uf);
> > - if (populate)
> > - mm_populate(oldbrk, newbrk - oldbrk);
> > - return brk;
> > -
> > -out:
> > - mm->brk = origbrk;
> > - mmap_write_unlock(mm);
> > - return origbrk;
> > -}
> > -
> > -#if defined(CONFIG_DEBUG_VM_MAPLE_TREE)
> > -static void validate_mm(struct mm_struct *mm)
> > -{
> > - int bug = 0;
> > - int i = 0;
> > - struct vm_area_struct *vma;
> > - VMA_ITERATOR(vmi, mm, 0);
> > -
> > - mt_validate(&mm->mm_mt);
> > - for_each_vma(vmi, vma) {
> > -#ifdef CONFIG_DEBUG_VM_RB
> > - struct anon_vma *anon_vma = vma->anon_vma;
> > - struct anon_vma_chain *avc;
> > -#endif
> > - unsigned long vmi_start, vmi_end;
> > - bool warn = 0;
> > -
> > - vmi_start = vma_iter_addr(&vmi);
> > - vmi_end = vma_iter_end(&vmi);
> > - if (VM_WARN_ON_ONCE_MM(vma->vm_end != vmi_end, mm))
> > - warn = 1;
> > -
> > - if (VM_WARN_ON_ONCE_MM(vma->vm_start != vmi_start, mm))
> > - warn = 1;
> > -
> > - if (warn) {
> > - pr_emerg("issue in %s\n", current->comm);
> > - dump_stack();
> > - dump_vma(vma);
> > - pr_emerg("tree range: %px start %lx end %lx\n", vma,
> > - vmi_start, vmi_end - 1);
> > - vma_iter_dump_tree(&vmi);
> > - }
> > -
> > -#ifdef CONFIG_DEBUG_VM_RB
> > - if (anon_vma) {
> > - anon_vma_lock_read(anon_vma);
> > - list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
> > - anon_vma_interval_tree_verify(avc);
> > - anon_vma_unlock_read(anon_vma);
> > - }
> > -#endif
> > - i++;
> > - }
> > - if (i != mm->map_count) {
> > - pr_emerg("map_count %d vma iterator %d\n", mm->map_count, i);
> > - bug = 1;
> > - }
> > - VM_BUG_ON_MM(bug, mm);
> > -}
> > -
> > -#else /* !CONFIG_DEBUG_VM_MAPLE_TREE */
> > -#define validate_mm(mm) do { } while (0)
> > -#endif /* CONFIG_DEBUG_VM_MAPLE_TREE */
> > -
> > -/*
> > - * vma has some anon_vma assigned, and is already inserted on that
> > - * anon_vma's interval trees.
> > - *
> > - * Before updating the vma's vm_start / vm_end / vm_pgoff fields, the
> > - * vma must be removed from the anon_vma's interval trees using
> > - * anon_vma_interval_tree_pre_update_vma().
> > - *
> > - * After the update, the vma will be reinserted using
> > - * anon_vma_interval_tree_post_update_vma().
> > - *
> > - * The entire update must be protected by exclusive mmap_lock and by
> > - * the root anon_vma's mutex.
> > - */
> > -static inline void
> > -anon_vma_interval_tree_pre_update_vma(struct vm_area_struct *vma)
> > -{
> > - struct anon_vma_chain *avc;
> > -
> > - list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
> > - anon_vma_interval_tree_remove(avc, &avc->anon_vma->rb_root);
> > -}
> > -
> > -static inline void
> > -anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma)
> > -{
> > - struct anon_vma_chain *avc;
> > -
> > - list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
> > - anon_vma_interval_tree_insert(avc, &avc->anon_vma->rb_root);
> > -}
> > -
> > -static unsigned long count_vma_pages_range(struct mm_struct *mm,
> > - unsigned long addr, unsigned long end)
> > -{
> > - VMA_ITERATOR(vmi, mm, addr);
> > - struct vm_area_struct *vma;
> > - unsigned long nr_pages = 0;
> > -
> > - for_each_vma_range(vmi, vma, end) {
> > - unsigned long vm_start = max(addr, vma->vm_start);
> > - unsigned long vm_end = min(end, vma->vm_end);
> > -
> > - nr_pages += PHYS_PFN(vm_end - vm_start);
> > - }
> > -
> > - return nr_pages;
> > -}
> > -
> > -static void __vma_link_file(struct vm_area_struct *vma,
> > - struct address_space *mapping)
> > -{
> > - if (vma_is_shared_maywrite(vma))
> > - mapping_allow_writable(mapping);
> > -
> > - flush_dcache_mmap_lock(mapping);
> > - vma_interval_tree_insert(vma, &mapping->i_mmap);
> > - flush_dcache_mmap_unlock(mapping);
> > -}
> > -
> > -static void vma_link_file(struct vm_area_struct *vma)
> > -{
> > - struct file *file = vma->vm_file;
> > - struct address_space *mapping;
> > -
> > - if (file) {
> > - mapping = file->f_mapping;
> > - i_mmap_lock_write(mapping);
> > - __vma_link_file(vma, mapping);
> > - i_mmap_unlock_write(mapping);
> > - }
> > -}
> > -
> > -static int vma_link(struct mm_struct *mm, struct vm_area_struct *vma)
> > -{
> > - VMA_ITERATOR(vmi, mm, 0);
> > -
> > - vma_iter_config(&vmi, vma->vm_start, vma->vm_end);
> > - if (vma_iter_prealloc(&vmi, vma))
> > - return -ENOMEM;
> > -
> > - vma_start_write(vma);
> > - vma_iter_store(&vmi, vma);
> > - vma_link_file(vma);
> > - mm->map_count++;
> > - validate_mm(mm);
> > - return 0;
> > -}
> > -
> > -/*
> > - * init_multi_vma_prep() - Initializer for struct vma_prepare
> > - * @vp: The vma_prepare struct
> > - * @vma: The vma that will be altered once locked
> > - * @next: The next vma if it is to be adjusted
> > - * @remove: The first vma to be removed
> > - * @remove2: The second vma to be removed
> > - */
> > -static inline void init_multi_vma_prep(struct vma_prepare *vp,
> > - struct vm_area_struct *vma, struct vm_area_struct *next,
> > - struct vm_area_struct *remove, struct vm_area_struct *remove2)
> > -{
> > - memset(vp, 0, sizeof(struct vma_prepare));
> > - vp->vma = vma;
> > - vp->anon_vma = vma->anon_vma;
> > - vp->remove = remove;
> > - vp->remove2 = remove2;
> > - vp->adj_next = next;
> > - if (!vp->anon_vma && next)
> > - vp->anon_vma = next->anon_vma;
> > -
> > - vp->file = vma->vm_file;
> > - if (vp->file)
> > - vp->mapping = vma->vm_file->f_mapping;
> > -
> > -}
> > -
> > -/*
> > - * init_vma_prep() - Initializer wrapper for vma_prepare struct
> > - * @vp: The vma_prepare struct
> > - * @vma: The vma that will be altered once locked
> > - */
> > -static inline void init_vma_prep(struct vma_prepare *vp,
> > - struct vm_area_struct *vma)
> > -{
> > - init_multi_vma_prep(vp, vma, NULL, NULL, NULL);
> > -}
> > -
> > -
> > -/*
> > - * vma_prepare() - Helper function for handling locking VMAs prior to altering
> > - * @vp: The initialized vma_prepare struct
> > - */
> > -static inline void vma_prepare(struct vma_prepare *vp)
> > -{
> > - if (vp->file) {
> > - uprobe_munmap(vp->vma, vp->vma->vm_start, vp->vma->vm_end);
> > -
> > - if (vp->adj_next)
> > - uprobe_munmap(vp->adj_next, vp->adj_next->vm_start,
> > - vp->adj_next->vm_end);
> > -
> > - i_mmap_lock_write(vp->mapping);
> > - if (vp->insert && vp->insert->vm_file) {
> > - /*
> > - * Put into interval tree now, so instantiated pages
> > - * are visible to arm/parisc __flush_dcache_page
> > - * throughout; but we cannot insert into address
> > - * space until vma start or end is updated.
> > - */
> > - __vma_link_file(vp->insert,
> > - vp->insert->vm_file->f_mapping);
> > - }
> > - }
> > -
> > - if (vp->anon_vma) {
> > - anon_vma_lock_write(vp->anon_vma);
> > - anon_vma_interval_tree_pre_update_vma(vp->vma);
> > - if (vp->adj_next)
> > - anon_vma_interval_tree_pre_update_vma(vp->adj_next);
> > - }
> > -
> > - if (vp->file) {
> > - flush_dcache_mmap_lock(vp->mapping);
> > - vma_interval_tree_remove(vp->vma, &vp->mapping->i_mmap);
> > - if (vp->adj_next)
> > - vma_interval_tree_remove(vp->adj_next,
> > - &vp->mapping->i_mmap);
> > - }
> > -
> > -}
> > -
> > -/*
> > - * vma_complete- Helper function for handling the unlocking after altering VMAs,
> > - * or for inserting a VMA.
> > - *
> > - * @vp: The vma_prepare struct
> > - * @vmi: The vma iterator
> > - * @mm: The mm_struct
> > - */
> > -static inline void vma_complete(struct vma_prepare *vp,
> > - struct vma_iterator *vmi, struct mm_struct *mm)
> > -{
> > - if (vp->file) {
> > - if (vp->adj_next)
> > - vma_interval_tree_insert(vp->adj_next,
> > - &vp->mapping->i_mmap);
> > - vma_interval_tree_insert(vp->vma, &vp->mapping->i_mmap);
> > - flush_dcache_mmap_unlock(vp->mapping);
> > - }
> > -
> > - if (vp->remove && vp->file) {
> > - __remove_shared_vm_struct(vp->remove, vp->mapping);
> > - if (vp->remove2)
> > - __remove_shared_vm_struct(vp->remove2, vp->mapping);
> > - } else if (vp->insert) {
> > - /*
> > - * split_vma has split insert from vma, and needs
> > - * us to insert it before dropping the locks
> > - * (it may either follow vma or precede it).
> > - */
> > - vma_iter_store(vmi, vp->insert);
> > - mm->map_count++;
> > - }
> > -
> > - if (vp->anon_vma) {
> > - anon_vma_interval_tree_post_update_vma(vp->vma);
> > - if (vp->adj_next)
> > - anon_vma_interval_tree_post_update_vma(vp->adj_next);
> > - anon_vma_unlock_write(vp->anon_vma);
> > - }
> > -
> > - if (vp->file) {
> > - i_mmap_unlock_write(vp->mapping);
> > - uprobe_mmap(vp->vma);
> > -
> > - if (vp->adj_next)
> > - uprobe_mmap(vp->adj_next);
> > - }
> > -
> > - if (vp->remove) {
> > -again:
> > - vma_mark_detached(vp->remove, true);
> > - if (vp->file) {
> > - uprobe_munmap(vp->remove, vp->remove->vm_start,
> > - vp->remove->vm_end);
> > - fput(vp->file);
> > - }
> > - if (vp->remove->anon_vma)
> > - anon_vma_merge(vp->vma, vp->remove);
> > - mm->map_count--;
> > - mpol_put(vma_policy(vp->remove));
> > - if (!vp->remove2)
> > - WARN_ON_ONCE(vp->vma->vm_end < vp->remove->vm_end);
> > - vm_area_free(vp->remove);
> > -
> > - /*
> > - * In mprotect's case 6 (see comments on vma_merge),
> > - * we are removing both mid and next vmas
> > - */
> > - if (vp->remove2) {
> > - vp->remove = vp->remove2;
> > - vp->remove2 = NULL;
> > - goto again;
> > - }
> > - }
> > - if (vp->insert && vp->file)
> > - uprobe_mmap(vp->insert);
> > - validate_mm(mm);
> > -}
> > -
> > -/*
> > - * dup_anon_vma() - Helper function to duplicate anon_vma
> > - * @dst: The destination VMA
> > - * @src: The source VMA
> > - * @dup: Pointer to the destination VMA when successful.
> > - *
> > - * Returns: 0 on success.
> > - */
> > -static inline int dup_anon_vma(struct vm_area_struct *dst,
> > - struct vm_area_struct *src, struct vm_area_struct **dup)
> > -{
> > - /*
> > - * Easily overlooked: when mprotect shifts the boundary, make sure the
> > - * expanding vma has anon_vma set if the shrinking vma had, to cover any
> > - * anon pages imported.
> > - */
> > - if (src->anon_vma && !dst->anon_vma) {
> > - int ret;
> > -
> > - vma_assert_write_locked(dst);
> > - dst->anon_vma = src->anon_vma;
> > - ret = anon_vma_clone(dst, src);
> > - if (ret)
> > - return ret;
> > -
> > - *dup = dst;
> > - }
> > -
> > - return 0;
> > -}
> > -
> > -/*
> > - * vma_expand - Expand an existing VMA
> > - *
> > - * @vmi: The vma iterator
> > - * @vma: The vma to expand
> > - * @start: The start of the vma
> > - * @end: The exclusive end of the vma
> > - * @pgoff: The page offset of vma
> > - * @next: The current of next vma.
> > - *
> > - * Expand @vma to @start and @end. Can expand off the start and end. Will
> > - * expand over @next if it's different from @vma and @end == @next->vm_end.
> > - * Checking if the @vma can expand and merge with @next needs to be handled by
> > - * the caller.
> > - *
> > - * Returns: 0 on success
> > - */
> > -int vma_expand(struct vma_iterator *vmi, struct vm_area_struct *vma,
> > - unsigned long start, unsigned long end, pgoff_t pgoff,
> > - struct vm_area_struct *next)
> > -{
> > - struct vm_area_struct *anon_dup = NULL;
> > - bool remove_next = false;
> > - struct vma_prepare vp;
> > -
> > - vma_start_write(vma);
> > - if (next && (vma != next) && (end == next->vm_end)) {
> > - int ret;
> > -
> > - remove_next = true;
> > - vma_start_write(next);
> > - ret = dup_anon_vma(vma, next, &anon_dup);
> > - if (ret)
> > - return ret;
> > - }
> > -
> > - init_multi_vma_prep(&vp, vma, NULL, remove_next ? next : NULL, NULL);
> > - /* Not merging but overwriting any part of next is not handled. */
> > - VM_WARN_ON(next && !vp.remove &&
> > - next != vma && end > next->vm_start);
> > - /* Only handles expanding */
> > - VM_WARN_ON(vma->vm_start < start || vma->vm_end > end);
> > -
> > - /* Note: vma iterator must be pointing to 'start' */
> > - vma_iter_config(vmi, start, end);
> > - if (vma_iter_prealloc(vmi, vma))
> > - goto nomem;
> > -
> > - vma_prepare(&vp);
> > - vma_adjust_trans_huge(vma, start, end, 0);
> > - vma_set_range(vma, start, end, pgoff);
> > - vma_iter_store(vmi, vma);
> > -
> > - vma_complete(&vp, vmi, vma->vm_mm);
> > - return 0;
> > -
> > -nomem:
> > - if (anon_dup)
> > - unlink_anon_vmas(anon_dup);
> > - return -ENOMEM;
> > -}
> > -
> > -/*
> > - * vma_shrink() - Reduce an existing VMAs memory area
> > - * @vmi: The vma iterator
> > - * @vma: The VMA to modify
> > - * @start: The new start
> > - * @end: The new end
> > - *
> > - * Returns: 0 on success, -ENOMEM otherwise
> > - */
> > -int vma_shrink(struct vma_iterator *vmi, struct vm_area_struct *vma,
> > - unsigned long start, unsigned long end, pgoff_t pgoff)
> > -{
> > - struct vma_prepare vp;
> > -
> > - WARN_ON((vma->vm_start != start) && (vma->vm_end != end));
> > -
> > - if (vma->vm_start < start)
> > - vma_iter_config(vmi, vma->vm_start, start);
> > - else
> > - vma_iter_config(vmi, end, vma->vm_end);
> > -
> > - if (vma_iter_prealloc(vmi, NULL))
> > - return -ENOMEM;
> > -
> > - vma_start_write(vma);
> > -
> > - init_vma_prep(&vp, vma);
> > - vma_prepare(&vp);
> > - vma_adjust_trans_huge(vma, start, end, 0);
> > -
> > - vma_iter_clear(vmi);
> > - vma_set_range(vma, start, end, pgoff);
> > - vma_complete(&vp, vmi, vma->vm_mm);
> > - return 0;
> > -}
> > -
> > -/*
> > - * If the vma has a ->close operation then the driver probably needs to release
> > - * per-vma resources, so we don't attempt to merge those if the caller indicates
> > - * the current vma may be removed as part of the merge.
> > - */
> > -static inline bool is_mergeable_vma(struct vm_area_struct *vma,
> > - struct file *file, unsigned long vm_flags,
> > - struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
> > - struct anon_vma_name *anon_name, bool may_remove_vma)
> > -{
> > - /*
> > - * VM_SOFTDIRTY should not prevent from VMA merging, if we
> > - * match the flags but dirty bit -- the caller should mark
> > - * merged VMA as dirty. If dirty bit won't be excluded from
> > - * comparison, we increase pressure on the memory system forcing
> > - * the kernel to generate new VMAs when old one could be
> > - * extended instead.
> > - */
> > - if ((vma->vm_flags ^ vm_flags) & ~VM_SOFTDIRTY)
> > - return false;
> > - if (vma->vm_file != file)
> > - return false;
> > - if (may_remove_vma && vma->vm_ops && vma->vm_ops->close)
> > - return false;
> > - if (!is_mergeable_vm_userfaultfd_ctx(vma, vm_userfaultfd_ctx))
> > - return false;
> > - if (!anon_vma_name_eq(anon_vma_name(vma), anon_name))
> > - return false;
> > - return true;
> > -}
> > -
> > -static inline bool is_mergeable_anon_vma(struct anon_vma *anon_vma1,
> > - struct anon_vma *anon_vma2, struct vm_area_struct *vma)
> > -{
> > - /*
> > - * The list_is_singular() test is to avoid merging VMA cloned from
> > - * parents. This can improve scalability caused by anon_vma lock.
> > - */
> > - if ((!anon_vma1 || !anon_vma2) && (!vma ||
> > - list_is_singular(&vma->anon_vma_chain)))
> > - return true;
> > - return anon_vma1 == anon_vma2;
> > -}
> > -
> > -/*
> > - * Return true if we can merge this (vm_flags,anon_vma,file,vm_pgoff)
> > - * in front of (at a lower virtual address and file offset than) the vma.
> > - *
> > - * We cannot merge two vmas if they have differently assigned (non-NULL)
> > - * anon_vmas, nor if same anon_vma is assigned but offsets incompatible.
> > - *
> > - * We don't check here for the merged mmap wrapping around the end of pagecache
> > - * indices (16TB on ia32) because do_mmap() does not permit mmap's which
> > - * wrap, nor mmaps which cover the final page at index -1UL.
> > - *
> > - * We assume the vma may be removed as part of the merge.
> > - */
> > -static bool
> > -can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags,
> > - struct anon_vma *anon_vma, struct file *file,
> > - pgoff_t vm_pgoff, struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
> > - struct anon_vma_name *anon_name)
> > -{
> > - if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx, anon_name, true) &&
> > - is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) {
> > - if (vma->vm_pgoff == vm_pgoff)
> > - return true;
> > - }
> > - return false;
> > -}
> > -
> > -/*
> > - * Return true if we can merge this (vm_flags,anon_vma,file,vm_pgoff)
> > - * beyond (at a higher virtual address and file offset than) the vma.
> > - *
> > - * We cannot merge two vmas if they have differently assigned (non-NULL)
> > - * anon_vmas, nor if same anon_vma is assigned but offsets incompatible.
> > - *
> > - * We assume that vma is not removed as part of the merge.
> > - */
> > -static bool
> > -can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags,
> > - struct anon_vma *anon_vma, struct file *file,
> > - pgoff_t vm_pgoff, struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
> > - struct anon_vma_name *anon_name)
> > -{
> > - if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx, anon_name, false) &&
> > - is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) {
> > - pgoff_t vm_pglen;
> > - vm_pglen = vma_pages(vma);
> > - if (vma->vm_pgoff + vm_pglen == vm_pgoff)
> > - return true;
> > - }
> > - return false;
> > -}
> > -
> > -/*
> > - * Given a mapping request (addr,end,vm_flags,file,pgoff,anon_name),
> > - * figure out whether that can be merged with its predecessor or its
> > - * successor. Or both (it neatly fills a hole).
> > - *
> > - * In most cases - when called for mmap, brk or mremap - [addr,end) is
> > - * certain not to be mapped by the time vma_merge is called; but when
> > - * called for mprotect, it is certain to be already mapped (either at
> > - * an offset within prev, or at the start of next), and the flags of
> > - * this area are about to be changed to vm_flags - and the no-change
> > - * case has already been eliminated.
> > - *
> > - * The following mprotect cases have to be considered, where **** is
> > - * the area passed down from mprotect_fixup, never extending beyond one
> > - * vma, PPPP is the previous vma, CCCC is a concurrent vma that starts
> > - * at the same address as **** and is of the same or larger span, and
> > - * NNNN the next vma after ****:
> > - *
> > - * **** **** ****
> > - * PPPPPPNNNNNN PPPPPPNNNNNN PPPPPPCCCCCC
> > - * cannot merge might become might become
> > - * PPNNNNNNNNNN PPPPPPPPPPCC
> > - * mmap, brk or case 4 below case 5 below
> > - * mremap move:
> > - * **** ****
> > - * PPPP NNNN PPPPCCCCNNNN
> > - * might become might become
> > - * PPPPPPPPPPPP 1 or PPPPPPPPPPPP 6 or
> > - * PPPPPPPPNNNN 2 or PPPPPPPPNNNN 7 or
> > - * PPPPNNNNNNNN 3 PPPPNNNNNNNN 8
> > - *
> > - * It is important for case 8 that the vma CCCC overlapping the
> > - * region **** is never going to extended over NNNN. Instead NNNN must
> > - * be extended in region **** and CCCC must be removed. This way in
> > - * all cases where vma_merge succeeds, the moment vma_merge drops the
> > - * rmap_locks, the properties of the merged vma will be already
> > - * correct for the whole merged range. Some of those properties like
> > - * vm_page_prot/vm_flags may be accessed by rmap_walks and they must
> > - * be correct for the whole merged range immediately after the
> > - * rmap_locks are released. Otherwise if NNNN would be removed and
> > - * CCCC would be extended over the NNNN range, remove_migration_ptes
> > - * or other rmap walkers (if working on addresses beyond the "end"
> > - * parameter) may establish ptes with the wrong permissions of CCCC
> > - * instead of the right permissions of NNNN.
> > - *
> > - * In the code below:
> > - * PPPP is represented by *prev
> > - * CCCC is represented by *curr or not represented at all (NULL)
> > - * NNNN is represented by *next or not represented at all (NULL)
> > - * **** is not represented - it will be merged and the vma containing the
> > - * area is returned, or the function will return NULL
> > - */
> > -static struct vm_area_struct
> > -*vma_merge(struct vma_iterator *vmi, struct vm_area_struct *prev,
> > - struct vm_area_struct *src, unsigned long addr, unsigned long end,
> > - unsigned long vm_flags, pgoff_t pgoff, struct mempolicy *policy,
> > - struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
> > - struct anon_vma_name *anon_name)
> > -{
> > - struct mm_struct *mm = src->vm_mm;
> > - struct anon_vma *anon_vma = src->anon_vma;
> > - struct file *file = src->vm_file;
> > - struct vm_area_struct *curr, *next, *res;
> > - struct vm_area_struct *vma, *adjust, *remove, *remove2;
> > - struct vm_area_struct *anon_dup = NULL;
> > - struct vma_prepare vp;
> > - pgoff_t vma_pgoff;
> > - int err = 0;
> > - bool merge_prev = false;
> > - bool merge_next = false;
> > - bool vma_expanded = false;
> > - unsigned long vma_start = addr;
> > - unsigned long vma_end = end;
> > - pgoff_t pglen = (end - addr) >> PAGE_SHIFT;
> > - long adj_start = 0;
> > -
> > - /*
> > - * We later require that vma->vm_flags == vm_flags,
> > - * so this tests vma->vm_flags & VM_SPECIAL, too.
> > - */
> > - if (vm_flags & VM_SPECIAL)
> > - return NULL;
> > -
> > - /* Does the input range span an existing VMA? (cases 5 - 8) */
> > - curr = find_vma_intersection(mm, prev ? prev->vm_end : 0, end);
> > -
> > - if (!curr || /* cases 1 - 4 */
> > - end == curr->vm_end) /* cases 6 - 8, adjacent VMA */
> > - next = vma_lookup(mm, end);
> > - else
> > - next = NULL; /* case 5 */
> > -
> > - if (prev) {
> > - vma_start = prev->vm_start;
> > - vma_pgoff = prev->vm_pgoff;
> > -
> > - /* Can we merge the predecessor? */
> > - if (addr == prev->vm_end && mpol_equal(vma_policy(prev), policy)
> > - && can_vma_merge_after(prev, vm_flags, anon_vma, file,
> > - pgoff, vm_userfaultfd_ctx, anon_name)) {
> > - merge_prev = true;
> > - vma_prev(vmi);
> > - }
> > - }
> > -
> > - /* Can we merge the successor? */
> > - if (next && mpol_equal(policy, vma_policy(next)) &&
> > - can_vma_merge_before(next, vm_flags, anon_vma, file, pgoff+pglen,
> > - vm_userfaultfd_ctx, anon_name)) {
> > - merge_next = true;
> > - }
> > -
> > - /* Verify some invariant that must be enforced by the caller. */
> > - VM_WARN_ON(prev && addr <= prev->vm_start);
> > - VM_WARN_ON(curr && (addr != curr->vm_start || end > curr->vm_end));
> > - VM_WARN_ON(addr >= end);
> > -
> > - if (!merge_prev && !merge_next)
> > - return NULL; /* Not mergeable. */
> > -
> > - if (merge_prev)
> > - vma_start_write(prev);
> > -
> > - res = vma = prev;
> > - remove = remove2 = adjust = NULL;
> > -
> > - /* Can we merge both the predecessor and the successor? */
> > - if (merge_prev && merge_next &&
> > - is_mergeable_anon_vma(prev->anon_vma, next->anon_vma, NULL)) {
> > - vma_start_write(next);
> > - remove = next; /* case 1 */
> > - vma_end = next->vm_end;
> > - err = dup_anon_vma(prev, next, &anon_dup);
> > - if (curr) { /* case 6 */
> > - vma_start_write(curr);
> > - remove = curr;
> > - remove2 = next;
> > - /*
> > - * Note that the dup_anon_vma below cannot overwrite err
> > - * since the first caller would do nothing unless next
> > - * has an anon_vma.
> > - */
> > - if (!next->anon_vma)
> > - err = dup_anon_vma(prev, curr, &anon_dup);
> > - }
> > - } else if (merge_prev) { /* case 2 */
> > - if (curr) {
> > - vma_start_write(curr);
> > - if (end == curr->vm_end) { /* case 7 */
> > - /*
> > - * can_vma_merge_after() assumed we would not be
> > - * removing prev vma, so it skipped the check
> > - * for vm_ops->close, but we are removing curr
> > - */
> > - if (curr->vm_ops && curr->vm_ops->close)
> > - err = -EINVAL;
> > - remove = curr;
> > - } else { /* case 5 */
> > - adjust = curr;
> > - adj_start = (end - curr->vm_start);
> > - }
> > - if (!err)
> > - err = dup_anon_vma(prev, curr, &anon_dup);
> > - }
> > - } else { /* merge_next */
> > - vma_start_write(next);
> > - res = next;
> > - if (prev && addr < prev->vm_end) { /* case 4 */
> > - vma_start_write(prev);
> > - vma_end = addr;
> > - adjust = next;
> > - adj_start = -(prev->vm_end - addr);
> > - err = dup_anon_vma(next, prev, &anon_dup);
> > - } else {
> > - /*
> > - * Note that cases 3 and 8 are the ONLY ones where prev
> > - * is permitted to be (but is not necessarily) NULL.
> > - */
> > - vma = next; /* case 3 */
> > - vma_start = addr;
> > - vma_end = next->vm_end;
> > - vma_pgoff = next->vm_pgoff - pglen;
> > - if (curr) { /* case 8 */
> > - vma_pgoff = curr->vm_pgoff;
> > - vma_start_write(curr);
> > - remove = curr;
> > - err = dup_anon_vma(next, curr, &anon_dup);
> > - }
> > - }
> > - }
> > -
> > - /* Error in anon_vma clone. */
> > - if (err)
> > - goto anon_vma_fail;
> > -
> > - if (vma_start < vma->vm_start || vma_end > vma->vm_end)
> > - vma_expanded = true;
> > -
> > - if (vma_expanded) {
> > - vma_iter_config(vmi, vma_start, vma_end);
> > - } else {
> > - vma_iter_config(vmi, adjust->vm_start + adj_start,
> > - adjust->vm_end);
> > - }
> > -
> > - if (vma_iter_prealloc(vmi, vma))
> > - goto prealloc_fail;
> > -
> > - init_multi_vma_prep(&vp, vma, adjust, remove, remove2);
> > - VM_WARN_ON(vp.anon_vma && adjust && adjust->anon_vma &&
> > - vp.anon_vma != adjust->anon_vma);
> > -
> > - vma_prepare(&vp);
> > - vma_adjust_trans_huge(vma, vma_start, vma_end, adj_start);
> > - vma_set_range(vma, vma_start, vma_end, vma_pgoff);
> > -
> > - if (vma_expanded)
> > - vma_iter_store(vmi, vma);
> > -
> > - if (adj_start) {
> > - adjust->vm_start += adj_start;
> > - adjust->vm_pgoff += adj_start >> PAGE_SHIFT;
> > - if (adj_start < 0) {
> > - WARN_ON(vma_expanded);
> > - vma_iter_store(vmi, next);
> > - }
> > - }
> > -
> > - vma_complete(&vp, vmi, mm);
> > - khugepaged_enter_vma(res, vm_flags);
> > - return res;
> > -
> > -prealloc_fail:
> > - if (anon_dup)
> > - unlink_anon_vmas(anon_dup);
> > -
> > -anon_vma_fail:
> > - vma_iter_set(vmi, addr);
> > - vma_iter_load(vmi);
> > - return NULL;
> > -}
> > -
> > -/*
> > - * Rough compatibility check to quickly see if it's even worth looking
> > - * at sharing an anon_vma.
> > - *
> > - * They need to have the same vm_file, and the flags can only differ
> > - * in things that mprotect may change.
> > - *
> > - * NOTE! The fact that we share an anon_vma doesn't _have_ to mean that
> > - * we can merge the two vma's. For example, we refuse to merge a vma if
> > - * there is a vm_ops->close() function, because that indicates that the
> > - * driver is doing some kind of reference counting. But that doesn't
> > - * really matter for the anon_vma sharing case.
> > - */
> > -static int anon_vma_compatible(struct vm_area_struct *a, struct vm_area_struct *b)
> > -{
> > - return a->vm_end == b->vm_start &&
> > - mpol_equal(vma_policy(a), vma_policy(b)) &&
> > - a->vm_file == b->vm_file &&
> > - !((a->vm_flags ^ b->vm_flags) & ~(VM_ACCESS_FLAGS | VM_SOFTDIRTY)) &&
> > - b->vm_pgoff == a->vm_pgoff + ((b->vm_start - a->vm_start) >> PAGE_SHIFT);
> > -}
> > -
> > -/*
> > - * Do some basic sanity checking to see if we can re-use the anon_vma
> > - * from 'old'. The 'a'/'b' vma's are in VM order - one of them will be
> > - * the same as 'old', the other will be the new one that is trying
> > - * to share the anon_vma.
> > - *
> > - * NOTE! This runs with mmap_lock held for reading, so it is possible that
> > - * the anon_vma of 'old' is concurrently in the process of being set up
> > - * by another page fault trying to merge _that_. But that's ok: if it
> > - * is being set up, that automatically means that it will be a singleton
> > - * acceptable for merging, so we can do all of this optimistically. But
> > - * we do that READ_ONCE() to make sure that we never re-load the pointer.
> > - *
> > - * IOW: that the "list_is_singular()" test on the anon_vma_chain only
> > - * matters for the 'stable anon_vma' case (ie the thing we want to avoid
> > - * is to return an anon_vma that is "complex" due to having gone through
> > - * a fork).
> > - *
> > - * We also make sure that the two vma's are compatible (adjacent,
> > - * and with the same memory policies). That's all stable, even with just
> > - * a read lock on the mmap_lock.
> > - */
> > -static struct anon_vma *reusable_anon_vma(struct vm_area_struct *old, struct vm_area_struct *a, struct vm_area_struct *b)
> > -{
> > - if (anon_vma_compatible(a, b)) {
> > - struct anon_vma *anon_vma = READ_ONCE(old->anon_vma);
> > -
> > - if (anon_vma && list_is_singular(&old->anon_vma_chain))
> > - return anon_vma;
> > - }
> > - return NULL;
> > -}
> > + goto out;
> >
> > -/*
> > - * find_mergeable_anon_vma is used by anon_vma_prepare, to check
> > - * neighbouring vmas for a suitable anon_vma, before it goes off
> > - * to allocate a new anon_vma. It checks because a repetitive
> > - * sequence of mprotects and faults may otherwise lead to distinct
> > - * anon_vmas being allocated, preventing vma merge in subsequent
> > - * mprotect.
> > - */
> > -struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *vma)
> > -{
> > - struct anon_vma *anon_vma = NULL;
> > - struct vm_area_struct *prev, *next;
> > - VMA_ITERATOR(vmi, vma->vm_mm, vma->vm_end);
> > -
> > - /* Try next first. */
> > - next = vma_iter_load(&vmi);
> > - if (next) {
> > - anon_vma = reusable_anon_vma(next, vma, next);
> > - if (anon_vma)
> > - return anon_vma;
> > - }
> > + mm->brk = brk;
> > + if (mm->def_flags & VM_LOCKED)
> > + populate = true;
> >
> > - prev = vma_prev(&vmi);
> > - VM_BUG_ON_VMA(prev != vma, vma);
> > - prev = vma_prev(&vmi);
> > - /* Try prev next. */
> > - if (prev)
> > - anon_vma = reusable_anon_vma(prev, prev, vma);
> > +success:
> > + mmap_write_unlock(mm);
> > +success_unlocked:
> > + userfaultfd_unmap_complete(mm, &uf);
> > + if (populate)
> > + mm_populate(oldbrk, newbrk - oldbrk);
> > + return brk;
> >
> > - /*
> > - * We might reach here with anon_vma == NULL if we can't find
> > - * any reusable anon_vma.
> > - * There's no absolute need to look only at touching neighbours:
> > - * we could search further afield for "compatible" anon_vmas.
> > - * But it would probably just be a waste of time searching,
> > - * or lead to too many vmas hanging off the same anon_vma.
> > - * We're trying to allow mprotect remerging later on,
> > - * not trying to minimize memory used for anon_vmas.
> > - */
> > - return anon_vma;
> > +out:
> > + mm->brk = origbrk;
> > + mmap_write_unlock(mm);
> > + return origbrk;
> > }
> >
> > /*
> > @@ -1519,85 +547,6 @@ SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg)
> > }
> > #endif /* __ARCH_WANT_SYS_OLD_MMAP */
> >
> > -static bool vm_ops_needs_writenotify(const struct vm_operations_struct *vm_ops)
> > -{
> > - return vm_ops && (vm_ops->page_mkwrite || vm_ops->pfn_mkwrite);
> > -}
> > -
> > -static bool vma_is_shared_writable(struct vm_area_struct *vma)
> > -{
> > - return (vma->vm_flags & (VM_WRITE | VM_SHARED)) ==
> > - (VM_WRITE | VM_SHARED);
> > -}
> > -
> > -static bool vma_fs_can_writeback(struct vm_area_struct *vma)
> > -{
> > - /* No managed pages to writeback. */
> > - if (vma->vm_flags & VM_PFNMAP)
> > - return false;
> > -
> > - return vma->vm_file && vma->vm_file->f_mapping &&
> > - mapping_can_writeback(vma->vm_file->f_mapping);
> > -}
> > -
> > -/*
> > - * Does this VMA require the underlying folios to have their dirty state
> > - * tracked?
> > - */
> > -bool vma_needs_dirty_tracking(struct vm_area_struct *vma)
> > -{
> > - /* Only shared, writable VMAs require dirty tracking. */
> > - if (!vma_is_shared_writable(vma))
> > - return false;
> > -
> > - /* Does the filesystem need to be notified? */
> > - if (vm_ops_needs_writenotify(vma->vm_ops))
> > - return true;
> > -
> > - /*
> > - * Even if the filesystem doesn't indicate a need for writenotify, if it
> > - * can writeback, dirty tracking is still required.
> > - */
> > - return vma_fs_can_writeback(vma);
> > -}
> > -
> > -/*
> > - * Some shared mappings will want the pages marked read-only
> > - * to track write events. If so, we'll downgrade vm_page_prot
> > - * to the private version (using protection_map[] without the
> > - * VM_SHARED bit).
> > - */
> > -bool vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot)
> > -{
> > - /* If it was private or non-writable, the write bit is already clear */
> > - if (!vma_is_shared_writable(vma))
> > - return false;
> > -
> > - /* The backer wishes to know when pages are first written to? */
> > - if (vm_ops_needs_writenotify(vma->vm_ops))
> > - return true;
> > -
> > - /* The open routine did something to the protections that pgprot_modify
> > - * won't preserve? */
> > - if (pgprot_val(vm_page_prot) !=
> > - pgprot_val(vm_pgprot_modify(vm_page_prot, vma->vm_flags)))
> > - return false;
> > -
> > - /*
> > - * Do we need to track softdirty? hugetlb does not support softdirty
> > - * tracking yet.
> > - */
> > - if (vma_soft_dirty_enabled(vma) && !is_vm_hugetlb_page(vma))
> > - return true;
> > -
> > - /* Do we need write faults for uffd-wp tracking? */
> > - if (userfaultfd_wp(vma))
> > - return true;
> > -
> > - /* Can the mapping track the dirty pages? */
> > - return vma_fs_can_writeback(vma);
> > -}
> > -
> > /*
> > * We account for memory if it's a private writeable mapping,
> > * not hugepages and VM_NORESERVE wasn't set.
> > @@ -2238,566 +1187,129 @@ int expand_downwards(struct vm_area_struct *vma, unsigned long address)
> > anon_vma_interval_tree_post_update_vma(vma);
> > spin_unlock(&mm->page_table_lock);
> >
> > - perf_event_mmap(vma);
> > - }
> > - }
> > - }
> > - anon_vma_unlock_write(vma->anon_vma);
> > - vma_iter_free(&vmi);
> > - validate_mm(mm);
> > - return error;
> > -}
> > -
> > -/* enforced gap between the expanding stack and other mappings. */
> > -unsigned long stack_guard_gap = 256UL<<PAGE_SHIFT;
> > -
> > -static int __init cmdline_parse_stack_guard_gap(char *p)
> > -{
> > - unsigned long val;
> > - char *endptr;
> > -
> > - val = simple_strtoul(p, &endptr, 10);
> > - if (!*endptr)
> > - stack_guard_gap = val << PAGE_SHIFT;
> > -
> > - return 1;
> > -}
> > -__setup("stack_guard_gap=", cmdline_parse_stack_guard_gap);
> > -
> > -#ifdef CONFIG_STACK_GROWSUP
> > -int expand_stack_locked(struct vm_area_struct *vma, unsigned long address)
> > -{
> > - return expand_upwards(vma, address);
> > -}
> > -
> > -struct vm_area_struct *find_extend_vma_locked(struct mm_struct *mm, unsigned long addr)
> > -{
> > - struct vm_area_struct *vma, *prev;
> > -
> > - addr &= PAGE_MASK;
> > - vma = find_vma_prev(mm, addr, &prev);
> > - if (vma && (vma->vm_start <= addr))
> > - return vma;
> > - if (!prev)
> > - return NULL;
> > - if (expand_stack_locked(prev, addr))
> > - return NULL;
> > - if (prev->vm_flags & VM_LOCKED)
> > - populate_vma_page_range(prev, addr, prev->vm_end, NULL);
> > - return prev;
> > -}
> > -#else
> > -int expand_stack_locked(struct vm_area_struct *vma, unsigned long address)
> > -{
> > - return expand_downwards(vma, address);
> > -}
> > -
> > -struct vm_area_struct *find_extend_vma_locked(struct mm_struct *mm, unsigned long addr)
> > -{
> > - struct vm_area_struct *vma;
> > - unsigned long start;
> > -
> > - addr &= PAGE_MASK;
> > - vma = find_vma(mm, addr);
> > - if (!vma)
> > - return NULL;
> > - if (vma->vm_start <= addr)
> > - return vma;
> > - start = vma->vm_start;
> > - if (expand_stack_locked(vma, addr))
> > - return NULL;
> > - if (vma->vm_flags & VM_LOCKED)
> > - populate_vma_page_range(vma, addr, start, NULL);
> > - return vma;
> > -}
> > -#endif
> > -
> > -#if defined(CONFIG_STACK_GROWSUP)
> > -
> > -#define vma_expand_up(vma,addr) expand_upwards(vma, addr)
> > -#define vma_expand_down(vma, addr) (-EFAULT)
> > -
> > -#else
> > -
> > -#define vma_expand_up(vma,addr) (-EFAULT)
> > -#define vma_expand_down(vma, addr) expand_downwards(vma, addr)
> > -
> > -#endif
> > -
> > -/*
> > - * expand_stack(): legacy interface for page faulting. Don't use unless
> > - * you have to.
> > - *
> > - * This is called with the mm locked for reading, drops the lock, takes
> > - * the lock for writing, tries to look up a vma again, expands it if
> > - * necessary, and downgrades the lock to reading again.
> > - *
> > - * If no vma is found or it can't be expanded, it returns NULL and has
> > - * dropped the lock.
> > - */
> > -struct vm_area_struct *expand_stack(struct mm_struct *mm, unsigned long addr)
> > -{
> > - struct vm_area_struct *vma, *prev;
> > -
> > - mmap_read_unlock(mm);
> > - if (mmap_write_lock_killable(mm))
> > - return NULL;
> > -
> > - vma = find_vma_prev(mm, addr, &prev);
> > - if (vma && vma->vm_start <= addr)
> > - goto success;
> > -
> > - if (prev && !vma_expand_up(prev, addr)) {
> > - vma = prev;
> > - goto success;
> > - }
> > -
> > - if (vma && !vma_expand_down(vma, addr))
> > - goto success;
> > -
> > - mmap_write_unlock(mm);
> > - return NULL;
> > -
> > -success:
> > - mmap_write_downgrade(mm);
> > - return vma;
> > -}
> > -
> > -/*
> > - * Ok - we have the memory areas we should free on a maple tree so release them,
> > - * and do the vma updates.
> > - *
> > - * Called with the mm semaphore held.
> > - */
> > -static inline void remove_mt(struct mm_struct *mm, struct ma_state *mas)
> > -{
> > - unsigned long nr_accounted = 0;
> > - struct vm_area_struct *vma;
> > -
> > - /* Update high watermark before we lower total_vm */
> > - update_hiwater_vm(mm);
> > - mas_for_each(mas, vma, ULONG_MAX) {
> > - long nrpages = vma_pages(vma);
> > -
> > - if (vma->vm_flags & VM_ACCOUNT)
> > - nr_accounted += nrpages;
> > - vm_stat_account(mm, vma->vm_flags, -nrpages);
> > - remove_vma(vma, false);
> > - }
> > - vm_unacct_memory(nr_accounted);
> > -}
> > -
> > -/*
> > - * Get rid of page table information in the indicated region.
> > - *
> > - * Called with the mm semaphore held.
> > - */
> > -static void unmap_region(struct mm_struct *mm, struct ma_state *mas,
> > - struct vm_area_struct *vma, struct vm_area_struct *prev,
> > - struct vm_area_struct *next, unsigned long start,
> > - unsigned long end, unsigned long tree_end, bool mm_wr_locked)
> > -{
> > - struct mmu_gather tlb;
> > - unsigned long mt_start = mas->index;
> > -
> > - lru_add_drain();
> > - tlb_gather_mmu(&tlb, mm);
> > - update_hiwater_rss(mm);
> > - unmap_vmas(&tlb, mas, vma, start, end, tree_end, mm_wr_locked);
> > - mas_set(mas, mt_start);
> > - free_pgtables(&tlb, mas, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS,
> > - next ? next->vm_start : USER_PGTABLES_CEILING,
> > - mm_wr_locked);
> > - tlb_finish_mmu(&tlb);
> > -}
> > -
> > -/*
> > - * __split_vma() bypasses sysctl_max_map_count checking. We use this where it
> > - * has already been checked or doesn't make sense to fail.
> > - * VMA Iterator will point to the end VMA.
> > - */
> > -static int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma,
> > - unsigned long addr, int new_below)
> > -{
> > - struct vma_prepare vp;
> > - struct vm_area_struct *new;
> > - int err;
> > -
> > - WARN_ON(vma->vm_start >= addr);
> > - WARN_ON(vma->vm_end <= addr);
> > -
> > - if (vma->vm_ops && vma->vm_ops->may_split) {
> > - err = vma->vm_ops->may_split(vma, addr);
> > - if (err)
> > - return err;
> > - }
> > -
> > - new = vm_area_dup(vma);
> > - if (!new)
> > - return -ENOMEM;
> > -
> > - if (new_below) {
> > - new->vm_end = addr;
> > - } else {
> > - new->vm_start = addr;
> > - new->vm_pgoff += ((addr - vma->vm_start) >> PAGE_SHIFT);
> > - }
> > -
> > - err = -ENOMEM;
> > - vma_iter_config(vmi, new->vm_start, new->vm_end);
> > - if (vma_iter_prealloc(vmi, new))
> > - goto out_free_vma;
> > -
> > - err = vma_dup_policy(vma, new);
> > - if (err)
> > - goto out_free_vmi;
> > -
> > - err = anon_vma_clone(new, vma);
> > - if (err)
> > - goto out_free_mpol;
> > -
> > - if (new->vm_file)
> > - get_file(new->vm_file);
> > -
> > - if (new->vm_ops && new->vm_ops->open)
> > - new->vm_ops->open(new);
> > -
> > - vma_start_write(vma);
> > - vma_start_write(new);
> > -
> > - init_vma_prep(&vp, vma);
> > - vp.insert = new;
> > - vma_prepare(&vp);
> > - vma_adjust_trans_huge(vma, vma->vm_start, addr, 0);
> > -
> > - if (new_below) {
> > - vma->vm_start = addr;
> > - vma->vm_pgoff += (addr - new->vm_start) >> PAGE_SHIFT;
> > - } else {
> > - vma->vm_end = addr;
> > - }
> > -
> > - /* vma_complete stores the new vma */
> > - vma_complete(&vp, vmi, vma->vm_mm);
> > -
> > - /* Success. */
> > - if (new_below)
> > - vma_next(vmi);
> > - return 0;
> > -
> > -out_free_mpol:
> > - mpol_put(vma_policy(new));
> > -out_free_vmi:
> > - vma_iter_free(vmi);
> > -out_free_vma:
> > - vm_area_free(new);
> > - return err;
> > -}
> > -
> > -/*
> > - * Split a vma into two pieces at address 'addr', a new vma is allocated
> > - * either for the first part or the tail.
> > - */
> > -static int split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma,
> > - unsigned long addr, int new_below)
> > -{
> > - if (vma->vm_mm->map_count >= sysctl_max_map_count)
> > - return -ENOMEM;
> > -
> > - return __split_vma(vmi, vma, addr, new_below);
> > -}
> > -
> > -/*
> > - * We are about to modify one or multiple of a VMA's flags, policy, userfaultfd
> > - * context and anonymous VMA name within the range [start, end).
> > - *
> > - * As a result, we might be able to merge the newly modified VMA range with an
> > - * adjacent VMA with identical properties.
> > - *
> > - * If no merge is possible and the range does not span the entirety of the VMA,
> > - * we then need to split the VMA to accommodate the change.
> > - *
> > - * The function returns either the merged VMA, the original VMA if a split was
> > - * required instead, or an error if the split failed.
> > - */
> > -struct vm_area_struct *vma_modify(struct vma_iterator *vmi,
> > - struct vm_area_struct *prev,
> > - struct vm_area_struct *vma,
> > - unsigned long start, unsigned long end,
> > - unsigned long vm_flags,
> > - struct mempolicy *policy,
> > - struct vm_userfaultfd_ctx uffd_ctx,
> > - struct anon_vma_name *anon_name)
> > -{
> > - pgoff_t pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
> > - struct vm_area_struct *merged;
> > -
> > - merged = vma_merge(vmi, prev, vma, start, end, vm_flags,
> > - pgoff, policy, uffd_ctx, anon_name);
> > - if (merged)
> > - return merged;
> > -
> > - if (vma->vm_start < start) {
> > - int err = split_vma(vmi, vma, start, 1);
> > -
> > - if (err)
> > - return ERR_PTR(err);
> > + perf_event_mmap(vma);
> > + }
> > + }
> > }
> > + anon_vma_unlock_write(vma->anon_vma);
> > + vma_iter_free(&vmi);
> > + validate_mm(mm);
> > + return error;
> > +}
> >
> > - if (vma->vm_end > end) {
> > - int err = split_vma(vmi, vma, end, 0);
> > +/* enforced gap between the expanding stack and other mappings. */
> > +unsigned long stack_guard_gap = 256UL<<PAGE_SHIFT;
> >
> > - if (err)
> > - return ERR_PTR(err);
> > - }
> > +static int __init cmdline_parse_stack_guard_gap(char *p)
> > +{
> > + unsigned long val;
> > + char *endptr;
> >
> > - return vma;
> > + val = simple_strtoul(p, &endptr, 10);
> > + if (!*endptr)
> > + stack_guard_gap = val << PAGE_SHIFT;
> > +
> > + return 1;
> > }
> > +__setup("stack_guard_gap=", cmdline_parse_stack_guard_gap);
> >
> > -/*
> > - * Attempt to merge a newly mapped VMA with those adjacent to it. The caller
> > - * must ensure that [start, end) does not overlap any existing VMA.
> > - */
> > -static struct vm_area_struct
> > -*vma_merge_new_vma(struct vma_iterator *vmi, struct vm_area_struct *prev,
> > - struct vm_area_struct *vma, unsigned long start,
> > - unsigned long end, pgoff_t pgoff)
> > +#ifdef CONFIG_STACK_GROWSUP
> > +int expand_stack_locked(struct vm_area_struct *vma, unsigned long address)
> > {
> > - return vma_merge(vmi, prev, vma, start, end, vma->vm_flags, pgoff,
> > - vma_policy(vma), vma->vm_userfaultfd_ctx, anon_vma_name(vma));
> > + return expand_upwards(vma, address);
> > }
> >
> > -/*
> > - * Expand vma by delta bytes, potentially merging with an immediately adjacent
> > - * VMA with identical properties.
> > - */
> > -struct vm_area_struct *vma_merge_extend(struct vma_iterator *vmi,
> > - struct vm_area_struct *vma,
> > - unsigned long delta)
> > +struct vm_area_struct *find_extend_vma_locked(struct mm_struct *mm, unsigned long addr)
> > {
> > - pgoff_t pgoff = vma->vm_pgoff + vma_pages(vma);
> > + struct vm_area_struct *vma, *prev;
> >
> > - /* vma is specified as prev, so case 1 or 2 will apply. */
> > - return vma_merge(vmi, vma, vma, vma->vm_end, vma->vm_end + delta,
> > - vma->vm_flags, pgoff, vma_policy(vma),
> > - vma->vm_userfaultfd_ctx, anon_vma_name(vma));
> > + addr &= PAGE_MASK;
> > + vma = find_vma_prev(mm, addr, &prev);
> > + if (vma && (vma->vm_start <= addr))
> > + return vma;
> > + if (!prev)
> > + return NULL;
> > + if (expand_stack_locked(prev, addr))
> > + return NULL;
> > + if (prev->vm_flags & VM_LOCKED)
> > + populate_vma_page_range(prev, addr, prev->vm_end, NULL);
> > + return prev;
> > }
> > -
> > -/*
> > - * do_vmi_align_munmap() - munmap the aligned region from @start to @end.
> > - * @vmi: The vma iterator
> > - * @vma: The starting vm_area_struct
> > - * @mm: The mm_struct
> > - * @start: The aligned start address to munmap.
> > - * @end: The aligned end address to munmap.
> > - * @uf: The userfaultfd list_head
> > - * @unlock: Set to true to drop the mmap_lock. unlocking only happens on
> > - * success.
> > - *
> > - * Return: 0 on success and drops the lock if so directed, error and leaves the
> > - * lock held otherwise.
> > - */
> > -static int
> > -do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma,
> > - struct mm_struct *mm, unsigned long start,
> > - unsigned long end, struct list_head *uf, bool unlock)
> > +#else
> > +int expand_stack_locked(struct vm_area_struct *vma, unsigned long address)
> > {
> > - struct vm_area_struct *prev, *next = NULL;
> > - struct maple_tree mt_detach;
> > - int count = 0;
> > - int error = -ENOMEM;
> > - unsigned long locked_vm = 0;
> > - MA_STATE(mas_detach, &mt_detach, 0, 0);
> > - mt_init_flags(&mt_detach, vmi->mas.tree->ma_flags & MT_FLAGS_LOCK_MASK);
> > - mt_on_stack(mt_detach);
> > -
> > - /*
> > - * If we need to split any vma, do it now to save pain later.
> > - *
> > - * Note: mremap's move_vma VM_ACCOUNT handling assumes a partially
> > - * unmapped vm_area_struct will remain in use: so lower split_vma
> > - * places tmp vma above, and higher split_vma places tmp vma below.
> > - */
> > -
> > - /* Does it split the first one? */
> > - if (start > vma->vm_start) {
> > -
> > - /*
> > - * Make sure that map_count on return from munmap() will
> > - * not exceed its limit; but let map_count go just above
> > - * its limit temporarily, to help free resources as expected.
> > - */
> > - if (end < vma->vm_end && mm->map_count >= sysctl_max_map_count)
> > - goto map_count_exceeded;
> > -
> > - error = __split_vma(vmi, vma, start, 1);
> > - if (error)
> > - goto start_split_failed;
> > - }
> > -
> > - /*
> > - * Detach a range of VMAs from the mm. Using next as a temp variable as
> > - * it is always overwritten.
> > - */
> > - next = vma;
> > - do {
> > - /* Does it split the end? */
> > - if (next->vm_end > end) {
> > - error = __split_vma(vmi, next, end, 0);
> > - if (error)
> > - goto end_split_failed;
> > - }
> > - vma_start_write(next);
> > - mas_set(&mas_detach, count);
> > - error = mas_store_gfp(&mas_detach, next, GFP_KERNEL);
> > - if (error)
> > - goto munmap_gather_failed;
> > - vma_mark_detached(next, true);
> > - if (next->vm_flags & VM_LOCKED)
> > - locked_vm += vma_pages(next);
> > + return expand_downwards(vma, address);
> > +}
> >
> > - count++;
> > - if (unlikely(uf)) {
> > - /*
> > - * If userfaultfd_unmap_prep returns an error the vmas
> > - * will remain split, but userland will get a
> > - * highly unexpected error anyway. This is no
> > - * different than the case where the first of the two
> > - * __split_vma fails, but we don't undo the first
> > - * split, despite we could. This is unlikely enough
> > - * failure that it's not worth optimizing it for.
> > - */
> > - error = userfaultfd_unmap_prep(next, start, end, uf);
> > +struct vm_area_struct *find_extend_vma_locked(struct mm_struct *mm, unsigned long addr)
> > +{
> > + struct vm_area_struct *vma;
> > + unsigned long start;
> >
> > - if (error)
> > - goto userfaultfd_error;
> > - }
> > -#ifdef CONFIG_DEBUG_VM_MAPLE_TREE
> > - BUG_ON(next->vm_start < start);
> > - BUG_ON(next->vm_start > end);
> > -#endif
> > - } for_each_vma_range(*vmi, next, end);
> > -
> > -#if defined(CONFIG_DEBUG_VM_MAPLE_TREE)
> > - /* Make sure no VMAs are about to be lost. */
> > - {
> > - MA_STATE(test, &mt_detach, 0, 0);
> > - struct vm_area_struct *vma_mas, *vma_test;
> > - int test_count = 0;
> > -
> > - vma_iter_set(vmi, start);
> > - rcu_read_lock();
> > - vma_test = mas_find(&test, count - 1);
> > - for_each_vma_range(*vmi, vma_mas, end) {
> > - BUG_ON(vma_mas != vma_test);
> > - test_count++;
> > - vma_test = mas_next(&test, count - 1);
> > - }
> > - rcu_read_unlock();
> > - BUG_ON(count != test_count);
> > - }
> > + addr &= PAGE_MASK;
> > + vma = find_vma(mm, addr);
> > + if (!vma)
> > + return NULL;
> > + if (vma->vm_start <= addr)
> > + return vma;
> > + start = vma->vm_start;
> > + if (expand_stack_locked(vma, addr))
> > + return NULL;
> > + if (vma->vm_flags & VM_LOCKED)
> > + populate_vma_page_range(vma, addr, start, NULL);
> > + return vma;
> > +}
> > #endif
> >
> > - while (vma_iter_addr(vmi) > start)
> > - vma_iter_prev_range(vmi);
> > -
> > - error = vma_iter_clear_gfp(vmi, start, end, GFP_KERNEL);
> > - if (error)
> > - goto clear_tree_failed;
> > -
> > - /* Point of no return */
> > - mm->locked_vm -= locked_vm;
> > - mm->map_count -= count;
> > - if (unlock)
> > - mmap_write_downgrade(mm);
> > +#if defined(CONFIG_STACK_GROWSUP)
> >
> > - prev = vma_iter_prev_range(vmi);
> > - next = vma_next(vmi);
> > - if (next)
> > - vma_iter_prev_range(vmi);
> > +#define vma_expand_up(vma,addr) expand_upwards(vma, addr)
> > +#define vma_expand_down(vma, addr) (-EFAULT)
> >
> > - /*
> > - * We can free page tables without write-locking mmap_lock because VMAs
> > - * were isolated before we downgraded mmap_lock.
> > - */
> > - mas_set(&mas_detach, 1);
> > - unmap_region(mm, &mas_detach, vma, prev, next, start, end, count,
> > - !unlock);
> > - /* Statistics and freeing VMAs */
> > - mas_set(&mas_detach, 0);
> > - remove_mt(mm, &mas_detach);
> > - validate_mm(mm);
> > - if (unlock)
> > - mmap_read_unlock(mm);
> > +#else
> >
> > - __mt_destroy(&mt_detach);
> > - return 0;
> > +#define vma_expand_up(vma,addr) (-EFAULT)
> > +#define vma_expand_down(vma, addr) expand_downwards(vma, addr)
> >
> > -clear_tree_failed:
> > -userfaultfd_error:
> > -munmap_gather_failed:
> > -end_split_failed:
> > - mas_set(&mas_detach, 0);
> > - mas_for_each(&mas_detach, next, end)
> > - vma_mark_detached(next, false);
> > -
> > - __mt_destroy(&mt_detach);
> > -start_split_failed:
> > -map_count_exceeded:
> > - validate_mm(mm);
> > - return error;
> > -}
> > +#endif
> >
> > /*
> > - * do_vmi_munmap() - munmap a given range.
> > - * @vmi: The vma iterator
> > - * @mm: The mm_struct
> > - * @start: The start address to munmap
> > - * @len: The length of the range to munmap
> > - * @uf: The userfaultfd list_head
> > - * @unlock: set to true if the user wants to drop the mmap_lock on success
> > + * expand_stack(): legacy interface for page faulting. Don't use unless
> > + * you have to.
> > *
> > - * This function takes a @mas that is either pointing to the previous VMA or set
> > - * to MA_START and sets it up to remove the mapping(s). The @len will be
> > - * aligned and any arch_unmap work will be preformed.
> > + * This is called with the mm locked for reading, drops the lock, takes
> > + * the lock for writing, tries to look up a vma again, expands it if
> > + * necessary, and downgrades the lock to reading again.
> > *
> > - * Return: 0 on success and drops the lock if so directed, error and leaves the
> > - * lock held otherwise.
> > + * If no vma is found or it can't be expanded, it returns NULL and has
> > + * dropped the lock.
> > */
> > -int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm,
> > - unsigned long start, size_t len, struct list_head *uf,
> > - bool unlock)
> > +struct vm_area_struct *expand_stack(struct mm_struct *mm, unsigned long addr)
> > {
> > - unsigned long end;
> > - struct vm_area_struct *vma;
> > + struct vm_area_struct *vma, *prev;
> >
> > - if ((offset_in_page(start)) || start > TASK_SIZE || len > TASK_SIZE-start)
> > - return -EINVAL;
> > + mmap_read_unlock(mm);
> > + if (mmap_write_lock_killable(mm))
> > + return NULL;
> >
> > - end = start + PAGE_ALIGN(len);
> > - if (end == start)
> > - return -EINVAL;
> > + vma = find_vma_prev(mm, addr, &prev);
> > + if (vma && vma->vm_start <= addr)
> > + goto success;
> >
> > - /*
> > - * Check if memory is sealed before arch_unmap.
> > - * Prevent unmapping a sealed VMA.
> > - * can_modify_mm assumes we have acquired the lock on MM.
> > - */
> > - if (unlikely(!can_modify_mm(mm, start, end)))
> > - return -EPERM;
> > + if (prev && !vma_expand_up(prev, addr)) {
> > + vma = prev;
> > + goto success;
> > + }
> >
> > - /* arch_unmap() might do unmaps itself. */
> > - arch_unmap(mm, start, end);
> > + if (vma && !vma_expand_down(vma, addr))
> > + goto success;
> >
> > - /* Find the first overlapping VMA */
> > - vma = vma_find(vmi, end);
> > - if (!vma) {
> > - if (unlock)
> > - mmap_write_unlock(mm);
> > - return 0;
> > - }
> > + mmap_write_unlock(mm);
> > + return NULL;
> >
> > - return do_vmi_align_munmap(vmi, vma, mm, start, end, uf, unlock);
> > +success:
> > + mmap_write_downgrade(mm);
> > + return vma;
> > }
> >
> > /* do_munmap() - Wrapper function for non-maple tree aware do_munmap() calls.
> > @@ -3460,92 +1972,6 @@ int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma)
> > return 0;
> > }
> >
> > -/*
> > - * Copy the vma structure to a new location in the same mm,
> > - * prior to moving page table entries, to effect an mremap move.
> > - */
> > -struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
> > - unsigned long addr, unsigned long len, pgoff_t pgoff,
> > - bool *need_rmap_locks)
> > -{
> > - struct vm_area_struct *vma = *vmap;
> > - unsigned long vma_start = vma->vm_start;
> > - struct mm_struct *mm = vma->vm_mm;
> > - struct vm_area_struct *new_vma, *prev;
> > - bool faulted_in_anon_vma = true;
> > - VMA_ITERATOR(vmi, mm, addr);
> > -
> > - /*
> > - * If anonymous vma has not yet been faulted, update new pgoff
> > - * to match new location, to increase its chance of merging.
> > - */
> > - if (unlikely(vma_is_anonymous(vma) && !vma->anon_vma)) {
> > - pgoff = addr >> PAGE_SHIFT;
> > - faulted_in_anon_vma = false;
> > - }
> > -
> > - new_vma = find_vma_prev(mm, addr, &prev);
> > - if (new_vma && new_vma->vm_start < addr + len)
> > - return NULL; /* should never get here */
> > -
> > - new_vma = vma_merge_new_vma(&vmi, prev, vma, addr, addr + len, pgoff);
> > - if (new_vma) {
> > - /*
> > - * Source vma may have been merged into new_vma
> > - */
> > - if (unlikely(vma_start >= new_vma->vm_start &&
> > - vma_start < new_vma->vm_end)) {
> > - /*
> > - * The only way we can get a vma_merge with
> > - * self during an mremap is if the vma hasn't
> > - * been faulted in yet and we were allowed to
> > - * reset the dst vma->vm_pgoff to the
> > - * destination address of the mremap to allow
> > - * the merge to happen. mremap must change the
> > - * vm_pgoff linearity between src and dst vmas
> > - * (in turn preventing a vma_merge) to be
> > - * safe. It is only safe to keep the vm_pgoff
> > - * linear if there are no pages mapped yet.
> > - */
> > - VM_BUG_ON_VMA(faulted_in_anon_vma, new_vma);
> > - *vmap = vma = new_vma;
> > - }
> > - *need_rmap_locks = (new_vma->vm_pgoff <= vma->vm_pgoff);
> > - } else {
> > - new_vma = vm_area_dup(vma);
> > - if (!new_vma)
> > - goto out;
> > - vma_set_range(new_vma, addr, addr + len, pgoff);
> > - if (vma_dup_policy(vma, new_vma))
> > - goto out_free_vma;
> > - if (anon_vma_clone(new_vma, vma))
> > - goto out_free_mempol;
> > - if (new_vma->vm_file)
> > - get_file(new_vma->vm_file);
> > - if (new_vma->vm_ops && new_vma->vm_ops->open)
> > - new_vma->vm_ops->open(new_vma);
> > - if (vma_link(mm, new_vma))
> > - goto out_vma_link;
> > - *need_rmap_locks = false;
> > - }
> > - return new_vma;
> > -
> > -out_vma_link:
> > - if (new_vma->vm_ops && new_vma->vm_ops->close)
> > - new_vma->vm_ops->close(new_vma);
> > -
> > - if (new_vma->vm_file)
> > - fput(new_vma->vm_file);
> > -
> > - unlink_anon_vmas(new_vma);
> > -out_free_mempol:
> > - mpol_put(vma_policy(new_vma));
> > -out_free_vma:
> > - vm_area_free(new_vma);
> > -out:
> > - return NULL;
> > -}
> > -
> > /*
> > * Return true if the calling process may expand its vm space by the passed
> > * number of pages
> > @@ -3743,203 +2169,6 @@ int install_special_mapping(struct mm_struct *mm,
> > return PTR_ERR_OR_ZERO(vma);
> > }
> >
> > -static DEFINE_MUTEX(mm_all_locks_mutex);
> > -
> > -static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma)
> > -{
> > - if (!test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_root.rb_node)) {
> > - /*
> > - * The LSB of head.next can't change from under us
> > - * because we hold the mm_all_locks_mutex.
> > - */
> > - down_write_nest_lock(&anon_vma->root->rwsem, &mm->mmap_lock);
> > - /*
> > - * We can safely modify head.next after taking the
> > - * anon_vma->root->rwsem. If some other vma in this mm shares
> > - * the same anon_vma we won't take it again.
> > - *
> > - * No need of atomic instructions here, head.next
> > - * can't change from under us thanks to the
> > - * anon_vma->root->rwsem.
> > - */
> > - if (__test_and_set_bit(0, (unsigned long *)
> > - &anon_vma->root->rb_root.rb_root.rb_node))
> > - BUG();
> > - }
> > -}
> > -
> > -static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping)
> > -{
> > - if (!test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) {
> > - /*
> > - * AS_MM_ALL_LOCKS can't change from under us because
> > - * we hold the mm_all_locks_mutex.
> > - *
> > - * Operations on ->flags have to be atomic because
> > - * even if AS_MM_ALL_LOCKS is stable thanks to the
> > - * mm_all_locks_mutex, there may be other cpus
> > - * changing other bitflags in parallel to us.
> > - */
> > - if (test_and_set_bit(AS_MM_ALL_LOCKS, &mapping->flags))
> > - BUG();
> > - down_write_nest_lock(&mapping->i_mmap_rwsem, &mm->mmap_lock);
> > - }
> > -}
> > -
> > -/*
> > - * This operation locks against the VM for all pte/vma/mm related
> > - * operations that could ever happen on a certain mm. This includes
> > - * vmtruncate, try_to_unmap, and all page faults.
> > - *
> > - * The caller must take the mmap_lock in write mode before calling
> > - * mm_take_all_locks(). The caller isn't allowed to release the
> > - * mmap_lock until mm_drop_all_locks() returns.
> > - *
> > - * mmap_lock in write mode is required in order to block all operations
> > - * that could modify pagetables and free pages without need of
> > - * altering the vma layout. It's also needed in write mode to avoid new
> > - * anon_vmas to be associated with existing vmas.
> > - *
> > - * A single task can't take more than one mm_take_all_locks() in a row
> > - * or it would deadlock.
> > - *
> > - * The LSB in anon_vma->rb_root.rb_node and the AS_MM_ALL_LOCKS bitflag in
> > - * mapping->flags avoid to take the same lock twice, if more than one
> > - * vma in this mm is backed by the same anon_vma or address_space.
> > - *
> > - * We take locks in following order, accordingly to comment at beginning
> > - * of mm/rmap.c:
> > - * - all hugetlbfs_i_mmap_rwsem_key locks (aka mapping->i_mmap_rwsem for
> > - * hugetlb mapping);
> > - * - all vmas marked locked
> > - * - all i_mmap_rwsem locks;
> > - * - all anon_vma->rwseml
> > - *
> > - * We can take all locks within these types randomly because the VM code
> > - * doesn't nest them and we protected from parallel mm_take_all_locks() by
> > - * mm_all_locks_mutex.
> > - *
> > - * mm_take_all_locks() and mm_drop_all_locks are expensive operations
> > - * that may have to take thousand of locks.
> > - *
> > - * mm_take_all_locks() can fail if it's interrupted by signals.
> > - */
> > -int mm_take_all_locks(struct mm_struct *mm)
> > -{
> > - struct vm_area_struct *vma;
> > - struct anon_vma_chain *avc;
> > - VMA_ITERATOR(vmi, mm, 0);
> > -
> > - mmap_assert_write_locked(mm);
> > -
> > - mutex_lock(&mm_all_locks_mutex);
> > -
> > - /*
> > - * vma_start_write() does not have a complement in mm_drop_all_locks()
> > - * because vma_start_write() is always asymmetrical; it marks a VMA as
> > - * being written to until mmap_write_unlock() or mmap_write_downgrade()
> > - * is reached.
> > - */
> > - for_each_vma(vmi, vma) {
> > - if (signal_pending(current))
> > - goto out_unlock;
> > - vma_start_write(vma);
> > - }
> > -
> > - vma_iter_init(&vmi, mm, 0);
> > - for_each_vma(vmi, vma) {
> > - if (signal_pending(current))
> > - goto out_unlock;
> > - if (vma->vm_file && vma->vm_file->f_mapping &&
> > - is_vm_hugetlb_page(vma))
> > - vm_lock_mapping(mm, vma->vm_file->f_mapping);
> > - }
> > -
> > - vma_iter_init(&vmi, mm, 0);
> > - for_each_vma(vmi, vma) {
> > - if (signal_pending(current))
> > - goto out_unlock;
> > - if (vma->vm_file && vma->vm_file->f_mapping &&
> > - !is_vm_hugetlb_page(vma))
> > - vm_lock_mapping(mm, vma->vm_file->f_mapping);
> > - }
> > -
> > - vma_iter_init(&vmi, mm, 0);
> > - for_each_vma(vmi, vma) {
> > - if (signal_pending(current))
> > - goto out_unlock;
> > - if (vma->anon_vma)
> > - list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
> > - vm_lock_anon_vma(mm, avc->anon_vma);
> > - }
> > -
> > - return 0;
> > -
> > -out_unlock:
> > - mm_drop_all_locks(mm);
> > - return -EINTR;
> > -}
> > -
> > -static void vm_unlock_anon_vma(struct anon_vma *anon_vma)
> > -{
> > - if (test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_root.rb_node)) {
> > - /*
> > - * The LSB of head.next can't change to 0 from under
> > - * us because we hold the mm_all_locks_mutex.
> > - *
> > - * We must however clear the bitflag before unlocking
> > - * the vma so the users using the anon_vma->rb_root will
> > - * never see our bitflag.
> > - *
> > - * No need of atomic instructions here, head.next
> > - * can't change from under us until we release the
> > - * anon_vma->root->rwsem.
> > - */
> > - if (!__test_and_clear_bit(0, (unsigned long *)
> > - &anon_vma->root->rb_root.rb_root.rb_node))
> > - BUG();
> > - anon_vma_unlock_write(anon_vma);
> > - }
> > -}
> > -
> > -static void vm_unlock_mapping(struct address_space *mapping)
> > -{
> > - if (test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) {
> > - /*
> > - * AS_MM_ALL_LOCKS can't change to 0 from under us
> > - * because we hold the mm_all_locks_mutex.
> > - */
> > - i_mmap_unlock_write(mapping);
> > - if (!test_and_clear_bit(AS_MM_ALL_LOCKS,
> > - &mapping->flags))
> > - BUG();
> > - }
> > -}
> > -
> > -/*
> > - * The mmap_lock cannot be released by the caller until
> > - * mm_drop_all_locks() returns.
> > - */
> > -void mm_drop_all_locks(struct mm_struct *mm)
> > -{
> > - struct vm_area_struct *vma;
> > - struct anon_vma_chain *avc;
> > - VMA_ITERATOR(vmi, mm, 0);
> > -
> > - mmap_assert_write_locked(mm);
> > - BUG_ON(!mutex_is_locked(&mm_all_locks_mutex));
> > -
> > - for_each_vma(vmi, vma) {
> > - if (vma->anon_vma)
> > - list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
> > - vm_unlock_anon_vma(avc->anon_vma);
> > - if (vma->vm_file && vma->vm_file->f_mapping)
> > - vm_unlock_mapping(vma->vm_file->f_mapping);
> > - }
> > -
> > - mutex_unlock(&mm_all_locks_mutex);
> > -}
> > -
> > /*
> > * vma_expand_bottom() - Expands the bottom of a VMA downwards. An error will
> > * arise if there is another VMA in the expanded range, or
> > diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
> > index 8982e6139d07..fc18fe274505 100644
> > --- a/mm/mmu_notifier.c
> > +++ b/mm/mmu_notifier.c
> > @@ -19,6 +19,8 @@
> > #include <linux/sched/mm.h>
> > #include <linux/slab.h>
> >
> > +#include "vma.h"
> > +
> > /* global SRCU for all MMs */
> > DEFINE_STATIC_SRCU(srcu);
> >
> > diff --git a/mm/mprotect.c b/mm/mprotect.c
> > index 222ab434da54..77951e2d0863 100644
> > --- a/mm/mprotect.c
> > +++ b/mm/mprotect.c
> > @@ -39,6 +39,7 @@
> > #include <asm/tlb.h>
> >
> > #include "internal.h"
> > +#include "vma.h"
> >
> > bool can_change_pte_writable(struct vm_area_struct *vma, unsigned long addr,
> > pte_t pte)
> > diff --git a/mm/mremap.c b/mm/mremap.c
> > index e7ae140fc640..09ef3eb31fbf 100644
> > --- a/mm/mremap.c
> > +++ b/mm/mremap.c
> > @@ -31,6 +31,7 @@
> > #include <asm/pgalloc.h>
> >
> > #include "internal.h"
> > +#include "vma.h"
> >
> > static pud_t *get_old_pud(struct mm_struct *mm, unsigned long addr)
> > {
> > diff --git a/mm/mseal.c b/mm/mseal.c
> > index bf783bba8ed0..7bcceda42a1a 100644
> > --- a/mm/mseal.c
> > +++ b/mm/mseal.c
> > @@ -14,7 +14,9 @@
> > #include <linux/mmu_context.h>
> > #include <linux/syscalls.h>
> > #include <linux/sched.h>
> > +
> > #include "internal.h"
> > +#include "vma.h"
> >
> > static inline bool vma_is_sealed(struct vm_area_struct *vma)
> > {
> > diff --git a/mm/rmap.c b/mm/rmap.c
> > index 8616308610b9..4dec7ab3638c 100644
> > --- a/mm/rmap.c
> > +++ b/mm/rmap.c
> > @@ -83,6 +83,7 @@
> > #include <trace/events/migrate.h>
> >
> > #include "internal.h"
> > +#include "vma.h"
> >
> > static struct kmem_cache *anon_vma_cachep;
> > static struct kmem_cache *anon_vma_chain_cachep;
> > diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
> > index 950fe6b2f0f7..30be083788be 100644
> > --- a/mm/userfaultfd.c
> > +++ b/mm/userfaultfd.c
> > @@ -17,7 +17,9 @@
> > #include <linux/shmem_fs.h>
> > #include <asm/tlbflush.h>
> > #include <asm/tlb.h>
> > +
> > #include "internal.h"
> > +#include "vma.h"
> >
> > static __always_inline
> > bool validate_dst_vma(struct vm_area_struct *dst_vma, unsigned long dst_end)
> > diff --git a/mm/vma.c b/mm/vma.c
> > new file mode 100644
> > index 000000000000..bf0546fe6eab
> > --- /dev/null
> > +++ b/mm/vma.c
> > @@ -0,0 +1,1766 @@
> > +// SPDX-License-Identifier: GPL-2.0-or-later
> > +
> > +/*
> > + * VMA-specific functions.
> > + */
> > +
> > +#include "vma_internal.h"
> > +#include "vma.h"
> > +
> > +/*
> > + * If the vma has a ->close operation then the driver probably needs to release
> > + * per-vma resources, so we don't attempt to merge those if the caller indicates
> > + * the current vma may be removed as part of the merge.
> > + */
> > +static inline bool is_mergeable_vma(struct vm_area_struct *vma,
> > + struct file *file, unsigned long vm_flags,
> > + struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
> > + struct anon_vma_name *anon_name, bool may_remove_vma)
> > +{
> > + /*
> > + * VM_SOFTDIRTY should not prevent from VMA merging, if we
> > + * match the flags but dirty bit -- the caller should mark
> > + * merged VMA as dirty. If dirty bit won't be excluded from
> > + * comparison, we increase pressure on the memory system forcing
> > + * the kernel to generate new VMAs when old one could be
> > + * extended instead.
> > + */
> > + if ((vma->vm_flags ^ vm_flags) & ~VM_SOFTDIRTY)
> > + return false;
> > + if (vma->vm_file != file)
> > + return false;
> > + if (may_remove_vma && vma->vm_ops && vma->vm_ops->close)
> > + return false;
> > + if (!is_mergeable_vm_userfaultfd_ctx(vma, vm_userfaultfd_ctx))
> > + return false;
> > + if (!anon_vma_name_eq(anon_vma_name(vma), anon_name))
> > + return false;
> > + return true;
> > +}
> > +
> > +static inline bool is_mergeable_anon_vma(struct anon_vma *anon_vma1,
> > + struct anon_vma *anon_vma2, struct vm_area_struct *vma)
> > +{
> > + /*
> > + * The list_is_singular() test is to avoid merging VMA cloned from
> > + * parents. This can improve scalability caused by anon_vma lock.
> > + */
> > + if ((!anon_vma1 || !anon_vma2) && (!vma ||
> > + list_is_singular(&vma->anon_vma_chain)))
> > + return true;
> > + return anon_vma1 == anon_vma2;
> > +}
> > +
> > +/*
> > + * init_multi_vma_prep() - Initializer for struct vma_prepare
> > + * @vp: The vma_prepare struct
> > + * @vma: The vma that will be altered once locked
> > + * @next: The next vma if it is to be adjusted
> > + * @remove: The first vma to be removed
> > + * @remove2: The second vma to be removed
> > + */
> > +static void init_multi_vma_prep(struct vma_prepare *vp,
> > + struct vm_area_struct *vma,
> > + struct vm_area_struct *next,
> > + struct vm_area_struct *remove,
> > + struct vm_area_struct *remove2)
> > +{
> > + memset(vp, 0, sizeof(struct vma_prepare));
> > + vp->vma = vma;
> > + vp->anon_vma = vma->anon_vma;
> > + vp->remove = remove;
> > + vp->remove2 = remove2;
> > + vp->adj_next = next;
> > + if (!vp->anon_vma && next)
> > + vp->anon_vma = next->anon_vma;
> > +
> > + vp->file = vma->vm_file;
> > + if (vp->file)
> > + vp->mapping = vma->vm_file->f_mapping;
> > +
> > +}
> > +
> > +/*
> > + * Return true if we can merge this (vm_flags,anon_vma,file,vm_pgoff)
> > + * in front of (at a lower virtual address and file offset than) the vma.
> > + *
> > + * We cannot merge two vmas if they have differently assigned (non-NULL)
> > + * anon_vmas, nor if same anon_vma is assigned but offsets incompatible.
> > + *
> > + * We don't check here for the merged mmap wrapping around the end of pagecache
> > + * indices (16TB on ia32) because do_mmap() does not permit mmap's which
> > + * wrap, nor mmaps which cover the final page at index -1UL.
> > + *
> > + * We assume the vma may be removed as part of the merge.
> > + */
> > +bool
> > +can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags,
> > + struct anon_vma *anon_vma, struct file *file,
> > + pgoff_t vm_pgoff, struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
> > + struct anon_vma_name *anon_name)
> > +{
> > + if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx, anon_name, true) &&
> > + is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) {
> > + if (vma->vm_pgoff == vm_pgoff)
> > + return true;
> > + }
> > + return false;
> > +}
> > +
> > +/*
> > + * Return true if we can merge this (vm_flags,anon_vma,file,vm_pgoff)
> > + * beyond (at a higher virtual address and file offset than) the vma.
> > + *
> > + * We cannot merge two vmas if they have differently assigned (non-NULL)
> > + * anon_vmas, nor if same anon_vma is assigned but offsets incompatible.
> > + *
> > + * We assume that vma is not removed as part of the merge.
> > + */
> > +bool
> > +can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags,
> > + struct anon_vma *anon_vma, struct file *file,
> > + pgoff_t vm_pgoff, struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
> > + struct anon_vma_name *anon_name)
> > +{
> > + if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx, anon_name, false) &&
> > + is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) {
> > + pgoff_t vm_pglen;
> > +
> > + vm_pglen = vma_pages(vma);
> > + if (vma->vm_pgoff + vm_pglen == vm_pgoff)
> > + return true;
> > + }
> > + return false;
> > +}
> > +
> > +/*
> > + * Close a vm structure and free it.
> > + */
> > +void remove_vma(struct vm_area_struct *vma, bool unreachable)
> > +{
> > + might_sleep();
> > + if (vma->vm_ops && vma->vm_ops->close)
> > + vma->vm_ops->close(vma);
> > + if (vma->vm_file)
> > + fput(vma->vm_file);
> > + mpol_put(vma_policy(vma));
> > + if (unreachable)
> > + __vm_area_free(vma);
> > + else
> > + vm_area_free(vma);
> > +}
> > +
> > +/*
> > + * Get rid of page table information in the indicated region.
> > + *
> > + * Called with the mm semaphore held.
> > + */
> > +void unmap_region(struct mm_struct *mm, struct ma_state *mas,
> > + struct vm_area_struct *vma, struct vm_area_struct *prev,
> > + struct vm_area_struct *next, unsigned long start,
> > + unsigned long end, unsigned long tree_end, bool mm_wr_locked)
> > +{
> > + struct mmu_gather tlb;
> > + unsigned long mt_start = mas->index;
> > +
> > + lru_add_drain();
> > + tlb_gather_mmu(&tlb, mm);
> > + update_hiwater_rss(mm);
> > + unmap_vmas(&tlb, mas, vma, start, end, tree_end, mm_wr_locked);
> > + mas_set(mas, mt_start);
> > + free_pgtables(&tlb, mas, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS,
> > + next ? next->vm_start : USER_PGTABLES_CEILING,
> > + mm_wr_locked);
> > + tlb_finish_mmu(&tlb);
> > +}
> > +
> > +/*
> > + * __split_vma() bypasses sysctl_max_map_count checking. We use this where it
> > + * has already been checked or doesn't make sense to fail.
> > + * VMA Iterator will point to the end VMA.
> > + */
> > +static int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma,
> > + unsigned long addr, int new_below)
> > +{
> > + struct vma_prepare vp;
> > + struct vm_area_struct *new;
> > + int err;
> > +
> > + WARN_ON(vma->vm_start >= addr);
> > + WARN_ON(vma->vm_end <= addr);
> > +
> > + if (vma->vm_ops && vma->vm_ops->may_split) {
> > + err = vma->vm_ops->may_split(vma, addr);
> > + if (err)
> > + return err;
> > + }
> > +
> > + new = vm_area_dup(vma);
> > + if (!new)
> > + return -ENOMEM;
> > +
> > + if (new_below) {
> > + new->vm_end = addr;
> > + } else {
> > + new->vm_start = addr;
> > + new->vm_pgoff += ((addr - vma->vm_start) >> PAGE_SHIFT);
> > + }
> > +
> > + err = -ENOMEM;
> > + vma_iter_config(vmi, new->vm_start, new->vm_end);
> > + if (vma_iter_prealloc(vmi, new))
> > + goto out_free_vma;
> > +
> > + err = vma_dup_policy(vma, new);
> > + if (err)
> > + goto out_free_vmi;
> > +
> > + err = anon_vma_clone(new, vma);
> > + if (err)
> > + goto out_free_mpol;
> > +
> > + if (new->vm_file)
> > + get_file(new->vm_file);
> > +
> > + if (new->vm_ops && new->vm_ops->open)
> > + new->vm_ops->open(new);
> > +
> > + vma_start_write(vma);
> > + vma_start_write(new);
> > +
> > + init_vma_prep(&vp, vma);
> > + vp.insert = new;
> > + vma_prepare(&vp);
> > + vma_adjust_trans_huge(vma, vma->vm_start, addr, 0);
> > +
> > + if (new_below) {
> > + vma->vm_start = addr;
> > + vma->vm_pgoff += (addr - new->vm_start) >> PAGE_SHIFT;
> > + } else {
> > + vma->vm_end = addr;
> > + }
> > +
> > + /* vma_complete stores the new vma */
> > + vma_complete(&vp, vmi, vma->vm_mm);
> > +
> > + /* Success. */
> > + if (new_below)
> > + vma_next(vmi);
> > + return 0;
> > +
> > +out_free_mpol:
> > + mpol_put(vma_policy(new));
> > +out_free_vmi:
> > + vma_iter_free(vmi);
> > +out_free_vma:
> > + vm_area_free(new);
> > + return err;
> > +}
> > +
> > +/*
> > + * Split a vma into two pieces at address 'addr', a new vma is allocated
> > + * either for the first part or the tail.
> > + */
> > +static int split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma,
> > + unsigned long addr, int new_below)
> > +{
> > + if (vma->vm_mm->map_count >= sysctl_max_map_count)
> > + return -ENOMEM;
> > +
> > + return __split_vma(vmi, vma, addr, new_below);
> > +}
> > +
> > +/*
> > + * Ok - we have the memory areas we should free on a maple tree so release them,
> > + * and do the vma updates.
> > + *
> > + * Called with the mm semaphore held.
> > + */
> > +static inline void remove_mt(struct mm_struct *mm, struct ma_state *mas)
> > +{
> > + unsigned long nr_accounted = 0;
> > + struct vm_area_struct *vma;
> > +
> > + /* Update high watermark before we lower total_vm */
> > + update_hiwater_vm(mm);
> > + mas_for_each(mas, vma, ULONG_MAX) {
> > + long nrpages = vma_pages(vma);
> > +
> > + if (vma->vm_flags & VM_ACCOUNT)
> > + nr_accounted += nrpages;
> > + vm_stat_account(mm, vma->vm_flags, -nrpages);
> > + remove_vma(vma, false);
> > + }
> > + vm_unacct_memory(nr_accounted);
> > +}
> > +
> > +/*
> > + * init_vma_prep() - Initializer wrapper for vma_prepare struct
> > + * @vp: The vma_prepare struct
> > + * @vma: The vma that will be altered once locked
> > + */
> > +void init_vma_prep(struct vma_prepare *vp,
> > + struct vm_area_struct *vma)
> > +{
> > + init_multi_vma_prep(vp, vma, NULL, NULL, NULL);
> > +}
> > +
> > +/*
> > + * Requires inode->i_mapping->i_mmap_rwsem
> > + */
> > +static void __remove_shared_vm_struct(struct vm_area_struct *vma,
> > + struct address_space *mapping)
> > +{
> > + if (vma_is_shared_maywrite(vma))
> > + mapping_unmap_writable(mapping);
> > +
> > + flush_dcache_mmap_lock(mapping);
> > + vma_interval_tree_remove(vma, &mapping->i_mmap);
> > + flush_dcache_mmap_unlock(mapping);
> > +}
> > +
> > +/*
> > + * vma has some anon_vma assigned, and is already inserted on that
> > + * anon_vma's interval trees.
> > + *
> > + * Before updating the vma's vm_start / vm_end / vm_pgoff fields, the
> > + * vma must be removed from the anon_vma's interval trees using
> > + * anon_vma_interval_tree_pre_update_vma().
> > + *
> > + * After the update, the vma will be reinserted using
> > + * anon_vma_interval_tree_post_update_vma().
> > + *
> > + * The entire update must be protected by exclusive mmap_lock and by
> > + * the root anon_vma's mutex.
> > + */
> > +void
> > +anon_vma_interval_tree_pre_update_vma(struct vm_area_struct *vma)
> > +{
> > + struct anon_vma_chain *avc;
> > +
> > + list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
> > + anon_vma_interval_tree_remove(avc, &avc->anon_vma->rb_root);
> > +}
> > +
> > +void
> > +anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma)
> > +{
> > + struct anon_vma_chain *avc;
> > +
> > + list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
> > + anon_vma_interval_tree_insert(avc, &avc->anon_vma->rb_root);
> > +}
> > +
> > +static void __vma_link_file(struct vm_area_struct *vma,
> > + struct address_space *mapping)
> > +{
> > + if (vma_is_shared_maywrite(vma))
> > + mapping_allow_writable(mapping);
> > +
> > + flush_dcache_mmap_lock(mapping);
> > + vma_interval_tree_insert(vma, &mapping->i_mmap);
> > + flush_dcache_mmap_unlock(mapping);
> > +}
> > +
> > +/*
> > + * vma_prepare() - Helper function for handling locking VMAs prior to altering
> > + * @vp: The initialized vma_prepare struct
> > + */
> > +void vma_prepare(struct vma_prepare *vp)
> > +{
> > + if (vp->file) {
> > + uprobe_munmap(vp->vma, vp->vma->vm_start, vp->vma->vm_end);
> > +
> > + if (vp->adj_next)
> > + uprobe_munmap(vp->adj_next, vp->adj_next->vm_start,
> > + vp->adj_next->vm_end);
> > +
> > + i_mmap_lock_write(vp->mapping);
> > + if (vp->insert && vp->insert->vm_file) {
> > + /*
> > + * Put into interval tree now, so instantiated pages
> > + * are visible to arm/parisc __flush_dcache_page
> > + * throughout; but we cannot insert into address
> > + * space until vma start or end is updated.
> > + */
> > + __vma_link_file(vp->insert,
> > + vp->insert->vm_file->f_mapping);
> > + }
> > + }
> > +
> > + if (vp->anon_vma) {
> > + anon_vma_lock_write(vp->anon_vma);
> > + anon_vma_interval_tree_pre_update_vma(vp->vma);
> > + if (vp->adj_next)
> > + anon_vma_interval_tree_pre_update_vma(vp->adj_next);
> > + }
> > +
> > + if (vp->file) {
> > + flush_dcache_mmap_lock(vp->mapping);
> > + vma_interval_tree_remove(vp->vma, &vp->mapping->i_mmap);
> > + if (vp->adj_next)
> > + vma_interval_tree_remove(vp->adj_next,
> > + &vp->mapping->i_mmap);
> > + }
> > +
> > +}
> > +
> > +/*
> > + * dup_anon_vma() - Helper function to duplicate anon_vma
> > + * @dst: The destination VMA
> > + * @src: The source VMA
> > + * @dup: Pointer to the destination VMA when successful.
> > + *
> > + * Returns: 0 on success.
> > + */
> > +static int dup_anon_vma(struct vm_area_struct *dst,
> > + struct vm_area_struct *src, struct vm_area_struct **dup)
> > +{
> > + /*
> > + * Easily overlooked: when mprotect shifts the boundary, make sure the
> > + * expanding vma has anon_vma set if the shrinking vma had, to cover any
> > + * anon pages imported.
> > + */
> > + if (src->anon_vma && !dst->anon_vma) {
> > + int ret;
> > +
> > + vma_assert_write_locked(dst);
> > + dst->anon_vma = src->anon_vma;
> > + ret = anon_vma_clone(dst, src);
> > + if (ret)
> > + return ret;
> > +
> > + *dup = dst;
> > + }
> > +
> > + return 0;
> > +}
> > +
> > +#ifdef CONFIG_DEBUG_VM_MAPLE_TREE
> > +void validate_mm(struct mm_struct *mm)
> > +{
> > + int bug = 0;
> > + int i = 0;
> > + struct vm_area_struct *vma;
> > + VMA_ITERATOR(vmi, mm, 0);
> > +
> > + mt_validate(&mm->mm_mt);
> > + for_each_vma(vmi, vma) {
> > +#ifdef CONFIG_DEBUG_VM_RB
> > + struct anon_vma *anon_vma = vma->anon_vma;
> > + struct anon_vma_chain *avc;
> > +#endif
> > + unsigned long vmi_start, vmi_end;
> > + bool warn = 0;
> > +
> > + vmi_start = vma_iter_addr(&vmi);
> > + vmi_end = vma_iter_end(&vmi);
> > + if (VM_WARN_ON_ONCE_MM(vma->vm_end != vmi_end, mm))
> > + warn = 1;
> > +
> > + if (VM_WARN_ON_ONCE_MM(vma->vm_start != vmi_start, mm))
> > + warn = 1;
> > +
> > + if (warn) {
> > + pr_emerg("issue in %s\n", current->comm);
> > + dump_stack();
> > + dump_vma(vma);
> > + pr_emerg("tree range: %px start %lx end %lx\n", vma,
> > + vmi_start, vmi_end - 1);
> > + vma_iter_dump_tree(&vmi);
> > + }
> > +
> > +#ifdef CONFIG_DEBUG_VM_RB
> > + if (anon_vma) {
> > + anon_vma_lock_read(anon_vma);
> > + list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
> > + anon_vma_interval_tree_verify(avc);
> > + anon_vma_unlock_read(anon_vma);
> > + }
> > +#endif
> > + i++;
> > + }
> > + if (i != mm->map_count) {
> > + pr_emerg("map_count %d vma iterator %d\n", mm->map_count, i);
> > + bug = 1;
> > + }
> > + VM_BUG_ON_MM(bug, mm);
> > +}
> > +#endif /* CONFIG_DEBUG_VM_MAPLE_TREE */
> > +
> > +/*
> > + * vma_expand - Expand an existing VMA
> > + *
> > + * @vmi: The vma iterator
> > + * @vma: The vma to expand
> > + * @start: The start of the vma
> > + * @end: The exclusive end of the vma
> > + * @pgoff: The page offset of vma
> > + * @next: The current of next vma.
> > + *
> > + * Expand @vma to @start and @end. Can expand off the start and end. Will
> > + * expand over @next if it's different from @vma and @end == @next->vm_end.
> > + * Checking if the @vma can expand and merge with @next needs to be handled by
> > + * the caller.
> > + *
> > + * Returns: 0 on success
> > + */
> > +int vma_expand(struct vma_iterator *vmi, struct vm_area_struct *vma,
> > + unsigned long start, unsigned long end, pgoff_t pgoff,
> > + struct vm_area_struct *next)
> > +{
> > + struct vm_area_struct *anon_dup = NULL;
> > + bool remove_next = false;
> > + struct vma_prepare vp;
> > +
> > + vma_start_write(vma);
> > + if (next && (vma != next) && (end == next->vm_end)) {
> > + int ret;
> > +
> > + remove_next = true;
> > + vma_start_write(next);
> > + ret = dup_anon_vma(vma, next, &anon_dup);
> > + if (ret)
> > + return ret;
> > + }
> > +
> > + init_multi_vma_prep(&vp, vma, NULL, remove_next ? next : NULL, NULL);
> > + /* Not merging but overwriting any part of next is not handled. */
> > + VM_WARN_ON(next && !vp.remove &&
> > + next != vma && end > next->vm_start);
> > + /* Only handles expanding */
> > + VM_WARN_ON(vma->vm_start < start || vma->vm_end > end);
> > +
> > + /* Note: vma iterator must be pointing to 'start' */
> > + vma_iter_config(vmi, start, end);
> > + if (vma_iter_prealloc(vmi, vma))
> > + goto nomem;
> > +
> > + vma_prepare(&vp);
> > + vma_adjust_trans_huge(vma, start, end, 0);
> > + vma_set_range(vma, start, end, pgoff);
> > + vma_iter_store(vmi, vma);
> > +
> > + vma_complete(&vp, vmi, vma->vm_mm);
> > + return 0;
> > +
> > +nomem:
> > + if (anon_dup)
> > + unlink_anon_vmas(anon_dup);
> > + return -ENOMEM;
> > +}
> > +
> > +/*
> > + * vma_shrink() - Reduce an existing VMAs memory area
> > + * @vmi: The vma iterator
> > + * @vma: The VMA to modify
> > + * @start: The new start
> > + * @end: The new end
> > + *
> > + * Returns: 0 on success, -ENOMEM otherwise
> > + */
> > +int vma_shrink(struct vma_iterator *vmi, struct vm_area_struct *vma,
> > + unsigned long start, unsigned long end, pgoff_t pgoff)
> > +{
> > + struct vma_prepare vp;
> > +
> > + WARN_ON((vma->vm_start != start) && (vma->vm_end != end));
> > +
> > + if (vma->vm_start < start)
> > + vma_iter_config(vmi, vma->vm_start, start);
> > + else
> > + vma_iter_config(vmi, end, vma->vm_end);
> > +
> > + if (vma_iter_prealloc(vmi, NULL))
> > + return -ENOMEM;
> > +
> > + vma_start_write(vma);
> > +
> > + init_vma_prep(&vp, vma);
> > + vma_prepare(&vp);
> > + vma_adjust_trans_huge(vma, start, end, 0);
> > +
> > + vma_iter_clear(vmi);
> > + vma_set_range(vma, start, end, pgoff);
> > + vma_complete(&vp, vmi, vma->vm_mm);
> > + return 0;
> > +}
> > +
> > +/*
> > + * vma_complete- Helper function for handling the unlocking after altering VMAs,
> > + * or for inserting a VMA.
> > + *
> > + * @vp: The vma_prepare struct
> > + * @vmi: The vma iterator
> > + * @mm: The mm_struct
> > + */
> > +void vma_complete(struct vma_prepare *vp,
> > + struct vma_iterator *vmi, struct mm_struct *mm)
> > +{
> > + if (vp->file) {
> > + if (vp->adj_next)
> > + vma_interval_tree_insert(vp->adj_next,
> > + &vp->mapping->i_mmap);
> > + vma_interval_tree_insert(vp->vma, &vp->mapping->i_mmap);
> > + flush_dcache_mmap_unlock(vp->mapping);
> > + }
> > +
> > + if (vp->remove && vp->file) {
> > + __remove_shared_vm_struct(vp->remove, vp->mapping);
> > + if (vp->remove2)
> > + __remove_shared_vm_struct(vp->remove2, vp->mapping);
> > + } else if (vp->insert) {
> > + /*
> > + * split_vma has split insert from vma, and needs
> > + * us to insert it before dropping the locks
> > + * (it may either follow vma or precede it).
> > + */
> > + vma_iter_store(vmi, vp->insert);
> > + mm->map_count++;
> > + }
> > +
> > + if (vp->anon_vma) {
> > + anon_vma_interval_tree_post_update_vma(vp->vma);
> > + if (vp->adj_next)
> > + anon_vma_interval_tree_post_update_vma(vp->adj_next);
> > + anon_vma_unlock_write(vp->anon_vma);
> > + }
> > +
> > + if (vp->file) {
> > + i_mmap_unlock_write(vp->mapping);
> > + uprobe_mmap(vp->vma);
> > +
> > + if (vp->adj_next)
> > + uprobe_mmap(vp->adj_next);
> > + }
> > +
> > + if (vp->remove) {
> > +again:
> > + vma_mark_detached(vp->remove, true);
> > + if (vp->file) {
> > + uprobe_munmap(vp->remove, vp->remove->vm_start,
> > + vp->remove->vm_end);
> > + fput(vp->file);
> > + }
> > + if (vp->remove->anon_vma)
> > + anon_vma_merge(vp->vma, vp->remove);
> > + mm->map_count--;
> > + mpol_put(vma_policy(vp->remove));
> > + if (!vp->remove2)
> > + WARN_ON_ONCE(vp->vma->vm_end < vp->remove->vm_end);
> > + vm_area_free(vp->remove);
> > +
> > + /*
> > + * In mprotect's case 6 (see comments on vma_merge),
> > + * we are removing both mid and next vmas
> > + */
> > + if (vp->remove2) {
> > + vp->remove = vp->remove2;
> > + vp->remove2 = NULL;
> > + goto again;
> > + }
> > + }
> > + if (vp->insert && vp->file)
> > + uprobe_mmap(vp->insert);
> > + validate_mm(mm);
> > +}
> > +
> > +/*
> > + * do_vmi_align_munmap() - munmap the aligned region from @start to @end.
> > + * @vmi: The vma iterator
> > + * @vma: The starting vm_area_struct
> > + * @mm: The mm_struct
> > + * @start: The aligned start address to munmap.
> > + * @end: The aligned end address to munmap.
> > + * @uf: The userfaultfd list_head
> > + * @unlock: Set to true to drop the mmap_lock. unlocking only happens on
> > + * success.
> > + *
> > + * Return: 0 on success and drops the lock if so directed, error and leaves the
> > + * lock held otherwise.
> > + */
> > +int
> > +do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma,
> > + struct mm_struct *mm, unsigned long start,
> > + unsigned long end, struct list_head *uf, bool unlock)
> > +{
> > + struct vm_area_struct *prev, *next = NULL;
> > + struct maple_tree mt_detach;
> > + int count = 0;
> > + int error = -ENOMEM;
> > + unsigned long locked_vm = 0;
> > + MA_STATE(mas_detach, &mt_detach, 0, 0);
> > + mt_init_flags(&mt_detach, vmi->mas.tree->ma_flags & MT_FLAGS_LOCK_MASK);
> > + mt_on_stack(mt_detach);
> > +
> > + /*
> > + * If we need to split any vma, do it now to save pain later.
> > + *
> > + * Note: mremap's move_vma VM_ACCOUNT handling assumes a partially
> > + * unmapped vm_area_struct will remain in use: so lower split_vma
> > + * places tmp vma above, and higher split_vma places tmp vma below.
> > + */
> > +
> > + /* Does it split the first one? */
> > + if (start > vma->vm_start) {
> > +
> > + /*
> > + * Make sure that map_count on return from munmap() will
> > + * not exceed its limit; but let map_count go just above
> > + * its limit temporarily, to help free resources as expected.
> > + */
> > + if (end < vma->vm_end && mm->map_count >= sysctl_max_map_count)
> > + goto map_count_exceeded;
> > +
> > + error = __split_vma(vmi, vma, start, 1);
> > + if (error)
> > + goto start_split_failed;
> > + }
> > +
> > + /*
> > + * Detach a range of VMAs from the mm. Using next as a temp variable as
> > + * it is always overwritten.
> > + */
> > + next = vma;
> > + do {
> > + /* Does it split the end? */
> > + if (next->vm_end > end) {
> > + error = __split_vma(vmi, next, end, 0);
> > + if (error)
> > + goto end_split_failed;
> > + }
> > + vma_start_write(next);
> > + mas_set(&mas_detach, count);
> > + error = mas_store_gfp(&mas_detach, next, GFP_KERNEL);
> > + if (error)
> > + goto munmap_gather_failed;
> > + vma_mark_detached(next, true);
> > + if (next->vm_flags & VM_LOCKED)
> > + locked_vm += vma_pages(next);
> > +
> > + count++;
> > + if (unlikely(uf)) {
> > + /*
> > + * If userfaultfd_unmap_prep returns an error the vmas
> > + * will remain split, but userland will get a
> > + * highly unexpected error anyway. This is no
> > + * different than the case where the first of the two
> > + * __split_vma fails, but we don't undo the first
> > + * split, despite we could. This is unlikely enough
> > + * failure that it's not worth optimizing it for.
> > + */
> > + error = userfaultfd_unmap_prep(next, start, end, uf);
> > +
> > + if (error)
> > + goto userfaultfd_error;
> > + }
> > +#ifdef CONFIG_DEBUG_VM_MAPLE_TREE
> > + BUG_ON(next->vm_start < start);
> > + BUG_ON(next->vm_start > end);
> > +#endif
> > + } for_each_vma_range(*vmi, next, end);
> > +
> > +#if defined(CONFIG_DEBUG_VM_MAPLE_TREE)
> > + /* Make sure no VMAs are about to be lost. */
> > + {
> > + MA_STATE(test, &mt_detach, 0, 0);
> > + struct vm_area_struct *vma_mas, *vma_test;
> > + int test_count = 0;
> > +
> > + vma_iter_set(vmi, start);
> > + rcu_read_lock();
> > + vma_test = mas_find(&test, count - 1);
> > + for_each_vma_range(*vmi, vma_mas, end) {
> > + BUG_ON(vma_mas != vma_test);
> > + test_count++;
> > + vma_test = mas_next(&test, count - 1);
> > + }
> > + rcu_read_unlock();
> > + BUG_ON(count != test_count);
> > + }
> > +#endif
> > +
> > + while (vma_iter_addr(vmi) > start)
> > + vma_iter_prev_range(vmi);
> > +
> > + error = vma_iter_clear_gfp(vmi, start, end, GFP_KERNEL);
> > + if (error)
> > + goto clear_tree_failed;
> > +
> > + /* Point of no return */
> > + mm->locked_vm -= locked_vm;
> > + mm->map_count -= count;
> > + if (unlock)
> > + mmap_write_downgrade(mm);
> > +
> > + prev = vma_iter_prev_range(vmi);
> > + next = vma_next(vmi);
> > + if (next)
> > + vma_iter_prev_range(vmi);
> > +
> > + /*
> > + * We can free page tables without write-locking mmap_lock because VMAs
> > + * were isolated before we downgraded mmap_lock.
> > + */
> > + mas_set(&mas_detach, 1);
> > + unmap_region(mm, &mas_detach, vma, prev, next, start, end, count,
> > + !unlock);
> > + /* Statistics and freeing VMAs */
> > + mas_set(&mas_detach, 0);
> > + remove_mt(mm, &mas_detach);
> > + validate_mm(mm);
> > + if (unlock)
> > + mmap_read_unlock(mm);
> > +
> > + __mt_destroy(&mt_detach);
> > + return 0;
> > +
> > +clear_tree_failed:
> > +userfaultfd_error:
> > +munmap_gather_failed:
> > +end_split_failed:
> > + mas_set(&mas_detach, 0);
> > + mas_for_each(&mas_detach, next, end)
> > + vma_mark_detached(next, false);
> > +
> > + __mt_destroy(&mt_detach);
> > +start_split_failed:
> > +map_count_exceeded:
> > + validate_mm(mm);
> > + return error;
> > +}
> > +
> > +/*
> > + * do_vmi_munmap() - munmap a given range.
> > + * @vmi: The vma iterator
> > + * @mm: The mm_struct
> > + * @start: The start address to munmap
> > + * @len: The length of the range to munmap
> > + * @uf: The userfaultfd list_head
> > + * @unlock: set to true if the user wants to drop the mmap_lock on success
> > + *
> > + * This function takes a @mas that is either pointing to the previous VMA or set
> > + * to MA_START and sets it up to remove the mapping(s). The @len will be
> > + * aligned and any arch_unmap work will be preformed.
> > + *
> > + * Return: 0 on success and drops the lock if so directed, error and leaves the
> > + * lock held otherwise.
> > + */
> > +int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm,
> > + unsigned long start, size_t len, struct list_head *uf,
> > + bool unlock)
> > +{
> > + unsigned long end;
> > + struct vm_area_struct *vma;
> > +
> > + if ((offset_in_page(start)) || start > TASK_SIZE || len > TASK_SIZE-start)
> > + return -EINVAL;
> > +
> > + end = start + PAGE_ALIGN(len);
> > + if (end == start)
> > + return -EINVAL;
> > +
> > + /*
> > + * Check if memory is sealed before arch_unmap.
> > + * Prevent unmapping a sealed VMA.
> > + * can_modify_mm assumes we have acquired the lock on MM.
> > + */
> > + if (unlikely(!can_modify_mm(mm, start, end)))
> > + return -EPERM;
> > +
> > + /* arch_unmap() might do unmaps itself. */
> > + arch_unmap(mm, start, end);
> > +
> > + /* Find the first overlapping VMA */
> > + vma = vma_find(vmi, end);
> > + if (!vma) {
> > + if (unlock)
> > + mmap_write_unlock(mm);
> > + return 0;
> > + }
> > +
> > + return do_vmi_align_munmap(vmi, vma, mm, start, end, uf, unlock);
> > +}
> > +
> > +/*
> > + * Given a mapping request (addr,end,vm_flags,file,pgoff,anon_name),
> > + * figure out whether that can be merged with its predecessor or its
> > + * successor. Or both (it neatly fills a hole).
> > + *
> > + * In most cases - when called for mmap, brk or mremap - [addr,end) is
> > + * certain not to be mapped by the time vma_merge is called; but when
> > + * called for mprotect, it is certain to be already mapped (either at
> > + * an offset within prev, or at the start of next), and the flags of
> > + * this area are about to be changed to vm_flags - and the no-change
> > + * case has already been eliminated.
> > + *
> > + * The following mprotect cases have to be considered, where **** is
> > + * the area passed down from mprotect_fixup, never extending beyond one
> > + * vma, PPPP is the previous vma, CCCC is a concurrent vma that starts
> > + * at the same address as **** and is of the same or larger span, and
> > + * NNNN the next vma after ****:
> > + *
> > + * **** **** ****
> > + * PPPPPPNNNNNN PPPPPPNNNNNN PPPPPPCCCCCC
> > + * cannot merge might become might become
> > + * PPNNNNNNNNNN PPPPPPPPPPCC
> > + * mmap, brk or case 4 below case 5 below
> > + * mremap move:
> > + * **** ****
> > + * PPPP NNNN PPPPCCCCNNNN
> > + * might become might become
> > + * PPPPPPPPPPPP 1 or PPPPPPPPPPPP 6 or
> > + * PPPPPPPPNNNN 2 or PPPPPPPPNNNN 7 or
> > + * PPPPNNNNNNNN 3 PPPPNNNNNNNN 8
> > + *
> > + * It is important for case 8 that the vma CCCC overlapping the
> > + * region **** is never going to extended over NNNN. Instead NNNN must
> > + * be extended in region **** and CCCC must be removed. This way in
> > + * all cases where vma_merge succeeds, the moment vma_merge drops the
> > + * rmap_locks, the properties of the merged vma will be already
> > + * correct for the whole merged range. Some of those properties like
> > + * vm_page_prot/vm_flags may be accessed by rmap_walks and they must
> > + * be correct for the whole merged range immediately after the
> > + * rmap_locks are released. Otherwise if NNNN would be removed and
> > + * CCCC would be extended over the NNNN range, remove_migration_ptes
> > + * or other rmap walkers (if working on addresses beyond the "end"
> > + * parameter) may establish ptes with the wrong permissions of CCCC
> > + * instead of the right permissions of NNNN.
> > + *
> > + * In the code below:
> > + * PPPP is represented by *prev
> > + * CCCC is represented by *curr or not represented at all (NULL)
> > + * NNNN is represented by *next or not represented at all (NULL)
> > + * **** is not represented - it will be merged and the vma containing the
> > + * area is returned, or the function will return NULL
> > + */
> > +static struct vm_area_struct
> > +*vma_merge(struct vma_iterator *vmi, struct vm_area_struct *prev,
> > + struct vm_area_struct *src, unsigned long addr, unsigned long end,
> > + unsigned long vm_flags, pgoff_t pgoff, struct mempolicy *policy,
> > + struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
> > + struct anon_vma_name *anon_name)
> > +{
> > + struct mm_struct *mm = src->vm_mm;
> > + struct anon_vma *anon_vma = src->anon_vma;
> > + struct file *file = src->vm_file;
> > + struct vm_area_struct *curr, *next, *res;
> > + struct vm_area_struct *vma, *adjust, *remove, *remove2;
> > + struct vm_area_struct *anon_dup = NULL;
> > + struct vma_prepare vp;
> > + pgoff_t vma_pgoff;
> > + int err = 0;
> > + bool merge_prev = false;
> > + bool merge_next = false;
> > + bool vma_expanded = false;
> > + unsigned long vma_start = addr;
> > + unsigned long vma_end = end;
> > + pgoff_t pglen = (end - addr) >> PAGE_SHIFT;
> > + long adj_start = 0;
> > +
> > + /*
> > + * We later require that vma->vm_flags == vm_flags,
> > + * so this tests vma->vm_flags & VM_SPECIAL, too.
> > + */
> > + if (vm_flags & VM_SPECIAL)
> > + return NULL;
> > +
> > + /* Does the input range span an existing VMA? (cases 5 - 8) */
> > + curr = find_vma_intersection(mm, prev ? prev->vm_end : 0, end);
> > +
> > + if (!curr || /* cases 1 - 4 */
> > + end == curr->vm_end) /* cases 6 - 8, adjacent VMA */
> > + next = vma_lookup(mm, end);
> > + else
> > + next = NULL; /* case 5 */
> > +
> > + if (prev) {
> > + vma_start = prev->vm_start;
> > + vma_pgoff = prev->vm_pgoff;
> > +
> > + /* Can we merge the predecessor? */
> > + if (addr == prev->vm_end && mpol_equal(vma_policy(prev), policy)
> > + && can_vma_merge_after(prev, vm_flags, anon_vma, file,
> > + pgoff, vm_userfaultfd_ctx, anon_name)) {
> > + merge_prev = true;
> > + vma_prev(vmi);
> > + }
> > + }
> > +
> > + /* Can we merge the successor? */
> > + if (next && mpol_equal(policy, vma_policy(next)) &&
> > + can_vma_merge_before(next, vm_flags, anon_vma, file, pgoff+pglen,
> > + vm_userfaultfd_ctx, anon_name)) {
> > + merge_next = true;
> > + }
> > +
> > + /* Verify some invariant that must be enforced by the caller. */
> > + VM_WARN_ON(prev && addr <= prev->vm_start);
> > + VM_WARN_ON(curr && (addr != curr->vm_start || end > curr->vm_end));
> > + VM_WARN_ON(addr >= end);
> > +
> > + if (!merge_prev && !merge_next)
> > + return NULL; /* Not mergeable. */
> > +
> > + if (merge_prev)
> > + vma_start_write(prev);
> > +
> > + res = vma = prev;
> > + remove = remove2 = adjust = NULL;
> > +
> > + /* Can we merge both the predecessor and the successor? */
> > + if (merge_prev && merge_next &&
> > + is_mergeable_anon_vma(prev->anon_vma, next->anon_vma, NULL)) {
> > + vma_start_write(next);
> > + remove = next; /* case 1 */
> > + vma_end = next->vm_end;
> > + err = dup_anon_vma(prev, next, &anon_dup);
> > + if (curr) { /* case 6 */
> > + vma_start_write(curr);
> > + remove = curr;
> > + remove2 = next;
> > + /*
> > + * Note that the dup_anon_vma below cannot overwrite err
> > + * since the first caller would do nothing unless next
> > + * has an anon_vma.
> > + */
> > + if (!next->anon_vma)
> > + err = dup_anon_vma(prev, curr, &anon_dup);
> > + }
> > + } else if (merge_prev) { /* case 2 */
> > + if (curr) {
> > + vma_start_write(curr);
> > + if (end == curr->vm_end) { /* case 7 */
> > + /*
> > + * can_vma_merge_after() assumed we would not be
> > + * removing prev vma, so it skipped the check
> > + * for vm_ops->close, but we are removing curr
> > + */
> > + if (curr->vm_ops && curr->vm_ops->close)
> > + err = -EINVAL;
> > + remove = curr;
> > + } else { /* case 5 */
> > + adjust = curr;
> > + adj_start = (end - curr->vm_start);
> > + }
> > + if (!err)
> > + err = dup_anon_vma(prev, curr, &anon_dup);
> > + }
> > + } else { /* merge_next */
> > + vma_start_write(next);
> > + res = next;
> > + if (prev && addr < prev->vm_end) { /* case 4 */
> > + vma_start_write(prev);
> > + vma_end = addr;
> > + adjust = next;
> > + adj_start = -(prev->vm_end - addr);
> > + err = dup_anon_vma(next, prev, &anon_dup);
> > + } else {
> > + /*
> > + * Note that cases 3 and 8 are the ONLY ones where prev
> > + * is permitted to be (but is not necessarily) NULL.
> > + */
> > + vma = next; /* case 3 */
> > + vma_start = addr;
> > + vma_end = next->vm_end;
> > + vma_pgoff = next->vm_pgoff - pglen;
> > + if (curr) { /* case 8 */
> > + vma_pgoff = curr->vm_pgoff;
> > + vma_start_write(curr);
> > + remove = curr;
> > + err = dup_anon_vma(next, curr, &anon_dup);
> > + }
> > + }
> > + }
> > +
> > + /* Error in anon_vma clone. */
> > + if (err)
> > + goto anon_vma_fail;
> > +
> > + if (vma_start < vma->vm_start || vma_end > vma->vm_end)
> > + vma_expanded = true;
> > +
> > + if (vma_expanded) {
> > + vma_iter_config(vmi, vma_start, vma_end);
> > + } else {
> > + vma_iter_config(vmi, adjust->vm_start + adj_start,
> > + adjust->vm_end);
> > + }
> > +
> > + if (vma_iter_prealloc(vmi, vma))
> > + goto prealloc_fail;
> > +
> > + init_multi_vma_prep(&vp, vma, adjust, remove, remove2);
> > + VM_WARN_ON(vp.anon_vma && adjust && adjust->anon_vma &&
> > + vp.anon_vma != adjust->anon_vma);
> > +
> > + vma_prepare(&vp);
> > + vma_adjust_trans_huge(vma, vma_start, vma_end, adj_start);
> > + vma_set_range(vma, vma_start, vma_end, vma_pgoff);
> > +
> > + if (vma_expanded)
> > + vma_iter_store(vmi, vma);
> > +
> > + if (adj_start) {
> > + adjust->vm_start += adj_start;
> > + adjust->vm_pgoff += adj_start >> PAGE_SHIFT;
> > + if (adj_start < 0) {
> > + WARN_ON(vma_expanded);
> > + vma_iter_store(vmi, next);
> > + }
> > + }
> > +
> > + vma_complete(&vp, vmi, mm);
> > + khugepaged_enter_vma(res, vm_flags);
> > + return res;
> > +
> > +prealloc_fail:
> > + if (anon_dup)
> > + unlink_anon_vmas(anon_dup);
> > +
> > +anon_vma_fail:
> > + vma_iter_set(vmi, addr);
> > + vma_iter_load(vmi);
> > + return NULL;
> > +}
> > +
> > +/*
> > + * We are about to modify one or multiple of a VMA's flags, policy, userfaultfd
> > + * context and anonymous VMA name within the range [start, end).
> > + *
> > + * As a result, we might be able to merge the newly modified VMA range with an
> > + * adjacent VMA with identical properties.
> > + *
> > + * If no merge is possible and the range does not span the entirety of the VMA,
> > + * we then need to split the VMA to accommodate the change.
> > + *
> > + * The function returns either the merged VMA, the original VMA if a split was
> > + * required instead, or an error if the split failed.
> > + */
> > +struct vm_area_struct *vma_modify(struct vma_iterator *vmi,
> > + struct vm_area_struct *prev,
> > + struct vm_area_struct *vma,
> > + unsigned long start, unsigned long end,
> > + unsigned long vm_flags,
> > + struct mempolicy *policy,
> > + struct vm_userfaultfd_ctx uffd_ctx,
> > + struct anon_vma_name *anon_name)
> > +{
> > + pgoff_t pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
> > + struct vm_area_struct *merged;
> > +
> > + merged = vma_merge(vmi, prev, vma, start, end, vm_flags,
> > + pgoff, policy, uffd_ctx, anon_name);
> > + if (merged)
> > + return merged;
> > +
> > + if (vma->vm_start < start) {
> > + int err = split_vma(vmi, vma, start, 1);
> > +
> > + if (err)
> > + return ERR_PTR(err);
> > + }
> > +
> > + if (vma->vm_end > end) {
> > + int err = split_vma(vmi, vma, end, 0);
> > +
> > + if (err)
> > + return ERR_PTR(err);
> > + }
> > +
> > + return vma;
> > +}
> > +
> > +/*
> > + * Attempt to merge a newly mapped VMA with those adjacent to it. The caller
> > + * must ensure that [start, end) does not overlap any existing VMA.
> > + */
> > +struct vm_area_struct
> > +*vma_merge_new_vma(struct vma_iterator *vmi, struct vm_area_struct *prev,
> > + struct vm_area_struct *vma, unsigned long start,
> > + unsigned long end, pgoff_t pgoff)
> > +{
> > + return vma_merge(vmi, prev, vma, start, end, vma->vm_flags, pgoff,
> > + vma_policy(vma), vma->vm_userfaultfd_ctx, anon_vma_name(vma));
> > +}
> > +
> > +/*
> > + * Expand vma by delta bytes, potentially merging with an immediately adjacent
> > + * VMA with identical properties.
> > + */
> > +struct vm_area_struct *vma_merge_extend(struct vma_iterator *vmi,
> > + struct vm_area_struct *vma,
> > + unsigned long delta)
> > +{
> > + pgoff_t pgoff = vma->vm_pgoff + vma_pages(vma);
> > +
> > + /* vma is specified as prev, so case 1 or 2 will apply. */
> > + return vma_merge(vmi, vma, vma, vma->vm_end, vma->vm_end + delta,
> > + vma->vm_flags, pgoff, vma_policy(vma),
> > + vma->vm_userfaultfd_ctx, anon_vma_name(vma));
> > +}
> > +
> > +void unlink_file_vma_batch_init(struct unlink_vma_file_batch *vb)
> > +{
> > + vb->count = 0;
> > +}
> > +
> > +static void unlink_file_vma_batch_process(struct unlink_vma_file_batch *vb)
> > +{
> > + struct address_space *mapping;
> > + int i;
> > +
> > + mapping = vb->vmas[0]->vm_file->f_mapping;
> > + i_mmap_lock_write(mapping);
> > + for (i = 0; i < vb->count; i++) {
> > + VM_WARN_ON_ONCE(vb->vmas[i]->vm_file->f_mapping != mapping);
> > + __remove_shared_vm_struct(vb->vmas[i], mapping);
> > + }
> > + i_mmap_unlock_write(mapping);
> > +
> > + unlink_file_vma_batch_init(vb);
> > +}
> > +
> > +void unlink_file_vma_batch_add(struct unlink_vma_file_batch *vb,
> > + struct vm_area_struct *vma)
> > +{
> > + if (vma->vm_file == NULL)
> > + return;
> > +
> > + if ((vb->count > 0 && vb->vmas[0]->vm_file != vma->vm_file) ||
> > + vb->count == ARRAY_SIZE(vb->vmas))
> > + unlink_file_vma_batch_process(vb);
> > +
> > + vb->vmas[vb->count] = vma;
> > + vb->count++;
> > +}
> > +
> > +void unlink_file_vma_batch_final(struct unlink_vma_file_batch *vb)
> > +{
> > + if (vb->count > 0)
> > + unlink_file_vma_batch_process(vb);
> > +}
> > +
> > +/*
> > + * Unlink a file-based vm structure from its interval tree, to hide
> > + * vma from rmap and vmtruncate before freeing its page tables.
> > + */
> > +void unlink_file_vma(struct vm_area_struct *vma)
> > +{
> > + struct file *file = vma->vm_file;
> > +
> > + if (file) {
> > + struct address_space *mapping = file->f_mapping;
> > +
> > + i_mmap_lock_write(mapping);
> > + __remove_shared_vm_struct(vma, mapping);
> > + i_mmap_unlock_write(mapping);
> > + }
> > +}
> > +
> > +void vma_link_file(struct vm_area_struct *vma)
> > +{
> > + struct file *file = vma->vm_file;
> > + struct address_space *mapping;
> > +
> > + if (file) {
> > + mapping = file->f_mapping;
> > + i_mmap_lock_write(mapping);
> > + __vma_link_file(vma, mapping);
> > + i_mmap_unlock_write(mapping);
> > + }
> > +}
> > +
> > +int vma_link(struct mm_struct *mm, struct vm_area_struct *vma)
> > +{
> > + VMA_ITERATOR(vmi, mm, 0);
> > +
> > + vma_iter_config(&vmi, vma->vm_start, vma->vm_end);
> > + if (vma_iter_prealloc(&vmi, vma))
> > + return -ENOMEM;
> > +
> > + vma_start_write(vma);
> > + vma_iter_store(&vmi, vma);
> > + vma_link_file(vma);
> > + mm->map_count++;
> > + validate_mm(mm);
> > + return 0;
> > +}
> > +
> > +/*
> > + * Copy the vma structure to a new location in the same mm,
> > + * prior to moving page table entries, to effect an mremap move.
> > + */
> > +struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
> > + unsigned long addr, unsigned long len, pgoff_t pgoff,
> > + bool *need_rmap_locks)
> > +{
> > + struct vm_area_struct *vma = *vmap;
> > + unsigned long vma_start = vma->vm_start;
> > + struct mm_struct *mm = vma->vm_mm;
> > + struct vm_area_struct *new_vma, *prev;
> > + bool faulted_in_anon_vma = true;
> > + VMA_ITERATOR(vmi, mm, addr);
> > +
> > + /*
> > + * If anonymous vma has not yet been faulted, update new pgoff
> > + * to match new location, to increase its chance of merging.
> > + */
> > + if (unlikely(vma_is_anonymous(vma) && !vma->anon_vma)) {
> > + pgoff = addr >> PAGE_SHIFT;
> > + faulted_in_anon_vma = false;
> > + }
> > +
> > + new_vma = find_vma_prev(mm, addr, &prev);
> > + if (new_vma && new_vma->vm_start < addr + len)
> > + return NULL; /* should never get here */
> > +
> > + new_vma = vma_merge_new_vma(&vmi, prev, vma, addr, addr + len, pgoff);
> > + if (new_vma) {
> > + /*
> > + * Source vma may have been merged into new_vma
> > + */
> > + if (unlikely(vma_start >= new_vma->vm_start &&
> > + vma_start < new_vma->vm_end)) {
> > + /*
> > + * The only way we can get a vma_merge with
> > + * self during an mremap is if the vma hasn't
> > + * been faulted in yet and we were allowed to
> > + * reset the dst vma->vm_pgoff to the
> > + * destination address of the mremap to allow
> > + * the merge to happen. mremap must change the
> > + * vm_pgoff linearity between src and dst vmas
> > + * (in turn preventing a vma_merge) to be
> > + * safe. It is only safe to keep the vm_pgoff
> > + * linear if there are no pages mapped yet.
> > + */
> > + VM_BUG_ON_VMA(faulted_in_anon_vma, new_vma);
> > + *vmap = vma = new_vma;
> > + }
> > + *need_rmap_locks = (new_vma->vm_pgoff <= vma->vm_pgoff);
> > + } else {
> > + new_vma = vm_area_dup(vma);
> > + if (!new_vma)
> > + goto out;
> > + vma_set_range(new_vma, addr, addr + len, pgoff);
> > + if (vma_dup_policy(vma, new_vma))
> > + goto out_free_vma;
> > + if (anon_vma_clone(new_vma, vma))
> > + goto out_free_mempol;
> > + if (new_vma->vm_file)
> > + get_file(new_vma->vm_file);
> > + if (new_vma->vm_ops && new_vma->vm_ops->open)
> > + new_vma->vm_ops->open(new_vma);
> > + if (vma_link(mm, new_vma))
> > + goto out_vma_link;
> > + *need_rmap_locks = false;
> > + }
> > + return new_vma;
> > +
> > +out_vma_link:
> > + if (new_vma->vm_ops && new_vma->vm_ops->close)
> > + new_vma->vm_ops->close(new_vma);
> > +
> > + if (new_vma->vm_file)
> > + fput(new_vma->vm_file);
> > +
> > + unlink_anon_vmas(new_vma);
> > +out_free_mempol:
> > + mpol_put(vma_policy(new_vma));
> > +out_free_vma:
> > + vm_area_free(new_vma);
> > +out:
> > + return NULL;
> > +}
> > +
> > +/*
> > + * Rough compatibility check to quickly see if it's even worth looking
> > + * at sharing an anon_vma.
> > + *
> > + * They need to have the same vm_file, and the flags can only differ
> > + * in things that mprotect may change.
> > + *
> > + * NOTE! The fact that we share an anon_vma doesn't _have_ to mean that
> > + * we can merge the two vma's. For example, we refuse to merge a vma if
> > + * there is a vm_ops->close() function, because that indicates that the
> > + * driver is doing some kind of reference counting. But that doesn't
> > + * really matter for the anon_vma sharing case.
> > + */
> > +static int anon_vma_compatible(struct vm_area_struct *a, struct vm_area_struct *b)
> > +{
> > + return a->vm_end == b->vm_start &&
> > + mpol_equal(vma_policy(a), vma_policy(b)) &&
> > + a->vm_file == b->vm_file &&
> > + !((a->vm_flags ^ b->vm_flags) & ~(VM_ACCESS_FLAGS | VM_SOFTDIRTY)) &&
> > + b->vm_pgoff == a->vm_pgoff + ((b->vm_start - a->vm_start) >> PAGE_SHIFT);
> > +}
> > +
> > +/*
> > + * Do some basic sanity checking to see if we can re-use the anon_vma
> > + * from 'old'. The 'a'/'b' vma's are in VM order - one of them will be
> > + * the same as 'old', the other will be the new one that is trying
> > + * to share the anon_vma.
> > + *
> > + * NOTE! This runs with mmap_lock held for reading, so it is possible that
> > + * the anon_vma of 'old' is concurrently in the process of being set up
> > + * by another page fault trying to merge _that_. But that's ok: if it
> > + * is being set up, that automatically means that it will be a singleton
> > + * acceptable for merging, so we can do all of this optimistically. But
> > + * we do that READ_ONCE() to make sure that we never re-load the pointer.
> > + *
> > + * IOW: that the "list_is_singular()" test on the anon_vma_chain only
> > + * matters for the 'stable anon_vma' case (ie the thing we want to avoid
> > + * is to return an anon_vma that is "complex" due to having gone through
> > + * a fork).
> > + *
> > + * We also make sure that the two vma's are compatible (adjacent,
> > + * and with the same memory policies). That's all stable, even with just
> > + * a read lock on the mmap_lock.
> > + */
> > +static struct anon_vma *reusable_anon_vma(struct vm_area_struct *old,
> > + struct vm_area_struct *a,
> > + struct vm_area_struct *b)
> > +{
> > + if (anon_vma_compatible(a, b)) {
> > + struct anon_vma *anon_vma = READ_ONCE(old->anon_vma);
> > +
> > + if (anon_vma && list_is_singular(&old->anon_vma_chain))
> > + return anon_vma;
> > + }
> > + return NULL;
> > +}
> > +
> > +/*
> > + * find_mergeable_anon_vma is used by anon_vma_prepare, to check
> > + * neighbouring vmas for a suitable anon_vma, before it goes off
> > + * to allocate a new anon_vma. It checks because a repetitive
> > + * sequence of mprotects and faults may otherwise lead to distinct
> > + * anon_vmas being allocated, preventing vma merge in subsequent
> > + * mprotect.
> > + */
> > +struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *vma)
> > +{
> > + struct anon_vma *anon_vma = NULL;
> > + struct vm_area_struct *prev, *next;
> > + VMA_ITERATOR(vmi, vma->vm_mm, vma->vm_end);
> > +
> > + /* Try next first. */
> > + next = vma_iter_load(&vmi);
> > + if (next) {
> > + anon_vma = reusable_anon_vma(next, vma, next);
> > + if (anon_vma)
> > + return anon_vma;
> > + }
> > +
> > + prev = vma_prev(&vmi);
> > + VM_BUG_ON_VMA(prev != vma, vma);
> > + prev = vma_prev(&vmi);
> > + /* Try prev next. */
> > + if (prev)
> > + anon_vma = reusable_anon_vma(prev, prev, vma);
> > +
> > + /*
> > + * We might reach here with anon_vma == NULL if we can't find
> > + * any reusable anon_vma.
> > + * There's no absolute need to look only at touching neighbours:
> > + * we could search further afield for "compatible" anon_vmas.
> > + * But it would probably just be a waste of time searching,
> > + * or lead to too many vmas hanging off the same anon_vma.
> > + * We're trying to allow mprotect remerging later on,
> > + * not trying to minimize memory used for anon_vmas.
> > + */
> > + return anon_vma;
> > +}
> > +
> > +static bool vm_ops_needs_writenotify(const struct vm_operations_struct *vm_ops)
> > +{
> > + return vm_ops && (vm_ops->page_mkwrite || vm_ops->pfn_mkwrite);
> > +}
> > +
> > +static bool vma_is_shared_writable(struct vm_area_struct *vma)
> > +{
> > + return (vma->vm_flags & (VM_WRITE | VM_SHARED)) ==
> > + (VM_WRITE | VM_SHARED);
> > +}
> > +
> > +static bool vma_fs_can_writeback(struct vm_area_struct *vma)
> > +{
> > + /* No managed pages to writeback. */
> > + if (vma->vm_flags & VM_PFNMAP)
> > + return false;
> > +
> > + return vma->vm_file && vma->vm_file->f_mapping &&
> > + mapping_can_writeback(vma->vm_file->f_mapping);
> > +}
> > +
> > +/*
> > + * Does this VMA require the underlying folios to have their dirty state
> > + * tracked?
> > + */
> > +bool vma_needs_dirty_tracking(struct vm_area_struct *vma)
> > +{
> > + /* Only shared, writable VMAs require dirty tracking. */
> > + if (!vma_is_shared_writable(vma))
> > + return false;
> > +
> > + /* Does the filesystem need to be notified? */
> > + if (vm_ops_needs_writenotify(vma->vm_ops))
> > + return true;
> > +
> > + /*
> > + * Even if the filesystem doesn't indicate a need for writenotify, if it
> > + * can writeback, dirty tracking is still required.
> > + */
> > + return vma_fs_can_writeback(vma);
> > +}
> > +
> > +/*
> > + * Some shared mappings will want the pages marked read-only
> > + * to track write events. If so, we'll downgrade vm_page_prot
> > + * to the private version (using protection_map[] without the
> > + * VM_SHARED bit).
> > + */
> > +bool vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot)
> > +{
> > + /* If it was private or non-writable, the write bit is already clear */
> > + if (!vma_is_shared_writable(vma))
> > + return false;
> > +
> > + /* The backer wishes to know when pages are first written to? */
> > + if (vm_ops_needs_writenotify(vma->vm_ops))
> > + return true;
> > +
> > + /* The open routine did something to the protections that pgprot_modify
> > + * won't preserve? */
> > + if (pgprot_val(vm_page_prot) !=
> > + pgprot_val(vm_pgprot_modify(vm_page_prot, vma->vm_flags)))
> > + return false;
> > +
> > + /*
> > + * Do we need to track softdirty? hugetlb does not support softdirty
> > + * tracking yet.
> > + */
> > + if (vma_soft_dirty_enabled(vma) && !is_vm_hugetlb_page(vma))
> > + return true;
> > +
> > + /* Do we need write faults for uffd-wp tracking? */
> > + if (userfaultfd_wp(vma))
> > + return true;
> > +
> > + /* Can the mapping track the dirty pages? */
> > + return vma_fs_can_writeback(vma);
> > +}
> > +
> > +unsigned long count_vma_pages_range(struct mm_struct *mm,
> > + unsigned long addr, unsigned long end)
> > +{
> > + VMA_ITERATOR(vmi, mm, addr);
> > + struct vm_area_struct *vma;
> > + unsigned long nr_pages = 0;
> > +
> > + for_each_vma_range(vmi, vma, end) {
> > + unsigned long vm_start = max(addr, vma->vm_start);
> > + unsigned long vm_end = min(end, vma->vm_end);
> > +
> > + nr_pages += PHYS_PFN(vm_end - vm_start);
> > + }
> > +
> > + return nr_pages;
> > +}
> > +
> > +static DEFINE_MUTEX(mm_all_locks_mutex);
> > +
> > +static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma)
> > +{
> > + if (!test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_root.rb_node)) {
> > + /*
> > + * The LSB of head.next can't change from under us
> > + * because we hold the mm_all_locks_mutex.
> > + */
> > + down_write_nest_lock(&anon_vma->root->rwsem, &mm->mmap_lock);
> > + /*
> > + * We can safely modify head.next after taking the
> > + * anon_vma->root->rwsem. If some other vma in this mm shares
> > + * the same anon_vma we won't take it again.
> > + *
> > + * No need of atomic instructions here, head.next
> > + * can't change from under us thanks to the
> > + * anon_vma->root->rwsem.
> > + */
> > + if (__test_and_set_bit(0, (unsigned long *)
> > + &anon_vma->root->rb_root.rb_root.rb_node))
> > + BUG();
> > + }
> > +}
> > +
> > +static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping)
> > +{
> > + if (!test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) {
> > + /*
> > + * AS_MM_ALL_LOCKS can't change from under us because
> > + * we hold the mm_all_locks_mutex.
> > + *
> > + * Operations on ->flags have to be atomic because
> > + * even if AS_MM_ALL_LOCKS is stable thanks to the
> > + * mm_all_locks_mutex, there may be other cpus
> > + * changing other bitflags in parallel to us.
> > + */
> > + if (test_and_set_bit(AS_MM_ALL_LOCKS, &mapping->flags))
> > + BUG();
> > + down_write_nest_lock(&mapping->i_mmap_rwsem, &mm->mmap_lock);
> > + }
> > +}
> > +
> > +/*
> > + * This operation locks against the VM for all pte/vma/mm related
> > + * operations that could ever happen on a certain mm. This includes
> > + * vmtruncate, try_to_unmap, and all page faults.
> > + *
> > + * The caller must take the mmap_lock in write mode before calling
> > + * mm_take_all_locks(). The caller isn't allowed to release the
> > + * mmap_lock until mm_drop_all_locks() returns.
> > + *
> > + * mmap_lock in write mode is required in order to block all operations
> > + * that could modify pagetables and free pages without need of
> > + * altering the vma layout. It's also needed in write mode to avoid new
> > + * anon_vmas to be associated with existing vmas.
> > + *
> > + * A single task can't take more than one mm_take_all_locks() in a row
> > + * or it would deadlock.
> > + *
> > + * The LSB in anon_vma->rb_root.rb_node and the AS_MM_ALL_LOCKS bitflag in
> > + * mapping->flags avoid to take the same lock twice, if more than one
> > + * vma in this mm is backed by the same anon_vma or address_space.
> > + *
> > + * We take locks in following order, accordingly to comment at beginning
> > + * of mm/rmap.c:
> > + * - all hugetlbfs_i_mmap_rwsem_key locks (aka mapping->i_mmap_rwsem for
> > + * hugetlb mapping);
> > + * - all vmas marked locked
> > + * - all i_mmap_rwsem locks;
> > + * - all anon_vma->rwseml
> > + *
> > + * We can take all locks within these types randomly because the VM code
> > + * doesn't nest them and we protected from parallel mm_take_all_locks() by
> > + * mm_all_locks_mutex.
> > + *
> > + * mm_take_all_locks() and mm_drop_all_locks are expensive operations
> > + * that may have to take thousand of locks.
> > + *
> > + * mm_take_all_locks() can fail if it's interrupted by signals.
> > + */
> > +int mm_take_all_locks(struct mm_struct *mm)
> > +{
> > + struct vm_area_struct *vma;
> > + struct anon_vma_chain *avc;
> > + VMA_ITERATOR(vmi, mm, 0);
> > +
> > + mmap_assert_write_locked(mm);
> > +
> > + mutex_lock(&mm_all_locks_mutex);
> > +
> > + /*
> > + * vma_start_write() does not have a complement in mm_drop_all_locks()
> > + * because vma_start_write() is always asymmetrical; it marks a VMA as
> > + * being written to until mmap_write_unlock() or mmap_write_downgrade()
> > + * is reached.
> > + */
> > + for_each_vma(vmi, vma) {
> > + if (signal_pending(current))
> > + goto out_unlock;
> > + vma_start_write(vma);
> > + }
> > +
> > + vma_iter_init(&vmi, mm, 0);
> > + for_each_vma(vmi, vma) {
> > + if (signal_pending(current))
> > + goto out_unlock;
> > + if (vma->vm_file && vma->vm_file->f_mapping &&
> > + is_vm_hugetlb_page(vma))
> > + vm_lock_mapping(mm, vma->vm_file->f_mapping);
> > + }
> > +
> > + vma_iter_init(&vmi, mm, 0);
> > + for_each_vma(vmi, vma) {
> > + if (signal_pending(current))
> > + goto out_unlock;
> > + if (vma->vm_file && vma->vm_file->f_mapping &&
> > + !is_vm_hugetlb_page(vma))
> > + vm_lock_mapping(mm, vma->vm_file->f_mapping);
> > + }
> > +
> > + vma_iter_init(&vmi, mm, 0);
> > + for_each_vma(vmi, vma) {
> > + if (signal_pending(current))
> > + goto out_unlock;
> > + if (vma->anon_vma)
> > + list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
> > + vm_lock_anon_vma(mm, avc->anon_vma);
> > + }
> > +
> > + return 0;
> > +
> > +out_unlock:
> > + mm_drop_all_locks(mm);
> > + return -EINTR;
> > +}
> > +
> > +static void vm_unlock_anon_vma(struct anon_vma *anon_vma)
> > +{
> > + if (test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_root.rb_node)) {
> > + /*
> > + * The LSB of head.next can't change to 0 from under
> > + * us because we hold the mm_all_locks_mutex.
> > + *
> > + * We must however clear the bitflag before unlocking
> > + * the vma so the users using the anon_vma->rb_root will
> > + * never see our bitflag.
> > + *
> > + * No need of atomic instructions here, head.next
> > + * can't change from under us until we release the
> > + * anon_vma->root->rwsem.
> > + */
> > + if (!__test_and_clear_bit(0, (unsigned long *)
> > + &anon_vma->root->rb_root.rb_root.rb_node))
> > + BUG();
> > + anon_vma_unlock_write(anon_vma);
> > + }
> > +}
> > +
> > +static void vm_unlock_mapping(struct address_space *mapping)
> > +{
> > + if (test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) {
> > + /*
> > + * AS_MM_ALL_LOCKS can't change to 0 from under us
> > + * because we hold the mm_all_locks_mutex.
> > + */
> > + i_mmap_unlock_write(mapping);
> > + if (!test_and_clear_bit(AS_MM_ALL_LOCKS,
> > + &mapping->flags))
> > + BUG();
> > + }
> > +}
> > +
> > +/*
> > + * The mmap_lock cannot be released by the caller until
> > + * mm_drop_all_locks() returns.
> > + */
> > +void mm_drop_all_locks(struct mm_struct *mm)
> > +{
> > + struct vm_area_struct *vma;
> > + struct anon_vma_chain *avc;
> > + VMA_ITERATOR(vmi, mm, 0);
> > +
> > + mmap_assert_write_locked(mm);
> > + BUG_ON(!mutex_is_locked(&mm_all_locks_mutex));
> > +
> > + for_each_vma(vmi, vma) {
> > + if (vma->anon_vma)
> > + list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
> > + vm_unlock_anon_vma(avc->anon_vma);
> > + if (vma->vm_file && vma->vm_file->f_mapping)
> > + vm_unlock_mapping(vma->vm_file->f_mapping);
> > + }
> > +
> > + mutex_unlock(&mm_all_locks_mutex);
> > +}
> > diff --git a/mm/vma.h b/mm/vma.h
> > new file mode 100644
> > index 000000000000..15d82dbb7213
> > --- /dev/null
> > +++ b/mm/vma.h
> > @@ -0,0 +1,356 @@
> > +/* SPDX-License-Identifier: GPL-2.0-or-later */
> > +/*
> > + * vma.h
> > + *
> > + * Core VMA manipulation API implemented in vma.c.
> > + */
> > +#ifndef __MM_VMA_H
> > +#define __MM_VMA_H
> > +
> > +/*
> > + * VMA lock generalization
> > + */
> > +struct vma_prepare {
> > + struct vm_area_struct *vma;
> > + struct vm_area_struct *adj_next;
> > + struct file *file;
> > + struct address_space *mapping;
> > + struct anon_vma *anon_vma;
> > + struct vm_area_struct *insert;
> > + struct vm_area_struct *remove;
> > + struct vm_area_struct *remove2;
> > +};
> > +
> > +struct unlink_vma_file_batch {
> > + int count;
> > + struct vm_area_struct *vmas[8];
> > +};
> > +
> > +#ifdef CONFIG_DEBUG_VM_MAPLE_TREE
> > +void validate_mm(struct mm_struct *mm);
> > +#else
> > +#define validate_mm(mm) do { } while (0)
> > +#endif
> > +
> > +/* Required for expand_downwards(). */
> > +void anon_vma_interval_tree_pre_update_vma(struct vm_area_struct *vma);
> > +
> > +/* Required for expand_downwards(). */
> > +void anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma);
> > +
> > +/* Required for do_brk_flags(). */
> > +void vma_prepare(struct vma_prepare *vp);
> > +
> > +/* Required for do_brk_flags(). */
> > +void init_vma_prep(struct vma_prepare *vp,
> > + struct vm_area_struct *vma);
> > +
> > +/* Required for do_brk_flags(). */
> > +void vma_complete(struct vma_prepare *vp,
> > + struct vma_iterator *vmi, struct mm_struct *mm);
> > +
> > +int vma_expand(struct vma_iterator *vmi, struct vm_area_struct *vma,
> > + unsigned long start, unsigned long end, pgoff_t pgoff,
> > + struct vm_area_struct *next);
> > +
> > +int vma_shrink(struct vma_iterator *vmi, struct vm_area_struct *vma,
> > + unsigned long start, unsigned long end, pgoff_t pgoff);
> > +
> > +int
> > +do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma,
> > + struct mm_struct *mm, unsigned long start,
> > + unsigned long end, struct list_head *uf, bool unlock);
> > +
> > +int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm,
> > + unsigned long start, size_t len, struct list_head *uf,
> > + bool unlock);
> > +
> > +void remove_vma(struct vm_area_struct *vma, bool unreachable);
> > +
> > +void unmap_region(struct mm_struct *mm, struct ma_state *mas,
> > + struct vm_area_struct *vma, struct vm_area_struct *prev,
> > + struct vm_area_struct *next, unsigned long start,
> > + unsigned long end, unsigned long tree_end, bool mm_wr_locked);
> > +
> > +/* Required by mmap_region(). */
> > +bool
> > +can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags,
> > + struct anon_vma *anon_vma, struct file *file,
> > + pgoff_t vm_pgoff, struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
> > + struct anon_vma_name *anon_name);
> > +
> > +/* Required by mmap_region() and do_brk_flags(). */
> > +bool
> > +can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags,
> > + struct anon_vma *anon_vma, struct file *file,
> > + pgoff_t vm_pgoff, struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
> > + struct anon_vma_name *anon_name);
> > +
> > +struct vm_area_struct *vma_modify(struct vma_iterator *vmi,
> > + struct vm_area_struct *prev,
> > + struct vm_area_struct *vma,
> > + unsigned long start, unsigned long end,
> > + unsigned long vm_flags,
> > + struct mempolicy *policy,
> > + struct vm_userfaultfd_ctx uffd_ctx,
> > + struct anon_vma_name *anon_name);
> > +
> > +/* We are about to modify the VMA's flags. */
> > +static inline struct vm_area_struct
> > +*vma_modify_flags(struct vma_iterator *vmi,
> > + struct vm_area_struct *prev,
> > + struct vm_area_struct *vma,
> > + unsigned long start, unsigned long end,
> > + unsigned long new_flags)
> > +{
> > + return vma_modify(vmi, prev, vma, start, end, new_flags,
> > + vma_policy(vma), vma->vm_userfaultfd_ctx,
> > + anon_vma_name(vma));
> > +}
> > +
> > +/* We are about to modify the VMA's flags and/or anon_name. */
> > +static inline struct vm_area_struct
> > +*vma_modify_flags_name(struct vma_iterator *vmi,
> > + struct vm_area_struct *prev,
> > + struct vm_area_struct *vma,
> > + unsigned long start,
> > + unsigned long end,
> > + unsigned long new_flags,
> > + struct anon_vma_name *new_name)
> > +{
> > + return vma_modify(vmi, prev, vma, start, end, new_flags,
> > + vma_policy(vma), vma->vm_userfaultfd_ctx, new_name);
> > +}
> > +
> > +/* We are about to modify the VMA's memory policy. */
> > +static inline struct vm_area_struct
> > +*vma_modify_policy(struct vma_iterator *vmi,
> > + struct vm_area_struct *prev,
> > + struct vm_area_struct *vma,
> > + unsigned long start, unsigned long end,
> > + struct mempolicy *new_pol)
> > +{
> > + return vma_modify(vmi, prev, vma, start, end, vma->vm_flags,
> > + new_pol, vma->vm_userfaultfd_ctx, anon_vma_name(vma));
> > +}
> > +
> > +/* We are about to modify the VMA's flags and/or uffd context. */
> > +static inline struct vm_area_struct
> > +*vma_modify_flags_uffd(struct vma_iterator *vmi,
> > + struct vm_area_struct *prev,
> > + struct vm_area_struct *vma,
> > + unsigned long start, unsigned long end,
> > + unsigned long new_flags,
> > + struct vm_userfaultfd_ctx new_ctx)
> > +{
> > + return vma_modify(vmi, prev, vma, start, end, new_flags,
> > + vma_policy(vma), new_ctx, anon_vma_name(vma));
> > +}
> > +
> > +struct vm_area_struct
> > +*vma_merge_new_vma(struct vma_iterator *vmi, struct vm_area_struct *prev,
> > + struct vm_area_struct *vma, unsigned long start,
> > + unsigned long end, pgoff_t pgoff);
> > +
> > +struct vm_area_struct *vma_merge_extend(struct vma_iterator *vmi,
> > + struct vm_area_struct *vma,
> > + unsigned long delta);
> > +
> > +void unlink_file_vma_batch_init(struct unlink_vma_file_batch *vb);
> > +
> > +void unlink_file_vma_batch_final(struct unlink_vma_file_batch *vb);
> > +
> > +void unlink_file_vma_batch_add(struct unlink_vma_file_batch *vb,
> > + struct vm_area_struct *vma);
> > +
> > +void unlink_file_vma(struct vm_area_struct *vma);
> > +
> > +void vma_link_file(struct vm_area_struct *vma);
> > +
> > +int vma_link(struct mm_struct *mm, struct vm_area_struct *vma);
> > +
> > +struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
> > + unsigned long addr, unsigned long len, pgoff_t pgoff,
> > + bool *need_rmap_locks);
> > +
> > +struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *vma);
> > +
> > +bool vma_needs_dirty_tracking(struct vm_area_struct *vma);
> > +bool vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot);
> > +
> > +int mm_take_all_locks(struct mm_struct *mm);
> > +void mm_drop_all_locks(struct mm_struct *mm);
> > +unsigned long count_vma_pages_range(struct mm_struct *mm,
> > + unsigned long addr, unsigned long end);
> > +
> > +static inline bool vma_wants_manual_pte_write_upgrade(struct vm_area_struct *vma)
> > +{
> > + /*
> > + * We want to check manually if we can change individual PTEs writable
> > + * if we can't do that automatically for all PTEs in a mapping. For
> > + * private mappings, that's always the case when we have write
> > + * permissions as we properly have to handle COW.
> > + */
> > + if (vma->vm_flags & VM_SHARED)
> > + return vma_wants_writenotify(vma, vma->vm_page_prot);
> > + return !!(vma->vm_flags & VM_WRITE);
> > +}
> > +
> > +static inline pgprot_t vm_pgprot_modify(pgprot_t oldprot, unsigned long vm_flags)
> > +{
> > + return pgprot_modify(oldprot, vm_get_page_prot(vm_flags));
> > +}
> > +
> > +static inline struct vm_area_struct *vma_prev_limit(struct vma_iterator *vmi,
> > + unsigned long min)
> > +{
> > + return mas_prev(&vmi->mas, min);
> > +}
> > +
> > +static inline int vma_iter_store_gfp(struct vma_iterator *vmi,
> > + struct vm_area_struct *vma, gfp_t gfp)
> > +{
> > + if (vmi->mas.status != ma_start &&
> > + ((vmi->mas.index > vma->vm_start) || (vmi->mas.last < vma->vm_start)))
> > + vma_iter_invalidate(vmi);
> > +
> > + __mas_set_range(&vmi->mas, vma->vm_start, vma->vm_end - 1);
> > + mas_store_gfp(&vmi->mas, vma, gfp);
> > + if (unlikely(mas_is_err(&vmi->mas)))
> > + return -ENOMEM;
> > +
> > + return 0;
> > +}
> > +
> > +
> > +/*
> > + * These three helpers classifies VMAs for virtual memory accounting.
> > + */
> > +
> > +/*
> > + * Executable code area - executable, not writable, not stack
> > + */
> > +static inline bool is_exec_mapping(vm_flags_t flags)
> > +{
> > + return (flags & (VM_EXEC | VM_WRITE | VM_STACK)) == VM_EXEC;
> > +}
> > +
> > +/*
> > + * Stack area (including shadow stacks)
> > + *
> > + * VM_GROWSUP / VM_GROWSDOWN VMAs are always private anonymous:
> > + * do_mmap() forbids all other combinations.
> > + */
> > +static inline bool is_stack_mapping(vm_flags_t flags)
> > +{
> > + return ((flags & VM_STACK) == VM_STACK) || (flags & VM_SHADOW_STACK);
> > +}
> > +
> > +/*
> > + * Data area - private, writable, not stack
> > + */
> > +static inline bool is_data_mapping(vm_flags_t flags)
> > +{
> > + return (flags & (VM_WRITE | VM_SHARED | VM_STACK)) == VM_WRITE;
> > +}
> > +
> > +
> > +static inline void vma_iter_config(struct vma_iterator *vmi,
> > + unsigned long index, unsigned long last)
> > +{
> > + __mas_set_range(&vmi->mas, index, last - 1);
> > +}
> > +
> > +static inline void vma_iter_reset(struct vma_iterator *vmi)
> > +{
> > + mas_reset(&vmi->mas);
> > +}
> > +
> > +static inline
> > +struct vm_area_struct *vma_iter_prev_range_limit(struct vma_iterator *vmi, unsigned long min)
> > +{
> > + return mas_prev_range(&vmi->mas, min);
> > +}
> > +
> > +static inline
> > +struct vm_area_struct *vma_iter_next_range_limit(struct vma_iterator *vmi, unsigned long max)
> > +{
> > + return mas_next_range(&vmi->mas, max);
> > +}
> > +
> > +static inline int vma_iter_area_lowest(struct vma_iterator *vmi, unsigned long min,
> > + unsigned long max, unsigned long size)
> > +{
> > + return mas_empty_area(&vmi->mas, min, max - 1, size);
> > +}
> > +
> > +static inline int vma_iter_area_highest(struct vma_iterator *vmi, unsigned long min,
> > + unsigned long max, unsigned long size)
> > +{
> > + return mas_empty_area_rev(&vmi->mas, min, max - 1, size);
> > +}
> > +
> > +/*
> > + * VMA Iterator functions shared between nommu and mmap
> > + */
> > +static inline int vma_iter_prealloc(struct vma_iterator *vmi,
> > + struct vm_area_struct *vma)
> > +{
> > + return mas_preallocate(&vmi->mas, vma, GFP_KERNEL);
> > +}
> > +
> > +static inline void vma_iter_clear(struct vma_iterator *vmi)
> > +{
> > + mas_store_prealloc(&vmi->mas, NULL);
> > +}
> > +
> > +static inline struct vm_area_struct *vma_iter_load(struct vma_iterator *vmi)
> > +{
> > + return mas_walk(&vmi->mas);
> > +}
> > +
> > +/* Store a VMA with preallocated memory */
> > +static inline void vma_iter_store(struct vma_iterator *vmi,
> > + struct vm_area_struct *vma)
> > +{
> > +
> > +#if defined(CONFIG_DEBUG_VM_MAPLE_TREE)
> > + if (MAS_WARN_ON(&vmi->mas, vmi->mas.status != ma_start &&
> > + vmi->mas.index > vma->vm_start)) {
> > + pr_warn("%lx > %lx\n store vma %lx-%lx\n into slot %lx-%lx\n",
> > + vmi->mas.index, vma->vm_start, vma->vm_start,
> > + vma->vm_end, vmi->mas.index, vmi->mas.last);
> > + }
> > + if (MAS_WARN_ON(&vmi->mas, vmi->mas.status != ma_start &&
> > + vmi->mas.last < vma->vm_start)) {
> > + pr_warn("%lx < %lx\nstore vma %lx-%lx\ninto slot %lx-%lx\n",
> > + vmi->mas.last, vma->vm_start, vma->vm_start, vma->vm_end,
> > + vmi->mas.index, vmi->mas.last);
> > + }
> > +#endif
> > +
> > + if (vmi->mas.status != ma_start &&
> > + ((vmi->mas.index > vma->vm_start) || (vmi->mas.last < vma->vm_start)))
> > + vma_iter_invalidate(vmi);
> > +
> > + __mas_set_range(&vmi->mas, vma->vm_start, vma->vm_end - 1);
> > + mas_store_prealloc(&vmi->mas, vma);
> > +}
> > +
> > +static inline unsigned long vma_iter_addr(struct vma_iterator *vmi)
> > +{
> > + return vmi->mas.index;
> > +}
> > +
> > +static inline unsigned long vma_iter_end(struct vma_iterator *vmi)
> > +{
> > + return vmi->mas.last + 1;
> > +}
> > +
> > +static inline int vma_iter_bulk_alloc(struct vma_iterator *vmi,
> > + unsigned long count)
> > +{
> > + return mas_expected_entries(&vmi->mas, count);
> > +}
> > +
> > +#endif /* __MM_VMA_H */
> > diff --git a/mm/vma_internal.h b/mm/vma_internal.h
> > new file mode 100644
> > index 000000000000..51b2010f30c0
> > --- /dev/null
> > +++ b/mm/vma_internal.h
> > @@ -0,0 +1,143 @@
> > +/* SPDX-License-Identifier: GPL-2.0-or-later */
> > +/*
> > + * vma_internal.h
> > + *
> > + * Headers required by vma.c, which can be substituted accordingly when testing
> > + * VMA functionality.
> > + */
> > +
>
> This should probably have header guards for the testing side?
>
> > +/* For fundamental mm types and VMA_ITERATOR(),
> > + * tlb_gather_mmu(), tlb_finish_mmu().
> > + */
> > +#include <linux/mm_types.h>
> > +
> > +/* For mapping_can_writeback(). */
> > +#include <linux/backing-dev.h>
> > +
> > +/*
> > + * For test_bit(), test_and_set_bit(), __test_and_set_bit(),
> > + * test_and_clear_bit().
> > + */
> > +#include <linux/bitops.h>
> > +
> > +/* For WARN_ON(), WARN_ON_ONCE(), BUG_ON(). */
> > +#include <linux/bug.h>
> > +
> > +/* For ERR_PTR(). */
> > +#include <linux/err.h>
> > +
> > +/* For fput(). */
> > +#include <linux/file.h>
> > +
> > +/*
> > + * For get_file(), mapping_unmap_writable(), i_mmap_lock_write(),
> > + * i_mmap_unlock_write().
> > + */
> > +#include <linux/fs.h>
> > +
> > +/* For vma_adjust_trans_huge(). */
> > +#include <linux/huge_mm.h>
> > +
> > +/* For is_vm_hugetlb_page(). */
> > +#include <linux/hugetlb_inline.h>
> > +
> > +/* For might_sleep(). */
> > +#include <linux/kernel.h>
> > +
> > +/* For khugepaged_enter_vma(). */
> > +#include <linux/khugepaged.h>
> > +
> > +/* For maple tree operations. */
> > +#include <linux/maple_tree.h>
> > +
> > +/* For mpol_put(), vma_policy(), vma_dup_policy(), mpol_equal(). */
> > +#include <linux/mempolicy.h>
> > +
> > +/*
> > + * For VM flags, update_hiwater_rss(), __vm_area_free(), vm_area_free(),
> > + * vm_area_dup(), unmap_vmas(), vma_start_write(), vma_prev(), vma_next(),
> > + * vma_iter_free(), vm_area_free(), vm_stat_account(), vma_is_shared_maywrite(),
> > + * vma_interval_tree_remove(), anon_vma_interval_tree_remove(),
> > + * anon_vma_interval_tree_insert(), vma_assert_write_locked(), for_each_vma(),
> > + * vma_mark_detached(), for_each_vma_range(), vma_iter_set(),
> > + * vma_iter_prev_range(), vma_iter_clear_gfp(), PAGE_ALIGN(), vma_find(),
> > + * find_vma_intersection(), vma_lookup(), vma_is_anonymous(), find_vma_prev().
> > + */
> > +#include <linux/mm.h>
> > +
> > +/* For VM_WARN_ON(), VM_WARN_ON_ONCE_MM(), VM_BUG_ON_VMA(). */
> > +#include <linux/mmdebug.h>
> > +
> > +/* For list_is_singular(), list_for_each_entry(). */
> > +#include <linux/list.h>
> > +
> > +/* For anon_vma_name_eq(). */
> > +#include <linux/mm_inline.h>
> > +
> > +/* For vm_unacct_memory(), vma_pages(). */
> > +#include <linux/mman.h>
> > +
> > +/* For mmap_write_unlock(), mmap_write_downgrade(), mmap_assert_write_locked(). */
> > +#include <linux/mmap_lock.h>
> > +
> > +/* For DEFINE_MUTEX(), mutex_lock(), mutex_unlock(). */
> > +#include <linux/mutex.h>
> > +
> > +/* For AS_MM_ALL_LOCKS. */
> > +#include <linux/pagemap.h>
> > +
> > +/* For PHYS_PFN(). */
> > +#include <linux/pfn.h>
> > +
> > +/* For rcu_read_lock(), rcu_read_unlock(). */
> > +#include <linux/rcupdate.h>
> > +
> > +/*
> > + * For anon_vma_clone(), anon_vma_lock_write(), anon_vma_clone(),
> > + * unlink_anon_vmas(), anon_vma_unlock_write(), anon_vma_merge().
> > + */
> > +#include <linux/rmap.h>
> > +
> > +/* For down_write_nest_lock(). */
> > +#include <linux/rwsem.h>
> > +
> > +/* For signal_pending(). */
> > +#include <linux/sched/signal.h>
> > +
> > +/* For lru_add_drain(). */
> > +#include <linux/swap.h>
> > +
> > +/* For uprobe_mmap(), uprobe_munmap(). */
> > +#include <linux/uprobes.h>
> > +/*
> > + * For struct vm_userfaultfd_ctx, is_mergeable_vm_userfaultfd_ctx(),
> > + * userfaultfd_unmap_prep, userfaultfd_wp().
> > + */
> > +#include <linux/userfaultfd_k.h>
> > +
> > +/* For BUG(). */
> > +#include <linux/bug.h>
> > +
> > +/* For flush_dcache_mmap_lock(), flush_dcache_mmap_unlock(). */
> > +#include <linux/cacheflush.h>
> > +
> > +/* For current. */
> > +#include <asm/current.h>
> > +
> > +/* For PAGE_SHIFT, etc. */
> > +#include <asm/page_types.h>
> > +
> > +/* For pgprot_val(). */
> > +#include <asm/pgtable_types.h>
> > +
> > +/* For struct mmu_gather. */
> > +#include <asm/tlb.h>
> > +
> > +/* For arch_unmap(). */
> > +#include <linux/mmu_context.h>
> > +
> > +/*
> > + * For free_pgtables(), vma_set_range(), can_modify_mm(),
> > + * vma_soft_dirty_enabled().
> > + */
> > +#include "internal.h"
> > --
> > 2.45.1
> >
Powered by blists - more mailing lists