linux-kernel - Re: [PATCH v3 3/4] mm: move dup

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <CAJuCfpGOLgMTTToug4kcGCw29dQJjWQ0OXU8Mpfoo_3NEPUZng@mail.gmail.com>
Date: Mon, 28 Apr 2025 16:31:07 -0700
From: Suren Baghdasaryan <surenb@...gle.com>
To: "Liam R. Howlett" <Liam.Howlett@...cle.com>, Lorenzo Stoakes <lorenzo.stoakes@...cle.com>, 
	Andrew Morton <akpm@...ux-foundation.org>, Vlastimil Babka <vbabka@...e.cz>, Jann Horn <jannh@...gle.com>, 
	Pedro Falcato <pfalcato@...e.de>, David Hildenbrand <david@...hat.com>, Kees Cook <kees@...nel.org>, 
	Alexander Viro <viro@...iv.linux.org.uk>, Christian Brauner <brauner@...nel.org>, Jan Kara <jack@...e.cz>, 
	Suren Baghdasaryan <surenb@...gle.com>, linux-mm@...ck.org, linux-fsdevel@...r.kernel.org, 
	linux-kernel@...r.kernel.org
Subject: Re: [PATCH v3 3/4] mm: move dup_mmap() to mm

On Mon, Apr 28, 2025 at 12:13 PM Liam R. Howlett
<Liam.Howlett@...cle.com> wrote:
>
> * Lorenzo Stoakes <lorenzo.stoakes@...cle.com> [250428 11:28]:
> > This is a key step in our being able to abstract and isolate VMA allocation
> > and destruction logic.
> >
> > This function is the last one where vm_area_free() and vm_area_dup() are
> > directly referenced outside of mmap, so having this in mm allows us to
> > isolate these.
> >
> > We do the same for the nommu version which is substantially simpler.
> >
> > We place the declaration for dup_mmap() in mm/internal.h and have
> > kernel/fork.c import this in order to prevent improper use of this
> > functionality elsewhere in the kernel.
> >
> > While we're here, we remove the useless #ifdef CONFIG_MMU check around
> > mmap_read_lock_maybe_expand() in mmap.c, mmap.c is compiled only if
> > CONFIG_MMU is set.
> >
> > Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@...cle.com>
> > Suggested-by: Pedro Falcato <pfalcato@...e.de>
> > Reviewed-by: Pedro Falcato <pfalcato@...e.de>
>
> Reviewed-by: Liam R. Howlett <Liam.Howlett@...cle.com>

Reviewed-by: Suren Baghdasaryan <surenb@...gle.com>

>
> > ---
> >  kernel/fork.c | 189 ++------------------------------------------------
> >  mm/internal.h |   2 +
> >  mm/mmap.c     | 181 +++++++++++++++++++++++++++++++++++++++++++++--
> >  mm/nommu.c    |   8 +++
> >  4 files changed, 189 insertions(+), 191 deletions(-)
> >
> > diff --git a/kernel/fork.c b/kernel/fork.c
> > index 168681fc4b25..ac9f9267a473 100644
> > --- a/kernel/fork.c
> > +++ b/kernel/fork.c
> > @@ -112,6 +112,9 @@
> >  #include <asm/cacheflush.h>
> >  #include <asm/tlbflush.h>
> >
> > +/* For dup_mmap(). */
> > +#include "../mm/internal.h"
> > +
> >  #include <trace/events/sched.h>
> >
> >  #define CREATE_TRACE_POINTS
> > @@ -589,7 +592,7 @@ void free_task(struct task_struct *tsk)
> >  }
> >  EXPORT_SYMBOL(free_task);
> >
> > -static void dup_mm_exe_file(struct mm_struct *mm, struct mm_struct *oldmm)
> > +void dup_mm_exe_file(struct mm_struct *mm, struct mm_struct *oldmm)
> >  {
> >       struct file *exe_file;
> >
> > @@ -604,183 +607,6 @@ static void dup_mm_exe_file(struct mm_struct *mm, struct mm_struct *oldmm)
> >  }
> >
> >  #ifdef CONFIG_MMU
> > -static __latent_entropy int dup_mmap(struct mm_struct *mm,
> > -                                     struct mm_struct *oldmm)
> > -{
> > -     struct vm_area_struct *mpnt, *tmp;
> > -     int retval;
> > -     unsigned long charge = 0;
> > -     LIST_HEAD(uf);
> > -     VMA_ITERATOR(vmi, mm, 0);
> > -
> > -     if (mmap_write_lock_killable(oldmm))
> > -             return -EINTR;
> > -     flush_cache_dup_mm(oldmm);
> > -     uprobe_dup_mmap(oldmm, mm);
> > -     /*
> > -      * Not linked in yet - no deadlock potential:
> > -      */
> > -     mmap_write_lock_nested(mm, SINGLE_DEPTH_NESTING);
> > -
> > -     /* No ordering required: file already has been exposed. */
> > -     dup_mm_exe_file(mm, oldmm);
> > -
> > -     mm->total_vm = oldmm->total_vm;
> > -     mm->data_vm = oldmm->data_vm;
> > -     mm->exec_vm = oldmm->exec_vm;
> > -     mm->stack_vm = oldmm->stack_vm;
> > -
> > -     /* Use __mt_dup() to efficiently build an identical maple tree. */
> > -     retval = __mt_dup(&oldmm->mm_mt, &mm->mm_mt, GFP_KERNEL);
> > -     if (unlikely(retval))
> > -             goto out;
> > -
> > -     mt_clear_in_rcu(vmi.mas.tree);
> > -     for_each_vma(vmi, mpnt) {
> > -             struct file *file;
> > -
> > -             vma_start_write(mpnt);
> > -             if (mpnt->vm_flags & VM_DONTCOPY) {
> > -                     retval = vma_iter_clear_gfp(&vmi, mpnt->vm_start,
> > -                                                 mpnt->vm_end, GFP_KERNEL);
> > -                     if (retval)
> > -                             goto loop_out;
> > -
> > -                     vm_stat_account(mm, mpnt->vm_flags, -vma_pages(mpnt));
> > -                     continue;
> > -             }
> > -             charge = 0;
> > -             /*
> > -              * Don't duplicate many vmas if we've been oom-killed (for
> > -              * example)
> > -              */
> > -             if (fatal_signal_pending(current)) {
> > -                     retval = -EINTR;
> > -                     goto loop_out;
> > -             }
> > -             if (mpnt->vm_flags & VM_ACCOUNT) {
> > -                     unsigned long len = vma_pages(mpnt);
> > -
> > -                     if (security_vm_enough_memory_mm(oldmm, len)) /* sic */
> > -                             goto fail_nomem;
> > -                     charge = len;
> > -             }
> > -             tmp = vm_area_dup(mpnt);
> > -             if (!tmp)
> > -                     goto fail_nomem;
> > -
> > -             /* track_pfn_copy() will later take care of copying internal state. */
> > -             if (unlikely(tmp->vm_flags & VM_PFNMAP))
> > -                     untrack_pfn_clear(tmp);
> > -
> > -             retval = vma_dup_policy(mpnt, tmp);
> > -             if (retval)
> > -                     goto fail_nomem_policy;
> > -             tmp->vm_mm = mm;
> > -             retval = dup_userfaultfd(tmp, &uf);
> > -             if (retval)
> > -                     goto fail_nomem_anon_vma_fork;
> > -             if (tmp->vm_flags & VM_WIPEONFORK) {
> > -                     /*
> > -                      * VM_WIPEONFORK gets a clean slate in the child.
> > -                      * Don't prepare anon_vma until fault since we don't
> > -                      * copy page for current vma.
> > -                      */
> > -                     tmp->anon_vma = NULL;
> > -             } else if (anon_vma_fork(tmp, mpnt))
> > -                     goto fail_nomem_anon_vma_fork;
> > -             vm_flags_clear(tmp, VM_LOCKED_MASK);
> > -             /*
> > -              * Copy/update hugetlb private vma information.
> > -              */
> > -             if (is_vm_hugetlb_page(tmp))
> > -                     hugetlb_dup_vma_private(tmp);
> > -
> > -             /*
> > -              * Link the vma into the MT. After using __mt_dup(), memory
> > -              * allocation is not necessary here, so it cannot fail.
> > -              */
> > -             vma_iter_bulk_store(&vmi, tmp);
> > -
> > -             mm->map_count++;
> > -
> > -             if (tmp->vm_ops && tmp->vm_ops->open)
> > -                     tmp->vm_ops->open(tmp);
> > -
> > -             file = tmp->vm_file;
> > -             if (file) {
> > -                     struct address_space *mapping = file->f_mapping;
> > -
> > -                     get_file(file);
> > -                     i_mmap_lock_write(mapping);
> > -                     if (vma_is_shared_maywrite(tmp))
> > -                             mapping_allow_writable(mapping);
> > -                     flush_dcache_mmap_lock(mapping);
> > -                     /* insert tmp into the share list, just after mpnt */
> > -                     vma_interval_tree_insert_after(tmp, mpnt,
> > -                                     &mapping->i_mmap);
> > -                     flush_dcache_mmap_unlock(mapping);
> > -                     i_mmap_unlock_write(mapping);
> > -             }
> > -
> > -             if (!(tmp->vm_flags & VM_WIPEONFORK))
> > -                     retval = copy_page_range(tmp, mpnt);
> > -
> > -             if (retval) {
> > -                     mpnt = vma_next(&vmi);
> > -                     goto loop_out;
> > -             }
> > -     }
> > -     /* a new mm has just been created */
> > -     retval = arch_dup_mmap(oldmm, mm);
> > -loop_out:
> > -     vma_iter_free(&vmi);
> > -     if (!retval) {
> > -             mt_set_in_rcu(vmi.mas.tree);
> > -             ksm_fork(mm, oldmm);
> > -             khugepaged_fork(mm, oldmm);
> > -     } else {
> > -
> > -             /*
> > -              * The entire maple tree has already been duplicated. If the
> > -              * mmap duplication fails, mark the failure point with
> > -              * XA_ZERO_ENTRY. In exit_mmap(), if this marker is encountered,
> > -              * stop releasing VMAs that have not been duplicated after this
> > -              * point.
> > -              */
> > -             if (mpnt) {
> > -                     mas_set_range(&vmi.mas, mpnt->vm_start, mpnt->vm_end - 1);
> > -                     mas_store(&vmi.mas, XA_ZERO_ENTRY);
> > -                     /* Avoid OOM iterating a broken tree */
> > -                     set_bit(MMF_OOM_SKIP, &mm->flags);
> > -             }
> > -             /*
> > -              * The mm_struct is going to exit, but the locks will be dropped
> > -              * first.  Set the mm_struct as unstable is advisable as it is
> > -              * not fully initialised.
> > -              */
> > -             set_bit(MMF_UNSTABLE, &mm->flags);
> > -     }
> > -out:
> > -     mmap_write_unlock(mm);
> > -     flush_tlb_mm(oldmm);
> > -     mmap_write_unlock(oldmm);
> > -     if (!retval)
> > -             dup_userfaultfd_complete(&uf);
> > -     else
> > -             dup_userfaultfd_fail(&uf);
> > -     return retval;
> > -
> > -fail_nomem_anon_vma_fork:
> > -     mpol_put(vma_policy(tmp));
> > -fail_nomem_policy:
> > -     vm_area_free(tmp);
> > -fail_nomem:
> > -     retval = -ENOMEM;
> > -     vm_unacct_memory(charge);
> > -     goto loop_out;
> > -}
> > -
> >  static inline int mm_alloc_pgd(struct mm_struct *mm)
> >  {
> >       mm->pgd = pgd_alloc(mm);
> > @@ -794,13 +620,6 @@ static inline void mm_free_pgd(struct mm_struct *mm)
> >       pgd_free(mm, mm->pgd);
> >  }
> >  #else
> > -static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
> > -{
> > -     mmap_write_lock(oldmm);
> > -     dup_mm_exe_file(mm, oldmm);
> > -     mmap_write_unlock(oldmm);
> > -     return 0;
> > -}
> >  #define mm_alloc_pgd(mm)     (0)
> >  #define mm_free_pgd(mm)
> >  #endif /* CONFIG_MMU */
> > diff --git a/mm/internal.h b/mm/internal.h
> > index 40464f755092..b3e011976f74 100644
> > --- a/mm/internal.h
> > +++ b/mm/internal.h
> > @@ -1631,5 +1631,7 @@ static inline bool reclaim_pt_is_enabled(unsigned long start, unsigned long end,
> >  }
> >  #endif /* CONFIG_PT_RECLAIM */
> >
> > +void dup_mm_exe_file(struct mm_struct *mm, struct mm_struct *oldmm);
> > +int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm);
> >
> >  #endif       /* __MM_INTERNAL_H */
> > diff --git a/mm/mmap.c b/mm/mmap.c
> > index 9e09eac0021c..5259df031e15 100644
> > --- a/mm/mmap.c
> > +++ b/mm/mmap.c
> > @@ -1675,7 +1675,6 @@ static int __meminit init_reserve_notifier(void)
> >  }
> >  subsys_initcall(init_reserve_notifier);
> >
> > -#ifdef CONFIG_MMU
> >  /*
> >   * Obtain a read lock on mm->mmap_lock, if the specified address is below the
> >   * start of the VMA, the intent is to perform a write, and it is a
> > @@ -1719,10 +1718,180 @@ bool mmap_read_lock_maybe_expand(struct mm_struct *mm,
> >       mmap_write_downgrade(mm);
> >       return true;
> >  }
> > -#else
> > -bool mmap_read_lock_maybe_expand(struct mm_struct *mm, struct vm_area_struct *vma,
> > -                              unsigned long addr, bool write)
> > +
> > +__latent_entropy int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
> >  {
> > -     return false;
> > +     struct vm_area_struct *mpnt, *tmp;
> > +     int retval;
> > +     unsigned long charge = 0;
> > +     LIST_HEAD(uf);
> > +     VMA_ITERATOR(vmi, mm, 0);
> > +
> > +     if (mmap_write_lock_killable(oldmm))
> > +             return -EINTR;
> > +     flush_cache_dup_mm(oldmm);
> > +     uprobe_dup_mmap(oldmm, mm);
> > +     /*
> > +      * Not linked in yet - no deadlock potential:
> > +      */
> > +     mmap_write_lock_nested(mm, SINGLE_DEPTH_NESTING);
> > +
> > +     /* No ordering required: file already has been exposed. */
> > +     dup_mm_exe_file(mm, oldmm);
> > +
> > +     mm->total_vm = oldmm->total_vm;
> > +     mm->data_vm = oldmm->data_vm;
> > +     mm->exec_vm = oldmm->exec_vm;
> > +     mm->stack_vm = oldmm->stack_vm;
> > +
> > +     /* Use __mt_dup() to efficiently build an identical maple tree. */
> > +     retval = __mt_dup(&oldmm->mm_mt, &mm->mm_mt, GFP_KERNEL);
> > +     if (unlikely(retval))
> > +             goto out;
> > +
> > +     mt_clear_in_rcu(vmi.mas.tree);
> > +     for_each_vma(vmi, mpnt) {
> > +             struct file *file;
> > +
> > +             vma_start_write(mpnt);
> > +             if (mpnt->vm_flags & VM_DONTCOPY) {
> > +                     retval = vma_iter_clear_gfp(&vmi, mpnt->vm_start,
> > +                                                 mpnt->vm_end, GFP_KERNEL);
> > +                     if (retval)
> > +                             goto loop_out;
> > +
> > +                     vm_stat_account(mm, mpnt->vm_flags, -vma_pages(mpnt));
> > +                     continue;
> > +             }
> > +             charge = 0;
> > +             /*
> > +              * Don't duplicate many vmas if we've been oom-killed (for
> > +              * example)
> > +              */
> > +             if (fatal_signal_pending(current)) {
> > +                     retval = -EINTR;
> > +                     goto loop_out;
> > +             }
> > +             if (mpnt->vm_flags & VM_ACCOUNT) {
> > +                     unsigned long len = vma_pages(mpnt);
> > +
> > +                     if (security_vm_enough_memory_mm(oldmm, len)) /* sic */
> > +                             goto fail_nomem;
> > +                     charge = len;
> > +             }
> > +
> > +             tmp = vm_area_dup(mpnt);
> > +             if (!tmp)
> > +                     goto fail_nomem;
> > +
> > +             /* track_pfn_copy() will later take care of copying internal state. */
> > +             if (unlikely(tmp->vm_flags & VM_PFNMAP))
> > +                     untrack_pfn_clear(tmp);
> > +
> > +             retval = vma_dup_policy(mpnt, tmp);
> > +             if (retval)
> > +                     goto fail_nomem_policy;
> > +             tmp->vm_mm = mm;
> > +             retval = dup_userfaultfd(tmp, &uf);
> > +             if (retval)
> > +                     goto fail_nomem_anon_vma_fork;
> > +             if (tmp->vm_flags & VM_WIPEONFORK) {
> > +                     /*
> > +                      * VM_WIPEONFORK gets a clean slate in the child.
> > +                      * Don't prepare anon_vma until fault since we don't
> > +                      * copy page for current vma.
> > +                      */
> > +                     tmp->anon_vma = NULL;
> > +             } else if (anon_vma_fork(tmp, mpnt))
> > +                     goto fail_nomem_anon_vma_fork;
> > +             vm_flags_clear(tmp, VM_LOCKED_MASK);
> > +             /*
> > +              * Copy/update hugetlb private vma information.
> > +              */
> > +             if (is_vm_hugetlb_page(tmp))
> > +                     hugetlb_dup_vma_private(tmp);
> > +
> > +             /*
> > +              * Link the vma into the MT. After using __mt_dup(), memory
> > +              * allocation is not necessary here, so it cannot fail.
> > +              */
> > +             vma_iter_bulk_store(&vmi, tmp);
> > +
> > +             mm->map_count++;
> > +
> > +             if (tmp->vm_ops && tmp->vm_ops->open)
> > +                     tmp->vm_ops->open(tmp);
> > +
> > +             file = tmp->vm_file;
> > +             if (file) {
> > +                     struct address_space *mapping = file->f_mapping;
> > +
> > +                     get_file(file);
> > +                     i_mmap_lock_write(mapping);
> > +                     if (vma_is_shared_maywrite(tmp))
> > +                             mapping_allow_writable(mapping);
> > +                     flush_dcache_mmap_lock(mapping);
> > +                     /* insert tmp into the share list, just after mpnt */
> > +                     vma_interval_tree_insert_after(tmp, mpnt,
> > +                                     &mapping->i_mmap);
> > +                     flush_dcache_mmap_unlock(mapping);
> > +                     i_mmap_unlock_write(mapping);
> > +             }
> > +
> > +             if (!(tmp->vm_flags & VM_WIPEONFORK))
> > +                     retval = copy_page_range(tmp, mpnt);
> > +
> > +             if (retval) {
> > +                     mpnt = vma_next(&vmi);
> > +                     goto loop_out;
> > +             }
> > +     }
> > +     /* a new mm has just been created */
> > +     retval = arch_dup_mmap(oldmm, mm);
> > +loop_out:
> > +     vma_iter_free(&vmi);
> > +     if (!retval) {
> > +             mt_set_in_rcu(vmi.mas.tree);
> > +             ksm_fork(mm, oldmm);
> > +             khugepaged_fork(mm, oldmm);
> > +     } else {
> > +
> > +             /*
> > +              * The entire maple tree has already been duplicated. If the
> > +              * mmap duplication fails, mark the failure point with
> > +              * XA_ZERO_ENTRY. In exit_mmap(), if this marker is encountered,
> > +              * stop releasing VMAs that have not been duplicated after this
> > +              * point.
> > +              */
> > +             if (mpnt) {
> > +                     mas_set_range(&vmi.mas, mpnt->vm_start, mpnt->vm_end - 1);
> > +                     mas_store(&vmi.mas, XA_ZERO_ENTRY);
> > +                     /* Avoid OOM iterating a broken tree */
> > +                     set_bit(MMF_OOM_SKIP, &mm->flags);
> > +             }
> > +             /*
> > +              * The mm_struct is going to exit, but the locks will be dropped
> > +              * first.  Set the mm_struct as unstable is advisable as it is
> > +              * not fully initialised.
> > +              */
> > +             set_bit(MMF_UNSTABLE, &mm->flags);
> > +     }
> > +out:
> > +     mmap_write_unlock(mm);
> > +     flush_tlb_mm(oldmm);
> > +     mmap_write_unlock(oldmm);
> > +     if (!retval)
> > +             dup_userfaultfd_complete(&uf);
> > +     else
> > +             dup_userfaultfd_fail(&uf);
> > +     return retval;
> > +
> > +fail_nomem_anon_vma_fork:
> > +     mpol_put(vma_policy(tmp));
> > +fail_nomem_policy:
> > +     vm_area_free(tmp);
> > +fail_nomem:
> > +     retval = -ENOMEM;
> > +     vm_unacct_memory(charge);
> > +     goto loop_out;
> >  }
> > -#endif
> > diff --git a/mm/nommu.c b/mm/nommu.c
> > index 2b4d304c6445..a142fc258d39 100644
> > --- a/mm/nommu.c
> > +++ b/mm/nommu.c
> > @@ -1874,3 +1874,11 @@ static int __meminit init_admin_reserve(void)
> >       return 0;
> >  }
> >  subsys_initcall(init_admin_reserve);
> > +
> > +int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
> > +{
> > +     mmap_write_lock(oldmm);
> > +     dup_mm_exe_file(mm, oldmm);
> > +     mmap_write_unlock(oldmm);
> > +     return 0;
> > +}
> > --
> > 2.49.0
> >