linux-kernel - Re: [PATCH v3 3/4] mm: move dup

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <gd3dcjrc47stimsmpcln3s6tu7vrhmccu5mej3jmfhsbp32hg7@5ffby6k5rcfp>
Date: Mon, 28 Apr 2025 15:12:59 -0400
From: "Liam R. Howlett" <Liam.Howlett@...cle.com>
To: Lorenzo Stoakes <lorenzo.stoakes@...cle.com>
Cc: Andrew Morton <akpm@...ux-foundation.org>,
        Vlastimil Babka <vbabka@...e.cz>, Jann Horn <jannh@...gle.com>,
        Pedro Falcato <pfalcato@...e.de>, David Hildenbrand <david@...hat.com>,
        Kees Cook <kees@...nel.org>, Alexander Viro <viro@...iv.linux.org.uk>,
        Christian Brauner <brauner@...nel.org>, Jan Kara <jack@...e.cz>,
        Suren Baghdasaryan <surenb@...gle.com>, linux-mm@...ck.org,
        linux-fsdevel@...r.kernel.org, linux-kernel@...r.kernel.org
Subject: Re: [PATCH v3 3/4] mm: move dup_mmap() to mm

* Lorenzo Stoakes <lorenzo.stoakes@...cle.com> [250428 11:28]:
> This is a key step in our being able to abstract and isolate VMA allocation
> and destruction logic.
> 
> This function is the last one where vm_area_free() and vm_area_dup() are
> directly referenced outside of mmap, so having this in mm allows us to
> isolate these.
> 
> We do the same for the nommu version which is substantially simpler.
> 
> We place the declaration for dup_mmap() in mm/internal.h and have
> kernel/fork.c import this in order to prevent improper use of this
> functionality elsewhere in the kernel.
> 
> While we're here, we remove the useless #ifdef CONFIG_MMU check around
> mmap_read_lock_maybe_expand() in mmap.c, mmap.c is compiled only if
> CONFIG_MMU is set.
> 
> Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@...cle.com>
> Suggested-by: Pedro Falcato <pfalcato@...e.de>
> Reviewed-by: Pedro Falcato <pfalcato@...e.de>

Reviewed-by: Liam R. Howlett <Liam.Howlett@...cle.com>

> ---
>  kernel/fork.c | 189 ++------------------------------------------------
>  mm/internal.h |   2 +
>  mm/mmap.c     | 181 +++++++++++++++++++++++++++++++++++++++++++++--
>  mm/nommu.c    |   8 +++
>  4 files changed, 189 insertions(+), 191 deletions(-)
> 
> diff --git a/kernel/fork.c b/kernel/fork.c
> index 168681fc4b25..ac9f9267a473 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -112,6 +112,9 @@
>  #include <asm/cacheflush.h>
>  #include <asm/tlbflush.h>
>  
> +/* For dup_mmap(). */
> +#include "../mm/internal.h"
> +
>  #include <trace/events/sched.h>
>  
>  #define CREATE_TRACE_POINTS
> @@ -589,7 +592,7 @@ void free_task(struct task_struct *tsk)
>  }
>  EXPORT_SYMBOL(free_task);
>  
> -static void dup_mm_exe_file(struct mm_struct *mm, struct mm_struct *oldmm)
> +void dup_mm_exe_file(struct mm_struct *mm, struct mm_struct *oldmm)
>  {
>  	struct file *exe_file;
>  
> @@ -604,183 +607,6 @@ static void dup_mm_exe_file(struct mm_struct *mm, struct mm_struct *oldmm)
>  }
>  
>  #ifdef CONFIG_MMU
> -static __latent_entropy int dup_mmap(struct mm_struct *mm,
> -					struct mm_struct *oldmm)
> -{
> -	struct vm_area_struct *mpnt, *tmp;
> -	int retval;
> -	unsigned long charge = 0;
> -	LIST_HEAD(uf);
> -	VMA_ITERATOR(vmi, mm, 0);
> -
> -	if (mmap_write_lock_killable(oldmm))
> -		return -EINTR;
> -	flush_cache_dup_mm(oldmm);
> -	uprobe_dup_mmap(oldmm, mm);
> -	/*
> -	 * Not linked in yet - no deadlock potential:
> -	 */
> -	mmap_write_lock_nested(mm, SINGLE_DEPTH_NESTING);
> -
> -	/* No ordering required: file already has been exposed. */
> -	dup_mm_exe_file(mm, oldmm);
> -
> -	mm->total_vm = oldmm->total_vm;
> -	mm->data_vm = oldmm->data_vm;
> -	mm->exec_vm = oldmm->exec_vm;
> -	mm->stack_vm = oldmm->stack_vm;
> -
> -	/* Use __mt_dup() to efficiently build an identical maple tree. */
> -	retval = __mt_dup(&oldmm->mm_mt, &mm->mm_mt, GFP_KERNEL);
> -	if (unlikely(retval))
> -		goto out;
> -
> -	mt_clear_in_rcu(vmi.mas.tree);
> -	for_each_vma(vmi, mpnt) {
> -		struct file *file;
> -
> -		vma_start_write(mpnt);
> -		if (mpnt->vm_flags & VM_DONTCOPY) {
> -			retval = vma_iter_clear_gfp(&vmi, mpnt->vm_start,
> -						    mpnt->vm_end, GFP_KERNEL);
> -			if (retval)
> -				goto loop_out;
> -
> -			vm_stat_account(mm, mpnt->vm_flags, -vma_pages(mpnt));
> -			continue;
> -		}
> -		charge = 0;
> -		/*
> -		 * Don't duplicate many vmas if we've been oom-killed (for
> -		 * example)
> -		 */
> -		if (fatal_signal_pending(current)) {
> -			retval = -EINTR;
> -			goto loop_out;
> -		}
> -		if (mpnt->vm_flags & VM_ACCOUNT) {
> -			unsigned long len = vma_pages(mpnt);
> -
> -			if (security_vm_enough_memory_mm(oldmm, len)) /* sic */
> -				goto fail_nomem;
> -			charge = len;
> -		}
> -		tmp = vm_area_dup(mpnt);
> -		if (!tmp)
> -			goto fail_nomem;
> -
> -		/* track_pfn_copy() will later take care of copying internal state. */
> -		if (unlikely(tmp->vm_flags & VM_PFNMAP))
> -			untrack_pfn_clear(tmp);
> -
> -		retval = vma_dup_policy(mpnt, tmp);
> -		if (retval)
> -			goto fail_nomem_policy;
> -		tmp->vm_mm = mm;
> -		retval = dup_userfaultfd(tmp, &uf);
> -		if (retval)
> -			goto fail_nomem_anon_vma_fork;
> -		if (tmp->vm_flags & VM_WIPEONFORK) {
> -			/*
> -			 * VM_WIPEONFORK gets a clean slate in the child.
> -			 * Don't prepare anon_vma until fault since we don't
> -			 * copy page for current vma.
> -			 */
> -			tmp->anon_vma = NULL;
> -		} else if (anon_vma_fork(tmp, mpnt))
> -			goto fail_nomem_anon_vma_fork;
> -		vm_flags_clear(tmp, VM_LOCKED_MASK);
> -		/*
> -		 * Copy/update hugetlb private vma information.
> -		 */
> -		if (is_vm_hugetlb_page(tmp))
> -			hugetlb_dup_vma_private(tmp);
> -
> -		/*
> -		 * Link the vma into the MT. After using __mt_dup(), memory
> -		 * allocation is not necessary here, so it cannot fail.
> -		 */
> -		vma_iter_bulk_store(&vmi, tmp);
> -
> -		mm->map_count++;
> -
> -		if (tmp->vm_ops && tmp->vm_ops->open)
> -			tmp->vm_ops->open(tmp);
> -
> -		file = tmp->vm_file;
> -		if (file) {
> -			struct address_space *mapping = file->f_mapping;
> -
> -			get_file(file);
> -			i_mmap_lock_write(mapping);
> -			if (vma_is_shared_maywrite(tmp))
> -				mapping_allow_writable(mapping);
> -			flush_dcache_mmap_lock(mapping);
> -			/* insert tmp into the share list, just after mpnt */
> -			vma_interval_tree_insert_after(tmp, mpnt,
> -					&mapping->i_mmap);
> -			flush_dcache_mmap_unlock(mapping);
> -			i_mmap_unlock_write(mapping);
> -		}
> -
> -		if (!(tmp->vm_flags & VM_WIPEONFORK))
> -			retval = copy_page_range(tmp, mpnt);
> -
> -		if (retval) {
> -			mpnt = vma_next(&vmi);
> -			goto loop_out;
> -		}
> -	}
> -	/* a new mm has just been created */
> -	retval = arch_dup_mmap(oldmm, mm);
> -loop_out:
> -	vma_iter_free(&vmi);
> -	if (!retval) {
> -		mt_set_in_rcu(vmi.mas.tree);
> -		ksm_fork(mm, oldmm);
> -		khugepaged_fork(mm, oldmm);
> -	} else {
> -
> -		/*
> -		 * The entire maple tree has already been duplicated. If the
> -		 * mmap duplication fails, mark the failure point with
> -		 * XA_ZERO_ENTRY. In exit_mmap(), if this marker is encountered,
> -		 * stop releasing VMAs that have not been duplicated after this
> -		 * point.
> -		 */
> -		if (mpnt) {
> -			mas_set_range(&vmi.mas, mpnt->vm_start, mpnt->vm_end - 1);
> -			mas_store(&vmi.mas, XA_ZERO_ENTRY);
> -			/* Avoid OOM iterating a broken tree */
> -			set_bit(MMF_OOM_SKIP, &mm->flags);
> -		}
> -		/*
> -		 * The mm_struct is going to exit, but the locks will be dropped
> -		 * first.  Set the mm_struct as unstable is advisable as it is
> -		 * not fully initialised.
> -		 */
> -		set_bit(MMF_UNSTABLE, &mm->flags);
> -	}
> -out:
> -	mmap_write_unlock(mm);
> -	flush_tlb_mm(oldmm);
> -	mmap_write_unlock(oldmm);
> -	if (!retval)
> -		dup_userfaultfd_complete(&uf);
> -	else
> -		dup_userfaultfd_fail(&uf);
> -	return retval;
> -
> -fail_nomem_anon_vma_fork:
> -	mpol_put(vma_policy(tmp));
> -fail_nomem_policy:
> -	vm_area_free(tmp);
> -fail_nomem:
> -	retval = -ENOMEM;
> -	vm_unacct_memory(charge);
> -	goto loop_out;
> -}
> -
>  static inline int mm_alloc_pgd(struct mm_struct *mm)
>  {
>  	mm->pgd = pgd_alloc(mm);
> @@ -794,13 +620,6 @@ static inline void mm_free_pgd(struct mm_struct *mm)
>  	pgd_free(mm, mm->pgd);
>  }
>  #else
> -static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
> -{
> -	mmap_write_lock(oldmm);
> -	dup_mm_exe_file(mm, oldmm);
> -	mmap_write_unlock(oldmm);
> -	return 0;
> -}
>  #define mm_alloc_pgd(mm)	(0)
>  #define mm_free_pgd(mm)
>  #endif /* CONFIG_MMU */
> diff --git a/mm/internal.h b/mm/internal.h
> index 40464f755092..b3e011976f74 100644
> --- a/mm/internal.h
> +++ b/mm/internal.h
> @@ -1631,5 +1631,7 @@ static inline bool reclaim_pt_is_enabled(unsigned long start, unsigned long end,
>  }
>  #endif /* CONFIG_PT_RECLAIM */
>  
> +void dup_mm_exe_file(struct mm_struct *mm, struct mm_struct *oldmm);
> +int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm);
>  
>  #endif	/* __MM_INTERNAL_H */
> diff --git a/mm/mmap.c b/mm/mmap.c
> index 9e09eac0021c..5259df031e15 100644
> --- a/mm/mmap.c
> +++ b/mm/mmap.c
> @@ -1675,7 +1675,6 @@ static int __meminit init_reserve_notifier(void)
>  }
>  subsys_initcall(init_reserve_notifier);
>  
> -#ifdef CONFIG_MMU
>  /*
>   * Obtain a read lock on mm->mmap_lock, if the specified address is below the
>   * start of the VMA, the intent is to perform a write, and it is a
> @@ -1719,10 +1718,180 @@ bool mmap_read_lock_maybe_expand(struct mm_struct *mm,
>  	mmap_write_downgrade(mm);
>  	return true;
>  }
> -#else
> -bool mmap_read_lock_maybe_expand(struct mm_struct *mm, struct vm_area_struct *vma,
> -				 unsigned long addr, bool write)
> +
> +__latent_entropy int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
>  {
> -	return false;
> +	struct vm_area_struct *mpnt, *tmp;
> +	int retval;
> +	unsigned long charge = 0;
> +	LIST_HEAD(uf);
> +	VMA_ITERATOR(vmi, mm, 0);
> +
> +	if (mmap_write_lock_killable(oldmm))
> +		return -EINTR;
> +	flush_cache_dup_mm(oldmm);
> +	uprobe_dup_mmap(oldmm, mm);
> +	/*
> +	 * Not linked in yet - no deadlock potential:
> +	 */
> +	mmap_write_lock_nested(mm, SINGLE_DEPTH_NESTING);
> +
> +	/* No ordering required: file already has been exposed. */
> +	dup_mm_exe_file(mm, oldmm);
> +
> +	mm->total_vm = oldmm->total_vm;
> +	mm->data_vm = oldmm->data_vm;
> +	mm->exec_vm = oldmm->exec_vm;
> +	mm->stack_vm = oldmm->stack_vm;
> +
> +	/* Use __mt_dup() to efficiently build an identical maple tree. */
> +	retval = __mt_dup(&oldmm->mm_mt, &mm->mm_mt, GFP_KERNEL);
> +	if (unlikely(retval))
> +		goto out;
> +
> +	mt_clear_in_rcu(vmi.mas.tree);
> +	for_each_vma(vmi, mpnt) {
> +		struct file *file;
> +
> +		vma_start_write(mpnt);
> +		if (mpnt->vm_flags & VM_DONTCOPY) {
> +			retval = vma_iter_clear_gfp(&vmi, mpnt->vm_start,
> +						    mpnt->vm_end, GFP_KERNEL);
> +			if (retval)
> +				goto loop_out;
> +
> +			vm_stat_account(mm, mpnt->vm_flags, -vma_pages(mpnt));
> +			continue;
> +		}
> +		charge = 0;
> +		/*
> +		 * Don't duplicate many vmas if we've been oom-killed (for
> +		 * example)
> +		 */
> +		if (fatal_signal_pending(current)) {
> +			retval = -EINTR;
> +			goto loop_out;
> +		}
> +		if (mpnt->vm_flags & VM_ACCOUNT) {
> +			unsigned long len = vma_pages(mpnt);
> +
> +			if (security_vm_enough_memory_mm(oldmm, len)) /* sic */
> +				goto fail_nomem;
> +			charge = len;
> +		}
> +
> +		tmp = vm_area_dup(mpnt);
> +		if (!tmp)
> +			goto fail_nomem;
> +
> +		/* track_pfn_copy() will later take care of copying internal state. */
> +		if (unlikely(tmp->vm_flags & VM_PFNMAP))
> +			untrack_pfn_clear(tmp);
> +
> +		retval = vma_dup_policy(mpnt, tmp);
> +		if (retval)
> +			goto fail_nomem_policy;
> +		tmp->vm_mm = mm;
> +		retval = dup_userfaultfd(tmp, &uf);
> +		if (retval)
> +			goto fail_nomem_anon_vma_fork;
> +		if (tmp->vm_flags & VM_WIPEONFORK) {
> +			/*
> +			 * VM_WIPEONFORK gets a clean slate in the child.
> +			 * Don't prepare anon_vma until fault since we don't
> +			 * copy page for current vma.
> +			 */
> +			tmp->anon_vma = NULL;
> +		} else if (anon_vma_fork(tmp, mpnt))
> +			goto fail_nomem_anon_vma_fork;
> +		vm_flags_clear(tmp, VM_LOCKED_MASK);
> +		/*
> +		 * Copy/update hugetlb private vma information.
> +		 */
> +		if (is_vm_hugetlb_page(tmp))
> +			hugetlb_dup_vma_private(tmp);
> +
> +		/*
> +		 * Link the vma into the MT. After using __mt_dup(), memory
> +		 * allocation is not necessary here, so it cannot fail.
> +		 */
> +		vma_iter_bulk_store(&vmi, tmp);
> +
> +		mm->map_count++;
> +
> +		if (tmp->vm_ops && tmp->vm_ops->open)
> +			tmp->vm_ops->open(tmp);
> +
> +		file = tmp->vm_file;
> +		if (file) {
> +			struct address_space *mapping = file->f_mapping;
> +
> +			get_file(file);
> +			i_mmap_lock_write(mapping);
> +			if (vma_is_shared_maywrite(tmp))
> +				mapping_allow_writable(mapping);
> +			flush_dcache_mmap_lock(mapping);
> +			/* insert tmp into the share list, just after mpnt */
> +			vma_interval_tree_insert_after(tmp, mpnt,
> +					&mapping->i_mmap);
> +			flush_dcache_mmap_unlock(mapping);
> +			i_mmap_unlock_write(mapping);
> +		}
> +
> +		if (!(tmp->vm_flags & VM_WIPEONFORK))
> +			retval = copy_page_range(tmp, mpnt);
> +
> +		if (retval) {
> +			mpnt = vma_next(&vmi);
> +			goto loop_out;
> +		}
> +	}
> +	/* a new mm has just been created */
> +	retval = arch_dup_mmap(oldmm, mm);
> +loop_out:
> +	vma_iter_free(&vmi);
> +	if (!retval) {
> +		mt_set_in_rcu(vmi.mas.tree);
> +		ksm_fork(mm, oldmm);
> +		khugepaged_fork(mm, oldmm);
> +	} else {
> +
> +		/*
> +		 * The entire maple tree has already been duplicated. If the
> +		 * mmap duplication fails, mark the failure point with
> +		 * XA_ZERO_ENTRY. In exit_mmap(), if this marker is encountered,
> +		 * stop releasing VMAs that have not been duplicated after this
> +		 * point.
> +		 */
> +		if (mpnt) {
> +			mas_set_range(&vmi.mas, mpnt->vm_start, mpnt->vm_end - 1);
> +			mas_store(&vmi.mas, XA_ZERO_ENTRY);
> +			/* Avoid OOM iterating a broken tree */
> +			set_bit(MMF_OOM_SKIP, &mm->flags);
> +		}
> +		/*
> +		 * The mm_struct is going to exit, but the locks will be dropped
> +		 * first.  Set the mm_struct as unstable is advisable as it is
> +		 * not fully initialised.
> +		 */
> +		set_bit(MMF_UNSTABLE, &mm->flags);
> +	}
> +out:
> +	mmap_write_unlock(mm);
> +	flush_tlb_mm(oldmm);
> +	mmap_write_unlock(oldmm);
> +	if (!retval)
> +		dup_userfaultfd_complete(&uf);
> +	else
> +		dup_userfaultfd_fail(&uf);
> +	return retval;
> +
> +fail_nomem_anon_vma_fork:
> +	mpol_put(vma_policy(tmp));
> +fail_nomem_policy:
> +	vm_area_free(tmp);
> +fail_nomem:
> +	retval = -ENOMEM;
> +	vm_unacct_memory(charge);
> +	goto loop_out;
>  }
> -#endif
> diff --git a/mm/nommu.c b/mm/nommu.c
> index 2b4d304c6445..a142fc258d39 100644
> --- a/mm/nommu.c
> +++ b/mm/nommu.c
> @@ -1874,3 +1874,11 @@ static int __meminit init_admin_reserve(void)
>  	return 0;
>  }
>  subsys_initcall(init_admin_reserve);
> +
> +int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
> +{
> +	mmap_write_lock(oldmm);
> +	dup_mm_exe_file(mm, oldmm);
> +	mmap_write_unlock(oldmm);
> +	return 0;
> +}
> -- 
> 2.49.0
>