lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <r57ezh55nc7x2uzepv4xxacmt5ar5mdlb732vllfh5juxnavv4@x3e33a6m2gxt>
Date: Tue, 6 Jan 2026 14:02:48 -0500
From: "Liam R. Howlett" <Liam.Howlett@...cle.com>
To: Lorenzo Stoakes <lorenzo.stoakes@...cle.com>
Cc: Andrew Morton <akpm@...ux-foundation.org>,
        Suren Baghdasaryan <surenb@...gle.com>,
        Vlastimil Babka <vbabka@...e.cz>,
        Shakeel Butt <shakeel.butt@...ux.dev>,
        David Hildenbrand <david@...nel.org>, Rik van Riel <riel@...riel.com>,
        Harry Yoo <harry.yoo@...cle.com>, Jann Horn <jannh@...gle.com>,
        Mike Rapoport <rppt@...nel.org>, Michal Hocko <mhocko@...e.com>,
        Pedro Falcato <pfalcato@...e.de>, Chris Li <chriscli@...gle.com>,
        Barry Song <v-songbaohua@...o.com>, linux-mm@...ck.org,
        linux-kernel@...r.kernel.org
Subject: Re: [PATCH v2 7/8] mm/rmap: allocate anon_vma_chain objects unlocked
 when possible

* Lorenzo Stoakes <lorenzo.stoakes@...cle.com> [260106 10:04]:
> There is no reason to allocate the anon_vma_chain under the anon_vma write
> lock when cloning - we can in fact assign these to the destination VMA
> safely as we hold the exclusive mmap lock and therefore preclude anybody
> else accessing these fields.
> 
> We only need take the anon_vma write lock when we link rbtree edges from
> the anon_vma to the newly established AVCs.
> 
> This also allows us to eliminate the weird GFP_NOWAIT, GFP_KERNEL dance
> introduced in commit dd34739c03f2 ("mm: avoid anon_vma_chain allocation
> under anon_vma lock"), further simplifying this logic.
> 
> This should reduce lock anon_vma contention, and clarifies exactly where
> the anon_vma lock is required.
> 
> We cannot adjust __anon_vma_prepare() in the same way as this is only
> protected by VMA read lock, so we have to perform the allocation here under
> the anon_vma write lock and page_table_lock (to protect against racing
> threads), and we wish to retain the lock ordering.
> 
> With this change we can simplify cleanup_partial_anon_vmas() even further -
> since we allocate AVC's without any lock taken and do not insert anything
> into the interval tree until after the allocations are tried, we can remove
> all logic pertaining to this and just free up AVC's only.
> 
> Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@...cle.com>
> Reviewed-by: Suren Baghdasaryan <surenb@...gle.com>

Reviewed-by: Liam R. Howlett <Liam.Howlett@...cle.com>

> ---
>  mm/rmap.c | 78 +++++++++++++++++++++++++------------------------------
>  1 file changed, 35 insertions(+), 43 deletions(-)
> 
> diff --git a/mm/rmap.c b/mm/rmap.c
> index 6ac42671bedd..8f4393546bce 100644
> --- a/mm/rmap.c
> +++ b/mm/rmap.c
> @@ -147,14 +147,13 @@ static void anon_vma_chain_free(struct anon_vma_chain *anon_vma_chain)
>  	kmem_cache_free(anon_vma_chain_cachep, anon_vma_chain);
>  }
>  
> -static void anon_vma_chain_link(struct vm_area_struct *vma,
> -				struct anon_vma_chain *avc,
> -				struct anon_vma *anon_vma)
> +static void anon_vma_chain_assign(struct vm_area_struct *vma,
> +				  struct anon_vma_chain *avc,
> +				  struct anon_vma *anon_vma)
>  {
>  	avc->vma = vma;
>  	avc->anon_vma = anon_vma;
>  	list_add(&avc->same_vma, &vma->anon_vma_chain);
> -	anon_vma_interval_tree_insert(avc, &anon_vma->rb_root);
>  }
>  
>  /**
> @@ -211,7 +210,8 @@ int __anon_vma_prepare(struct vm_area_struct *vma)
>  	spin_lock(&mm->page_table_lock);
>  	if (likely(!vma->anon_vma)) {
>  		vma->anon_vma = anon_vma;
> -		anon_vma_chain_link(vma, avc, anon_vma);
> +		anon_vma_chain_assign(vma, avc, anon_vma);
> +		anon_vma_interval_tree_insert(avc, &anon_vma->rb_root);
>  		anon_vma->num_active_vmas++;
>  		allocated = NULL;
>  		avc = NULL;
> @@ -292,21 +292,31 @@ int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src)
>  
>  	check_anon_vma_clone(dst, src);
>  
> -	/* All anon_vma's share the same root. */
> +	/*
> +	 * Allocate AVCs. We don't need an anon_vma lock for this as we
> +	 * are not updating the anon_vma rbtree nor are we changing
> +	 * anon_vma statistics.
> +	 *
> +	 * We hold the exclusive mmap write lock so there's no possibliity of
> +	 * the unlinked AVC's being observed yet.
> +	 */
> +	list_for_each_entry(pavc, &src->anon_vma_chain, same_vma) {
> +		avc = anon_vma_chain_alloc(GFP_KERNEL);
> +		if (!avc)
> +			goto enomem_failure;
> +
> +		anon_vma_chain_assign(dst, avc, pavc->anon_vma);
> +	}
> +
> +	/*
> +	 * Now link the anon_vma's back to the newly inserted AVCs.
> +	 * Note that all anon_vma's share the same root.
> +	 */
>  	anon_vma_lock_write(src->anon_vma);
> -	list_for_each_entry_reverse(pavc, &src->anon_vma_chain, same_vma) {
> -		struct anon_vma *anon_vma;
> -
> -		avc = anon_vma_chain_alloc(GFP_NOWAIT);
> -		if (unlikely(!avc)) {
> -			anon_vma_unlock_write(src->anon_vma);
> -			avc = anon_vma_chain_alloc(GFP_KERNEL);
> -			if (!avc)
> -				goto enomem_failure;
> -			anon_vma_lock_write(src->anon_vma);
> -		}
> -		anon_vma = pavc->anon_vma;
> -		anon_vma_chain_link(dst, avc, anon_vma);
> +	list_for_each_entry_reverse(avc, &dst->anon_vma_chain, same_vma) {
> +		struct anon_vma *anon_vma = avc->anon_vma;
> +
> +		anon_vma_interval_tree_insert(avc, &anon_vma->rb_root);
>  
>  		/*
>  		 * Reuse existing anon_vma if it has no vma and only one
> @@ -322,7 +332,6 @@ int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src)
>  	}
>  	if (dst->anon_vma)
>  		dst->anon_vma->num_active_vmas++;
> -
>  	anon_vma_unlock_write(src->anon_vma);
>  	return 0;
>  
> @@ -384,8 +393,10 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
>  	get_anon_vma(anon_vma->root);
>  	/* Mark this anon_vma as the one where our new (COWed) pages go. */
>  	vma->anon_vma = anon_vma;
> +	anon_vma_chain_assign(vma, avc, anon_vma);
> +	/* Now let rmap see it. */
>  	anon_vma_lock_write(anon_vma);
> -	anon_vma_chain_link(vma, avc, anon_vma);
> +	anon_vma_interval_tree_insert(avc, &anon_vma->rb_root);
>  	anon_vma->parent->num_children++;
>  	anon_vma_unlock_write(anon_vma);
>  
> @@ -402,34 +413,15 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
>   * In the unfortunate case of anon_vma_clone() failing to allocate memory we
>   * have to clean things up.
>   *
> - * On clone we hold the exclusive mmap write lock, so we can't race
> - * unlink_anon_vmas(). Since we're cloning, we know we can't have empty
> - * anon_vma's, since existing anon_vma's are what we're cloning from.
> - *
> - * So this function needs only traverse the anon_vma_chain and free each
> - * allocated anon_vma_chain.
> + * Since we allocate anon_vma_chain's before we insert them into the interval
> + * trees, we simply have to free up the AVC's and remove the entries from the
> + * VMA's anon_vma_chain.
>   */
>  static void cleanup_partial_anon_vmas(struct vm_area_struct *vma)
>  {
>  	struct anon_vma_chain *avc, *next;
> -	bool locked = false;
> -
> -	/*
> -	 * We exclude everybody else from being able to modify anon_vma's
> -	 * underneath us.
> -	 */
> -	mmap_assert_locked(vma->vm_mm);
>  
>  	list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) {
> -		struct anon_vma *anon_vma = avc->anon_vma;
> -
> -		/* All anon_vma's share the same root. */
> -		if (!locked) {
> -			anon_vma_lock_write(anon_vma);
> -			locked = true;
> -		}
> -
> -		anon_vma_interval_tree_remove(avc, &anon_vma->rb_root);
>  		list_del(&avc->same_vma);
>  		anon_vma_chain_free(avc);
>  	}
> -- 
> 2.52.0
> 

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ