[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <4ce4ec09b92664091e8935982d83dde3a4c7f898.1765970117.git.lorenzo.stoakes@oracle.com>
Date: Wed, 17 Dec 2025 12:27:09 +0000
From: Lorenzo Stoakes <lorenzo.stoakes@...cle.com>
To: Andrew Morton <akpm@...ux-foundation.org>
Cc: Suren Baghdasaryan <surenb@...gle.com>,
"Liam R . Howlett" <Liam.Howlett@...cle.com>,
Vlastimil Babka <vbabka@...e.cz>,
Shakeel Butt <shakeel.butt@...ux.dev>,
David Hildenbrand <david@...nel.org>, Rik van Riel <riel@...riel.com>,
Harry Yoo <harry.yoo@...cle.com>, Jann Horn <jannh@...gle.com>,
Mike Rapoport <rppt@...nel.org>, Michal Hocko <mhocko@...e.com>,
Pedro Falcato <pfalcato@...e.de>, Chris Li <chriscli@...gle.com>,
Barry Song <v-songbaohua@...o.com>, linux-mm@...ck.org,
linux-kernel@...r.kernel.org
Subject: [PATCH 7/8] mm/rmap: allocate anon_vma_chain objects unlocked when possible
There is no reason to allocate the anon_vma_chain under the anon_vma write
lock when cloning - we can in fact assign these to the destination VMA
safely as we hold the exclusive mmap lock and therefore preclude anybody
else accessing these fields.
We only need take the anon_vma write lock when we link rbtree edges from
the anon_vma to the newly established AVCs.
This also allows us to eliminate the weird GFP_NOWAIT, GFP_KERNEL dance
introduced in commit dd34739c03f2 ("mm: avoid anon_vma_chain allocation
under anon_vma lock"), further simplifying this logic.
This should reduce lock anon_vma contention, and clarifies exactly where
the anon_vma lock is required.
We cannot adjust __anon_vma_prepare() in the same way as this is only
protected by VMA read lock, so we have to perform the allocation here under
the anon_vma write lock and page_table_lock (to protect against racing
threads), and we wish to retain the lock ordering.
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@...cle.com>
---
mm/rmap.c | 49 +++++++++++++++++++++++++++++--------------------
1 file changed, 29 insertions(+), 20 deletions(-)
diff --git a/mm/rmap.c b/mm/rmap.c
index 60134a566073..de9de6d71c23 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -146,14 +146,13 @@ static void anon_vma_chain_free(struct anon_vma_chain *anon_vma_chain)
kmem_cache_free(anon_vma_chain_cachep, anon_vma_chain);
}
-static void anon_vma_chain_link(struct vm_area_struct *vma,
- struct anon_vma_chain *avc,
- struct anon_vma *anon_vma)
+static void anon_vma_chain_assign(struct vm_area_struct *vma,
+ struct anon_vma_chain *avc,
+ struct anon_vma *anon_vma)
{
avc->vma = vma;
avc->anon_vma = anon_vma;
list_add(&avc->same_vma, &vma->anon_vma_chain);
- anon_vma_interval_tree_insert(avc, &anon_vma->rb_root);
}
/**
@@ -210,7 +209,8 @@ int __anon_vma_prepare(struct vm_area_struct *vma)
spin_lock(&mm->page_table_lock);
if (likely(!vma->anon_vma)) {
vma->anon_vma = anon_vma;
- anon_vma_chain_link(vma, avc, anon_vma);
+ anon_vma_chain_assign(vma, avc, anon_vma);
+ anon_vma_interval_tree_insert(avc, &anon_vma->rb_root);
anon_vma->num_active_vmas++;
allocated = NULL;
avc = NULL;
@@ -287,20 +287,28 @@ int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src)
check_anon_vma_clone(dst, src);
+ /*
+ * Allocate AVCs. We don't need an anon_vma lock for this as we
+ * are not updating the anon_vma rbtree nor are we changing
+ * anon_vma statistics.
+ *
+ * We hold the mmap write lock so there's no possibliity of
+ * the unlinked AVC's being observed yet.
+ */
+ list_for_each_entry(pavc, &src->anon_vma_chain, same_vma) {
+ avc = anon_vma_chain_alloc(GFP_KERNEL);
+ if (!avc)
+ goto enomem_failure;
+
+ anon_vma_chain_assign(dst, avc, pavc->anon_vma);
+ }
+
+ /* Now link the anon_vma's back to the newly inserted AVCs. */
anon_vma_lock_write(src->anon_vma);
- list_for_each_entry_reverse(pavc, &src->anon_vma_chain, same_vma) {
- struct anon_vma *anon_vma;
-
- avc = anon_vma_chain_alloc(GFP_NOWAIT);
- if (unlikely(!avc)) {
- anon_vma_unlock_write(src->anon_vma);
- avc = anon_vma_chain_alloc(GFP_KERNEL);
- if (!avc)
- goto enomem_failure;
- anon_vma_lock_write(src->anon_vma);
- }
- anon_vma = pavc->anon_vma;
- anon_vma_chain_link(dst, avc, anon_vma);
+ list_for_each_entry_reverse(avc, &dst->anon_vma_chain, same_vma) {
+ struct anon_vma *anon_vma = avc->anon_vma;
+
+ anon_vma_interval_tree_insert(avc, &anon_vma->rb_root);
/*
* Reuse existing anon_vma if it has no vma and only one
@@ -316,7 +324,6 @@ int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src)
}
if (dst->anon_vma)
dst->anon_vma->num_active_vmas++;
-
anon_vma_unlock_write(src->anon_vma);
return 0;
@@ -385,8 +392,10 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
get_anon_vma(anon_vma->root);
/* Mark this anon_vma as the one where our new (COWed) pages go. */
vma->anon_vma = anon_vma;
+ anon_vma_chain_assign(vma, avc, anon_vma);
+ /* Now let rmap see it. */
anon_vma_lock_write(anon_vma);
- anon_vma_chain_link(vma, avc, anon_vma);
+ anon_vma_interval_tree_insert(avc, &anon_vma->rb_root);
anon_vma->parent->num_children++;
anon_vma_unlock_write(anon_vma);
--
2.52.0
Powered by blists - more mailing lists