[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20230109205336.3665937-42-surenb@google.com>
Date: Mon, 9 Jan 2023 12:53:36 -0800
From: Suren Baghdasaryan <surenb@...gle.com>
To: akpm@...ux-foundation.org
Cc: michel@...pinasse.org, jglisse@...gle.com, mhocko@...e.com,
vbabka@...e.cz, hannes@...xchg.org, mgorman@...hsingularity.net,
dave@...olabs.net, willy@...radead.org, liam.howlett@...cle.com,
peterz@...radead.org, ldufour@...ux.ibm.com,
laurent.dufour@...ibm.com, paulmck@...nel.org, luto@...nel.org,
songliubraving@...com, peterx@...hat.com, david@...hat.com,
dhowells@...hat.com, hughd@...gle.com, bigeasy@...utronix.de,
kent.overstreet@...ux.dev, punit.agrawal@...edance.com,
lstoakes@...il.com, peterjung1337@...il.com, rientjes@...gle.com,
axelrasmussen@...gle.com, joelaf@...gle.com, minchan@...gle.com,
jannh@...gle.com, shakeelb@...gle.com, tatashin@...gle.com,
edumazet@...gle.com, gthelen@...gle.com, gurua@...gle.com,
arjunroy@...gle.com, soheil@...gle.com, hughlynch@...gle.com,
leewalsh@...gle.com, posk@...gle.com, linux-mm@...ck.org,
linux-arm-kernel@...ts.infradead.org,
linuxppc-dev@...ts.ozlabs.org, x86@...nel.org,
linux-kernel@...r.kernel.org, kernel-team@...roid.com,
surenb@...gle.com
Subject: [PATCH 41/41] mm: replace rw_semaphore with atomic_t in vma_lock
rw_semaphore is a sizable structure of 40 bytes and consumes
considerable space for each vm_area_struct. However vma_lock has
two important specifics which can be used to replace rw_semaphore
with a simpler structure:
1. Readers never wait. They try to take the vma_lock and fall back to
mmap_lock if that fails.
2. Only one writer at a time will ever try to write-lock a vma_lock
because writers first take mmap_lock in write mode.
Because of these requirements, full rw_semaphore functionality is not
needed and we can replace rw_semaphore with an atomic variable.
When a reader takes read lock, it increments the atomic unless the
value is negative. If that fails read-locking is aborted and mmap_lock
is used instead.
When writer takes write lock, it resets atomic value to -1 if the
current value is 0 (no readers). Since all writers take mmap_lock in
write mode first, there can be only one writer at a time. If there
are readers, writer will place itself into a wait queue using new
mm_struct.vma_writer_wait waitqueue head. The last reader to release
the vma_lock will signal the writer to wake up.
vm_lock_seq is also moved into vma_lock and along with atomic_t they
are nicely packed and consume 8 bytes, bringing the overhead from
vma_lock from 44 to 16 bytes:
slabinfo before the changes:
<name> ... <objsize> <objperslab> <pagesperslab> : ...
vm_area_struct ... 152 53 2 : ...
slabinfo with vma_lock:
<name> ... <objsize> <objperslab> <pagesperslab> : ...
rw_semaphore ... 8 512 1 : ...
vm_area_struct ... 160 51 2 : ...
Assuming 40000 vm_area_structs, memory consumption would be:
baseline: 6040kB
vma_lock (vm_area_structs+vma_lock): 6280kB+316kB=6596kB
Total increase: 556kB
atomic_t might overflow if there are many competing readers, therefore
vma_read_trylock() implements an overflow check and if that occurs it
restors the previous value and exits with a failure to lock.
Signed-off-by: Suren Baghdasaryan <surenb@...gle.com>
---
include/linux/mm.h | 37 +++++++++++++++++++++++++------------
include/linux/mm_types.h | 10 ++++++++--
kernel/fork.c | 6 +++---
mm/init-mm.c | 2 ++
4 files changed, 38 insertions(+), 17 deletions(-)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index d40bf8a5e19e..294dd44b2198 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -627,12 +627,16 @@ static inline void vma_write_lock(struct vm_area_struct *vma)
* mm->mm_lock_seq can't be concurrently modified.
*/
mm_lock_seq = READ_ONCE(vma->vm_mm->mm_lock_seq);
- if (vma->vm_lock_seq == mm_lock_seq)
+ if (vma->vm_lock->lock_seq == mm_lock_seq)
return;
- down_write(&vma->vm_lock->lock);
- vma->vm_lock_seq = mm_lock_seq;
- up_write(&vma->vm_lock->lock);
+ if (atomic_cmpxchg(&vma->vm_lock->count, 0, -1))
+ wait_event(vma->vm_mm->vma_writer_wait,
+ atomic_cmpxchg(&vma->vm_lock->count, 0, -1) == 0);
+ vma->vm_lock->lock_seq = mm_lock_seq;
+ /* Write barrier to ensure lock_seq change is visible before count */
+ smp_wmb();
+ atomic_set(&vma->vm_lock->count, 0);
}
/*
@@ -643,20 +647,28 @@ static inline void vma_write_lock(struct vm_area_struct *vma)
static inline bool vma_read_trylock(struct vm_area_struct *vma)
{
/* Check before locking. A race might cause false locked result. */
- if (vma->vm_lock_seq == READ_ONCE(vma->vm_mm->mm_lock_seq))
+ if (vma->vm_lock->lock_seq == READ_ONCE(vma->vm_mm->mm_lock_seq))
return false;
- if (unlikely(down_read_trylock(&vma->vm_lock->lock) == 0))
+ if (unlikely(!atomic_inc_unless_negative(&vma->vm_lock->count)))
return false;
+ /* If atomic_t overflows, restore and fail to lock. */
+ if (unlikely(atomic_read(&vma->vm_lock->count) < 0)) {
+ if (atomic_dec_and_test(&vma->vm_lock->count))
+ wake_up(&vma->vm_mm->vma_writer_wait);
+ return false;
+ }
+
/*
* Overflow might produce false locked result.
* False unlocked result is impossible because we modify and check
* vma->vm_lock_seq under vma->vm_lock protection and mm->mm_lock_seq
* modification invalidates all existing locks.
*/
- if (unlikely(vma->vm_lock_seq == READ_ONCE(vma->vm_mm->mm_lock_seq))) {
- up_read(&vma->vm_lock->lock);
+ if (unlikely(vma->vm_lock->lock_seq == READ_ONCE(vma->vm_mm->mm_lock_seq))) {
+ if (atomic_dec_and_test(&vma->vm_lock->count))
+ wake_up(&vma->vm_mm->vma_writer_wait);
return false;
}
return true;
@@ -664,7 +676,8 @@ static inline bool vma_read_trylock(struct vm_area_struct *vma)
static inline void vma_read_unlock(struct vm_area_struct *vma)
{
- up_read(&vma->vm_lock->lock);
+ if (atomic_dec_and_test(&vma->vm_lock->count))
+ wake_up(&vma->vm_mm->vma_writer_wait);
}
static inline void vma_assert_write_locked(struct vm_area_struct *vma)
@@ -674,13 +687,13 @@ static inline void vma_assert_write_locked(struct vm_area_struct *vma)
* current task is holding mmap_write_lock, both vma->vm_lock_seq and
* mm->mm_lock_seq can't be concurrently modified.
*/
- VM_BUG_ON_VMA(vma->vm_lock_seq != READ_ONCE(vma->vm_mm->mm_lock_seq), vma);
+ VM_BUG_ON_VMA(vma->vm_lock->lock_seq != READ_ONCE(vma->vm_mm->mm_lock_seq), vma);
}
static inline void vma_assert_no_reader(struct vm_area_struct *vma)
{
- VM_BUG_ON_VMA(rwsem_is_locked(&vma->vm_lock->lock) &&
- vma->vm_lock_seq != READ_ONCE(vma->vm_mm->mm_lock_seq),
+ VM_BUG_ON_VMA(atomic_read(&vma->vm_lock->count) > 0 &&
+ vma->vm_lock->lock_seq != READ_ONCE(vma->vm_mm->mm_lock_seq),
vma);
}
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index faa61b400f9b..a6050c38ca2e 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -527,7 +527,13 @@ struct anon_vma_name {
};
struct vma_lock {
- struct rw_semaphore lock;
+ /*
+ * count > 0 ==> read-locked with 'count' number of readers
+ * count < 0 ==> write-locked
+ * count = 0 ==> unlocked
+ */
+ atomic_t count;
+ int lock_seq;
};
/*
@@ -566,7 +572,6 @@ struct vm_area_struct {
unsigned long vm_flags;
#ifdef CONFIG_PER_VMA_LOCK
- int vm_lock_seq;
struct vma_lock *vm_lock;
#endif
@@ -706,6 +711,7 @@ struct mm_struct {
* by mmlist_lock
*/
#ifdef CONFIG_PER_VMA_LOCK
+ struct wait_queue_head vma_writer_wait;
int mm_lock_seq;
struct {
struct list_head head;
diff --git a/kernel/fork.c b/kernel/fork.c
index 95db6a521cf1..b221ad182d98 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -461,9 +461,8 @@ static bool vma_init_lock(struct vm_area_struct *vma)
vma->vm_lock = kmem_cache_alloc(vma_lock_cachep, GFP_KERNEL);
if (!vma->vm_lock)
return false;
-
- init_rwsem(&vma->vm_lock->lock);
- vma->vm_lock_seq = -1;
+ atomic_set(&vma->vm_lock->count, 0);
+ vma->vm_lock->lock_seq = -1;
return true;
}
@@ -1229,6 +1228,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
mmap_init_lock(mm);
INIT_LIST_HEAD(&mm->mmlist);
#ifdef CONFIG_PER_VMA_LOCK
+ init_waitqueue_head(&mm->vma_writer_wait);
WRITE_ONCE(mm->mm_lock_seq, 0);
INIT_LIST_HEAD(&mm->vma_free_list.head);
spin_lock_init(&mm->vma_free_list.lock);
diff --git a/mm/init-mm.c b/mm/init-mm.c
index b53d23c2d7a3..0088e31e5f7e 100644
--- a/mm/init-mm.c
+++ b/mm/init-mm.c
@@ -38,6 +38,8 @@ struct mm_struct init_mm = {
.arg_lock = __SPIN_LOCK_UNLOCKED(init_mm.arg_lock),
.mmlist = LIST_HEAD_INIT(init_mm.mmlist),
#ifdef CONFIG_PER_VMA_LOCK
+ .vma_writer_wait =
+ __WAIT_QUEUE_HEAD_INITIALIZER(init_mm.vma_writer_wait),
.mm_lock_seq = 0,
.vma_free_list.head = LIST_HEAD_INIT(init_mm.vma_free_list.head),
.vma_free_list.lock = __SPIN_LOCK_UNLOCKED(init_mm.vma_free_list.lock),
--
2.39.0
Powered by blists - more mailing lists