[<prev] [next>] [day] [month] [year] [list]
Message-ID: <20230721182350.845615-1-jannh@google.com>
Date: Fri, 21 Jul 2023 20:23:50 +0200
From: Jann Horn <jannh@...gle.com>
To: Andrew Morton <akpm@...ux-foundation.org>
Cc: Suren Baghdasaryan <surenb@...gle.com>, linux-mm@...ck.org,
linux-kernel@...r.kernel.org, Will Deacon <will@...nel.org>,
Peter Zijlstra <peterz@...radead.org>
Subject: [PATCH] mm: Fix memory ordering for mm_lock_seq and vm_lock_seq
mm->mm_lock_seq effectively functions as a read/write lock; therefore it
must be used with acquire/release semantics.
A specific example is the interaction between userfaultfd_register() and
lock_vma_under_rcu().
userfaultfd_register() does the following from the point where it changes
a VMA's flags to the point where concurrent readers are permitted again
(in a simple scenario where only a single private VMA is accessed and no
merging/splitting is involved):
userfaultfd_register
userfaultfd_set_vm_flags
vm_flags_reset
vma_start_write
down_write(&vma->vm_lock->lock)
vma->vm_lock_seq = mm_lock_seq [marks VMA as busy]
up_write(&vma->vm_lock->lock)
vm_flags_init
[sets VM_UFFD_* in __vm_flags]
vma->vm_userfaultfd_ctx.ctx = ctx
mmap_write_unlock
vma_end_write_all
WRITE_ONCE(mm->mm_lock_seq, mm->mm_lock_seq + 1) [unlocks VMA]
There are no memory barriers in between the __vm_flags update and the
mm->mm_lock_seq update that unlocks the VMA, so the unlock can be reordered
to above the `vm_flags_init()` call, which means from the perspective of a
concurrent reader, a VMA can be marked as a userfaultfd VMA while it is not
VMA-locked. That's bad, we definitely need a store-release for the unlock
operation.
The non-atomic write to vma->vm_lock_seq in vma_start_write() is mostly
fine because all accesses to vma->vm_lock_seq that matter are always
protected by the VMA lock. There is a racy read in vma_start_read() though
that can tolerate false-positives, so we should be using WRITE_ONCE() to
keep things tidy and data-race-free (including for KCSAN).
On the other side, lock_vma_under_rcu() works as follows in the relevant
region for locking and userfaultfd check:
lock_vma_under_rcu
vma_start_read
vma->vm_lock_seq == READ_ONCE(vma->vm_mm->mm_lock_seq) [early bailout]
down_read_trylock(&vma->vm_lock->lock)
vma->vm_lock_seq == READ_ONCE(vma->vm_mm->mm_lock_seq) [main check]
userfaultfd_armed
checks vma->vm_flags & __VM_UFFD_FLAGS
Here, the interesting aspect is how far down the mm->mm_lock_seq read
can be reordered - if this read is reordered down below the vma->vm_flags
access, this could cause lock_vma_under_rcu() to partly operate on
information that was read while the VMA was supposed to be locked.
To prevent this kind of downwards bleeding of the mm->mm_lock_seq read, we
need to read it with a load-acquire.
BACKPORT WARNING: One of the functions changed by this patch (which I've
written against Linus' tree) is vma_try_start_write(), but this function
no longer exists in mm/mm-everything. I don't know whether the merged
version of this patch will be ordered before or after the patch that
removes vma_try_start_write(). If you're backporting this patch to a
tree with vma_try_start_write(), make sure this patch changes that
function.
Fixes: 5e31275cc997 ("mm: add per-VMA lock and helper functions to control it")
Cc: stable@...r.kernel.org
Cc: Suren Baghdasaryan <surenb@...gle.com>
Signed-off-by: Jann Horn <jannh@...gle.com>
---
include/linux/mm.h | 21 ++++++++++++++++-----
include/linux/mm_types.h | 27 +++++++++++++++++++++++++++
include/linux/mmap_lock.h | 10 ++++++++--
3 files changed, 51 insertions(+), 7 deletions(-)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 2dd73e4f3d8e..beba0bfd43da 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -642,7 +642,7 @@ static inline void vma_numab_state_free(struct vm_area_struct *vma) {}
static inline bool vma_start_read(struct vm_area_struct *vma)
{
/* Check before locking. A race might cause false locked result. */
- if (vma->vm_lock_seq == READ_ONCE(vma->vm_mm->mm_lock_seq))
+ if (READ_ONCE(vma->vm_lock_seq) == READ_ONCE(vma->vm_mm->mm_lock_seq))
return false;
if (unlikely(down_read_trylock(&vma->vm_lock->lock) == 0))
@@ -653,8 +653,13 @@ static inline bool vma_start_read(struct vm_area_struct *vma)
* False unlocked result is impossible because we modify and check
* vma->vm_lock_seq under vma->vm_lock protection and mm->mm_lock_seq
* modification invalidates all existing locks.
+ *
+ * We must use ACQUIRE semantics for the mm_lock_seq so that if we are
+ * racing with vma_end_write_all(), we only start reading from the VMA
+ * after it has been unlocked.
+ * This pairs with RELEASE semantics in vma_end_write_all().
*/
- if (unlikely(vma->vm_lock_seq == READ_ONCE(vma->vm_mm->mm_lock_seq))) {
+ if (unlikely(vma->vm_lock_seq == smp_load_acquire(&vma->vm_mm->mm_lock_seq))) {
up_read(&vma->vm_lock->lock);
return false;
}
@@ -676,7 +681,7 @@ static bool __is_vma_write_locked(struct vm_area_struct *vma, int *mm_lock_seq)
* current task is holding mmap_write_lock, both vma->vm_lock_seq and
* mm->mm_lock_seq can't be concurrently modified.
*/
- *mm_lock_seq = READ_ONCE(vma->vm_mm->mm_lock_seq);
+ *mm_lock_seq = vma->vm_mm->mm_lock_seq;
return (vma->vm_lock_seq == *mm_lock_seq);
}
@@ -688,7 +693,13 @@ static inline void vma_start_write(struct vm_area_struct *vma)
return;
down_write(&vma->vm_lock->lock);
- vma->vm_lock_seq = mm_lock_seq;
+ /*
+ * We should use WRITE_ONCE() here because we can have concurrent reads
+ * from the early lockless pessimistic check in vma_start_read().
+ * We don't really care about the correctness of that early check, but
+ * we should use WRITE_ONCE() for cleanliness and to keep KCSAN happy.
+ */
+ WRITE_ONCE(vma->vm_lock_seq, mm_lock_seq);
up_write(&vma->vm_lock->lock);
}
@@ -702,7 +713,7 @@ static inline bool vma_try_start_write(struct vm_area_struct *vma)
if (!down_write_trylock(&vma->vm_lock->lock))
return false;
- vma->vm_lock_seq = mm_lock_seq;
+ WRITE_ONCE(vma->vm_lock_seq, mm_lock_seq);
up_write(&vma->vm_lock->lock);
return true;
}
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index de10fc797c8e..aa2444966f5f 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -514,6 +514,19 @@ struct vm_area_struct {
};
#ifdef CONFIG_PER_VMA_LOCK
+ /*
+ * Can only be written (using WRITE_ONCE()) while holding both:
+ * - mmap_lock (in write mode)
+ * - vm_lock->lock (in write mode)
+ * Can be read reliably while holding:
+ * - vm_lock->lock (in read or write mode)
+ * Can be read unreliably (for pessimistic bailout) while holding
+ * nothing (except RCU to keep the VMA struct allocated).
+ *
+ * This sequence counter is explicitly allowed to overflow; sequence
+ * counter reuse can only lead to occasional unnecessary use of the
+ * slowpath.
+ */
int vm_lock_seq;
struct vma_lock *vm_lock;
@@ -679,6 +692,20 @@ struct mm_struct {
* by mmlist_lock
*/
#ifdef CONFIG_PER_VMA_LOCK
+ /*
+ * This field has lock-like semantics; see also
+ * vma->vm_lock_seq.
+ * Incrementing the sequence number is equivalent to releasing
+ * locks on VMAs and requires RELEASE semantics; reading the
+ * sequence number is part of taking a read lock on a VMA and
+ * requires ACQUIRE semantics.
+ *
+ * Can be written (with RELEASE semantics) while holding
+ * mmap_lock in write mode.
+ * Can be read (with ACQUIRE semantics) without holding any
+ * locks on the MM (but you need to have a VMA locked to be able
+ * to do anything useful with the result).
+ */
int mm_lock_seq;
#endif
diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h
index aab8f1b28d26..e05e167dbd16 100644
--- a/include/linux/mmap_lock.h
+++ b/include/linux/mmap_lock.h
@@ -76,8 +76,14 @@ static inline void mmap_assert_write_locked(struct mm_struct *mm)
static inline void vma_end_write_all(struct mm_struct *mm)
{
mmap_assert_write_locked(mm);
- /* No races during update due to exclusive mmap_lock being held */
- WRITE_ONCE(mm->mm_lock_seq, mm->mm_lock_seq + 1);
+ /*
+ * Nobody can concurrently modify mm->mm_lock_seq due to exclusive
+ * mmap_lock being held.
+ * We need RELEASE semantics here to ensure that preceding stores into
+ * the VMA take effect before we unlock it with this store.
+ * Pairs with ACQUIRE semantics in vma_start_read().
+ */
+ smp_store_release(&mm->mm_lock_seq, mm->mm_lock_seq + 1);
}
#else
static inline void vma_end_write_all(struct mm_struct *mm) {}
base-commit: d192f5382581d972c4ae1b4d72e0b59b34cadeb9
--
2.41.0.487.g6d72f3e995-goog
Powered by blists - more mailing lists