[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <CAEf4BzYi7jE39NSBeEk3_ytDhDGrvQEN67qWR=-TpzSiJfQGWw@mail.gmail.com>
Date: Sun, 27 Oct 2024 18:24:38 -0700
From: Andrii Nakryiko <andrii.nakryiko@...il.com>
To: Suren Baghdasaryan <surenb@...gle.com>
Cc: akpm@...ux-foundation.org, peterz@...radead.org, andrii@...nel.org,
jannh@...gle.com, Liam.Howlett@...cle.com, lorenzo.stoakes@...cle.com,
vbabka@...e.cz, mhocko@...nel.org, shakeel.butt@...ux.dev, hannes@...xchg.org,
david@...hat.com, willy@...radead.org, brauner@...nel.org, oleg@...hat.com,
arnd@...db.de, richard.weiyang@...il.com, zhangpeng.00@...edance.com,
linmiaohe@...wei.com, viro@...iv.linux.org.uk, hca@...ux.ibm.com,
linux-mm@...ck.org, linux-kernel@...r.kernel.org
Subject: Re: [PATCH 1/2] mm: convert mm_lock_seq to a proper seqcount
On Sun, Oct 27, 2024 at 5:57 PM Andrii Nakryiko
<andrii.nakryiko@...il.com> wrote:
>
> On Thu, Oct 24, 2024 at 1:52 PM Suren Baghdasaryan <surenb@...gle.com> wrote:
> >
> > Convert mm_lock_seq to be seqcount_t and change all mmap_write_lock
> > variants to increment it, in-line with the usual seqcount usage pattern.
> > This lets us check whether the mmap_lock is write-locked by checking
> > mm_lock_seq.sequence counter (odd=locked, even=unlocked). This will be
> > used when implementing mmap_lock speculation functions.
> > As a result vm_lock_seq is also change to be unsigned to match the type
> > of mm_lock_seq.sequence.
> >
> > Suggested-by: Peter Zijlstra <peterz@...radead.org>
> > Signed-off-by: Suren Baghdasaryan <surenb@...gle.com>
> > ---
> > Applies over mm-unstable
> >
> > This conversion was discussed at [1] and these patches will likely be
> > incorporated into the next version od Andrii's patcset.
>
> I got a notification that this patch set was applied to mm-unstable by
> Andrew. But I was wondering if Andrew and Peter would agree to move
> the patches into tip's perf/core branch, given this is a dependency of
> my pending uprobe series ([0]) and, as far as I'm aware, there is no
> urgent need for this API in mm tree(s). If we route this through
> mm-unstable, we either need a stable tag from mm tree for Peter to
> merge into perf/core (a bit of a hassle for both Andrew and Peter, I'm
> sure), or I'd have to wait for 5-6 weeks until after next merge window
> closes, which would be a huge bummer, given I'd been at this for a
> while already with basically done patches, and would prefer to get my
> changes sooner.
>
> So, I'd very much prefer to just route these changes through
> perf/core, if mm folks don't oppose this? In fact, I'll go ahead and
> will send my patches with Suren's patches included with the assumption
> that we can reroute all this. Thanks for understanding!
>
> P.S. And yeah, Suren's patches apply cleanly to perf/core just as
> well, I checked.
Sent all of that as v4, see [2]
[2] https://lore.kernel.org/linux-trace-kernel/20241028010818.2487581-1-andrii@kernel.org/
>
> [0] https://lore.kernel.org/linux-trace-kernel/20241010205644.3831427-1-andrii@kernel.org/
>
> > The issue of the seqcount_t.sequence being an unsigned rather than
> > unsigned long will be addressed separately in collaoration with Jann Horn.
> >
> > [1] https://lore.kernel.org/all/20241010205644.3831427-2-andrii@kernel.org/
> >
> > include/linux/mm.h | 12 +++----
> > include/linux/mm_types.h | 7 ++--
> > include/linux/mmap_lock.h | 58 +++++++++++++++++++++-----------
> > kernel/fork.c | 5 +--
> > mm/init-mm.c | 2 +-
> > tools/testing/vma/vma.c | 4 +--
> > tools/testing/vma/vma_internal.h | 4 +--
> > 7 files changed, 56 insertions(+), 36 deletions(-)
> >
> > diff --git a/include/linux/mm.h b/include/linux/mm.h
> > index 4ef8cf1043f1..77644118b200 100644
> > --- a/include/linux/mm.h
> > +++ b/include/linux/mm.h
> > @@ -698,7 +698,7 @@ static inline bool vma_start_read(struct vm_area_struct *vma)
> > * we don't rely on for anything - the mm_lock_seq read against which we
> > * need ordering is below.
> > */
> > - if (READ_ONCE(vma->vm_lock_seq) == READ_ONCE(vma->vm_mm->mm_lock_seq))
> > + if (READ_ONCE(vma->vm_lock_seq) == READ_ONCE(vma->vm_mm->mm_lock_seq.sequence))
> > return false;
> >
> > if (unlikely(down_read_trylock(&vma->vm_lock->lock) == 0))
> > @@ -715,7 +715,7 @@ static inline bool vma_start_read(struct vm_area_struct *vma)
> > * after it has been unlocked.
> > * This pairs with RELEASE semantics in vma_end_write_all().
> > */
> > - if (unlikely(vma->vm_lock_seq == smp_load_acquire(&vma->vm_mm->mm_lock_seq))) {
> > + if (unlikely(vma->vm_lock_seq == raw_read_seqcount(&vma->vm_mm->mm_lock_seq))) {
> > up_read(&vma->vm_lock->lock);
> > return false;
> > }
> > @@ -730,7 +730,7 @@ static inline void vma_end_read(struct vm_area_struct *vma)
> > }
> >
> > /* WARNING! Can only be used if mmap_lock is expected to be write-locked */
> > -static bool __is_vma_write_locked(struct vm_area_struct *vma, int *mm_lock_seq)
> > +static bool __is_vma_write_locked(struct vm_area_struct *vma, unsigned int *mm_lock_seq)
> > {
> > mmap_assert_write_locked(vma->vm_mm);
> >
> > @@ -738,7 +738,7 @@ static bool __is_vma_write_locked(struct vm_area_struct *vma, int *mm_lock_seq)
> > * current task is holding mmap_write_lock, both vma->vm_lock_seq and
> > * mm->mm_lock_seq can't be concurrently modified.
> > */
> > - *mm_lock_seq = vma->vm_mm->mm_lock_seq;
> > + *mm_lock_seq = vma->vm_mm->mm_lock_seq.sequence;
> > return (vma->vm_lock_seq == *mm_lock_seq);
> > }
> >
> > @@ -749,7 +749,7 @@ static bool __is_vma_write_locked(struct vm_area_struct *vma, int *mm_lock_seq)
> > */
> > static inline void vma_start_write(struct vm_area_struct *vma)
> > {
> > - int mm_lock_seq;
> > + unsigned int mm_lock_seq;
> >
> > if (__is_vma_write_locked(vma, &mm_lock_seq))
> > return;
> > @@ -767,7 +767,7 @@ static inline void vma_start_write(struct vm_area_struct *vma)
> >
> > static inline void vma_assert_write_locked(struct vm_area_struct *vma)
> > {
> > - int mm_lock_seq;
> > + unsigned int mm_lock_seq;
> >
> > VM_BUG_ON_VMA(!__is_vma_write_locked(vma, &mm_lock_seq), vma);
> > }
> > diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
> > index ff8627acbaa7..80fef38d9d64 100644
> > --- a/include/linux/mm_types.h
> > +++ b/include/linux/mm_types.h
> > @@ -715,7 +715,7 @@ struct vm_area_struct {
> > * counter reuse can only lead to occasional unnecessary use of the
> > * slowpath.
> > */
> > - int vm_lock_seq;
> > + unsigned int vm_lock_seq;
> > /* Unstable RCU readers are allowed to read this. */
> > struct vma_lock *vm_lock;
> > #endif
> > @@ -887,6 +887,9 @@ struct mm_struct {
> > * Roughly speaking, incrementing the sequence number is
> > * equivalent to releasing locks on VMAs; reading the sequence
> > * number can be part of taking a read lock on a VMA.
> > + * Incremented every time mmap_lock is write-locked/unlocked.
> > + * Initialized to 0, therefore odd values indicate mmap_lock
> > + * is write-locked and even values that it's released.
> > *
> > * Can be modified under write mmap_lock using RELEASE
> > * semantics.
> > @@ -895,7 +898,7 @@ struct mm_struct {
> > * Can be read with ACQUIRE semantics if not holding write
> > * mmap_lock.
> > */
> > - int mm_lock_seq;
> > + seqcount_t mm_lock_seq;
> > #endif
> >
> >
> > diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h
> > index de9dc20b01ba..6b3272686860 100644
> > --- a/include/linux/mmap_lock.h
> > +++ b/include/linux/mmap_lock.h
> > @@ -71,39 +71,38 @@ static inline void mmap_assert_write_locked(const struct mm_struct *mm)
> > }
> >
> > #ifdef CONFIG_PER_VMA_LOCK
> > -/*
> > - * Drop all currently-held per-VMA locks.
> > - * This is called from the mmap_lock implementation directly before releasing
> > - * a write-locked mmap_lock (or downgrading it to read-locked).
> > - * This should normally NOT be called manually from other places.
> > - * If you want to call this manually anyway, keep in mind that this will release
> > - * *all* VMA write locks, including ones from further up the stack.
> > - */
> > -static inline void vma_end_write_all(struct mm_struct *mm)
> > +static inline void mm_lock_seqcount_init(struct mm_struct *mm)
> > {
> > - mmap_assert_write_locked(mm);
> > - /*
> > - * Nobody can concurrently modify mm->mm_lock_seq due to exclusive
> > - * mmap_lock being held.
> > - * We need RELEASE semantics here to ensure that preceding stores into
> > - * the VMA take effect before we unlock it with this store.
> > - * Pairs with ACQUIRE semantics in vma_start_read().
> > - */
> > - smp_store_release(&mm->mm_lock_seq, mm->mm_lock_seq + 1);
> > + seqcount_init(&mm->mm_lock_seq);
> > +}
> > +
> > +static inline void mm_lock_seqcount_begin(struct mm_struct *mm)
> > +{
> > + do_raw_write_seqcount_begin(&mm->mm_lock_seq);
> > +}
> > +
> > +static inline void mm_lock_seqcount_end(struct mm_struct *mm)
> > +{
> > + do_raw_write_seqcount_end(&mm->mm_lock_seq);
> > }
> > +
> > #else
> > -static inline void vma_end_write_all(struct mm_struct *mm) {}
> > +static inline void mm_lock_seqcount_init(struct mm_struct *mm) {}
> > +static inline void mm_lock_seqcount_begin(struct mm_struct *mm) {}
> > +static inline void mm_lock_seqcount_end(struct mm_struct *mm) {}
> > #endif
> >
> > static inline void mmap_init_lock(struct mm_struct *mm)
> > {
> > init_rwsem(&mm->mmap_lock);
> > + mm_lock_seqcount_init(mm);
> > }
> >
> > static inline void mmap_write_lock(struct mm_struct *mm)
> > {
> > __mmap_lock_trace_start_locking(mm, true);
> > down_write(&mm->mmap_lock);
> > + mm_lock_seqcount_begin(mm);
> > __mmap_lock_trace_acquire_returned(mm, true, true);
> > }
> >
> > @@ -111,6 +110,7 @@ static inline void mmap_write_lock_nested(struct mm_struct *mm, int subclass)
> > {
> > __mmap_lock_trace_start_locking(mm, true);
> > down_write_nested(&mm->mmap_lock, subclass);
> > + mm_lock_seqcount_begin(mm);
> > __mmap_lock_trace_acquire_returned(mm, true, true);
> > }
> >
> > @@ -120,10 +120,30 @@ static inline int mmap_write_lock_killable(struct mm_struct *mm)
> >
> > __mmap_lock_trace_start_locking(mm, true);
> > ret = down_write_killable(&mm->mmap_lock);
> > + if (!ret)
> > + mm_lock_seqcount_begin(mm);
> > __mmap_lock_trace_acquire_returned(mm, true, ret == 0);
> > return ret;
> > }
> >
> > +/*
> > + * Drop all currently-held per-VMA locks.
> > + * This is called from the mmap_lock implementation directly before releasing
> > + * a write-locked mmap_lock (or downgrading it to read-locked).
> > + * This should normally NOT be called manually from other places.
> > + * If you want to call this manually anyway, keep in mind that this will release
> > + * *all* VMA write locks, including ones from further up the stack.
> > + */
> > +static inline void vma_end_write_all(struct mm_struct *mm)
> > +{
> > + mmap_assert_write_locked(mm);
> > + /*
> > + * Nobody can concurrently modify mm->mm_lock_seq due to exclusive
> > + * mmap_lock being held.
> > + */
> > + mm_lock_seqcount_end(mm);
> > +}
> > +
> > static inline void mmap_write_unlock(struct mm_struct *mm)
> > {
> > __mmap_lock_trace_released(mm, true);
> > diff --git a/kernel/fork.c b/kernel/fork.c
> > index fd528fb5e305..0cae6fc651f0 100644
> > --- a/kernel/fork.c
> > +++ b/kernel/fork.c
> > @@ -447,7 +447,7 @@ static bool vma_lock_alloc(struct vm_area_struct *vma)
> > return false;
> >
> > init_rwsem(&vma->vm_lock->lock);
> > - vma->vm_lock_seq = -1;
> > + vma->vm_lock_seq = UINT_MAX;
> >
> > return true;
> > }
> > @@ -1260,9 +1260,6 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
> > seqcount_init(&mm->write_protect_seq);
> > mmap_init_lock(mm);
> > INIT_LIST_HEAD(&mm->mmlist);
> > -#ifdef CONFIG_PER_VMA_LOCK
> > - mm->mm_lock_seq = 0;
> > -#endif
> > mm_pgtables_bytes_init(mm);
> > mm->map_count = 0;
> > mm->locked_vm = 0;
> > diff --git a/mm/init-mm.c b/mm/init-mm.c
> > index 24c809379274..6af3ad675930 100644
> > --- a/mm/init-mm.c
> > +++ b/mm/init-mm.c
> > @@ -40,7 +40,7 @@ struct mm_struct init_mm = {
> > .arg_lock = __SPIN_LOCK_UNLOCKED(init_mm.arg_lock),
> > .mmlist = LIST_HEAD_INIT(init_mm.mmlist),
> > #ifdef CONFIG_PER_VMA_LOCK
> > - .mm_lock_seq = 0,
> > + .mm_lock_seq = SEQCNT_ZERO(init_mm.mm_lock_seq),
> > #endif
> > .user_ns = &init_user_ns,
> > .cpu_bitmap = CPU_BITS_NONE,
> > diff --git a/tools/testing/vma/vma.c b/tools/testing/vma/vma.c
> > index 8fab5e13c7c3..9bcf1736bf18 100644
> > --- a/tools/testing/vma/vma.c
> > +++ b/tools/testing/vma/vma.c
> > @@ -89,7 +89,7 @@ static struct vm_area_struct *alloc_and_link_vma(struct mm_struct *mm,
> > * begun. Linking to the tree will have caused this to be incremented,
> > * which means we will get a false positive otherwise.
> > */
> > - vma->vm_lock_seq = -1;
> > + vma->vm_lock_seq = UINT_MAX;
> >
> > return vma;
> > }
> > @@ -214,7 +214,7 @@ static bool vma_write_started(struct vm_area_struct *vma)
> > int seq = vma->vm_lock_seq;
> >
> > /* We reset after each check. */
> > - vma->vm_lock_seq = -1;
> > + vma->vm_lock_seq = UINT_MAX;
> >
> > /* The vma_start_write() stub simply increments this value. */
> > return seq > -1;
> > diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h
> > index e76ff579e1fd..1d9fc97b8e80 100644
> > --- a/tools/testing/vma/vma_internal.h
> > +++ b/tools/testing/vma/vma_internal.h
> > @@ -241,7 +241,7 @@ struct vm_area_struct {
> > * counter reuse can only lead to occasional unnecessary use of the
> > * slowpath.
> > */
> > - int vm_lock_seq;
> > + unsigned int vm_lock_seq;
> > struct vma_lock *vm_lock;
> > #endif
> >
> > @@ -416,7 +416,7 @@ static inline bool vma_lock_alloc(struct vm_area_struct *vma)
> > return false;
> >
> > init_rwsem(&vma->vm_lock->lock);
> > - vma->vm_lock_seq = -1;
> > + vma->vm_lock_seq = UINT_MAX;
> >
> > return true;
> > }
> >
> > base-commit: 9c111059234a949a4d3442a413ade19cc65ab927
> > --
> > 2.47.0.163.g1226f6d8fa-goog
> >
Powered by blists - more mailing lists