linux-kernel - Re: [RFC] Sleep waiting for an rwsem to be unlocked

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <CAJuCfpHpGyiXePsQNWuGwfYAC64jtrJmp5kZBeQwYwD1p+SkAg@mail.gmail.com>
Date: Mon, 15 Jan 2024 10:44:02 -0800
From: Suren Baghdasaryan <surenb@...gle.com>
To: Matthew Wilcox <willy@...radead.org>
Cc: Peter Zijlstra <peterz@...radead.org>, Ingo Molnar <mingo@...hat.com>, 
	Will Deacon <will@...nel.org>, Waiman Long <longman@...hat.com>, linux-kernel@...r.kernel.org, 
	"Liam R. Howlett" <liam.howlett@...cle.com>, "Paul E. McKenney" <paulmck@...nel.org>
Subject: Re: [RFC] Sleep waiting for an rwsem to be unlocked

On Tue, Jan 9, 2024 at 9:12 AM Matthew Wilcox <willy@...radead.org> wrote:
>
> The problem we're trying to solve is a lock-free walk of
> /proc/$pid/maps. If the process is modifying the VMAs at the same time
> the reader is walking them, it can see garbage.  For page faults, we
> handle this by taking the mmap_lock for read and retrying the page fault
> (excluding any further modifications).
>
> We don't want to take that approach for the maps file.  The monitoring
> task may have a significantly lower process priority, and so taking
> the mmap_lock for read can block it for a significant period of time.
> The obvious answer is to do some kind of backoff+sleep.  But we already
> have a wait queue, so why not use it?
>
> I haven't done the rwbase version; this is just a demonstration of what
> we could do.  It's also untested other than by compilation.  It might
> well be missing something.

I just posted an RFC for lock-less /proc/$pid/maps reading at [1]. The
rwsem_wait() function proposed by Matthew here would be useful in that
patchset to replace mmap_read_lock/mmap_read_unlock sequence I have to
use to wait for mmap_lock writer to finish.

[1] https://lore.kernel.org/all/20240115183837.205694-1-surenb@google.com/
>
> Signed-off-by: Matthew Wilcox (Oracle) <willy@...radead.org>
> ---
>  include/linux/rwsem.h  |   6 +++
>  kernel/locking/rwsem.c | 104 ++++++++++++++++++++++++++++++++++++++++-
>  2 files changed, 108 insertions(+), 2 deletions(-)
>
> diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h
> index 4f1c18992f76..e7bf9dfc471a 100644
> --- a/include/linux/rwsem.h
> +++ b/include/linux/rwsem.h
> @@ -250,6 +250,12 @@ DEFINE_GUARD_COND(rwsem_write, _try, down_write_trylock(_T))
>   */
>  extern void downgrade_write(struct rw_semaphore *sem);
>
> +/*
> + * wait for current writer to be finished
> + */
> +void rwsem_wait(struct rw_semaphore *sem);
> +int __must_check rwsem_wait_killable(struct rw_semaphore *sem);
> +
>  #ifdef CONFIG_DEBUG_LOCK_ALLOC
>  /*
>   * nested locking. NOTE: rwsems are not allowed to recurse
> diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
> index 2340b6d90ec6..7c8096c5586f 100644
> --- a/kernel/locking/rwsem.c
> +++ b/kernel/locking/rwsem.c
> @@ -332,7 +332,8 @@ EXPORT_SYMBOL(__init_rwsem);
>
>  enum rwsem_waiter_type {
>         RWSEM_WAITING_FOR_WRITE,
> -       RWSEM_WAITING_FOR_READ
> +       RWSEM_WAITING_FOR_READ,
> +       RWSEM_WAITING_FOR_RELEASE,
>  };
>
>  struct rwsem_waiter {
> @@ -511,7 +512,8 @@ static void rwsem_mark_wake(struct rw_semaphore *sem,
>                 if (waiter->type == RWSEM_WAITING_FOR_WRITE)
>                         continue;
>
> -               woken++;
> +               if (waiter->type == RWSEM_WAITING_FOR_READ)
> +                       woken++;
>                 list_move_tail(&waiter->list, &wlist);
>
>                 /*
> @@ -1401,6 +1403,67 @@ static inline void __downgrade_write(struct rw_semaphore *sem)
>         preempt_enable();
>  }
>
> +static inline int __wait_read_common(struct rw_semaphore *sem, int state)
> +{
> +       int ret = 0;
> +       long adjustment = 0;
> +       struct rwsem_waiter waiter;
> +       DEFINE_WAKE_Q(wake_q);
> +
> +       waiter.task = current;
> +       waiter.type = RWSEM_WAITING_FOR_RELEASE;
> +       waiter.timeout = jiffies + RWSEM_WAIT_TIMEOUT;
> +       waiter.handoff_set = false;
> +
> +       preempt_disable();
> +       raw_spin_lock_irq(&sem->wait_lock);
> +       if (list_empty(&sem->wait_list)) {
> +               if (!(atomic_long_read(&sem->count) & RWSEM_WRITER_MASK)) {
> +                       /* Provide lock ACQUIRE */
> +                       smp_acquire__after_ctrl_dep();
> +                       raw_spin_unlock_irq(&sem->wait_lock);
> +                       goto done;
> +               }
> +               adjustment = RWSEM_FLAG_WAITERS;
> +       }
> +       rwsem_add_waiter(sem, &waiter);
> +       if (adjustment) {
> +               long count = atomic_long_add_return(adjustment, &sem->count);
> +               rwsem_cond_wake_waiter(sem, count, &wake_q);
> +       }
> +       raw_spin_unlock_irq(&sem->wait_lock);
> +
> +       if (!wake_q_empty(&wake_q))
> +               wake_up_q(&wake_q);
> +
> +       for (;;) {
> +               set_current_state(state);
> +               if (!smp_load_acquire(&waiter.task)) {
> +                       /* Matches rwsem_mark_wake()'s smp_store_release(). */
> +                       break;
> +               }
> +               if (signal_pending_state(state, current)) {
> +                       raw_spin_lock_irq(&sem->wait_lock);
> +                       if (waiter.task)
> +                               goto out_nolock;
> +                       raw_spin_unlock_irq(&sem->wait_lock);
> +                       /* Ordered by sem->wait_lock against rwsem_mark_wake(). */
> +                       break;
> +               }
> +               schedule_preempt_disabled();
> +       }
> +
> +       __set_current_state(TASK_RUNNING);
> +done:
> +       preempt_enable();
> +       return ret;
> +out_nolock:
> +       rwsem_del_wake_waiter(sem, &waiter, &wake_q);
> +       __set_current_state(TASK_RUNNING);
> +       ret = -EINTR;
> +       goto done;
> +}
> +
>  #else /* !CONFIG_PREEMPT_RT */
>
>  #define RT_MUTEX_BUILD_MUTEX
> @@ -1500,6 +1563,11 @@ static inline void __downgrade_write(struct rw_semaphore *sem)
>         rwbase_write_downgrade(&sem->rwbase);
>  }
>
> +static inline int __wait_read_killable(struct rw_semaphore *sem)
> +{
> +       return rwbase_wait_lock(&sem->rwbase, TASK_KILLABLE);
> +}
> +
>  /* Debug stubs for the common API */
>  #define DEBUG_RWSEMS_WARN_ON(c, sem)
>
> @@ -1643,6 +1711,38 @@ void downgrade_write(struct rw_semaphore *sem)
>  }
>  EXPORT_SYMBOL(downgrade_write);
>
> +/**
> + * rwsem_wait_killable - Wait for current write lock holder to release lock
> + * @sem: The semaphore to wait on.
> + *
> + * This is equivalent to calling down_read(); up_read() but avoids the
> + * possibility that the thread will be preempted while holding the lock
> + * causing threads that want to take the lock for writes to block.  The
> + * intended use case is for lockless readers who notice an inconsistent
> + * state and want to wait for the current writer to finish.
> + */
> +int rwsem_wait_killable(struct rw_semaphore *sem)
> +{
> +       might_sleep();
> +
> +       rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_);
> +       rwsem_release(&sem->dep_map, _RET_IP_);
> +
> +       return __wait_read_common(sem, TASK_KILLABLE);
> +}
> +EXPORT_SYMBOL(rwsem_wait_killable);
> +
> +void rwsem_wait(struct rw_semaphore *sem)
> +{
> +       might_sleep();
> +
> +       rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_);
> +       rwsem_release(&sem->dep_map, _RET_IP_);
> +
> +       __wait_read_common(sem, TASK_UNINTERRUPTIBLE);
> +}
> +EXPORT_SYMBOL(rwsem_wait);
> +
>  #ifdef CONFIG_DEBUG_LOCK_ALLOC
>
>  void down_read_nested(struct rw_semaphore *sem, int subclass)
> --
> 2.43.0
>