[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <7f7b277a-7019-4bf4-b100-0505c6ce9737@redhat.com>
Date: Wed, 16 Oct 2024 10:23:14 -0400
From: Waiman Long <llong@...hat.com>
To: lizhe.67@...edance.com, peterz@...radead.org, mingo@...hat.com,
will@...nel.org, boqun.feng@...il.com, akpm@...ux-foundation.org
Cc: linux-kernel@...r.kernel.org, linux-mm@...ck.org
Subject: Re: [RFC 1/2] rwsem: introduce upgrade_read interface
On 10/16/24 12:35 AM, lizhe.67@...edance.com wrote:
> From: Li Zhe <lizhe.67@...edance.com>
>
> Introduce a new rwsem interface upgrade_read(). We can call it
> to upgrade the lock into write rwsem lock after we get read lock.
> This interface will wait for all readers to exit before obtaining
> the write lock. In addition, this interface has a higher priority
> than any process waiting for the write lock and subsequent threads
> that want to obtain the read lock.
>
> Signed-off-by: Li Zhe <lizhe.67@...edance.com>
> ---
> include/linux/rwsem.h | 1 +
> kernel/locking/rwsem.c | 87 ++++++++++++++++++++++++++++++++++++++++--
> 2 files changed, 85 insertions(+), 3 deletions(-)
>
> diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h
> index c8b543d428b0..90183ab5ea79 100644
> --- a/include/linux/rwsem.h
> +++ b/include/linux/rwsem.h
> @@ -249,6 +249,7 @@ DEFINE_GUARD_COND(rwsem_write, _try, down_write_trylock(_T))
> * downgrade write lock to read lock
> */
> extern void downgrade_write(struct rw_semaphore *sem);
> +extern int upgrade_read(struct rw_semaphore *sem);
>
> #ifdef CONFIG_DEBUG_LOCK_ALLOC
> /*
> diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
> index 2bbb6eca5144..0583e1be3dbf 100644
> --- a/kernel/locking/rwsem.c
> +++ b/kernel/locking/rwsem.c
> @@ -37,6 +37,7 @@
> * meanings when set.
> * - Bit 0: RWSEM_READER_OWNED - rwsem may be owned by readers (just a hint)
> * - Bit 1: RWSEM_NONSPINNABLE - Cannot spin on a reader-owned lock
> + * - Bit 2: RWSEM_UPGRADING - doing upgrade read process
> *
> * When the rwsem is reader-owned and a spinning writer has timed out,
> * the nonspinnable bit will be set to disable optimistic spinning.
> @@ -62,7 +63,8 @@
> */
> #define RWSEM_READER_OWNED (1UL << 0)
> #define RWSEM_NONSPINNABLE (1UL << 1)
> -#define RWSEM_OWNER_FLAGS_MASK (RWSEM_READER_OWNED | RWSEM_NONSPINNABLE)
> +#define RWSEM_UPGRADING (1UL << 2)
> +#define RWSEM_OWNER_FLAGS_MASK (RWSEM_READER_OWNED | RWSEM_NONSPINNABLE | RWSEM_UPGRADING)
>
> #ifdef CONFIG_DEBUG_RWSEMS
> # define DEBUG_RWSEMS_WARN_ON(c, sem) do { \
> @@ -93,7 +95,8 @@
> * Bit 0 - writer locked bit
> * Bit 1 - waiters present bit
> * Bit 2 - lock handoff bit
> - * Bits 3-7 - reserved
> + * Bit 3 - upgrade read bit
> + * Bits 4-7 - reserved
> * Bits 8-30 - 23-bit reader count
> * Bit 31 - read fail bit
> *
> @@ -117,6 +120,7 @@
> #define RWSEM_WRITER_LOCKED (1UL << 0)
> #define RWSEM_FLAG_WAITERS (1UL << 1)
> #define RWSEM_FLAG_HANDOFF (1UL << 2)
> +#define RWSEM_FLAG_UPGRADE_READ (1UL << 3)
> #define RWSEM_FLAG_READFAIL (1UL << (BITS_PER_LONG - 1))
>
> #define RWSEM_READER_SHIFT 8
> @@ -143,6 +147,13 @@ static inline void rwsem_set_owner(struct rw_semaphore *sem)
> atomic_long_set(&sem->owner, (long)current);
> }
>
> +static inline void rwsem_set_owner_upgrade(struct rw_semaphore *sem)
> +{
> + lockdep_assert_preemption_disabled();
> + atomic_long_set(&sem->owner, (long)current | RWSEM_UPGRADING |
> + RWSEM_READER_OWNED | RWSEM_NONSPINNABLE);
> +}
Because of possible racing between 2 competing upgraders, read lock
owner setting has to be atomic to avoid one overwriting the others.
> +
> static inline void rwsem_clear_owner(struct rw_semaphore *sem)
> {
> lockdep_assert_preemption_disabled();
> @@ -201,7 +212,7 @@ static inline bool is_rwsem_reader_owned(struct rw_semaphore *sem)
> */
> long count = atomic_long_read(&sem->count);
>
> - if (count & RWSEM_WRITER_MASK)
> + if ((count & RWSEM_WRITER_MASK) && !(count & RWSEM_FLAG_UPGRADE_READ))
> return false;
> return rwsem_test_oflags(sem, RWSEM_READER_OWNED);
> }
> @@ -1336,6 +1347,8 @@ static inline int __down_write_trylock(struct rw_semaphore *sem)
> static inline void __up_read(struct rw_semaphore *sem)
> {
> long tmp;
> + unsigned long flags;
> + struct task_struct *owner;
>
> DEBUG_RWSEMS_WARN_ON(sem->magic != sem, sem);
> DEBUG_RWSEMS_WARN_ON(!is_rwsem_reader_owned(sem), sem);
> @@ -1349,6 +1362,9 @@ static inline void __up_read(struct rw_semaphore *sem)
> clear_nonspinnable(sem);
> rwsem_wake(sem);
> }
> + owner = rwsem_owner_flags(sem, &flags);
> + if (unlikely(!(tmp & RWSEM_READER_MASK) && (flags & RWSEM_UPGRADING)))
> + wake_up_process(owner);
> preempt_enable();
> }
>
> @@ -1641,6 +1657,71 @@ void downgrade_write(struct rw_semaphore *sem)
> }
> EXPORT_SYMBOL(downgrade_write);
>
> +static inline void rwsem_clear_upgrade_flag(struct rw_semaphore *sem)
> +{
> + atomic_long_andnot(RWSEM_FLAG_UPGRADE_READ, &sem->count);
> +}
> +
> +/*
> + * upgrade read lock to write lock
> + */
> +static inline int __upgrade_read(struct rw_semaphore *sem)
> +{
> + long tmp;
> +
> + preempt_disable();
> +
> + tmp = atomic_long_read(&sem->count);
> + do {
> + if (tmp & (RWSEM_WRITER_MASK | RWSEM_FLAG_UPGRADE_READ)) {
> + preempt_enable();
> + return -EBUSY;
> + }
> + } while (!atomic_long_try_cmpxchg(&sem->count, &tmp,
> + tmp + RWSEM_FLAG_UPGRADE_READ + RWSEM_WRITER_LOCKED - RWSEM_READER_BIAS));
> +
> + if ((tmp & RWSEM_READER_MASK) == RWSEM_READER_BIAS) {
> + /* fast path */
> + DEBUG_RWSEMS_WARN_ON(sem->magic != sem, sem);
> + rwsem_clear_upgrade_flag(sem);
> + rwsem_set_owner(sem);
> + preempt_enable();
> + return 0;
> + }
> + /* slow path */
> + raw_spin_lock_irq(&sem->wait_lock);
> + rwsem_set_owner_upgrade(sem);
> +
> + set_current_state(TASK_UNINTERRUPTIBLE);
> +
> + for (;;) {
> + if (!(atomic_long_read(&sem->count) & RWSEM_READER_MASK))
> + break;
> + raw_spin_unlock_irq(&sem->wait_lock);
> + schedule_preempt_disabled();
> + set_current_state(TASK_UNINTERRUPTIBLE);
> + raw_spin_lock_irq(&sem->wait_lock);
> + }
> +
> + rwsem_clear_upgrade_flag(sem);
> + rwsem_set_owner(sem);
> + __set_current_state(TASK_RUNNING);
> + raw_spin_unlock_irq(&sem->wait_lock);
> + preempt_enable();
> + return 0;
> +}
> +
> +/*
> + * upgrade read lock to write lock
> + *
> + * Return: 0 on success, error code on failure
> + */
> +int upgrade_read(struct rw_semaphore *sem)
> +{
> + return __upgrade_read(sem);
> +}
> +EXPORT_SYMBOL(upgrade_read);
This new interface should have an API similar to a trylock. True if
successful, false otherwise. I like the read_try_upgrade() name.
Another alternative that I have been thinking about is a down_read()
variant with intention to upgrade later. This will ensure that only one
active reader is allowed to upgrade later. With this, upgrade_read()
will always succeed, maybe with some sleeping, as long as the correct
down_read() is used.
Cheers,
Longman
Powered by blists - more mailing lists