lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <7f7b277a-7019-4bf4-b100-0505c6ce9737@redhat.com>
Date: Wed, 16 Oct 2024 10:23:14 -0400
From: Waiman Long <llong@...hat.com>
To: lizhe.67@...edance.com, peterz@...radead.org, mingo@...hat.com,
 will@...nel.org, boqun.feng@...il.com, akpm@...ux-foundation.org
Cc: linux-kernel@...r.kernel.org, linux-mm@...ck.org
Subject: Re: [RFC 1/2] rwsem: introduce upgrade_read interface


On 10/16/24 12:35 AM, lizhe.67@...edance.com wrote:
> From: Li Zhe <lizhe.67@...edance.com>
>
> Introduce a new rwsem interface upgrade_read(). We can call it
> to upgrade the lock into write rwsem lock after we get read lock.
> This interface will wait for all readers to exit before obtaining
> the write lock. In addition, this interface has a higher priority
> than any process waiting for the write lock and subsequent threads
> that want to obtain the read lock.
>
> Signed-off-by: Li Zhe <lizhe.67@...edance.com>
> ---
>   include/linux/rwsem.h  |  1 +
>   kernel/locking/rwsem.c | 87 ++++++++++++++++++++++++++++++++++++++++--
>   2 files changed, 85 insertions(+), 3 deletions(-)
>
> diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h
> index c8b543d428b0..90183ab5ea79 100644
> --- a/include/linux/rwsem.h
> +++ b/include/linux/rwsem.h
> @@ -249,6 +249,7 @@ DEFINE_GUARD_COND(rwsem_write, _try, down_write_trylock(_T))
>    * downgrade write lock to read lock
>    */
>   extern void downgrade_write(struct rw_semaphore *sem);
> +extern int upgrade_read(struct rw_semaphore *sem);
>   
>   #ifdef CONFIG_DEBUG_LOCK_ALLOC
>   /*
> diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
> index 2bbb6eca5144..0583e1be3dbf 100644
> --- a/kernel/locking/rwsem.c
> +++ b/kernel/locking/rwsem.c
> @@ -37,6 +37,7 @@
>    * meanings when set.
>    *  - Bit 0: RWSEM_READER_OWNED - rwsem may be owned by readers (just a hint)
>    *  - Bit 1: RWSEM_NONSPINNABLE - Cannot spin on a reader-owned lock
> + *  - Bit 2: RWSEM_UPGRADING    - doing upgrade read process
>    *
>    * When the rwsem is reader-owned and a spinning writer has timed out,
>    * the nonspinnable bit will be set to disable optimistic spinning.
> @@ -62,7 +63,8 @@
>    */
>   #define RWSEM_READER_OWNED	(1UL << 0)
>   #define RWSEM_NONSPINNABLE	(1UL << 1)
> -#define RWSEM_OWNER_FLAGS_MASK	(RWSEM_READER_OWNED | RWSEM_NONSPINNABLE)
> +#define RWSEM_UPGRADING		(1UL << 2)
> +#define RWSEM_OWNER_FLAGS_MASK	(RWSEM_READER_OWNED | RWSEM_NONSPINNABLE | RWSEM_UPGRADING)
>   
>   #ifdef CONFIG_DEBUG_RWSEMS
>   # define DEBUG_RWSEMS_WARN_ON(c, sem)	do {			\
> @@ -93,7 +95,8 @@
>    * Bit  0    - writer locked bit
>    * Bit  1    - waiters present bit
>    * Bit  2    - lock handoff bit
> - * Bits 3-7  - reserved
> + * Bit  3    - upgrade read bit
> + * Bits 4-7  - reserved
>    * Bits 8-30 - 23-bit reader count
>    * Bit  31   - read fail bit
>    *
> @@ -117,6 +120,7 @@
>   #define RWSEM_WRITER_LOCKED	(1UL << 0)
>   #define RWSEM_FLAG_WAITERS	(1UL << 1)
>   #define RWSEM_FLAG_HANDOFF	(1UL << 2)
> +#define RWSEM_FLAG_UPGRADE_READ	(1UL << 3)
>   #define RWSEM_FLAG_READFAIL	(1UL << (BITS_PER_LONG - 1))
>   
>   #define RWSEM_READER_SHIFT	8
> @@ -143,6 +147,13 @@ static inline void rwsem_set_owner(struct rw_semaphore *sem)
>   	atomic_long_set(&sem->owner, (long)current);
>   }
>   
> +static inline void rwsem_set_owner_upgrade(struct rw_semaphore *sem)
> +{
> +	lockdep_assert_preemption_disabled();
> +	atomic_long_set(&sem->owner, (long)current | RWSEM_UPGRADING |
> +			RWSEM_READER_OWNED | RWSEM_NONSPINNABLE);
> +}

Because of possible  racing between 2 competing upgraders, read lock 
owner setting has to be atomic to avoid one overwriting the others.


> +
>   static inline void rwsem_clear_owner(struct rw_semaphore *sem)
>   {
>   	lockdep_assert_preemption_disabled();
> @@ -201,7 +212,7 @@ static inline bool is_rwsem_reader_owned(struct rw_semaphore *sem)
>   	 */
>   	long count = atomic_long_read(&sem->count);
>   
> -	if (count & RWSEM_WRITER_MASK)
> +	if ((count & RWSEM_WRITER_MASK) && !(count & RWSEM_FLAG_UPGRADE_READ))
>   		return false;
>   	return rwsem_test_oflags(sem, RWSEM_READER_OWNED);
>   }
> @@ -1336,6 +1347,8 @@ static inline int __down_write_trylock(struct rw_semaphore *sem)
>   static inline void __up_read(struct rw_semaphore *sem)
>   {
>   	long tmp;
> +	unsigned long flags;
> +	struct task_struct *owner;
>   
>   	DEBUG_RWSEMS_WARN_ON(sem->magic != sem, sem);
>   	DEBUG_RWSEMS_WARN_ON(!is_rwsem_reader_owned(sem), sem);
> @@ -1349,6 +1362,9 @@ static inline void __up_read(struct rw_semaphore *sem)
>   		clear_nonspinnable(sem);
>   		rwsem_wake(sem);
>   	}
> +	owner = rwsem_owner_flags(sem, &flags);
> +	if (unlikely(!(tmp & RWSEM_READER_MASK) && (flags & RWSEM_UPGRADING)))
> +		wake_up_process(owner);
>   	preempt_enable();
>   }
>   
> @@ -1641,6 +1657,71 @@ void downgrade_write(struct rw_semaphore *sem)
>   }
>   EXPORT_SYMBOL(downgrade_write);
>   
> +static inline void rwsem_clear_upgrade_flag(struct rw_semaphore *sem)
> +{
> +	atomic_long_andnot(RWSEM_FLAG_UPGRADE_READ, &sem->count);
> +}
> +
> +/*
> + * upgrade read lock to write lock
> + */
> +static inline int __upgrade_read(struct rw_semaphore *sem)
> +{
> +	long tmp;
> +
> +	preempt_disable();
> +
> +	tmp = atomic_long_read(&sem->count);
> +	do {
> +		if (tmp & (RWSEM_WRITER_MASK | RWSEM_FLAG_UPGRADE_READ)) {
> +			preempt_enable();
> +			return -EBUSY;
> +		}
> +	} while (!atomic_long_try_cmpxchg(&sem->count, &tmp,
> +		tmp + RWSEM_FLAG_UPGRADE_READ + RWSEM_WRITER_LOCKED - RWSEM_READER_BIAS));
> +
> +	if ((tmp & RWSEM_READER_MASK) == RWSEM_READER_BIAS) {
> +		/* fast path */
> +		DEBUG_RWSEMS_WARN_ON(sem->magic != sem, sem);
> +		rwsem_clear_upgrade_flag(sem);
> +		rwsem_set_owner(sem);
> +		preempt_enable();
> +		return 0;
> +	}
> +	/* slow path */
> +	raw_spin_lock_irq(&sem->wait_lock);
> +	rwsem_set_owner_upgrade(sem);
> +
> +	set_current_state(TASK_UNINTERRUPTIBLE);
> +
> +	for (;;) {
> +		if (!(atomic_long_read(&sem->count) & RWSEM_READER_MASK))
> +			break;
> +		raw_spin_unlock_irq(&sem->wait_lock);
> +		schedule_preempt_disabled();
> +		set_current_state(TASK_UNINTERRUPTIBLE);
> +		raw_spin_lock_irq(&sem->wait_lock);
> +	}
> +
> +	rwsem_clear_upgrade_flag(sem);
> +	rwsem_set_owner(sem);
> +	__set_current_state(TASK_RUNNING);
> +	raw_spin_unlock_irq(&sem->wait_lock);
> +	preempt_enable();
> +	return 0;
> +}
> +
> +/*
> + * upgrade read lock to write lock
> + *
> + * Return: 0 on success, error code on failure
> + */
> +int upgrade_read(struct rw_semaphore *sem)
> +{
> +	return __upgrade_read(sem);
> +}
> +EXPORT_SYMBOL(upgrade_read);

This new interface should have an API similar to a trylock. True if 
successful, false otherwise. I like the read_try_upgrade() name.

Another alternative that I have been thinking about is a down_read() 
variant with intention to upgrade later. This will ensure that only one 
active reader is allowed to upgrade later. With this, upgrade_read() 
will always succeed, maybe with some sleeping, as long as the correct 
down_read() is used.

Cheers,
Longman


Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ