linux-kernel - Re: [PATCH v8 16/19] locking/rwsem: Guard against making count negative

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20190611132715.GJ3463@hirez.programming.kicks-ass.net>
Date:   Tue, 11 Jun 2019 15:27:15 +0200
From:   Peter Zijlstra <peterz@...radead.org>
To:     Waiman Long <longman@...hat.com>
Cc:     Ingo Molnar <mingo@...hat.com>, Will Deacon <will.deacon@....com>,
        Thomas Gleixner <tglx@...utronix.de>,
        Borislav Petkov <bp@...en8.de>,
        "H. Peter Anvin" <hpa@...or.com>, linux-kernel@...r.kernel.org,
        x86@...nel.org, Davidlohr Bueso <dave@...olabs.net>,
        Linus Torvalds <torvalds@...ux-foundation.org>,
        Tim Chen <tim.c.chen@...ux.intel.com>,
        huang ying <huang.ying.caritas@...il.com>
Subject: Re: [PATCH v8 16/19] locking/rwsem: Guard against making count
 negative

On Tue, Jun 11, 2019 at 03:11:31PM +0200, Peter Zijlstra wrote:
> On Mon, May 20, 2019 at 04:59:15PM -0400, Waiman Long wrote:
> 
> > +static inline long rwsem_read_trylock(struct rw_semaphore *sem, long *cnt)
> > +{
> > +	long adjustment = -RWSEM_READER_BIAS;
> > +
> > +	*cnt = atomic_long_fetch_add_acquire(RWSEM_READER_BIAS, &sem->count);
> 
> I'm thinking we'd actually want add_return_acquire() here.
> 
> > +	if (unlikely(*cnt < 0)) {
> > +		atomic_long_add(-RWSEM_READER_BIAS, &sem->count);
> > +		adjustment = 0;
> > +	}
> > +	return adjustment;
> > +}
> 
> > @@ -1271,9 +1332,10 @@ static struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem)
> >   */
> >  inline void __down_read(struct rw_semaphore *sem)
> >  {
> > +	long tmp, adjustment = rwsem_read_trylock(sem, &tmp);
> > +
> > +	if (unlikely(tmp & RWSEM_READ_FAILED_MASK)) {
> > +		rwsem_down_read_slowpath(sem, TASK_UNINTERRUPTIBLE, adjustment);
> >  		DEBUG_RWSEMS_WARN_ON(!is_rwsem_reader_owned(sem), sem);
> >  	} else {
> >  		rwsem_set_reader_owned(sem);
> > @@ -1282,9 +1344,11 @@ inline void __down_read(struct rw_semaphore *sem)
> >  
> >  static inline int __down_read_killable(struct rw_semaphore *sem)
> >  {
> > +	long tmp, adjustment = rwsem_read_trylock(sem, &tmp);
> > +
> > +	if (unlikely(tmp & RWSEM_READ_FAILED_MASK)) {
> > +		if (IS_ERR(rwsem_down_read_slowpath(sem, TASK_KILLABLE,
> > +						    adjustment)))
> >  			return -EINTR;
> >  		DEBUG_RWSEMS_WARN_ON(!is_rwsem_reader_owned(sem), sem);
> >  	} else {
> 
> I'm confused by the need for @tmp; isn't that returning the exact same
> state !adjustment is?

Argh.. READ_FAILED_MASK isn't just the MSB. Bah, this is confusing.

Maybe something like so?

--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -116,13 +116,28 @@
 #endif
 
 /*
- * The definition of the atomic counter in the semaphore:
+ * On 64-bit architectures, the bit definitions of the count are:
  *
- * Bit  0   - writer locked bit
- * Bit  1   - waiters present bit
- * Bit  2   - lock handoff bit
- * Bits 3-7 - reserved
- * Bits 8-X - 24-bit (32-bit) or 56-bit reader count
+ * Bit  0    - writer locked bit
+ * Bit  1    - waiters present bit
+ * Bit  2    - lock handoff bit
+ * Bits 3-7  - reserved
+ * Bits 8-62 - 55-bit reader count
+ * Bit  63   - read fail bit
+ *
+ * On 32-bit architectures, the bit definitions of the count are:
+ *
+ * Bit  0    - writer locked bit
+ * Bit  1    - waiters present bit
+ * Bit  2    - lock handoff bit
+ * Bits 3-7  - reserved
+ * Bits 8-30 - 23-bit reader count
+ * Bit  31   - read fail bit
+ *
+ * It is not likely that the most significant bit (read fail bit) will ever
+ * be set. This guard bit is still checked anyway in the down_read() fastpath
+ * just in case we need to use up more of the reader bits for other purpose
+ * in the future.
  *
  * atomic_long_fetch_add() is used to obtain reader lock, whereas
  * atomic_long_cmpxchg() will be used to obtain writer lock.
@@ -139,6 +154,7 @@
 #define RWSEM_WRITER_LOCKED	(1UL << 0)
 #define RWSEM_FLAG_WAITERS	(1UL << 1)
 #define RWSEM_FLAG_HANDOFF	(1UL << 2)
+#define RWSEM_FLAG_READFAIL	(1UL << (BITS_PER_LONG - 1))
 
 #define RWSEM_READER_SHIFT	8
 #define RWSEM_READER_BIAS	(1UL << RWSEM_READER_SHIFT)
@@ -146,7 +162,7 @@
 #define RWSEM_WRITER_MASK	RWSEM_WRITER_LOCKED
 #define RWSEM_LOCK_MASK		(RWSEM_WRITER_MASK|RWSEM_READER_MASK)
 #define RWSEM_READ_FAILED_MASK	(RWSEM_WRITER_MASK|RWSEM_FLAG_WAITERS|\
-				 RWSEM_FLAG_HANDOFF)
+				 RWSEM_FLAG_HANDOFF|RWSEM_FLAG_READFAIL)
 
 /*
  * All writes to owner are protected by WRITE_ONCE() to make sure that
@@ -254,6 +270,14 @@ static inline void rwsem_set_nonspinnabl
 					  owner | RWSEM_NONSPINNABLE));
 }
 
+static inline bool rwsem_read_trylock(struct rw_semaphore *sem)
+{
+	unsigned long cnt = atomic_long_add_return_acquire(RWSEM_READER_BIAS, &sem->count);
+	WARN_ON_ONCE(cnt < 0);
+	return !(cnt & RWSEM_READ_FAILED_MASK);
+
+}
+
 /*
  * Return just the real task structure pointer of the owner
  */
@@ -403,6 +427,12 @@ static void rwsem_mark_wake(struct rw_se
 	}
 
 	/*
+	 * No reader wakeup if there are too many of them already.
+	 */
+	if (unlikely(atomic_long_read(&sem->count) < 0))
+		return;
+
+	/*
 	 * Writers might steal the lock before we grant it to the next reader.
 	 * We prefer to do the first reader grant before counting readers
 	 * so we can bail out early if a writer stole the lock.
@@ -949,9 +979,9 @@ static struct rw_semaphore __sched *
 rwsem_down_read_slowpath(struct rw_semaphore *sem, int state)
 {
 	long count, adjustment = -RWSEM_READER_BIAS;
-	bool wake = false;
 	struct rwsem_waiter waiter;
 	DEFINE_WAKE_Q(wake_q);
+	bool wake = false;
 
 	/*
 	 * Save the current read-owner of rwsem, if available, and the
@@ -1270,8 +1300,7 @@ static struct rw_semaphore *rwsem_downgr
  */
 inline void __down_read(struct rw_semaphore *sem)
 {
-	if (unlikely(atomic_long_fetch_add_acquire(RWSEM_READER_BIAS,
-			&sem->count) & RWSEM_READ_FAILED_MASK)) {
+	if (!rwsem_read_trylock(sem)) {
 		rwsem_down_read_slowpath(sem, TASK_UNINTERRUPTIBLE);
 		DEBUG_RWSEMS_WARN_ON(!is_rwsem_reader_owned(sem), sem);
 	} else {
@@ -1281,9 +1310,8 @@ inline void __down_read(struct rw_semaph
 
 static inline int __down_read_killable(struct rw_semaphore *sem)
 {
-	if (unlikely(atomic_long_fetch_add_acquire(RWSEM_READER_BIAS,
-			&sem->count) & RWSEM_READ_FAILED_MASK)) {
-		if (IS_ERR(rwsem_down_read_slowpath(sem, TASK_KILLABLE)))
+	if (!rwsem_read_trylock(sem)) {
+		if (IS_ERR(rwsem_down_read_slowpath(sem, TASK_KILLABLE));
 			return -EINTR;
 		DEBUG_RWSEMS_WARN_ON(!is_rwsem_reader_owned(sem), sem);
 	} else {
@@ -1359,6 +1387,7 @@ inline void __up_read(struct rw_semaphor
 	DEBUG_RWSEMS_WARN_ON(!is_rwsem_reader_owned(sem), sem);
 	rwsem_clear_reader_owned(sem);
 	tmp = atomic_long_add_return_release(-RWSEM_READER_BIAS, &sem->count);
+	DEBUG_RWSEMS_WARN_ON(tmp < 0, sem);
 	if (unlikely((tmp & (RWSEM_LOCK_MASK|RWSEM_FLAG_WAITERS)) ==
 		      RWSEM_FLAG_WAITERS)) {
 		clear_wr_nonspinnable(sem);