linux-kernel - Re: Severe performance regression w/ 4.4+ on Android due to cgroup locking changes

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <20160714193537.GQ30927@twins.programming.kicks-ass.net>
Date:	Thu, 14 Jul 2016 21:35:37 +0200
From:	Peter Zijlstra <peterz@...radead.org>
To:	Oleg Nesterov <oleg@...hat.com>
Cc:	Tejun Heo <tj@...nel.org>, John Stultz <john.stultz@...aro.org>,
	Ingo Molnar <mingo@...hat.com>,
	lkml <linux-kernel@...r.kernel.org>,
	Dmitry Shmidt <dimitrysh@...gle.com>,
	Rom Lemarchand <romlem@...gle.com>,
	Colin Cross <ccross@...gle.com>, Todd Kjos <tkjos@...gle.com>,
	"Paul E. McKenney" <paulmck@...ux.vnet.ibm.com>
Subject: Re: Severe performance regression w/ 4.4+ on Android due to cgroup
 locking changes

On Thu, Jul 14, 2016 at 08:36:09PM +0200, Peter Zijlstra wrote:
> On Thu, Jul 14, 2016 at 08:09:52PM +0200, Oleg Nesterov wrote:
> > On 07/14, Peter Zijlstra wrote:
> > >
> > > The below is a compile tested only first draft so far. I'll go give it
> > > some runtime next.
> > 
> > So I will wait for the new version, but at first glance this matches the
> > code I already reviewed in the past (at least, tried hard to review ;)
> > and it looks correct.
> > 
> > Just a couple of almost cosmetic nits, feel free to ignore.
> 
> Drat, I just mailed out the patches.. I can do a second version later.

How about so on top of what I sent?


--- a/include/linux/percpu-rwsem.h
+++ b/include/linux/percpu-rwsem.h
@@ -13,11 +13,10 @@ struct percpu_rw_semaphore {
 	unsigned int __percpu	*read_count;
 	struct rw_semaphore	rw_sem;
 	wait_queue_head_t	writer;
-	int			state;
+	int			readers_block;
 };
 
-extern void __percpu_down_read(struct percpu_rw_semaphore *);
-extern int  __percpu_down_read_trylock(struct percpu_rw_semaphore *);
+extern int __percpu_down_read(struct percpu_rw_semaphore *, int);
 extern void __percpu_up_read(struct percpu_rw_semaphore *);
 
 static inline void percpu_down_read(struct percpu_rw_semaphore *sem)
@@ -37,7 +36,7 @@ static inline void percpu_down_read(stru
 	 */
 	__this_cpu_inc(*sem->read_count);
 	if (unlikely(!rcu_sync_is_idle(&sem->rss)))
-		__percpu_down_read(sem); /* Unconditional memory barrier */
+		__percpu_down_read(sem, false); /* Unconditional memory barrier */
 	preempt_enable();
 	/*
 	 * The barrier() from preempt_enable() prevents the compiler from
@@ -55,7 +54,7 @@ static inline int percpu_down_read_trylo
 	 */
 	__this_cpu_inc(*sem->read_count);
 	if (unlikely(!rcu_sync_is_idle(&sem->rss)))
-		ret = __percpu_down_read_trylock(sem);
+		ret = __percpu_down_read(sem, true); /* Unconditional memory barrier */
 	preempt_enable();
 	/*
 	 * The barrier() from preempt_enable() prevents the compiler from
--- a/kernel/locking/percpu-rwsem.c
+++ b/kernel/locking/percpu-rwsem.c
@@ -8,8 +8,6 @@
 #include <linux/sched.h>
 #include <linux/errno.h>
 
-enum { readers_slow, readers_block };
-
 int __percpu_init_rwsem(struct percpu_rw_semaphore *sem,
 			const char *name, struct lock_class_key *rwsem_key)
 {
@@ -21,7 +19,7 @@ int __percpu_init_rwsem(struct percpu_rw
 	rcu_sync_init(&sem->rss, RCU_SCHED_SYNC);
 	__init_rwsem(&sem->rw_sem, name, rwsem_key);
 	init_waitqueue_head(&sem->writer);
-	sem->state = readers_slow;
+	sem->readers_block = 0;
 	return 0;
 }
 EXPORT_SYMBOL_GPL(__percpu_init_rwsem);
@@ -41,21 +39,21 @@ void percpu_free_rwsem(struct percpu_rw_
 }
 EXPORT_SYMBOL_GPL(percpu_free_rwsem);
 
-void __percpu_down_read(struct percpu_rw_semaphore *sem)
+int __percpu_down_read(struct percpu_rw_semaphore *sem, int try)
 {
 	/*
 	 * Due to having preemption disabled the decrement happens on
 	 * the same CPU as the increment, avoiding the
 	 * increment-on-one-CPU-and-decrement-on-another problem.
 	 *
-	 * And yes, if the reader misses the writer's assignment of
-	 * readers_block to sem->state, then the writer is
-	 * guaranteed to see the reader's increment.  Conversely, any
-	 * readers that increment their sem->read_count after the
-	 * writer looks are guaranteed to see the readers_block value,
+	 * If the reader misses the writer's assignment of readers_block, then
+	 * the writer is guaranteed to see the reader's increment.
+	 *
+	 * Conversely, any readers that increment their sem->read_count after
+	 * the writer looks are guaranteed to see the readers_block value,
 	 * which in turn means that they are guaranteed to immediately
-	 * decrement their sem->read_count, so that it doesn't matter
-	 * that the writer missed them.
+	 * decrement their sem->read_count, so that it doesn't matter that the
+	 * writer missed them.
 	 */
 
 	smp_mb(); /* A matches D */
@@ -64,8 +62,8 @@ void __percpu_down_read(struct percpu_rw
 	 * If !readers_block the critical section starts here, matched by the
 	 * release in percpu_up_write().
 	 */
-	if (likely(smp_load_acquire(&sem->state) != readers_block))
-		return;
+	if (likely(!smp_load_acquire(&sem->readers_block)))
+		return 1;
 
 	/*
 	 * Per the above comment; we still have preemption disabled and
@@ -73,6 +71,9 @@ void __percpu_down_read(struct percpu_rw
 	 */
 	__percpu_up_read(sem);
 
+	if (try)
+		return 0;
+
 	/*
 	 * We either call schedule() in the wait, or we'll fall through
 	 * and reschedule on the preempt_enable() in percpu_down_read().
@@ -87,21 +88,10 @@ void __percpu_down_read(struct percpu_rw
 	__up_read(&sem->rw_sem);
 
 	preempt_disable();
+	return 1;
 }
 EXPORT_SYMBOL_GPL(__percpu_down_read);
 
-int __percpu_down_read_trylock(struct percpu_rw_semaphore *sem)
-{
-	smp_mb(); /* A matches D */
-
-	if (likely(smp_load_acquire(&sem->state) != readers_block))
-		return 1;
-
-	__percpu_up_read(sem);
-	return 0;
-}
-EXPORT_SYMBOL_GPL(__percpu_down_read_trylock);
-
 void __percpu_up_read(struct percpu_rw_semaphore *sem)
 {
 	smp_mb(); /* B matches C */
@@ -150,23 +140,23 @@ static bool readers_active_check(struct
 
 void percpu_down_write(struct percpu_rw_semaphore *sem)
 {
-	down_write(&sem->rw_sem);
-
 	/* Notify readers to take the slow path. */
 	rcu_sync_enter(&sem->rss);
 
+	down_write(&sem->rw_sem);
+
 	/*
 	 * Notify new readers to block; up until now, and thus throughout the
 	 * longish rcu_sync_enter() above, new readers could still come in.
 	 */
-	WRITE_ONCE(sem->state, readers_block);
+	WRITE_ONCE(sem->readers_block, 1);
 
 	smp_mb(); /* D matches A */
 
 	/*
-	 * If they don't see our writer of readers_block to sem->state,
-	 * then we are guaranteed to see their sem->read_count increment, and
-	 * therefore will wait for them.
+	 * If they don't see our writer of readers_block, then we are
+	 * guaranteed to see their sem->read_count increment, and therefore
+	 * will wait for them.
 	 */
 
 	/* Wait for all now active readers to complete. */
@@ -186,7 +176,7 @@ void percpu_up_write(struct percpu_rw_se
 	 * Therefore we force it through the slow path which guarantees an
 	 * acquire and thereby guarantees the critical section's consistency.
 	 */
-	smp_store_release(&sem->state, readers_slow);
+	smp_store_release(&sem->readers_block, 0);
 
 	/*
 	 * Release the write lock, this will allow readers back in the game.
@@ -194,9 +184,9 @@ void percpu_up_write(struct percpu_rw_se
 	up_write(&sem->rw_sem);
 
 	/*
-	 * Once this completes (at least one RCU grace period hence) the reader
-	 * fast path will be available again. Safe to use outside the exclusive
-	 * write lock because its counting.
+	 * Once this completes (at least one RCU-sched grace period hence) the
+	 * reader fast path will be available again. Safe to use outside the
+	 * exclusive write lock because its counting.
 	 */
 	rcu_sync_exit(&sem->rss);
 }