linux-kernel - Re: [PATCH -rfc 4/4] locking/rtmutex: Support spin on owner (osq)

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <1432922475.21393.4.camel@stgolabs.net>
Date:	Fri, 29 May 2015 11:01:15 -0700
From:	Davidlohr Bueso <dave@...olabs.net>
To:	Thomas Gleixner <tglx@...utronix.de>
Cc:	Peter Zijlstra <peterz@...radead.org>,
	Ingo Molnar <mingo@...hat.com>,
	Steven Rostedt <rostedt@...dmis.org>,
	Mike Galbraith <umgwanakikbuti@...il.com>,
	"Paul E. McKenney" <paulmck@...ux.vnet.ibm.com>,
	Sebastian Andrzej Siewior <bigeasy@...utronix.de>,
	linux-kernel@...r.kernel.org
Subject: Re: [PATCH -rfc 4/4] locking/rtmutex: Support spin on owner (osq)

On Fri, 2015-05-29 at 08:19 -0700, Davidlohr Bueso wrote:
> Btw, I just realized I had sent a stale patch where the osq was not
> being initialized, fixed below. Thanks!

I am an idiot, patch contained some bogus changes in percpu rwsems...
*sigh* Here's v3, sorry for the noise.

8<---------------------------------------------------------------------
From: Davidlohr Bueso <dave@...olabs.net>
Subject: [PATCH v3 4/4] locking/rtmutex: Support spin on owner (osq)

Similar to what we have in other locks, particularly regular mutexes, the
idea is that as long as the owner is running, there is a fair chance it'll
release the lock soon, and thus a task trying to acquire the rtmutex will
better off spinning instead of blocking immediately after the fastpath.
Conditions to stop spinning and enter the slowpath are simple:

(1) Upon need_resched()
(2) Current lock owner blocks

Spinning tasks performance is further enhanced by queuing in cancelable
mcs (osq). Because rtmutexes track the lock owner atomically, we can
extend the fastpath to continue polling on the lock owner via
cmpxchg(lock->owner, NULL, current).

This is a conservative approach, such that upon entering the slowpath,
we force all lock spinners (including future ones) to serialize on the
wait_lock as mark_rt_mutex_waiters() is called, unconditionally.

CPU0					CPU1
optimistic_spin() (failed)
try_to_take_rt_mutex()
	mark_rt_mutex_waiters()
					optimistic_spin() (failed cmpxchg)
					spin_lock(&lock->wait_lock)

As such we check for RT_MUTEX_HAS_WAITERS bit0 (rt_mutex_has_waiters_fast()).
This allows priority boosting to take precedence over spinning, as otherwise
we could starve a higher priority queued-up task (ie: top waiter) if spinners
constantly steal the lock. Another alternative would be to stop spinning when
we should really wakeup a higher priority waiter, but of course we do not hold
the wait_lock, so that is racy.

Signed-off-by: Davidlohr Bueso <dbueso@...e.de>
---
 include/linux/rtmutex.h  |   4 ++
 kernel/Kconfig.locks     |   4 ++
 kernel/locking/rtmutex.c | 136 ++++++++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 143 insertions(+), 1 deletion(-)

diff --git a/include/linux/rtmutex.h b/include/linux/rtmutex.h
index 1abba5c..b5e85ca 100644
--- a/include/linux/rtmutex.h
+++ b/include/linux/rtmutex.h
@@ -15,6 +15,7 @@
 #include <linux/linkage.h>
 #include <linux/rbtree.h>
 #include <linux/spinlock_types.h>
+#include <linux/osq_lock.h>
 
 extern int max_lock_depth; /* for sysctl */
 
@@ -31,6 +32,9 @@ struct rt_mutex {
 	struct rb_root          waiters;
 	struct rb_node          *waiters_leftmost;
 	struct task_struct	*owner;
+#ifdef CONFIG_RT_MUTEX_SPIN_ON_OWNER
+	struct optimistic_spin_queue osq; /* Spinner MCS lock */
+#endif
 #ifdef CONFIG_DEBUG_RT_MUTEXES
 	int			save_state;
 	const char 		*name, *file;
diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks
index ebdb004..8602871 100644
--- a/kernel/Kconfig.locks
+++ b/kernel/Kconfig.locks
@@ -227,6 +227,10 @@ config MUTEX_SPIN_ON_OWNER
 	def_bool y
 	depends on SMP && !DEBUG_MUTEXES && ARCH_SUPPORTS_ATOMIC_RMW
 
+config RT_MUTEX_SPIN_ON_OWNER
+	def_bool y
+	depends on SMP && RT_MUTEXES && !DEBUG_RT_MUTEXES && ARCH_SUPPORTS_ATOMIC_RMW
+
 config RWSEM_SPIN_ON_OWNER
        def_bool y
        depends on SMP && RWSEM_XCHGADD_ALGORITHM && ARCH_SUPPORTS_ATOMIC_RMW
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 81aad32..c868bd4 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -63,6 +63,20 @@ static inline void clear_rt_mutex_waiters(struct rt_mutex *lock)
 			((unsigned long)lock->owner & ~RT_MUTEX_HAS_WAITERS);
 }
 
+/*
+ * Lockless alternative to rt_mutex_has_waiters() as we do not need the
+ * wait_lock to check if we are in, for instance, a transitional state
+ * after calling mark_rt_mutex_waiters().
+ */
+static inline bool rt_mutex_has_waiters_fast(struct rt_mutex *lock)
+{
+	unsigned long val = (unsigned long)lock->owner;
+
+	if (!val)
+		return false;
+	return val & RT_MUTEX_HAS_WAITERS;
+}
+
 static void fixup_rt_mutex_waiters(struct rt_mutex *lock)
 {
 	if (!rt_mutex_has_waiters(lock))
@@ -1152,6 +1166,121 @@ static void rt_mutex_handle_deadlock(int res, int detect_deadlock,
 	}
 }
 
+#ifdef CONFIG_RT_MUTEX_SPIN_ON_OWNER
+static noinline
+bool rt_mutex_spin_on_owner(struct rt_mutex *lock, struct task_struct *owner)
+{
+	bool ret = true;
+
+	rcu_read_lock();
+	while (rt_mutex_owner(lock) == owner) {
+		/*
+		 * Ensure we emit the owner->on_cpu, dereference _after_
+		 * checking lock->owner still matches owner. If that fails,
+		 * owner might point to freed memory. If it still matches,
+		 * the rcu_read_lock() ensures the memory stays valid.
+		 */
+		barrier();
+
+		if (!owner->on_cpu || need_resched()) {
+			ret = false;
+			break;
+		}
+
+		cpu_relax_lowlatency();
+	}
+	rcu_read_unlock();
+
+	return ret;
+}
+
+/*
+ * Initial check for entering the mutex spinning loop
+ */
+static inline bool rt_mutex_can_spin_on_owner(struct rt_mutex *lock)
+{
+	struct task_struct *owner;
+	/* default return to spin: if no owner, the lock is free */
+	int ret = true;
+
+	if (need_resched())
+		return 0;
+
+	rcu_read_lock();
+	owner = rt_mutex_owner(lock);
+	if (owner)
+		ret = owner->on_cpu;
+	rcu_read_unlock();
+
+	return ret;
+}
+
+static bool rt_mutex_optimistic_spin(struct rt_mutex *lock)
+{
+	bool taken = false;
+
+	preempt_disable();
+
+	if (!rt_mutex_can_spin_on_owner(lock))
+		goto done;
+	/*
+	 * In order to avoid a stampede of mutex spinners trying to
+	 * acquire the mutex all at once, the spinners need to take a
+	 * MCS (queued) lock first before spinning on the owner field.
+	 */
+	if (!osq_lock(&lock->osq))
+		goto done;
+
+	while (true) {
+		struct task_struct *owner;
+
+		/*
+		 * When another task is competing for the lock in the
+		 * slowpath (either transitional or not), avoid the
+		 * cmpxchg and immediately block. If the situation is
+		 * later fixed by clearing the waiters bit, future
+		 * tasks that atempt to take the rtmutex can spin.
+		 */
+		if (rt_mutex_has_waiters_fast(lock))
+			goto done_unlock;
+
+		/*
+		 * If there's an owner, wait for it to either
+		 * release the lock or go to sleep.
+		 */
+		owner = rt_mutex_owner(lock);
+		if (owner && !rt_mutex_spin_on_owner(lock, owner))
+			break;
+
+		/* Try to acquire the lock, if it is unlocked. */
+		if (rt_mutex_cmpxchg(lock, NULL, current)) {
+			taken = true;
+			goto done_unlock;
+		}
+
+		/*
+		 * The cpu_relax() call is a compiler barrier which forces
+		 * everything in this loop to be re-loaded. We don't need
+		 * memory barriers as we'll eventually observe the right
+		 * values at the cost of a few extra spins.
+		 */
+		cpu_relax_lowlatency();
+	}
+
+done_unlock:
+	osq_unlock(&lock->osq);
+done:
+	preempt_enable();
+	return taken;
+}
+
+#else
+static bool rt_mutex_optimistic_spin(struct rt_mutex *lock)
+{
+	return false;
+}
+#endif
+
 /*
  * Slow path lock function:
  */
@@ -1163,6 +1292,9 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
 	struct rt_mutex_waiter waiter;
 	int ret = 0;
 
+	if (rt_mutex_optimistic_spin(lock))
+		return ret;
+
 	debug_rt_mutex_init_waiter(&waiter);
 	RB_CLEAR_NODE(&waiter.pi_tree_entry);
 	RB_CLEAR_NODE(&waiter.tree_entry);
@@ -1500,7 +1632,9 @@ void __rt_mutex_init(struct rt_mutex *lock, const char *name)
 	raw_spin_lock_init(&lock->wait_lock);
 	lock->waiters = RB_ROOT;
 	lock->waiters_leftmost = NULL;
-
+#ifdef CONFIG_RT_MUTEX_SPIN_ON_OWNER
+	osq_lock_init(&lock->osq);
+#endif
 	debug_rt_mutex_init(lock, name);
 }
 EXPORT_SYMBOL_GPL(__rt_mutex_init);
-- 
2.1.4



--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/