linux-kernel - [PATCH 1/3] futex: avoid double wake up in PI futex wait / wake on -RT

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <1428419030-20030-2-git-send-email-bigeasy@linutronix.de>
Date:	Tue,  7 Apr 2015 17:03:48 +0200
From:	Sebastian Andrzej Siewior <bigeasy@...utronix.de>
To:	linux-kernel@...r.kernel.org
Cc:	Peter Zijlstra <peterz@...radead.org>,
	Ingo Molnar <mingo@...hat.com>,
	Thomas Gleixner <tglx@...utronix.de>,
	Darren Hart <darren@...art.com>,
	Steven Rostedt <rostedt@...dmis.org>,
	fredrik.markstrom@...driver.com,
	Davidlohr Bueso <davidlohr@...com>,
	Manfred Spraul <manfred@...orfullife.com>,
	Sebastian Andrzej Siewior <bigeasy@...utronix.de>
Subject: [PATCH 1/3] futex: avoid double wake up in PI futex wait / wake on -RT

From: Thomas Gleixner <tglx@...utronix.de>

The boosted priority is reverted after the unlock but before the
futex_hash_bucket (hb) has been accessed. The result is that we boost the
task, deboost the task, boost again for the hb lock, deboost again.
A sched trace of this scenario looks the following:

| med_prio-93  sched_wakeup: comm=high_prio pid=92 prio=9 success=1 target_cpu=000
| med_prio-93  sched_switch: prev_comm=med_prio prev_pid=93 prev_prio=29 prev_state=R ==> next_comm=high_prio next_pid=92 next_prio=9
|high_prio-92  sched_pi_setprio: comm=low_prio pid=91 oldprio=120 newprio=9
|high_prio-92  sched_switch: prev_comm=high_prio prev_pid=92 prev_prio=9 prev_state=S ==> next_comm=low_prio next_pid=91 next_prio=9
| low_prio-91  sched_wakeup: comm=high_prio pid=92 prio=9 success=1 target_cpu=000
| low_prio-91  sched_pi_setprio: comm=low_prio pid=91 oldprio=9 newprio=120
| low_prio-91  sched_switch: prev_comm=low_prio prev_pid=91 prev_prio=120 prev_state=R+ ==> next_comm=high_prio next_pid=92 next_prio=9
|high_prio-92  sched_pi_setprio: comm=low_prio pid=91 oldprio=120 newprio=9
|high_prio-92  sched_switch: prev_comm=high_prio prev_pid=92 prev_prio=9 prev_state=D ==> next_comm=low_prio next_pid=91 next_prio=9
| low_prio-91  sched_wakeup: comm=high_prio pid=92 prio=9 success=1 target_cpu=000
| low_prio-91  sched_pi_setprio: comm=low_prio pid=91 oldprio=9 newprio=120
| low_prio-91  sched_switch: prev_comm=low_prio prev_pid=91 prev_prio=120 prev_state=R+ ==> next_comm=high_prio next_pid=92 next_prio=9

We see four sched_pi_setprio() invocation but ideally two would be enough.
The patch tries to avoid the double wakeup by a wake up once the hb lock is
released. The same test case:

| med_prio-21  sched_wakeup: comm=high_prio pid=20 prio=9 success=1 target_cpu=000
| med_prio-21  sched_switch: prev_comm=med_prio prev_pid=21 prev_prio=29 prev_state=R ==> next_comm=high_prio next_pid=20 next_prio=9
|high_prio-20  sched_pi_setprio: comm=low_prio pid=19 oldprio=120 newprio=9
|high_prio-20  sched_switch: prev_comm=high_prio prev_pid=20 prev_prio=9 prev_state=S ==> next_comm=low_prio next_pid=19 next_prio=9
| low_prio-19  sched_wakeup: comm=high_prio pid=20 prio=9 success=1 target_cpu=000
| low_prio-19  sched_pi_setprio: comm=low_prio pid=19 oldprio=9 newprio=120
| low_prio-19  sched_switch: prev_comm=low_prio prev_pid=19 prev_prio=120 prev_state=R+ ==> next_comm=high_prio next_pid=20 next_prio=9

only two sched_pi_setprio() invocations as one would expect and see
without -RT.

Signed-off-by: Thomas Gleixner <tglx@...utronix.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@...utronix.de>
---
 kernel/futex.c                  | 32 +++++++++++++++++++++++++++++---
 kernel/locking/rtmutex.c        | 39 ++++++++++++++++++++++++++++-----------
 kernel/locking/rtmutex_common.h |  4 ++++
 3 files changed, 61 insertions(+), 14 deletions(-)

diff --git a/kernel/futex.c b/kernel/futex.c
index 2579e407ff67..b38abe3573a8 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1122,11 +1122,13 @@ static void wake_futex(struct futex_q *q)
 	put_task_struct(p);
 }
 
-static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
+static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this,
+			 struct futex_hash_bucket *hb)
 {
 	struct task_struct *new_owner;
 	struct futex_pi_state *pi_state = this->pi_state;
 	u32 uninitialized_var(curval), newval;
+	bool deboost;
 	int ret = 0;
 
 	if (!pi_state)
@@ -1178,7 +1180,17 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
 	raw_spin_unlock_irq(&new_owner->pi_lock);
 
 	raw_spin_unlock(&pi_state->pi_mutex.wait_lock);
-	rt_mutex_unlock(&pi_state->pi_mutex);
+
+	deboost = rt_mutex_futex_unlock(&pi_state->pi_mutex);
+
+	/*
+	 * We deboost after dropping hb->lock. That prevents a double
+	 * wakeup on RT.
+	 */
+	spin_unlock(&hb->lock);
+
+	if (deboost)
+		rt_mutex_adjust_prio(current);
 
 	return 0;
 }
@@ -2412,13 +2424,26 @@ static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
 	 */
 	match = futex_top_waiter(hb, &key);
 	if (match) {
-		ret = wake_futex_pi(uaddr, uval, match);
+		ret = wake_futex_pi(uaddr, uval, match, hb);
+
+		/*
+		 * In case of success wake_futex_pi dropped the hash
+		 * bucket lock.
+		 */
+		if (!ret)
+			goto out_putkey;
+
 		/*
 		 * The atomic access to the futex value generated a
 		 * pagefault, so retry the user-access and the wakeup:
 		 */
 		if (ret == -EFAULT)
 			goto pi_faulted;
+
+		/*
+		 * wake_futex_pi has detected invalid state. Tell user
+		 * space.
+		 */
 		goto out_unlock;
 	}
 
@@ -2439,6 +2464,7 @@ static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
 
 out_unlock:
 	spin_unlock(&hb->lock);
+out_putkey:
 	put_futex_key(&key);
 	return ret;
 
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index b73279367087..9d858106ba0c 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -298,7 +298,7 @@ static void __rt_mutex_adjust_prio(struct task_struct *task)
  * of task. We do not use the spin_xx_mutex() variants here as we are
  * outside of the debug path.)
  */
-static void rt_mutex_adjust_prio(struct task_struct *task)
+void rt_mutex_adjust_prio(struct task_struct *task)
 {
 	unsigned long flags;
 
@@ -955,8 +955,8 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
 /*
  * Wake up the next waiter on the lock.
  *
- * Remove the top waiter from the current tasks pi waiter list and
- * wake it up.
+ * Remove the top waiter from the current tasks pi waiter list take a
+ * reference and return a pointer to it.
  *
  * Called with lock->wait_lock held.
  */
@@ -1253,7 +1253,7 @@ static inline int rt_mutex_slowtrylock(struct rt_mutex *lock)
 /*
  * Slow path to release a rt-mutex:
  */
-static void __sched
+static bool __sched
 rt_mutex_slowunlock(struct rt_mutex *lock)
 {
 	raw_spin_lock(&lock->wait_lock);
@@ -1296,7 +1296,7 @@ rt_mutex_slowunlock(struct rt_mutex *lock)
 	while (!rt_mutex_has_waiters(lock)) {
 		/* Drops lock->wait_lock ! */
 		if (unlock_rt_mutex_safe(lock) == true)
-			return;
+			return false;
 		/* Relock the rtmutex and try again */
 		raw_spin_lock(&lock->wait_lock);
 	}
@@ -1309,8 +1309,7 @@ rt_mutex_slowunlock(struct rt_mutex *lock)
 
 	raw_spin_unlock(&lock->wait_lock);
 
-	/* Undo pi boosting if necessary: */
-	rt_mutex_adjust_prio(current);
+	return true;
 }
 
 /*
@@ -1361,12 +1360,14 @@ rt_mutex_fasttrylock(struct rt_mutex *lock,
 
 static inline void
 rt_mutex_fastunlock(struct rt_mutex *lock,
-		    void (*slowfn)(struct rt_mutex *lock))
+		    bool (*slowfn)(struct rt_mutex *lock))
 {
-	if (likely(rt_mutex_cmpxchg(lock, current, NULL)))
+	if (likely(rt_mutex_cmpxchg(lock, current, NULL))) {
 		rt_mutex_deadlock_account_unlock(current);
-	else
-		slowfn(lock);
+	} else if (slowfn(lock)) {
+		/* Undo pi boosting if necessary: */
+		rt_mutex_adjust_prio(current);
+	}
 }
 
 /**
@@ -1461,6 +1462,22 @@ void __sched rt_mutex_unlock(struct rt_mutex *lock)
 EXPORT_SYMBOL_GPL(rt_mutex_unlock);
 
 /**
+ * rt_mutex_futex_unlock - Futex variant of rt_mutex_unlock
+ * @lock: the rt_mutex to be unlocked
+ *
+ * Returns: true/false indicating whether priority adjustment is
+ * required or not.
+ */
+bool __sched rt_mutex_futex_unlock(struct rt_mutex *lock)
+{
+	if (likely(rt_mutex_cmpxchg(lock, current, NULL))) {
+		rt_mutex_deadlock_account_unlock(current);
+		return false;
+	}
+	return rt_mutex_slowunlock(lock);
+}
+
+/**
  * rt_mutex_destroy - mark a mutex unusable
  * @lock: the mutex to be destroyed
  *
diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h
index 855212501407..1bcf43099b16 100644
--- a/kernel/locking/rtmutex_common.h
+++ b/kernel/locking/rtmutex_common.h
@@ -132,6 +132,10 @@ extern int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
 				      struct rt_mutex_waiter *waiter);
 extern int rt_mutex_timed_futex_lock(struct rt_mutex *l, struct hrtimer_sleeper *to);
 
+extern bool rt_mutex_futex_unlock(struct rt_mutex *lock);
+
+extern void rt_mutex_adjust_prio(struct task_struct *task);
+
 #ifdef CONFIG_DEBUG_RT_MUTEXES
 # include "rtmutex-debug.h"
 #else
-- 
2.1.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/