[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <1428419030-20030-2-git-send-email-bigeasy@linutronix.de>
Date: Tue, 7 Apr 2015 17:03:48 +0200
From: Sebastian Andrzej Siewior <bigeasy@...utronix.de>
To: linux-kernel@...r.kernel.org
Cc: Peter Zijlstra <peterz@...radead.org>,
Ingo Molnar <mingo@...hat.com>,
Thomas Gleixner <tglx@...utronix.de>,
Darren Hart <darren@...art.com>,
Steven Rostedt <rostedt@...dmis.org>,
fredrik.markstrom@...driver.com,
Davidlohr Bueso <davidlohr@...com>,
Manfred Spraul <manfred@...orfullife.com>,
Sebastian Andrzej Siewior <bigeasy@...utronix.de>
Subject: [PATCH 1/3] futex: avoid double wake up in PI futex wait / wake on -RT
From: Thomas Gleixner <tglx@...utronix.de>
The boosted priority is reverted after the unlock but before the
futex_hash_bucket (hb) has been accessed. The result is that we boost the
task, deboost the task, boost again for the hb lock, deboost again.
A sched trace of this scenario looks the following:
| med_prio-93 sched_wakeup: comm=high_prio pid=92 prio=9 success=1 target_cpu=000
| med_prio-93 sched_switch: prev_comm=med_prio prev_pid=93 prev_prio=29 prev_state=R ==> next_comm=high_prio next_pid=92 next_prio=9
|high_prio-92 sched_pi_setprio: comm=low_prio pid=91 oldprio=120 newprio=9
|high_prio-92 sched_switch: prev_comm=high_prio prev_pid=92 prev_prio=9 prev_state=S ==> next_comm=low_prio next_pid=91 next_prio=9
| low_prio-91 sched_wakeup: comm=high_prio pid=92 prio=9 success=1 target_cpu=000
| low_prio-91 sched_pi_setprio: comm=low_prio pid=91 oldprio=9 newprio=120
| low_prio-91 sched_switch: prev_comm=low_prio prev_pid=91 prev_prio=120 prev_state=R+ ==> next_comm=high_prio next_pid=92 next_prio=9
|high_prio-92 sched_pi_setprio: comm=low_prio pid=91 oldprio=120 newprio=9
|high_prio-92 sched_switch: prev_comm=high_prio prev_pid=92 prev_prio=9 prev_state=D ==> next_comm=low_prio next_pid=91 next_prio=9
| low_prio-91 sched_wakeup: comm=high_prio pid=92 prio=9 success=1 target_cpu=000
| low_prio-91 sched_pi_setprio: comm=low_prio pid=91 oldprio=9 newprio=120
| low_prio-91 sched_switch: prev_comm=low_prio prev_pid=91 prev_prio=120 prev_state=R+ ==> next_comm=high_prio next_pid=92 next_prio=9
We see four sched_pi_setprio() invocation but ideally two would be enough.
The patch tries to avoid the double wakeup by a wake up once the hb lock is
released. The same test case:
| med_prio-21 sched_wakeup: comm=high_prio pid=20 prio=9 success=1 target_cpu=000
| med_prio-21 sched_switch: prev_comm=med_prio prev_pid=21 prev_prio=29 prev_state=R ==> next_comm=high_prio next_pid=20 next_prio=9
|high_prio-20 sched_pi_setprio: comm=low_prio pid=19 oldprio=120 newprio=9
|high_prio-20 sched_switch: prev_comm=high_prio prev_pid=20 prev_prio=9 prev_state=S ==> next_comm=low_prio next_pid=19 next_prio=9
| low_prio-19 sched_wakeup: comm=high_prio pid=20 prio=9 success=1 target_cpu=000
| low_prio-19 sched_pi_setprio: comm=low_prio pid=19 oldprio=9 newprio=120
| low_prio-19 sched_switch: prev_comm=low_prio prev_pid=19 prev_prio=120 prev_state=R+ ==> next_comm=high_prio next_pid=20 next_prio=9
only two sched_pi_setprio() invocations as one would expect and see
without -RT.
Signed-off-by: Thomas Gleixner <tglx@...utronix.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@...utronix.de>
---
kernel/futex.c | 32 +++++++++++++++++++++++++++++---
kernel/locking/rtmutex.c | 39 ++++++++++++++++++++++++++++-----------
kernel/locking/rtmutex_common.h | 4 ++++
3 files changed, 61 insertions(+), 14 deletions(-)
diff --git a/kernel/futex.c b/kernel/futex.c
index 2579e407ff67..b38abe3573a8 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1122,11 +1122,13 @@ static void wake_futex(struct futex_q *q)
put_task_struct(p);
}
-static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
+static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this,
+ struct futex_hash_bucket *hb)
{
struct task_struct *new_owner;
struct futex_pi_state *pi_state = this->pi_state;
u32 uninitialized_var(curval), newval;
+ bool deboost;
int ret = 0;
if (!pi_state)
@@ -1178,7 +1180,17 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
raw_spin_unlock_irq(&new_owner->pi_lock);
raw_spin_unlock(&pi_state->pi_mutex.wait_lock);
- rt_mutex_unlock(&pi_state->pi_mutex);
+
+ deboost = rt_mutex_futex_unlock(&pi_state->pi_mutex);
+
+ /*
+ * We deboost after dropping hb->lock. That prevents a double
+ * wakeup on RT.
+ */
+ spin_unlock(&hb->lock);
+
+ if (deboost)
+ rt_mutex_adjust_prio(current);
return 0;
}
@@ -2412,13 +2424,26 @@ static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
*/
match = futex_top_waiter(hb, &key);
if (match) {
- ret = wake_futex_pi(uaddr, uval, match);
+ ret = wake_futex_pi(uaddr, uval, match, hb);
+
+ /*
+ * In case of success wake_futex_pi dropped the hash
+ * bucket lock.
+ */
+ if (!ret)
+ goto out_putkey;
+
/*
* The atomic access to the futex value generated a
* pagefault, so retry the user-access and the wakeup:
*/
if (ret == -EFAULT)
goto pi_faulted;
+
+ /*
+ * wake_futex_pi has detected invalid state. Tell user
+ * space.
+ */
goto out_unlock;
}
@@ -2439,6 +2464,7 @@ static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
out_unlock:
spin_unlock(&hb->lock);
+out_putkey:
put_futex_key(&key);
return ret;
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index b73279367087..9d858106ba0c 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -298,7 +298,7 @@ static void __rt_mutex_adjust_prio(struct task_struct *task)
* of task. We do not use the spin_xx_mutex() variants here as we are
* outside of the debug path.)
*/
-static void rt_mutex_adjust_prio(struct task_struct *task)
+void rt_mutex_adjust_prio(struct task_struct *task)
{
unsigned long flags;
@@ -955,8 +955,8 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
/*
* Wake up the next waiter on the lock.
*
- * Remove the top waiter from the current tasks pi waiter list and
- * wake it up.
+ * Remove the top waiter from the current tasks pi waiter list take a
+ * reference and return a pointer to it.
*
* Called with lock->wait_lock held.
*/
@@ -1253,7 +1253,7 @@ static inline int rt_mutex_slowtrylock(struct rt_mutex *lock)
/*
* Slow path to release a rt-mutex:
*/
-static void __sched
+static bool __sched
rt_mutex_slowunlock(struct rt_mutex *lock)
{
raw_spin_lock(&lock->wait_lock);
@@ -1296,7 +1296,7 @@ rt_mutex_slowunlock(struct rt_mutex *lock)
while (!rt_mutex_has_waiters(lock)) {
/* Drops lock->wait_lock ! */
if (unlock_rt_mutex_safe(lock) == true)
- return;
+ return false;
/* Relock the rtmutex and try again */
raw_spin_lock(&lock->wait_lock);
}
@@ -1309,8 +1309,7 @@ rt_mutex_slowunlock(struct rt_mutex *lock)
raw_spin_unlock(&lock->wait_lock);
- /* Undo pi boosting if necessary: */
- rt_mutex_adjust_prio(current);
+ return true;
}
/*
@@ -1361,12 +1360,14 @@ rt_mutex_fasttrylock(struct rt_mutex *lock,
static inline void
rt_mutex_fastunlock(struct rt_mutex *lock,
- void (*slowfn)(struct rt_mutex *lock))
+ bool (*slowfn)(struct rt_mutex *lock))
{
- if (likely(rt_mutex_cmpxchg(lock, current, NULL)))
+ if (likely(rt_mutex_cmpxchg(lock, current, NULL))) {
rt_mutex_deadlock_account_unlock(current);
- else
- slowfn(lock);
+ } else if (slowfn(lock)) {
+ /* Undo pi boosting if necessary: */
+ rt_mutex_adjust_prio(current);
+ }
}
/**
@@ -1461,6 +1462,22 @@ void __sched rt_mutex_unlock(struct rt_mutex *lock)
EXPORT_SYMBOL_GPL(rt_mutex_unlock);
/**
+ * rt_mutex_futex_unlock - Futex variant of rt_mutex_unlock
+ * @lock: the rt_mutex to be unlocked
+ *
+ * Returns: true/false indicating whether priority adjustment is
+ * required or not.
+ */
+bool __sched rt_mutex_futex_unlock(struct rt_mutex *lock)
+{
+ if (likely(rt_mutex_cmpxchg(lock, current, NULL))) {
+ rt_mutex_deadlock_account_unlock(current);
+ return false;
+ }
+ return rt_mutex_slowunlock(lock);
+}
+
+/**
* rt_mutex_destroy - mark a mutex unusable
* @lock: the mutex to be destroyed
*
diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h
index 855212501407..1bcf43099b16 100644
--- a/kernel/locking/rtmutex_common.h
+++ b/kernel/locking/rtmutex_common.h
@@ -132,6 +132,10 @@ extern int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
struct rt_mutex_waiter *waiter);
extern int rt_mutex_timed_futex_lock(struct rt_mutex *l, struct hrtimer_sleeper *to);
+extern bool rt_mutex_futex_unlock(struct rt_mutex *lock);
+
+extern void rt_mutex_adjust_prio(struct task_struct *task);
+
#ifdef CONFIG_DEBUG_RT_MUTEXES
# include "rtmutex-debug.h"
#else
--
2.1.4
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Powered by blists - more mailing lists