Use the brand spanking new wake_list to delay the futex wakeups until after we've released the hash bucket locks. This avoids the newly woken tasks from immediately getting stuck on the hb lock. This is esp. painful on -rt, where the hb lock is preemptible. Cc: Thomas Gleixner Cc: Darren Hart Signed-off-by: Peter Zijlstra --- kernel/futex.c | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) Index: linux-2.6/kernel/futex.c =================================================================== --- linux-2.6.orig/kernel/futex.c +++ linux-2.6/kernel/futex.c @@ -823,7 +823,7 @@ static void __unqueue_futex(struct futex * The hash bucket lock must be held when this is called. * Afterwards, the futex_q must not be accessed. */ -static void wake_futex(struct futex_q *q) +static void wake_futex(struct wake_list_head *wake_list, struct futex_q *q) { struct task_struct *p = q->task; @@ -834,7 +834,7 @@ static void wake_futex(struct futex_q *q * struct. Prevent this by holding a reference on p across the * wake up. */ - get_task_struct(p); + wake_list_add(wake_list, p); __unqueue_futex(q); /* @@ -845,9 +845,6 @@ static void wake_futex(struct futex_q *q */ smp_wmb(); q->lock_ptr = NULL; - - wake_up_state(p, TASK_NORMAL); - put_task_struct(p); } static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this) @@ -964,6 +961,7 @@ futex_wake(u32 __user *uaddr, unsigned i struct futex_q *this, *next; struct plist_head *head; union futex_key key = FUTEX_KEY_INIT; + WAKE_LIST(wake_list); int ret; if (!bitset) @@ -988,7 +986,7 @@ futex_wake(u32 __user *uaddr, unsigned i if (!(this->bitset & bitset)) continue; - wake_futex(this); + wake_futex(&wake_list, this); if (++ret >= nr_wake) break; } @@ -996,6 +994,8 @@ futex_wake(u32 __user *uaddr, unsigned i spin_unlock(&hb->lock); put_futex_key(&key); + + wake_up_list(&wake_list, TASK_NORMAL); out: return ret; } @@ -1012,6 +1012,7 @@ futex_wake_op(u32 __user *uaddr1, unsign struct futex_hash_bucket *hb1, *hb2; struct plist_head *head; struct futex_q *this, *next; + WAKE_LIST(wake_list); int ret, op_ret; retry: @@ -1062,7 +1063,7 @@ futex_wake_op(u32 __user *uaddr1, unsign plist_for_each_entry_safe(this, next, head, list) { if (match_futex (&this->key, &key1)) { - wake_futex(this); + wake_futex(&wake_list, this); if (++ret >= nr_wake) break; } @@ -1074,7 +1075,7 @@ futex_wake_op(u32 __user *uaddr1, unsign op_ret = 0; plist_for_each_entry_safe(this, next, head, list) { if (match_futex (&this->key, &key2)) { - wake_futex(this); + wake_futex(&wake_list, this); if (++op_ret >= nr_wake2) break; } @@ -1087,6 +1088,8 @@ futex_wake_op(u32 __user *uaddr1, unsign put_futex_key(&key2); out_put_key1: put_futex_key(&key1); + + wake_up_list(&wake_list, TASK_NORMAL); out: return ret; } @@ -1239,6 +1242,7 @@ static int futex_requeue(u32 __user *uad struct futex_hash_bucket *hb1, *hb2; struct plist_head *head1; struct futex_q *this, *next; + WAKE_LIST(wake_list); u32 curval2; if (requeue_pi) { @@ -1384,7 +1388,7 @@ static int futex_requeue(u32 __user *uad * woken by futex_unlock_pi(). */ if (++task_count <= nr_wake && !requeue_pi) { - wake_futex(this); + wake_futex(&wake_list, this); continue; } @@ -1437,6 +1441,7 @@ static int futex_requeue(u32 __user *uad put_futex_key(&key2); out_put_key1: put_futex_key(&key1); + wake_up_list(&wake_list, TASK_NORMAL); out: if (pi_state != NULL) free_pi_state(pi_state); -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/