When the plist was added to futexes it added overhead to sort based on priority for the futex waiters. If there is a miss order the value of this, from my perspective, is lost. Since we don't re-order tasks when their priority is changed after they sleep then we get a miss ordered scenerio, and tasks aren't woken in priority order. This patch corrects this issue, so the tasks are always woken in priority order. Signed-off-by: Daniel Walker --- include/linux/sched.h | 2 ++ kernel/futex.c | 45 +++++++++++++++++++++++++++++++++++++++++++++ kernel/sched.c | 1 + 3 files changed, 48 insertions(+) Index: linux-2.6.25/include/linux/sched.h =================================================================== --- linux-2.6.25.orig/include/linux/sched.h +++ linux-2.6.25/include/linux/sched.h @@ -1664,6 +1664,8 @@ static inline int rt_mutex_getprio(struc # define rt_mutex_adjust_pi(p) do { } while (0) #endif +extern void futex_adjust_waiters(struct task_struct *p); + extern void set_user_nice(struct task_struct *p, long nice); extern int task_prio(const struct task_struct *p); extern int task_nice(const struct task_struct *p); Index: linux-2.6.25/kernel/futex.c =================================================================== --- linux-2.6.25.orig/kernel/futex.c +++ linux-2.6.25/kernel/futex.c @@ -326,6 +326,42 @@ static int get_futex_value_locked(u32 *d return ret ? -EFAULT : 0; } +void futex_adjust_waiters(struct task_struct *p) +{ + struct futex_hash_bucket *hb; + union futex_key key; + struct futex_q *q; + int prio; + + spin_lock(&p->pi_lock); + if (p->blocked_on && p->blocked_on->lock_type == FUTEX_WAITER) { + q = p->blocked_on->futex_blocked_on; +retry: + key = q->key; + + hb = hash_futex(&key); + spin_lock(&hb->lock); + + /* + * Check if the waiter got requeued to a new page, if so retry. + */ + if (unlikely(!match_futex(&q->key, &key))) { + spin_unlock(&hb->lock); + goto retry; + } + + if (likely(!plist_node_empty(&q->list))) { + plist_del(&q->list, &hb->chain); + prio = min(p->normal_prio, MAX_RT_PRIO); + plist_node_init(&q->list, prio); + plist_add(&q->list, &hb->chain); + } + spin_unlock(&hb->lock); + } + spin_unlock(&p->pi_lock); +} + + /* * Fault handling. * if fshared is non NULL, current->mm->mmap_sem is already held @@ -1155,6 +1191,8 @@ static int futex_wait(u32 __user *uaddr, DECLARE_WAITQUEUE(wait, curr); struct futex_hash_bucket *hb; struct futex_q q; + struct lock_waiter_state blocked_on = + { .lock_type = FUTEX_WAITER, { .futex_blocked_on = &q} }; u32 uval; int ret; struct hrtimer_sleeper t; @@ -1215,6 +1253,10 @@ static int futex_wait(u32 __user *uaddr, if (uval != val) goto out_unlock_release_sem; + spin_lock(¤t->pi_lock); + current->blocked_on = &blocked_on; + spin_unlock(¤t->pi_lock); + /* Only actually queue if *uaddr contained val. */ queue_me(&q, hb); @@ -1272,6 +1314,9 @@ static int futex_wait(u32 __user *uaddr, } __set_current_state(TASK_RUNNING); + spin_lock(¤t->pi_lock); + current->blocked_on = NULL; + spin_unlock(¤t->pi_lock); /* * NOTE: we don't remove ourselves from the waitqueue because * we are the only user of it. Index: linux-2.6.25/kernel/sched.c =================================================================== --- linux-2.6.25.orig/kernel/sched.c +++ linux-2.6.25/kernel/sched.c @@ -5209,6 +5209,7 @@ recheck: spin_unlock_irqrestore(&p->pi_lock, flags); rt_mutex_adjust_pi(p); + futex_adjust_waiters(p); return 0; } -- -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/