With the introduction of SCHED_DEADLINE the whole notion that priority is a single number is gone, therefore the @prio argument to rt_mutex_setprio() doesn't make sense anymore. So rework the code to pass a pi_task instead. Note this also fixes a problem with pi_top_task caching; previously we would not set the pointer (call rt_mutex_update_top_task) if the priority didn't change, this could lead to a stale pointer. As for the XXX, I think its fine to use pi_task->prio, because if it differs from waiter->prio, a PI chain update is immenent. Signed-off-by: Peter Zijlstra (Intel) --- include/linux/sched/rt.h | 24 +++------- kernel/locking/rtmutex.c | 112 ++++++++++++----------------------------------- kernel/sched/core.c | 66 ++++++++++++++++++++++----- 3 files changed, 91 insertions(+), 111 deletions(-) --- a/include/linux/sched/rt.h +++ b/include/linux/sched/rt.h @@ -18,28 +18,20 @@ static inline int rt_task(struct task_st } #ifdef CONFIG_RT_MUTEXES -extern int rt_mutex_getprio(struct task_struct *p); -extern void rt_mutex_setprio(struct task_struct *p, int prio); -extern int rt_mutex_get_effective_prio(struct task_struct *task, int newprio); -extern void rt_mutex_update_top_task(struct task_struct *p); -extern struct task_struct *rt_mutex_get_top_task(struct task_struct *task); +/* + * Must hold either p->pi_lock or task_rq(p)->lock. + */ +static inline struct task_struct *rt_mutex_get_top_task(struct task_struct *p) +{ + return p->pi_top_task; +} +extern void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task); extern void rt_mutex_adjust_pi(struct task_struct *p); static inline bool tsk_is_pi_blocked(struct task_struct *tsk) { return tsk->pi_blocked_on != NULL; } #else -static inline int rt_mutex_getprio(struct task_struct *p) -{ - return p->normal_prio; -} - -static inline int rt_mutex_get_effective_prio(struct task_struct *task, - int newprio) -{ - return newprio; -} - static inline struct task_struct *rt_mutex_get_top_task(struct task_struct *task) { return NULL; --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -322,67 +322,16 @@ rt_mutex_dequeue_pi(struct task_struct * RB_CLEAR_NODE(&waiter->pi_tree_entry); } -/* - * Must hold both p->pi_lock and task_rq(p)->lock. - */ -void rt_mutex_update_top_task(struct task_struct *p) -{ - if (!task_has_pi_waiters(p)) { - p->pi_top_task = NULL; - return; - } - - p->pi_top_task = task_top_pi_waiter(p)->task; -} - -/* - * Calculate task priority from the waiter tree priority - * - * Return task->normal_prio when the waiter tree is empty or when - * the waiter is not allowed to do priority boosting - */ -int rt_mutex_getprio(struct task_struct *task) -{ - if (likely(!task_has_pi_waiters(task))) - return task->normal_prio; - - return min(task_top_pi_waiter(task)->prio, - task->normal_prio); -} - -/* - * Must hold either p->pi_lock or task_rq(p)->lock. - */ -struct task_struct *rt_mutex_get_top_task(struct task_struct *task) -{ - return task->pi_top_task; -} - -/* - * Called by sched_setscheduler() to get the priority which will be - * effective after the change. - */ -int rt_mutex_get_effective_prio(struct task_struct *task, int newprio) +static void rt_mutex_adjust_prio(struct task_struct *p) { - struct task_struct *top_task = rt_mutex_get_top_task(task); + struct task_struct *pi_task = NULL; - if (!top_task) - return newprio; + lockdep_assert_held(&p->pi_lock); - return min(top_task->prio, newprio); -} + if (task_has_pi_waiters(p)) + pi_task = task_top_pi_waiter(p)->task; -/* - * Adjust the priority of a task, after its pi_waiters got modified. - * - * This can be both boosting and unboosting. task->pi_lock must be held. - */ -static void __rt_mutex_adjust_prio(struct task_struct *task) -{ - int prio = rt_mutex_getprio(task); - - if (task->prio != prio || dl_prio(prio)) - rt_mutex_setprio(task, prio); + rt_mutex_setprio(p, pi_task); } /* @@ -742,7 +691,7 @@ static int rt_mutex_adjust_prio_chain(st */ rt_mutex_dequeue_pi(task, prerequeue_top_waiter); rt_mutex_enqueue_pi(task, waiter); - __rt_mutex_adjust_prio(task); + rt_mutex_adjust_prio(task); } else if (prerequeue_top_waiter == waiter) { /* @@ -758,7 +707,7 @@ static int rt_mutex_adjust_prio_chain(st rt_mutex_dequeue_pi(task, waiter); waiter = rt_mutex_top_waiter(lock); rt_mutex_enqueue_pi(task, waiter); - __rt_mutex_adjust_prio(task); + rt_mutex_adjust_prio(task); } else { /* * Nothing changed. No need to do any priority @@ -966,7 +915,7 @@ static int task_blocks_on_rt_mutex(struc return -EDEADLK; raw_spin_lock(&task->pi_lock); - __rt_mutex_adjust_prio(task); + rt_mutex_adjust_prio(task); waiter->task = task; waiter->lock = lock; waiter->prio = task->prio; @@ -988,7 +937,7 @@ static int task_blocks_on_rt_mutex(struc rt_mutex_dequeue_pi(owner, top_waiter); rt_mutex_enqueue_pi(owner, waiter); - __rt_mutex_adjust_prio(owner); + rt_mutex_adjust_prio(owner); if (owner->pi_blocked_on) chain_walk = 1; } else if (rt_mutex_cond_detect_deadlock(waiter, chwalk)) { @@ -1040,13 +989,14 @@ static void mark_wakeup_next_waiter(stru waiter = rt_mutex_top_waiter(lock); /* - * Remove it from current->pi_waiters. We do not adjust a - * possible priority boost right now. We execute wakeup in the - * boosted mode and go back to normal after releasing - * lock->wait_lock. + * Remove it from current->pi_waiters and deboost. + * + * We must in fact deboost here in order to ensure we call + * rt_mutex_setprio() to update p->pi_top_task before the + * task unblocks. */ rt_mutex_dequeue_pi(current, waiter); - __rt_mutex_adjust_prio(current); + rt_mutex_adjust_prio(current); /* * As we are waking up the top waiter, and the waiter stays @@ -1058,9 +1008,19 @@ static void mark_wakeup_next_waiter(stru */ lock->owner = (void *) RT_MUTEX_HAS_WAITERS; - raw_spin_unlock(¤t->pi_lock); - + /* + * We deboosted before waking the top waiter task such that we don't + * run two tasks with the 'same' priority (and ensure the + * p->pi_top_task pointer points to a blocked task). This however can + * lead to priority inversion if we would get preempted after the + * deboost but before waking our donor task, hence the preempt_disable() + * before unlock. + * + * Pairs with preempt_enable() in rt_mutex_postunlock(); + */ + preempt_disable(); wake_q_add(wake_q, waiter->task); + raw_spin_unlock(¤t->pi_lock); } /* @@ -1095,7 +1055,7 @@ static void remove_waiter(struct rt_mute if (rt_mutex_has_waiters(lock)) rt_mutex_enqueue_pi(owner, rt_mutex_top_waiter(lock)); - __rt_mutex_adjust_prio(owner); + rt_mutex_adjust_prio(owner); /* Store the lock on which owner is blocked or NULL */ next_lock = task_blocked_on_lock(owner); @@ -1134,8 +1094,7 @@ void rt_mutex_adjust_pi(struct task_stru raw_spin_lock_irqsave(&task->pi_lock, flags); waiter = task->pi_blocked_on; - if (!waiter || (waiter->prio == task->prio && - !dl_prio(task->prio))) { + if (!waiter || (waiter->prio == task->prio && !dl_prio(task->prio))) { raw_spin_unlock_irqrestore(&task->pi_lock, flags); return; } @@ -1389,17 +1348,6 @@ static bool __sched rt_mutex_slowunlock( * Queue the next waiter for wakeup once we release the wait_lock. */ mark_wakeup_next_waiter(wake_q, lock); - - /* - * We should deboost before waking the top waiter task such that - * we don't run two tasks with the 'same' priority. This however - * can lead to prio-inversion if we would get preempted after - * the deboost but before waking our high-prio task, hence the - * preempt_disable before unlock. Pairs with preempt_enable() in - * rt_mutex_postunlock(); - */ - preempt_disable(); - raw_spin_unlock_irqrestore(&lock->wait_lock, flags); return true; /* call rt_mutex_postunlock() */ --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3674,10 +3674,25 @@ EXPORT_SYMBOL(default_wake_function); #ifdef CONFIG_RT_MUTEXES +static inline int __rt_effective_prio(struct task_struct *pi_task, int prio) +{ + if (pi_task) + prio = min(prio, pi_task->prio); + + return prio; +} + +static inline int rt_effective_prio(struct task_struct *p, int prio) +{ + struct task_struct *pi_task = rt_mutex_get_top_task(p); + + return __rt_effective_prio(pi_task, prio); +} + /* * rt_mutex_setprio - set the current priority of a task - * @p: task - * @prio: prio value (kernel-internal form) + * @p: task to boost + * @pi_task: donor task * * This function changes the 'effective' priority of a task. It does * not touch ->normal_prio like __setscheduler(). @@ -3685,18 +3700,42 @@ EXPORT_SYMBOL(default_wake_function); * Used by the rt_mutex code to implement priority inheritance * logic. Call site only calls if the priority of the task changed. */ -void rt_mutex_setprio(struct task_struct *p, int prio) +void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task) { - int oldprio, queued, running, queue_flag = + int prio, oldprio, queued, running, queue_flag = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK; const struct sched_class *prev_class; struct rq_flags rf; struct rq *rq; - BUG_ON(prio > MAX_PRIO); + /* XXX used to be waiter->prio, not waiter->task->prio */ + prio = __rt_effective_prio(pi_task, p->normal_prio); + + /* + * If nothing changed; bail early. + */ + if (p->pi_top_task == pi_task && prio == p->prio && !dl_prio(prio)) + return; rq = __task_rq_lock(p, &rf); update_rq_clock(rq); + /* + * Set under pi_lock && rq->lock, such that the value can be used under + * either lock. + * + * Note that there is loads of tricky to make this pointer cache work + * right. rt_mutex_slowunlock()+rt_mutex_postunlock() work together to + * ensure a task is de-boosted (pi_task is set to NULL) before the + * task is allowed to run again (and can exit). This ensures the pointer + * points to a blocked task -- which guaratees the task is present. + */ + p->pi_top_task = pi_task; + + /* + * For FIFO/RR we only need to set prio, if that matches we're done. + */ + if (prio == p->prio && !dl_prio(prio)) + goto out_unlock; /* * Idle task boosting is a nono in general. There is one @@ -3716,9 +3755,7 @@ void rt_mutex_setprio(struct task_struct goto out_unlock; } - rt_mutex_update_top_task(p); - - trace_sched_pi_setprio(p, prio); + trace_sched_pi_setprio(p, prio); /* broken */ oldprio = p->prio; if (oldprio == prio) @@ -3742,7 +3779,6 @@ void rt_mutex_setprio(struct task_struct * running task */ if (dl_prio(prio)) { - struct task_struct *pi_task = rt_mutex_get_top_task(p); if (!dl_prio(p->normal_prio) || (pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) { p->dl.dl_boosted = 1; @@ -3780,6 +3816,11 @@ void rt_mutex_setprio(struct task_struct balance_callback(rq); preempt_enable(); } +#else +static inline int rt_effective_prio(struct task_struct *p, int prio) +{ + return prio; +} #endif void set_user_nice(struct task_struct *p, long nice) @@ -4026,10 +4067,9 @@ static void __setscheduler(struct rq *rq * Keep a potential priority boosting if called from * sched_setscheduler(). */ + p->prio = normal_prio(p); if (keep_boost) - p->prio = rt_mutex_get_effective_prio(p, normal_prio(p)); - else - p->prio = normal_prio(p); + p->prio = rt_effective_prio(p, p->prio); if (dl_prio(p->prio)) p->sched_class = &dl_sched_class; @@ -4316,7 +4356,7 @@ static int __sched_setscheduler(struct t * the runqueue. This will be done when the task deboost * itself. */ - new_effective_prio = rt_mutex_get_effective_prio(p, newprio); + new_effective_prio = rt_effective_prio(p, newprio); if (new_effective_prio == oldprio) queue_flags &= ~DEQUEUE_MOVE; }