[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <57037974.1020002@redhat.com>
Date: Tue, 5 Apr 2016 16:38:12 +0800
From: Xunlei Pang <xpang@...hat.com>
To: Peter Zijlstra <peterz@...radead.org>, xlpang@...hat.com
Cc: linux-kernel@...r.kernel.org, Juri Lelli <juri.lelli@....com>,
Ingo Molnar <mingo@...hat.com>,
Steven Rostedt <rostedt@...dmis.org>,
Thomas Gleixner <tglx@...utronix.de>
Subject: Re: [PATCH] sched/deadline/rtmutex: Fix a PI crash for deadline tasks
On 2016/04/02 at 05:51, Peter Zijlstra wrote:
> On Fri, Apr 01, 2016 at 09:34:24PM +0800, Xunlei Pang wrote:
>
>>>> I checked the code, currently only deadline accesses the
>>>> pi_waiters/pi_waiters_leftmost
>>>> without pi_lock held via rt_mutex_get_top_task(), other cases all have
>>>> pi_lock held.
>> Any better ideas is welcome.
> Something like the below _might_ work; but its late and I haven't looked
> at the PI code in a while. This basically caches a pointer to the top
> waiter task in the running task_struct, under pi_lock and rq->lock, and
> therefore we can use it with only rq->lock held.
>
> Since the task is blocked, and cannot unblock without taking itself from
> the block chain -- which would cause rt_mutex_setprio() to set another
> top waiter task, the lifetime rules should be good.
In rt_mutex_slowunlock(), we release pi_lock and and wait_lock first, then
wake up the top waiter, then call rt_mutex_adjust_prio(), so there is a small
window without any lock or irq disabled between the top waiter wake up
and rt_mutex_adjust_prio(), which can cause problems.
For example, before calling rt_mutex_adjust_prio() to adjust the cached pointer,
if current is preempted and the waken top waiter exited, after that, the task is
back, and it may enter enqueue_task_dl() before entering rt_mutex_adjust_prio(),
where the cached pointer is updated, so it will access a stale cached pointer.
Regards,
Xunlei
> Having this top waiter pointer around might also be useful to further
> implement bandwidth inheritance or such, but I've not thought about that
> too much.
>
> Lots of code deleted because we move the entire task->prio mapping muck
> into the scheduler, because we now pass a pi_task, not a prio.
>
> ---
> include/linux/sched.h | 1 +
> include/linux/sched/rt.h | 20 +-------------------
> kernel/locking/rtmutex.c | 45 +++++----------------------------------------
> kernel/sched/core.c | 34 +++++++++++++++++++++++-----------
> kernel/sched/deadline.c | 2 +-
> 5 files changed, 31 insertions(+), 71 deletions(-)
>
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 52c4847b05e2..30169f38cb24 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -1615,6 +1615,7 @@ struct task_struct {
>
> /* Protection of the PI data structures: */
> raw_spinlock_t pi_lock;
> + struct task_struct *pi_task;
>
> struct wake_q_node wake_q;
>
> diff --git a/include/linux/sched/rt.h b/include/linux/sched/rt.h
> index a30b172df6e1..0a8f071b16e3 100644
> --- a/include/linux/sched/rt.h
> +++ b/include/linux/sched/rt.h
> @@ -16,31 +16,13 @@ static inline int rt_task(struct task_struct *p)
> }
>
> #ifdef CONFIG_RT_MUTEXES
> -extern int rt_mutex_getprio(struct task_struct *p);
> -extern void rt_mutex_setprio(struct task_struct *p, int prio);
> -extern int rt_mutex_get_effective_prio(struct task_struct *task, int newprio);
> -extern struct task_struct *rt_mutex_get_top_task(struct task_struct *task);
> +extern void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task);
> extern void rt_mutex_adjust_pi(struct task_struct *p);
> static inline bool tsk_is_pi_blocked(struct task_struct *tsk)
> {
> return tsk->pi_blocked_on != NULL;
> }
> #else
> -static inline int rt_mutex_getprio(struct task_struct *p)
> -{
> - return p->normal_prio;
> -}
> -
> -static inline int rt_mutex_get_effective_prio(struct task_struct *task,
> - int newprio)
> -{
> - return newprio;
> -}
> -
> -static inline struct task_struct *rt_mutex_get_top_task(struct task_struct *task)
> -{
> - return NULL;
> -}
> # define rt_mutex_adjust_pi(p) do { } while (0)
> static inline bool tsk_is_pi_blocked(struct task_struct *tsk)
> {
> diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
> index 3e746607abe5..13b6b5922d3c 100644
> --- a/kernel/locking/rtmutex.c
> +++ b/kernel/locking/rtmutex.c
> @@ -257,53 +257,18 @@ rt_mutex_dequeue_pi(struct task_struct *task, struct rt_mutex_waiter *waiter)
> }
>
> /*
> - * Calculate task priority from the waiter tree priority
> - *
> - * Return task->normal_prio when the waiter tree is empty or when
> - * the waiter is not allowed to do priority boosting
> - */
> -int rt_mutex_getprio(struct task_struct *task)
> -{
> - if (likely(!task_has_pi_waiters(task)))
> - return task->normal_prio;
> -
> - return min(task_top_pi_waiter(task)->prio,
> - task->normal_prio);
> -}
> -
> -struct task_struct *rt_mutex_get_top_task(struct task_struct *task)
> -{
> - if (likely(!task_has_pi_waiters(task)))
> - return NULL;
> -
> - return task_top_pi_waiter(task)->task;
> -}
> -
> -/*
> - * Called by sched_setscheduler() to get the priority which will be
> - * effective after the change.
> - */
> -int rt_mutex_get_effective_prio(struct task_struct *task, int newprio)
> -{
> - if (!task_has_pi_waiters(task))
> - return newprio;
> -
> - if (task_top_pi_waiter(task)->task->prio <= newprio)
> - return task_top_pi_waiter(task)->task->prio;
> - return newprio;
> -}
> -
> -/*
> * Adjust the priority of a task, after its pi_waiters got modified.
> *
> * This can be both boosting and unboosting. task->pi_lock must be held.
> */
> static void __rt_mutex_adjust_prio(struct task_struct *task)
> {
> - int prio = rt_mutex_getprio(task);
> + struct task_struct *pi_task = task;
> +
> + if (unlikely(task_has_pi_waiters(task)))
> + pi_task = task_top_pi_waiter(task)->task;
>
> - if (task->prio != prio || dl_prio(prio))
> - rt_mutex_setprio(task, prio);
> + rt_mutex_setprio(task, pi_task);
> }
>
> /*
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 8b489fcac37b..7d7e3a0eaeb0 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -3392,7 +3392,7 @@ EXPORT_SYMBOL(default_wake_function);
> /*
> * rt_mutex_setprio - set the current priority of a task
> * @p: task
> - * @prio: prio value (kernel-internal form)
> + * @pi_task: top waiter, donating state
> *
> * This function changes the 'effective' priority of a task. It does
> * not touch ->normal_prio like __setscheduler().
> @@ -3400,13 +3400,20 @@ EXPORT_SYMBOL(default_wake_function);
> * Used by the rt_mutex code to implement priority inheritance
> * logic. Call site only calls if the priority of the task changed.
> */
> -void rt_mutex_setprio(struct task_struct *p, int prio)
> +void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
> {
> - int oldprio, queued, running, queue_flag = DEQUEUE_SAVE | DEQUEUE_MOVE;
> - struct rq *rq;
> + int prio, oldprio, queued, running, queue_flag = DEQUEUE_SAVE | DEQUEUE_MOVE;
> const struct sched_class *prev_class;
> + struct rq *rq;
>
> - BUG_ON(prio > MAX_PRIO);
> + /*
> + * For FIFO/RR we simply donate prio; for DL things are
> + * more interesting.
> + */
> + /* XXX used to be waiter->prio, not waiter->task->prio */
> + prio = min(pi_task->prio, p->normal_prio);
> + if (p->prio == prio && !dl_prio(prio))
> + return;
>
> rq = __task_rq_lock(p);
>
> @@ -3442,6 +3449,10 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
> if (running)
> put_prev_task(rq, p);
>
> + if (pi_task == p)
> + pi_task = NULL;
> + p->pi_task = pi_task;
> +
> /*
> * Boosting condition are:
> * 1. -rt task is running and holds mutex A
> @@ -3452,7 +3463,6 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
> * running task
> */
> if (dl_prio(prio)) {
> - struct task_struct *pi_task = rt_mutex_get_top_task(p);
> if (!dl_prio(p->normal_prio) ||
> (pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) {
> p->dl.dl_boosted = 1;
> @@ -3727,10 +3737,9 @@ static void __setscheduler(struct rq *rq, struct task_struct *p,
> * Keep a potential priority boosting if called from
> * sched_setscheduler().
> */
> - if (keep_boost)
> - p->prio = rt_mutex_get_effective_prio(p, normal_prio(p));
> - else
> - p->prio = normal_prio(p);
> + p->prio = normal_prio(p);
> + if (keep_boost && p->pi_task)
> + p->prio = min(p->prio, p->pi_task->prio);
>
> if (dl_prio(p->prio))
> p->sched_class = &dl_sched_class;
> @@ -4017,7 +4026,10 @@ static int __sched_setscheduler(struct task_struct *p,
> * the runqueue. This will be done when the task deboost
> * itself.
> */
> - new_effective_prio = rt_mutex_get_effective_prio(p, newprio);
> + new_effective_prio = newprio;
> + if (p->pi_task)
> + new_effective_prio = min(new_effective_prio, p->pi_task->prio);
> +
> if (new_effective_prio == oldprio)
> queue_flags &= ~DEQUEUE_MOVE;
> }
> diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
> index affd97ec9f65..6c5aa4612eb6 100644
> --- a/kernel/sched/deadline.c
> +++ b/kernel/sched/deadline.c
> @@ -932,7 +932,7 @@ static void dequeue_dl_entity(struct sched_dl_entity *dl_se)
>
> static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
> {
> - struct task_struct *pi_task = rt_mutex_get_top_task(p);
> + struct task_struct *pi_task = p->pi_task;
> struct sched_dl_entity *pi_se = &p->dl;
>
> /*
Powered by blists - more mailing lists