linux-kernel - Re: [PATCH v4 2/2] sched/rtmutex/deadline: Fix a PI crash for deadline tasks

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <574C0A89.0@redhat.com>
Date:	Mon, 30 May 2016 17:40:25 +0800
From:	Xunlei Pang <xpang@...hat.com>
To:	Xunlei Pang <xlpang@...hat.com>, linux-kernel@...r.kernel.org
Cc:	Peter Zijlstra <peterz@...radead.org>,
	Juri Lelli <juri.lelli@....com>,
	Ingo Molnar <mingo@...hat.com>,
	Steven Rostedt <rostedt@...dmis.org>
Subject: Re: [PATCH v4 2/2] sched/rtmutex/deadline: Fix a PI crash for
 deadline tasks

Ping
Could someone have some comments on this bug?

On 2016/04/26 at 16:30, Xunlei Pang wrote:
> A crash happened while I was playing with deadline PI rtmutex.
>
>     BUG: unable to handle kernel NULL pointer dereference at 0000000000000018
>     IP: [<ffffffff810eeb8f>] rt_mutex_get_top_task+0x1f/0x30
>     PGD 232a75067 PUD 230947067 PMD 0
>     Oops: 0000 [#1] SMP
>     CPU: 1 PID: 10994 Comm: a.out Not tainted
>
>     Call Trace:
>     [<ffffffff810cf8aa>] ? enqueue_task_dl+0x2a/0x320
>     [<ffffffff810b658c>] enqueue_task+0x2c/0x80
>     [<ffffffff810ba763>] activate_task+0x23/0x30
>     [<ffffffff810d0ab5>] pull_dl_task+0x1d5/0x260
>     [<ffffffff810d0be6>] pre_schedule_dl+0x16/0x20
>     [<ffffffff8164e783>] __schedule+0xd3/0x900
>     [<ffffffff8164efd9>] schedule+0x29/0x70
>     [<ffffffff8165035b>] __rt_mutex_slowlock+0x4b/0xc0
>     [<ffffffff81650501>] rt_mutex_slowlock+0xd1/0x190
>     [<ffffffff810eeb33>] rt_mutex_timed_lock+0x53/0x60
>     [<ffffffff810ecbfc>] futex_lock_pi.isra.18+0x28c/0x390
>     [<ffffffff810cfa15>] ? enqueue_task_dl+0x195/0x320
>     [<ffffffff810d0bac>] ? prio_changed_dl+0x6c/0x90
>     [<ffffffff810ed8b0>] do_futex+0x190/0x5b0
>     [<ffffffff810edd50>] SyS_futex+0x80/0x180
>     [<ffffffff8165a089>] system_call_fastpath+0x16/0x1b
>     RIP  [<ffffffff810eeb8f>] rt_mutex_get_top_task+0x1f/0x30
>
> This is because rt_mutex_enqueue_pi() and rt_mutex_dequeue_pi()
> are only protected by pi_lock when operating pi waiters, while
> rt_mutex_get_top_task(), will access them with rq lock held but
> not holding pi_lock.
>
> In order to tackle it, we introduce new "pi_top_task" pointer
> cached in task_struct, and add new rt_mutex_update_top_task()
> to update its value, it can be called by rt_mutex_setprio()
> which held both owner's pi_lock and rq lock. Thus "pi_top_task"
> can be safely accessed by enqueue_task_dl() under rq lock.
>
> One problem is when rt_mutex_adjust_prio()->...->rt_mutex_setprio(),
> at that time rtmutex lock was released and owner was marked off,
> this can cause "pi_top_task" dereferenced to be a running one(as it
> can be falsely woken up by others before rt_mutex_setprio() is
> made to update "pi_top_task"). We solve this by directly calling
> __rt_mutex_adjust_prio() in mark_wakeup_next_waiter() which held
> pi_lock and rtmutex lock, and remove rt_mutex_adjust_prio(). Since
> now we moved the deboost point, in order to avoid current to be
> preempted due to deboost earlier before wake_up_q(), we also moved
> preempt_disable() before unlocking rtmutex.
>
> Originally-From: Peter Zijlstra <peterz@...radead.org>
> Signed-off-by: Xunlei Pang <xlpang@...hat.com>
> ---
> v3 -> v4:
> Cache a task_struct pi_top_task pointer instead of rt_mutex_waiter.
>
>  include/linux/init_task.h |  1 +
>  include/linux/sched.h     |  2 ++
>  include/linux/sched/rt.h  |  1 +
>  kernel/fork.c             |  1 +
>  kernel/locking/rtmutex.c  | 65 ++++++++++++++++++++---------------------------
>  kernel/sched/core.c       |  2 ++
>  6 files changed, 34 insertions(+), 38 deletions(-)
>
> diff --git a/include/linux/init_task.h b/include/linux/init_task.h
> index f2cb8d4..ca088b1 100644
> --- a/include/linux/init_task.h
> +++ b/include/linux/init_task.h
> @@ -162,6 +162,7 @@ extern struct task_group root_task_group;
>  #ifdef CONFIG_RT_MUTEXES
>  # define INIT_RT_MUTEXES(tsk)						\
>  	.pi_waiters = RB_ROOT,						\
> +	.pi_top_task = NULL,						\
>  	.pi_waiters_leftmost = NULL,
>  #else
>  # define INIT_RT_MUTEXES(tsk)
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 45e848c..322c0ad 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -1622,6 +1622,8 @@ struct task_struct {
>  	/* PI waiters blocked on a rt_mutex held by this task */
>  	struct rb_root pi_waiters;
>  	struct rb_node *pi_waiters_leftmost;
> +	/* Updated under owner's pi_lock and rq lock */
> +	struct task_struct *pi_top_task;
>  	/* Deadlock detection and priority inheritance handling */
>  	struct rt_mutex_waiter *pi_blocked_on;
>  #endif
> diff --git a/include/linux/sched/rt.h b/include/linux/sched/rt.h
> index a30b172..60d0c47 100644
> --- a/include/linux/sched/rt.h
> +++ b/include/linux/sched/rt.h
> @@ -19,6 +19,7 @@ static inline int rt_task(struct task_struct *p)
>  extern int rt_mutex_getprio(struct task_struct *p);
>  extern void rt_mutex_setprio(struct task_struct *p, int prio);
>  extern int rt_mutex_get_effective_prio(struct task_struct *task, int newprio);
> +extern void rt_mutex_update_top_task(struct task_struct *p);
>  extern struct task_struct *rt_mutex_get_top_task(struct task_struct *task);
>  extern void rt_mutex_adjust_pi(struct task_struct *p);
>  static inline bool tsk_is_pi_blocked(struct task_struct *tsk)
> diff --git a/kernel/fork.c b/kernel/fork.c
> index a453546..cb01dfc 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -1225,6 +1225,7 @@ static void rt_mutex_init_task(struct task_struct *p)
>  #ifdef CONFIG_RT_MUTEXES
>  	p->pi_waiters = RB_ROOT;
>  	p->pi_waiters_leftmost = NULL;
> +	p->pi_top_task = NULL;
>  	p->pi_blocked_on = NULL;
>  #endif
>  }
> diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
> index d87f99e..969e436 100644
> --- a/kernel/locking/rtmutex.c
> +++ b/kernel/locking/rtmutex.c
> @@ -256,6 +256,16 @@ rt_mutex_dequeue_pi(struct task_struct *task, struct rt_mutex_waiter *waiter)
>  	RB_CLEAR_NODE(&waiter->pi_tree_entry);
>  }
>  
> +void rt_mutex_update_top_task(struct task_struct *p)
> +{
> +	if (!task_has_pi_waiters(p)) {
> +		p->pi_top_task = NULL;
> +		return;
> +	}
> +
> +	p->pi_top_task = task_top_pi_waiter(p)->task;
> +}
> +
>  /*
>   * Calculate task priority from the waiter tree priority
>   *
> @@ -273,10 +283,7 @@ int rt_mutex_getprio(struct task_struct *task)
>  
>  struct task_struct *rt_mutex_get_top_task(struct task_struct *task)
>  {
> -	if (likely(!task_has_pi_waiters(task)))
> -		return NULL;
> -
> -	return task_top_pi_waiter(task)->task;
> +	return task->pi_top_task;
>  }
>  
>  /*
> @@ -285,12 +292,12 @@ struct task_struct *rt_mutex_get_top_task(struct task_struct *task)
>   */
>  int rt_mutex_get_effective_prio(struct task_struct *task, int newprio)
>  {
> -	if (!task_has_pi_waiters(task))
> +	struct task_struct *top_task = rt_mutex_get_top_task(task);
> +
> +	if (!top_task)
>  		return newprio;
>  
> -	if (task_top_pi_waiter(task)->task->prio <= newprio)
> -		return task_top_pi_waiter(task)->task->prio;
> -	return newprio;
> +	return min(top_task->prio, newprio);
>  }
>  
>  /*
> @@ -307,24 +314,6 @@ static void __rt_mutex_adjust_prio(struct task_struct *task)
>  }
>  
>  /*
> - * Adjust task priority (undo boosting). Called from the exit path of
> - * rt_mutex_slowunlock() and rt_mutex_slowlock().
> - *
> - * (Note: We do this outside of the protection of lock->wait_lock to
> - * allow the lock to be taken while or before we readjust the priority
> - * of task. We do not use the spin_xx_mutex() variants here as we are
> - * outside of the debug path.)
> - */
> -void rt_mutex_adjust_prio(struct task_struct *task)
> -{
> -	unsigned long flags;
> -
> -	raw_spin_lock_irqsave(&task->pi_lock, flags);
> -	__rt_mutex_adjust_prio(task);
> -	raw_spin_unlock_irqrestore(&task->pi_lock, flags);
> -}
> -
> -/*
>   * Deadlock detection is conditional:
>   *
>   * If CONFIG_DEBUG_RT_MUTEXES=n, deadlock detection is only conducted
> @@ -987,6 +976,7 @@ static void mark_wakeup_next_waiter(struct wake_q_head *wake_q,
>  	 * lock->wait_lock.
>  	 */
>  	rt_mutex_dequeue_pi(current, waiter);
> +	__rt_mutex_adjust_prio(current);
>  
>  	/*
>  	 * As we are waking up the top waiter, and the waiter stays
> @@ -1325,6 +1315,16 @@ static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock,
>  	 */
>  	mark_wakeup_next_waiter(wake_q, lock);
>  
> +	/*
> +	 * We should deboost before waking the top waiter task such that
> +	 * we don't run two tasks with the 'same' priority. This however
> +	 * can lead to prio-inversion if we would get preempted after
> +	 * the deboost but before waking our high-prio task, hence the
> +	 * preempt_disable before unlock. Pairs with preempt_enable() in
> +	 * rt_mutex_postunlock();
> +	 */
> +	preempt_disable();
> +
>  	raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
>  
>  	/* check PI boosting */
> @@ -1400,20 +1400,9 @@ rt_mutex_fastunlock(struct rt_mutex *lock,
>   */
>  void rt_mutex_postunlock(struct wake_q_head *wake_q, bool deboost)
>  {
> -	/*
> -	 * We should deboost before waking the top waiter task such that
> -	 * we don't run two tasks with the 'same' priority. This however
> -	 * can lead to prio-inversion if we would get preempted after
> -	 * the deboost but before waking our high-prio task, hence the
> -	 * preempt_disable.
> -	 */
> -	if (deboost) {
> -		preempt_disable();
> -		rt_mutex_adjust_prio(current);
> -	}
> -
>  	wake_up_q(wake_q);
>  
> +	/* Pairs with preempt_disable() in rt_mutex_slowunlock() */
>  	if (deboost)
>  		preempt_enable();
>  }
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 1159423..60f8166 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -3480,6 +3480,8 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
>  		goto out_unlock;
>  	}
>  
> +	rt_mutex_update_top_task(p);
> +
>  	trace_sched_pi_setprio(p, prio);
>  	oldprio = p->prio;
>