lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Date:	Wed,  6 Apr 2016 20:59:15 +0800
From:	Xunlei Pang <xlpang@...hat.com>
To:	linux-kernel@...r.kernel.org
Cc:	Peter Zijlstra <peterz@...radead.org>,
	Thomas Gleixner <tglx@...utronix.de>,
	Juri Lelli <juri.lelli@....com>,
	Ingo Molnar <mingo@...hat.com>,
	Steven Rostedt <rostedt@...dmis.org>,
	Xunlei Pang <xlpang@...hat.com>
Subject: [PATCH v2 1/2] sched/rtmutex/deadline: Fix a PI crash for deadline tasks

A crash happened while I'm playing with deadline PI rtmutex.

    BUG: unable to handle kernel NULL pointer dereference at 0000000000000018
    IP: [<ffffffff810eeb8f>] rt_mutex_get_top_task+0x1f/0x30
    PGD 232a75067 PUD 230947067 PMD 0
    Oops: 0000 [#1] SMP
    CPU: 1 PID: 10994 Comm: a.out Not tainted

    Call Trace:
    [<ffffffff810cf8aa>] ? enqueue_task_dl+0x2a/0x320
    [<ffffffff810b658c>] enqueue_task+0x2c/0x80
    [<ffffffff810ba763>] activate_task+0x23/0x30
    [<ffffffff810d0ab5>] pull_dl_task+0x1d5/0x260
    [<ffffffff810d0be6>] pre_schedule_dl+0x16/0x20
    [<ffffffff8164e783>] __schedule+0xd3/0x900
    [<ffffffff8164efd9>] schedule+0x29/0x70
    [<ffffffff8165035b>] __rt_mutex_slowlock+0x4b/0xc0
    [<ffffffff81650501>] rt_mutex_slowlock+0xd1/0x190
    [<ffffffff810eeb33>] rt_mutex_timed_lock+0x53/0x60
    [<ffffffff810ecbfc>] futex_lock_pi.isra.18+0x28c/0x390
    [<ffffffff810cfa15>] ? enqueue_task_dl+0x195/0x320
    [<ffffffff810d0bac>] ? prio_changed_dl+0x6c/0x90
    [<ffffffff810ed8b0>] do_futex+0x190/0x5b0
    [<ffffffff810edd50>] SyS_futex+0x80/0x180
    [<ffffffff8165a089>] system_call_fastpath+0x16/0x1b
    RIP  [<ffffffff810eeb8f>] rt_mutex_get_top_task+0x1f/0x30

This is because rt_mutex_enqueue_pi() and rt_mutex_dequeue_pi()
are only protected by pi_lock when operating pi waiters, while
rt_mutex_get_top_task() will access them with rq lock held but
not holding pi_lock.

In order to tackle it, we introduce a new pointer "pi_top_task"
in task_struct, and update it to be the top waiter task(this waiter
is updated under pi_lock) in rt_mutex_setprio() which is under 
both pi_lock and rq lock, then ensure all its accessers be under 
rq lock (or pi_lock), this can safely fix the crash.

This patch is originated from "Peter Zijlstra", with several
tweaks and improvements by me.

Tested-by: Xunlei Pang <xlpang@...hat.com>
Originally-From: Peter Zijlstra (Intel) <peterz@...radead.org>
Signed-off-by: Xunlei Pang <xlpang@...hat.com>
---
Without the patch, kernel crashes in seconds, after the patch
it can survive overnight.

 include/linux/init_task.h       |  3 +-
 include/linux/sched.h           |  1 +
 include/linux/sched/deadline.h  | 12 +++++++
 include/linux/sched/rt.h        | 21 ++----------
 kernel/fork.c                   |  1 +
 kernel/futex.c                  |  5 ++-
 kernel/locking/rtmutex.c        | 71 +++++++++++++++--------------------------
 kernel/locking/rtmutex_common.h |  1 +
 kernel/sched/core.c             | 39 +++++++++++++++-------
 kernel/sched/deadline.c         |  2 +-
 10 files changed, 75 insertions(+), 81 deletions(-)

diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index f2cb8d4..de834f3 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -162,7 +162,8 @@ extern struct task_group root_task_group;
 #ifdef CONFIG_RT_MUTEXES
 # define INIT_RT_MUTEXES(tsk)						\
 	.pi_waiters = RB_ROOT,						\
-	.pi_waiters_leftmost = NULL,
+	.pi_waiters_leftmost = NULL,					\
+	.pi_top_task = NULL,
 #else
 # define INIT_RT_MUTEXES(tsk)
 #endif
diff --git a/include/linux/sched.h b/include/linux/sched.h
index c617ea1..b4d9347 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1621,6 +1621,7 @@ struct task_struct {
 	/* PI waiters blocked on a rt_mutex held by this task */
 	struct rb_root pi_waiters;
 	struct rb_node *pi_waiters_leftmost;
+	struct task_struct *pi_top_task;
 	/* Deadlock detection and priority inheritance handling */
 	struct rt_mutex_waiter *pi_blocked_on;
 #endif
diff --git a/include/linux/sched/deadline.h b/include/linux/sched/deadline.h
index 9089a2a..9f46729 100644
--- a/include/linux/sched/deadline.h
+++ b/include/linux/sched/deadline.h
@@ -26,4 +26,16 @@ static inline bool dl_time_before(u64 a, u64 b)
 	return (s64)(a - b) < 0;
 }
 
+#ifdef CONFIG_RT_MUTEXES
+static inline struct task_struct *get_pi_top_task(struct task_struct *p)
+{
+	return p->pi_top_task;
+}
+#else
+static inline struct task_struct *get_pi_top_task(struct task_struct *p)
+{
+	return NULL;
+}
+#endif
+
 #endif /* _SCHED_DEADLINE_H */
diff --git a/include/linux/sched/rt.h b/include/linux/sched/rt.h
index a30b172..4e35e94 100644
--- a/include/linux/sched/rt.h
+++ b/include/linux/sched/rt.h
@@ -16,31 +16,14 @@ static inline int rt_task(struct task_struct *p)
 }
 
 #ifdef CONFIG_RT_MUTEXES
-extern int rt_mutex_getprio(struct task_struct *p);
-extern void rt_mutex_setprio(struct task_struct *p, int prio);
-extern int rt_mutex_get_effective_prio(struct task_struct *task, int newprio);
-extern struct task_struct *rt_mutex_get_top_task(struct task_struct *task);
+extern void rt_mutex_setprio(struct task_struct *p,
+				struct task_struct *pi_top_task);
 extern void rt_mutex_adjust_pi(struct task_struct *p);
 static inline bool tsk_is_pi_blocked(struct task_struct *tsk)
 {
 	return tsk->pi_blocked_on != NULL;
 }
 #else
-static inline int rt_mutex_getprio(struct task_struct *p)
-{
-	return p->normal_prio;
-}
-
-static inline int rt_mutex_get_effective_prio(struct task_struct *task,
-					      int newprio)
-{
-	return newprio;
-}
-
-static inline struct task_struct *rt_mutex_get_top_task(struct task_struct *task)
-{
-	return NULL;
-}
 # define rt_mutex_adjust_pi(p)		do { } while (0)
 static inline bool tsk_is_pi_blocked(struct task_struct *tsk)
 {
diff --git a/kernel/fork.c b/kernel/fork.c
index 45a9048..3ad84c9 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1209,6 +1209,7 @@ static void rt_mutex_init_task(struct task_struct *p)
 	p->pi_waiters = RB_ROOT;
 	p->pi_waiters_leftmost = NULL;
 	p->pi_blocked_on = NULL;
+	p->pi_top_task = NULL;
 #endif
 }
 
diff --git a/kernel/futex.c b/kernel/futex.c
index a5d2e74..e73145b 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1326,9 +1326,8 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this,
 	 * scheduled away before the wake up can take place.
 	 */
 	spin_unlock(&hb->lock);
-	wake_up_q(&wake_q);
-	if (deboost)
-		rt_mutex_adjust_prio(current);
+
+	rt_mutex_postunlock(&wake_q, deboost);
 
 	return 0;
 }
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 5e4294c..6c3a806 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -257,53 +257,18 @@ rt_mutex_dequeue_pi(struct task_struct *task, struct rt_mutex_waiter *waiter)
 }
 
 /*
- * Calculate task priority from the waiter tree priority
- *
- * Return task->normal_prio when the waiter tree is empty or when
- * the waiter is not allowed to do priority boosting
- */
-int rt_mutex_getprio(struct task_struct *task)
-{
-	if (likely(!task_has_pi_waiters(task)))
-		return task->normal_prio;
-
-	return min(task_top_pi_waiter(task)->prio,
-		   task->normal_prio);
-}
-
-struct task_struct *rt_mutex_get_top_task(struct task_struct *task)
-{
-	if (likely(!task_has_pi_waiters(task)))
-		return NULL;
-
-	return task_top_pi_waiter(task)->task;
-}
-
-/*
- * Called by sched_setscheduler() to get the priority which will be
- * effective after the change.
- */
-int rt_mutex_get_effective_prio(struct task_struct *task, int newprio)
-{
-	if (!task_has_pi_waiters(task))
-		return newprio;
-
-	if (task_top_pi_waiter(task)->task->prio <= newprio)
-		return task_top_pi_waiter(task)->task->prio;
-	return newprio;
-}
-
-/*
  * Adjust the priority of a task, after its pi_waiters got modified.
  *
  * This can be both boosting and unboosting. task->pi_lock must be held.
  */
 static void __rt_mutex_adjust_prio(struct task_struct *task)
 {
-	int prio = rt_mutex_getprio(task);
+	struct task_struct *pi_top_task = task;
 
-	if (task->prio != prio || dl_prio(prio))
-		rt_mutex_setprio(task, prio);
+	if (unlikely(task_has_pi_waiters(task)))
+		pi_top_task = task_top_pi_waiter(task)->task;
+
+	rt_mutex_setprio(task, pi_top_task);
 }
 
 /*
@@ -1403,14 +1368,30 @@ rt_mutex_fastunlock(struct rt_mutex *lock,
 	} else {
 		bool deboost = slowfn(lock, &wake_q);
 
-		wake_up_q(&wake_q);
-
-		/* Undo pi boosting if necessary: */
-		if (deboost)
-			rt_mutex_adjust_prio(current);
+		rt_mutex_postunlock(&wake_q, deboost);
 	}
 }
 
+/*
+ * Undo pi boosting (if necessary) and wake top waiter.
+ */
+void rt_mutex_postunlock(struct wake_q_head *wake_q, bool deboost)
+{
+	/*
+	 * We should deboost before waking the high-prio task such that
+	 * we don't run two tasks with the 'same' state. This however
+	 * can lead to prio-inversion if we would get preempted after
+	 * the deboost but before waking our high-prio task, hence the
+	 * preempt_disable.
+	 */
+	preempt_disable();
+	if (deboost)
+		rt_mutex_adjust_prio(current);
+
+	wake_up_q(wake_q);
+	preempt_enable();
+}
+
 /**
  * rt_mutex_lock - lock a rt_mutex
  *
diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h
index 4f5f83c..93b0924 100644
--- a/kernel/locking/rtmutex_common.h
+++ b/kernel/locking/rtmutex_common.h
@@ -111,6 +111,7 @@ extern int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
 extern int rt_mutex_timed_futex_lock(struct rt_mutex *l, struct hrtimer_sleeper *to);
 extern bool rt_mutex_futex_unlock(struct rt_mutex *lock,
 				  struct wake_q_head *wqh);
+extern void rt_mutex_postunlock(struct wake_q_head *wake_q, bool deboost);
 extern void rt_mutex_adjust_prio(struct task_struct *task);
 
 #ifdef CONFIG_DEBUG_RT_MUTEXES
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 0b21e7a..a533566 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3374,7 +3374,7 @@ EXPORT_SYMBOL(default_wake_function);
 /*
  * rt_mutex_setprio - set the current priority of a task
  * @p: task
- * @prio: prio value (kernel-internal form)
+ * @pi_top_task: top waiter, donating state
  *
  * This function changes the 'effective' priority of a task. It does
  * not touch ->normal_prio like __setscheduler().
@@ -3382,13 +3382,21 @@ EXPORT_SYMBOL(default_wake_function);
  * Used by the rt_mutex code to implement priority inheritance
  * logic. Call site only calls if the priority of the task changed.
  */
-void rt_mutex_setprio(struct task_struct *p, int prio)
+void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_top_task)
 {
-	int oldprio, queued, running, queue_flag = DEQUEUE_SAVE | DEQUEUE_MOVE;
-	struct rq *rq;
+	int prio, oldprio, queued, running;
+	int queue_flag = DEQUEUE_SAVE | DEQUEUE_MOVE;
 	const struct sched_class *prev_class;
+	struct rq *rq;
 
-	BUG_ON(prio > MAX_PRIO);
+	/*
+	 * For FIFO/RR we simply donate prio; for DL things are
+	 * more interesting.
+	 */
+	/* XXX used to be waiter->prio, not waiter->task->prio */
+	prio = min(pi_top_task->prio, p->normal_prio);
+	if (p->prio == prio && !dl_prio(prio))
+		return;
 
 	rq = __task_rq_lock(p);
 
@@ -3424,6 +3432,10 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
 	if (running)
 		put_prev_task(rq, p);
 
+	if (pi_top_task == p)
+		pi_top_task = NULL;
+	p->pi_top_task = pi_top_task;
+
 	/*
 	 * Boosting condition are:
 	 * 1. -rt task is running and holds mutex A
@@ -3434,9 +3446,9 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
 	 *          running task
 	 */
 	if (dl_prio(prio)) {
-		struct task_struct *pi_task = rt_mutex_get_top_task(p);
 		if (!dl_prio(p->normal_prio) ||
-		    (pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) {
+		    (pi_top_task &&
+			    dl_entity_preempt(&pi_top_task->dl, &p->dl))) {
 			p->dl.dl_boosted = 1;
 			queue_flag |= ENQUEUE_REPLENISH;
 		} else
@@ -3709,10 +3721,9 @@ static void __setscheduler(struct rq *rq, struct task_struct *p,
 	 * Keep a potential priority boosting if called from
 	 * sched_setscheduler().
 	 */
-	if (keep_boost)
-		p->prio = rt_mutex_get_effective_prio(p, normal_prio(p));
-	else
-		p->prio = normal_prio(p);
+	p->prio = normal_prio(p);
+	if (keep_boost && get_pi_top_task(p))
+		p->prio = min(p->prio, get_pi_top_task(p)->prio);
 
 	if (dl_prio(p->prio))
 		p->sched_class = &dl_sched_class;
@@ -3999,7 +4010,11 @@ change:
 		 * the runqueue. This will be done when the task deboost
 		 * itself.
 		 */
-		new_effective_prio = rt_mutex_get_effective_prio(p, newprio);
+		new_effective_prio = newprio;
+		if (get_pi_top_task(p))
+			new_effective_prio =
+			    min(new_effective_prio, get_pi_top_task(p)->prio);
+
 		if (new_effective_prio == oldprio)
 			queue_flags &= ~DEQUEUE_MOVE;
 	}
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index c7a036f..e564c88 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -928,7 +928,7 @@ static void dequeue_dl_entity(struct sched_dl_entity *dl_se)
 
 static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
 {
-	struct task_struct *pi_task = rt_mutex_get_top_task(p);
+	struct task_struct *pi_task = get_pi_top_task(p);
 	struct sched_dl_entity *pi_se = &p->dl;
 
 	/*
-- 
1.8.3.1

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ