linux-kernel - Re: sched_setscheduler() vs idle

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <1433054344.6545.20.camel@gmail.com>
Date:	Sun, 31 May 2015 08:39:04 +0200
From:	Mike Galbraith <umgwanakikbuti@...il.com>
To:	Peter Zijlstra <peterz@...radead.org>
Cc:	LKML <linux-kernel@...r.kernel.org>, Ingo Molnar <mingo@...e.hu>,
	ktkhai@...allels.com
Subject: Re: sched_setscheduler() vs idle_balance() race

On Sat, 2015-05-30 at 15:08 +0200, Mike Galbraith wrote:
> On Thu, 2015-05-28 at 15:53 +0200, Peter Zijlstra wrote:
> > On Thu, May 28, 2015 at 09:43:52AM +0200, Mike Galbraith wrote:
> > > Hi Peter,
> > > 
> > > I'm not seeing what prevents pull_task() from yanking a task out from
> > > under __sched_setscheduler().  A box sprinkling smoldering 3.0 kernel
> > > wreckage all over my bugzilla mbox isn't seeing it either ;-)
> > 
> > Say, how easy can that thing be reproduced?
> > 
> > The below is compile tested only, but it might just work if I didn't
> > miss anything :-)
> 
> Seems trying to make the target invisible to balancing created a new
> race: dequeue target, do stuff that may drop rq->lock while it's
> dequeued, target sneaks into schedule(), dequeues itself (#2), boom.

Well, the below (ick) plugged it up, but...

I don't see why we can't just say no in can_migrate_task() if ->pi_lock
is held.  It plugged the original hole in a lot less lines.

Hohum, time to go pretend I have something better to do on a sunny
Sunday morning ;-)

  massive_intr_x-6213  [007] d...   170.579339: pull_rt_task: yup, pulled
  massive_intr_x-6213  [002] d...   170.580114: pull_rt_task: yup, pulled
  massive_intr_x-6213  [006] d...   170.586083: pull_rt_task: yup, pulled
           <...>-6237  [006] d...   170.593878: __schedule: saving the day

---
 kernel/sched/core.c     |   43 +++++++++++++++++++++++++++++++++++--------
 kernel/sched/deadline.c |    6 +++---
 kernel/sched/rt.c       |   11 +++++++++--
 kernel/sched/sched.h    |   10 +++++++++-
 4 files changed, 56 insertions(+), 14 deletions(-)

--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2745,9 +2745,18 @@ static void __sched __schedule(void)
 	 * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE)
 	 * done by the caller to avoid the race with signal_wake_up().
 	 */
+dequeued:
 	smp_mb__before_spinlock();
 	raw_spin_lock_irq(&rq->lock);
 
+	if (unlikely(!task_on_rq_queued(prev))) {
+		trace_printk("saving the day\n");
+		tracing_off();
+		raw_spin_unlock_irq(&rq->lock);
+		cpu_relax();
+		goto dequeued;
+	}
+
 	rq->clock_skip_update <<= 1; /* promote REQ to ACT */
 
 	switch_count = &prev->nivcsw;
@@ -3013,8 +3022,10 @@ void rt_mutex_setprio(struct task_struct
 	prev_class = p->sched_class;
 	queued = task_on_rq_queued(p);
 	running = task_current(rq, p);
-	if (queued)
+	if (queued) {
 		dequeue_task(rq, p, 0);
+		p->on_rq = TASK_ON_RQ_DEQUEUED;
+	}
 	if (running)
 		put_prev_task(rq, p);
 
@@ -3067,8 +3078,10 @@ void rt_mutex_setprio(struct task_struct
 
 	if (running)
 		p->sched_class->set_curr_task(rq);
-	if (queued)
+	if (queued) {
 		enqueue_task(rq, p, enqueue_flag);
+		p->on_rq = TASK_ON_RQ_QUEUED;
+	}
 
 	/*
 	 * Both switched_to() and prio_changed() are allowed to drop @rq->lock;
@@ -3114,8 +3127,10 @@ void set_user_nice(struct task_struct *p
 		goto out_unlock;
 	}
 	queued = task_on_rq_queued(p);
-	if (queued)
+	if (queued) {
 		dequeue_task(rq, p, 0);
+		p->on_rq = TASK_ON_RQ_DEQUEUED;
+	}
 
 	p->static_prio = NICE_TO_PRIO(nice);
 	set_load_weight(p);
@@ -3125,6 +3140,7 @@ void set_user_nice(struct task_struct *p
 
 	if (queued) {
 		enqueue_task(rq, p, 0);
+		p->on_rq = TASK_ON_RQ_QUEUED;
 		/*
 		 * If the task increased its priority or is running and
 		 * lowered its priority, then reschedule its CPU:
@@ -3628,8 +3644,10 @@ static int __sched_setscheduler(struct t
 
 	queued = task_on_rq_queued(p);
 	running = task_current(rq, p);
-	if (queued)
+	if (queued) {
 		dequeue_task(rq, p, 0);
+		p->on_rq = TASK_ON_RQ_DEQUEUED;
+	}
 	if (running)
 		put_prev_task(rq, p);
 
@@ -3656,6 +3674,7 @@ static int __sched_setscheduler(struct t
 		 * increased (user space view).
 		 */
 		enqueue_task(rq, p, oldprio <= p->prio ? ENQUEUE_HEAD : 0);
+		p->on_rq = TASK_ON_RQ_QUEUED;
 	}
 
 	/*
@@ -4943,8 +4962,10 @@ void sched_setnuma(struct task_struct *p
 	queued = task_on_rq_queued(p);
 	running = task_current(rq, p);
 
-	if (queued)
+	if (queued) {
 		dequeue_task(rq, p, 0);
+		p->on_rq = TASK_ON_RQ_DEQUEUED;
+	}
 	if (running)
 		put_prev_task(rq, p);
 
@@ -4952,8 +4973,10 @@ void sched_setnuma(struct task_struct *p
 
 	if (running)
 		p->sched_class->set_curr_task(rq);
-	if (queued)
+	if (queued) {
 		enqueue_task(rq, p, 0);
+		p->on_rq = TASK_ON_RQ_QUEUED;
+	}
 	task_rq_unlock(rq, p, &flags);
 }
 #endif
@@ -7587,8 +7610,10 @@ void sched_move_task(struct task_struct
 	running = task_current(rq, tsk);
 	queued = task_on_rq_queued(tsk);
 
-	if (queued)
+	if (queued) {
 		dequeue_task(rq, tsk, 0);
+		tsk->on_rq = TASK_ON_RQ_DEQUEUED;
+	}
 	if (unlikely(running))
 		put_prev_task(rq, tsk);
 
@@ -7611,8 +7636,10 @@ void sched_move_task(struct task_struct
 
 	if (unlikely(running))
 		tsk->sched_class->set_curr_task(rq);
-	if (queued)
+	if (queued) {
 		enqueue_task(rq, tsk, 0);
+		tsk->on_rq = TASK_ON_RQ_QUEUED;
+	}
 
 	task_rq_unlock(rq, tsk, &flags);
 }
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -607,7 +607,7 @@ static enum hrtimer_restart dl_task_time
 	 * We can be both throttled and !queued. Replenish the counter
 	 * but do not enqueue -- wait for our wakeup to do that.
 	 */
-	if (!task_on_rq_queued(p)) {
+	if (!task_on_rq_queued_or_dequeued(p)) {
 		replenish_dl_entity(dl_se, dl_se);
 		goto unlock;
 	}
@@ -1526,7 +1526,7 @@ static int pull_dl_task(struct rq *this_
 		     dl_time_before(p->dl.deadline,
 				    this_rq->dl.earliest_dl.curr))) {
 			WARN_ON(p == src_rq->curr);
-			WARN_ON(!task_on_rq_queued(p));
+			WARN_ON(!task_on_rq_queued_or_dequeued(p));
 
 			/*
 			 * Then we pull iff p has actually an earlier
@@ -1707,7 +1707,7 @@ static void switched_from_dl(struct rq *
 	 * this is the right place to try to pull some other one
 	 * from an overloaded cpu, if any.
 	 */
-	if (!task_on_rq_queued(p) || rq->dl.dl_nr_running)
+	if (!task_on_rq_queued_or_dequeued(p) || rq->dl.dl_nr_running)
 		return;
 
 	if (pull_dl_task(rq))
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1656,7 +1656,7 @@ static struct rq *find_lock_lowest_rq(st
 				     !cpumask_test_cpu(lowest_rq->cpu,
 						       tsk_cpus_allowed(task)) ||
 				     task_running(rq, task) ||
-				     !task_on_rq_queued(task))) {
+				     !task_on_rq_queued_or_dequeued(task))) {
 
 				double_unlock_balance(rq, lowest_rq);
 				lowest_rq = NULL;
@@ -1953,10 +1953,14 @@ static int pull_rt_task(struct rq *this_
 	int this_cpu = this_rq->cpu, ret = 0, cpu;
 	struct task_struct *p;
 	struct rq *src_rq;
+	int task_dequeued = 0;
 
 	if (likely(!rt_overloaded(this_rq)))
 		return 0;
 
+	if (this_rq->curr->on_rq == TASK_ON_RQ_DEQUEUED)
+		task_dequeued = 1;
+
 	/*
 	 * Match the barrier from rt_set_overloaded; this guarantees that if we
 	 * see overloaded we must also see the rto_mask bit.
@@ -2035,6 +2039,9 @@ static int pull_rt_task(struct rq *this_
 		double_unlock_balance(this_rq, src_rq);
 	}
 
+	if (ret && task_dequeued)
+		trace_printk("yup, pulled\n");
+
 	return ret;
 }
 
@@ -2133,7 +2140,7 @@ static void switched_from_rt(struct rq *
 	 * we may need to handle the pulling of RT tasks
 	 * now.
 	 */
-	if (!task_on_rq_queued(p) || rq->rt.rt_nr_running)
+	if (!task_on_rq_queued_or_dequeued(p) || rq->rt.rt_nr_running)
 		return;
 
 	if (pull_rt_task(rq))
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -19,7 +19,8 @@ struct cpuidle_state;
 
 /* task_struct::on_rq states: */
 #define TASK_ON_RQ_QUEUED	1
-#define TASK_ON_RQ_MIGRATING	2
+#define TASK_ON_RQ_DEQUEUED	2
+#define TASK_ON_RQ_MIGRATING	3
 
 extern __read_mostly int scheduler_running;
 
@@ -1034,6 +1035,13 @@ static inline int task_on_rq_queued(stru
 	return p->on_rq == TASK_ON_RQ_QUEUED;
 }
 
+static inline int task_on_rq_queued_or_dequeued(struct task_struct *p)
+{
+	if (p->on_rq == TASK_ON_RQ_QUEUED)
+		return 1;
+	return p->on_rq == TASK_ON_RQ_DEQUEUED;
+}
+
 static inline int task_on_rq_migrating(struct task_struct *p)
 {
 	return p->on_rq == TASK_ON_RQ_MIGRATING;


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/