linux-kernel - Re: [PATCH RFC] reduce runqueue lock contention

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <1277117647.1875.503.camel@laptop>
Date:	Mon, 21 Jun 2010 12:54:07 +0200
From:	Peter Zijlstra <peterz@...radead.org>
To:	Chris Mason <chris.mason@...cle.com>
Cc:	Ingo Molnar <mingo@...e.hu>, axboe@...nel.dk,
	linux-kernel@...r.kernel.org, Mike Galbraith <efault@....de>,
	Oleg Nesterov <oleg@...hat.com>, tglx <tglx@...utronix.de>
Subject: Re: [PATCH RFC] reduce runqueue lock contention

On Mon, 2010-06-21 at 12:02 +0200, Peter Zijlstra wrote:
> To return the favour, here's a scary patch that renders your machine a
> doorstop :-) Sadly it just hangs, no splat, no sysrq, no nmi, no tripple
> fault.
> 
> It looses the ttwu task_running() check, as I must admit I'm not quite
> sure what it does.. Ingo?
> 
> It makes the whole TASK_WAKING thing very interesting again :-)
> 
> It also re-introduces a bunch of races because we now need to run
> ->select_task_rq() without holding the rq->lock. We cannot defer running
> that because it really wants to know the cpu the wakeup originated from
> (affine wakeups and the like).


Progress, this one gave spectacular fireworks ;-)

---
 arch/x86/kernel/smp.c   |    1 +
 include/linux/sched.h   |    7 +-
 kernel/sched.c          |  247 +++++++++++++++++++++++++----------------------
 kernel/sched_fair.c     |   10 +-
 kernel/sched_idletask.c |    2 +-
 kernel/sched_rt.c       |    4 +-
 6 files changed, 143 insertions(+), 128 deletions(-)

diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index d801210..1e487d6 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -202,6 +202,7 @@ void smp_reschedule_interrupt(struct pt_regs *regs)
 	/*
 	 * KVM uses this interrupt to force a cpu out of guest mode
 	 */
+	sched_ttwu_pending();
 }
 
 void smp_call_function_interrupt(struct pt_regs *regs)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index a61c08c..9ae9fdb 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1003,6 +1003,7 @@ partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
 }
 #endif	/* !CONFIG_SMP */
 
+void sched_ttwu_pending(void);
 
 struct io_context;			/* See blkdev.h */
 
@@ -1046,12 +1047,11 @@ struct sched_class {
 	void (*put_prev_task) (struct rq *rq, struct task_struct *p);
 
 #ifdef CONFIG_SMP
-	int  (*select_task_rq)(struct rq *rq, struct task_struct *p,
-			       int sd_flag, int flags);
+	int  (*select_task_rq)(struct task_struct *p, int sd_flag, int flags);
 
 	void (*pre_schedule) (struct rq *this_rq, struct task_struct *task);
 	void (*post_schedule) (struct rq *this_rq);
-	void (*task_waking) (struct rq *this_rq, struct task_struct *task);
+	void (*task_waking) (struct task_struct *task);
 	void (*task_woken) (struct rq *this_rq, struct task_struct *task);
 
 	void (*set_cpus_allowed)(struct task_struct *p,
@@ -1174,6 +1174,7 @@ struct task_struct {
 	int lock_depth;		/* BKL lock depth */
 
 #ifdef CONFIG_SMP
+	struct task_struct *wake_entry;
 #ifdef __ARCH_WANT_UNLOCKED_CTXSW
 	int oncpu;
 #endif
diff --git a/kernel/sched.c b/kernel/sched.c
index b697606..5fe442c 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -518,6 +518,8 @@ struct rq {
 	u64 age_stamp;
 	u64 idle_stamp;
 	u64 avg_idle;
+
+	struct task_struct *wake_list;
 #endif
 
 	/* calc_load related fields */
@@ -944,7 +946,7 @@ static inline struct rq *__task_rq_lock(struct task_struct *p)
 	for (;;) {
 		rq = task_rq(p);
 		raw_spin_lock(&rq->lock);
-		if (likely(rq == task_rq(p)))
+		if (likely(rq == task_rq(p)) && !task_is_waking(p))
 			return rq;
 		raw_spin_unlock(&rq->lock);
 	}
@@ -964,7 +966,7 @@ static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
 		local_irq_save(*flags);
 		rq = task_rq(p);
 		raw_spin_lock(&rq->lock);
-		if (likely(rq == task_rq(p)))
+		if (likely(rq == task_rq(p)) && !task_is_waking(p))
 			return rq;
 		raw_spin_unlock_irqrestore(&rq->lock, *flags);
 	}
@@ -2257,9 +2259,9 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
  * The caller (fork, wakeup) owns TASK_WAKING, ->cpus_allowed is stable.
  */
 static inline
-int select_task_rq(struct rq *rq, struct task_struct *p, int sd_flags, int wake_flags)
+int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags)
 {
-	int cpu = p->sched_class->select_task_rq(rq, p, sd_flags, wake_flags);
+	int cpu = p->sched_class->select_task_rq(p, sd_flags, wake_flags);
 
 	/*
 	 * In order not to call set_task_cpu() on a blocking task we need
@@ -2285,9 +2287,8 @@ static void update_avg(u64 *avg, u64 sample)
 }
 #endif
 
-static inline void ttwu_activate(struct task_struct *p, struct rq *rq,
-				 bool is_sync, bool is_migrate, bool is_local,
-				 unsigned long en_flags)
+static inline void
+ttwu_stat(struct task_struct *p, bool is_sync, bool is_migrate, bool is_local)
 {
 	schedstat_inc(p, se.statistics.nr_wakeups);
 	if (is_sync)
@@ -2298,8 +2299,6 @@ static inline void ttwu_activate(struct task_struct *p, struct rq *rq,
 		schedstat_inc(p, se.statistics.nr_wakeups_local);
 	else
 		schedstat_inc(p, se.statistics.nr_wakeups_remote);
-
-	activate_task(rq, p, en_flags);
 }
 
 static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq,
@@ -2330,111 +2329,6 @@ static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq,
 }
 
 /**
- * try_to_wake_up - wake up a thread
- * @p: the thread to be awakened
- * @state: the mask of task states that can be woken
- * @wake_flags: wake modifier flags (WF_*)
- *
- * Put it on the run-queue if it's not already there. The "current"
- * thread is always on the run-queue (except when the actual
- * re-schedule is in progress), and as such you're allowed to do
- * the simpler "current->state = TASK_RUNNING" to mark yourself
- * runnable without the overhead of this.
- *
- * Returns %true if @p was woken up, %false if it was already running
- * or @state didn't match @p's state.
- */
-static int try_to_wake_up(struct task_struct *p, unsigned int state,
-			  int wake_flags)
-{
-	int cpu, orig_cpu, this_cpu, success = 0;
-	unsigned long flags;
-	unsigned long en_flags = ENQUEUE_WAKEUP;
-	struct rq *rq;
-
-	this_cpu = get_cpu();
-
-	smp_wmb();
-	rq = task_rq_lock(p, &flags);
-	if (!(p->state & state))
-		goto out;
-
-	if (p->se.on_rq)
-		goto out_running;
-
-	cpu = task_cpu(p);
-	orig_cpu = cpu;
-
-#ifdef CONFIG_SMP
-	if (unlikely(task_running(rq, p)))
-		goto out_activate;
-
-	/*
-	 * In order to handle concurrent wakeups and release the rq->lock
-	 * we put the task in TASK_WAKING state.
-	 *
-	 * First fix up the nr_uninterruptible count:
-	 */
-	if (task_contributes_to_load(p)) {
-		if (likely(cpu_online(orig_cpu)))
-			rq->nr_uninterruptible--;
-		else
-			this_rq()->nr_uninterruptible--;
-	}
-	p->state = TASK_WAKING;
-
-	if (p->sched_class->task_waking) {
-		p->sched_class->task_waking(rq, p);
-		en_flags |= ENQUEUE_WAKING;
-	}
-
-	cpu = select_task_rq(rq, p, SD_BALANCE_WAKE, wake_flags);
-	if (cpu != orig_cpu)
-		set_task_cpu(p, cpu);
-	__task_rq_unlock(rq);
-
-	rq = cpu_rq(cpu);
-	raw_spin_lock(&rq->lock);
-
-	/*
-	 * We migrated the task without holding either rq->lock, however
-	 * since the task is not on the task list itself, nobody else
-	 * will try and migrate the task, hence the rq should match the
-	 * cpu we just moved it to.
-	 */
-	WARN_ON(task_cpu(p) != cpu);
-	WARN_ON(p->state != TASK_WAKING);
-
-#ifdef CONFIG_SCHEDSTATS
-	schedstat_inc(rq, ttwu_count);
-	if (cpu == this_cpu)
-		schedstat_inc(rq, ttwu_local);
-	else {
-		struct sched_domain *sd;
-		for_each_domain(this_cpu, sd) {
-			if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
-				schedstat_inc(sd, ttwu_wake_remote);
-				break;
-			}
-		}
-	}
-#endif /* CONFIG_SCHEDSTATS */
-
-out_activate:
-#endif /* CONFIG_SMP */
-	ttwu_activate(p, rq, wake_flags & WF_SYNC, orig_cpu != cpu,
-		      cpu == this_cpu, en_flags);
-	success = 1;
-out_running:
-	ttwu_post_activation(p, rq, wake_flags, success);
-out:
-	task_rq_unlock(rq, &flags);
-	put_cpu();
-
-	return success;
-}
-
-/**
  * try_to_wake_up_local - try to wake up a local task with rq lock held
  * @p: the thread to be awakened
  *
@@ -2459,12 +2353,131 @@ static void try_to_wake_up_local(struct task_struct *p)
 			schedstat_inc(rq, ttwu_count);
 			schedstat_inc(rq, ttwu_local);
 		}
-		ttwu_activate(p, rq, false, false, true, ENQUEUE_WAKEUP);
+		ttwu_stat(p, false, false, true);
+		activate_task(rq, p, ENQUEUE_WAKEUP);
 		success = true;
 	}
 	ttwu_post_activation(p, rq, 0, success);
 }
 
+static int ttwu_running(struct task_struct *p, int wake_flags)
+{
+	struct rq *rq;
+
+	for (;;) {
+		rq = task_rq(p);
+		raw_spin_lock(&rq->lock);
+		if (likely(rq == task_rq(p)))
+			break;
+		raw_spin_unlock(&rq->lock);
+	}
+
+	if (!p->se.on_rq) {
+		raw_spin_unlock(&rq->lock);
+		return 0;
+	}
+
+	ttwu_post_activation(p, rq, 0, true);
+	return 1;
+}
+
+static void ttwu_do_activate(struct rq *rq, struct task_struct *p, bool local)
+{
+	set_task_cpu(p, cpu_of(rq));
+	activate_task(rq, p, ENQUEUE_WAKEUP
+#ifdef CONFIG_SMP
+			     | ENQUEUE_WAKING
+#endif
+			);
+	ttwu_post_activation(p, rq, 0, true);
+}
+
+void sched_ttwu_pending(void)
+{
+#ifdef CONFIG_SMP
+	struct rq *rq = this_rq();
+	struct task_struct *list = xchg(&rq->wake_list, NULL);
+
+	if (!list)
+		return;
+
+	raw_spin_lock(&rq->lock);
+
+	while (list) {
+		struct task_struct *p = list;
+		list = list->wake_entry;
+		ttwu_do_activate(rq, p, false);
+	}
+
+	raw_spin_unlock(&rq->lock);
+#endif
+}
+
+static void ttwu_queue(struct task_struct *p, int cpu)
+{
+	struct task_struct *next = NULL;
+	struct rq *rq = cpu_rq(cpu);
+
+	if (cpu == smp_processor_id()) {
+		raw_spin_lock(&rq->lock);
+		ttwu_do_activate(rq, p, true);
+		raw_spin_unlock(&rq->lock);
+		return;
+	}
+
+#ifdef CONFIG_SMP
+	for (;;) {
+		struct task_struct *old = next;
+
+		p->wake_entry = next;
+		next = cmpxchg(&rq->wake_list, old, p);
+		if (next == old)
+			break;
+	}
+
+	if (!next)
+		smp_send_reschedule(cpu);
+#endif
+}
+
+static int
+try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
+{
+	unsigned long flags;
+	int success = 0;
+	int cpu = task_cpu(p);
+
+	local_irq_save(flags);
+	for (;;) {
+		unsigned int task_state = p->state;
+
+		if (!(task_state & state))
+			goto out;
+
+		if (cmpxchg(&p->state, task_state, TASK_WAKING) == task_state)
+			break;
+	}
+
+	success = 1;
+
+	if (unlikely(p->se.on_rq) && ttwu_running(p, wake_flags))
+		goto out;
+
+#ifdef CONFIG_SMP
+	if (p->sched_class->task_waking)
+		p->sched_class->task_waking(p);
+
+	cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
+#endif
+	ttwu_stat(p, wake_flags & WF_SYNC,		/* sync */
+		     cpu != task_cpu(p),		/* migrate */
+		     cpu == smp_processor_id());	/* local */
+	ttwu_queue(p, cpu);
+out:
+	local_irq_restore(flags);
+	return success;
+}
+
 /**
  * wake_up_process - Wake up a specific process
  * @p: The process to be woken up.
@@ -2604,7 +2617,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
 	 * We set TASK_WAKING so that select_task_rq() can drop rq->lock
 	 * without people poking at ->cpus_allowed.
 	 */
-	cpu = select_task_rq(rq, p, SD_BALANCE_FORK, 0);
+	cpu = select_task_rq(p, SD_BALANCE_FORK, 0);
 	set_task_cpu(p, cpu);
 
 	p->state = TASK_RUNNING;
@@ -3200,7 +3213,7 @@ void sched_exec(void)
 	int dest_cpu;
 
 	rq = task_rq_lock(p, &flags);
-	dest_cpu = p->sched_class->select_task_rq(rq, p, SD_BALANCE_EXEC, 0);
+	dest_cpu = p->sched_class->select_task_rq(p, SD_BALANCE_EXEC, 0);
 	if (dest_cpu == smp_processor_id())
 		goto unlock;
 
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 5e8f98c..2bd145f 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1128,11 +1128,12 @@ static void yield_task_fair(struct rq *rq)
 
 #ifdef CONFIG_SMP
 
-static void task_waking_fair(struct rq *rq, struct task_struct *p)
+static void task_waking_fair(struct task_struct *p)
 {
 	struct sched_entity *se = &p->se;
 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
 
+	// XXX racy again,.. we need to read cfs_rq->min_vruntime atomically
 	se->vruntime -= cfs_rq->min_vruntime;
 }
 
@@ -1443,7 +1444,7 @@ static int select_idle_sibling(struct task_struct *p, int target)
  * preempt must be disabled.
  */
 static int
-select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_flags)
+select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
 {
 	struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
 	int cpu = smp_processor_id();
@@ -1516,11 +1517,8 @@ select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_
 		if (affine_sd && (!tmp || affine_sd->span_weight > sd->span_weight))
 			tmp = affine_sd;
 
-		if (tmp) {
-			raw_spin_unlock(&rq->lock);
+		if (tmp)
 			update_shares(tmp);
-			raw_spin_lock(&rq->lock);
-		}
 	}
 #endif
 
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
index 9fa0f40..efaed8c 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
@@ -7,7 +7,7 @@
 
 #ifdef CONFIG_SMP
 static int
-select_task_rq_idle(struct rq *rq, struct task_struct *p, int sd_flag, int flags)
+select_task_rq_idle(struct task_struct *p, int sd_flag, int flags)
 {
 	return task_cpu(p); /* IDLE tasks as never migrated */
 }
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index d10c80e..1011cdc 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -949,8 +949,10 @@ static void yield_task_rt(struct rq *rq)
 static int find_lowest_rq(struct task_struct *task);
 
 static int
-select_task_rq_rt(struct rq *rq, struct task_struct *p, int sd_flag, int flags)
+select_task_rq_rt(struct task_struct *p, int sd_flag, int flags)
 {
+	struct rq *rq = task_rq(p); // XXX  racy
+
 	if (sd_flag != SD_BALANCE_WAKE)
 		return smp_processor_id();
 

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/