linux-kernel - Re: [PATCH 1/2] sched/deadline: add per rq tracking of admitted bandwidth

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite for Android: free password hash cracker in your pocket
[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <20160324092047.GA6375@twins.programming.kicks-ass.net>
Date:	Thu, 24 Mar 2016 10:20:47 +0100
From:	Peter Zijlstra <peterz@...radead.org>
To:	Juri Lelli <juri.lelli@....com>
Cc:	Steven Rostedt <rostedt@...dmis.org>,
	luca abeni <luca.abeni@...tn.it>, linux-kernel@...r.kernel.org,
	mingo@...hat.com, vincent.guittot@...aro.org,
	wanpeng.li@...mail.com
Subject: Re: [PATCH 1/2] sched/deadline: add per rq tracking of admitted
 bandwidth

On Thu, Feb 25, 2016 at 11:20:34AM +0100, Peter Zijlstra wrote:
> On Thu, Feb 25, 2016 at 10:07:06AM +0000, Juri Lelli wrote:
> > Argh, this makes lot of sense to me. I've actually pondered a tree/list
> > solution, but then decided to try the cumulative approach because it
> > looked nicer. But it contains holes, I'm afraid. As Luca already said,
> > GRUB shouldn't have these problems though.
> > 
> > I'll try and see what introducting a list of blocked/throttled deadline
> > tasks means, considering also the interaction with cpusets and such.
> > Maybe it's simpler than it seems.
> > 
> > I'm not sure this will come anytime soon, unfortunately. I'm almost 100%
> > on the sched-freq/schedutil discussion these days.
> 
> Just skip sleep and write them when its dark outside :-)
> 
> > Anyway, do you also think that what we want to solve the root domain
> > issue is something based on rq_online/offline and per-rq information?
> > Everything else that I tried or thought of was broken/more horrible. :-/
> 
> I was still trying to get my head around this, the above was my
> suggestion to the per-rq state, but I've not thought hard on alternative
> approaches to the root_domain issue.

So the below is the inactive list; it seems to not insta-explode when I
run a few simple dl proglets.

I don't particularly like it because it makes wakeups (esp. cross-cpu
ones) more expensive for the benefit of hotplug/cpusets which is
something that 'never' happens.

So what I'm going to try and do is forget all about this here patch and
see what I can do with a full task-list iteration on rebuild. But I
figured that since I wrote it and it might work, I might as well post
it.

---
 include/linux/sched.h   |   5 ++
 kernel/sched/core.c     |   6 ++-
 kernel/sched/deadline.c | 118 ++++++++++++++++++++++++++++++++++++++++++++++--
 kernel/sched/fair.c     |   2 +-
 kernel/sched/sched.h    |   7 ++-
 5 files changed, 132 insertions(+), 6 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index c617ea12c6b7..d9848eac35f2 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1351,6 +1351,11 @@ struct sched_dl_entity {
 	 * own bandwidth to be enforced, thus we need one timer per task.
 	 */
 	struct hrtimer dl_timer;
+
+#ifdef CONFIG_SMP
+	struct list_head dl_inactive_entry;
+	int		 dl_inactive_cpu;
+#endif
 };
 
 union rcu_special {
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 0b21e7a724e1..7f3fab6349a4 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1162,7 +1162,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
 
 	if (task_cpu(p) != new_cpu) {
 		if (p->sched_class->migrate_task_rq)
-			p->sched_class->migrate_task_rq(p);
+			p->sched_class->migrate_task_rq(p, new_cpu);
 		p->se.nr_migrations++;
 		perf_event_task_migrate(p);
 	}
@@ -2077,6 +2077,9 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
 	RB_CLEAR_NODE(&p->dl.rb_node);
 	init_dl_task_timer(&p->dl);
 	__dl_clear_params(p);
+#ifdef CONFIG_SMP
+	INIT_LIST_HEAD(&p->dl.dl_inactive_entry);
+#endif
 
 	INIT_LIST_HEAD(&p->rt.run_list);
 	p->rt.timeout		= 0;
@@ -5397,6 +5400,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
 		migrate_tasks(rq);
 		BUG_ON(rq->nr_running != 1); /* the migration thread */
 		raw_spin_unlock_irqrestore(&rq->lock, flags);
+		migrate_inactive_dl(rq);
 		break;
 
 	case CPU_DEAD:
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index c7a036facbe1..f999b8bb6fea 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -80,6 +80,9 @@ void init_dl_rq(struct dl_rq *dl_rq)
 	dl_rq->dl_nr_migratory = 0;
 	dl_rq->overloaded = 0;
 	dl_rq->pushable_dl_tasks_root = RB_ROOT;
+
+	raw_spin_lock_init(&dl_rq->dl_inactive_lock);
+	INIT_LIST_HEAD(&dl_rq->dl_inactive_list);
 #else
 	init_dl_bw(&dl_rq->dl_bw);
 #endif
@@ -289,6 +292,62 @@ static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p
 	return later_rq;
 }
 
+static void enqueue_inactive(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
+{
+	raw_spin_lock(&dl_rq->dl_inactive_lock);
+	WRITE_ONCE(dl_se->dl_inactive_cpu, rq_of_dl_rq(dl_rq)->cpu);
+	list_add(&dl_se->dl_inactive_entry, &dl_rq->dl_inactive_list);
+	raw_spin_unlock(&dl_rq->dl_inactive_lock);
+}
+
+static void dequeue_inactive(struct sched_dl_entity *dl_se)
+{
+	int tmp, cpu = READ_ONCE(dl_se->dl_inactive_cpu);
+	struct rq *rq;
+
+again:
+	if (cpu == -1)
+		return;
+	rq = cpu_rq(cpu);
+
+	raw_spin_lock(&rq->dl.dl_inactive_lock);
+	tmp = READ_ONCE(dl_se->dl_inactive_cpu);
+	if (cpu != tmp) {
+		cpu = tmp;
+		raw_spin_unlock(&rq->dl.dl_inactive_lock);
+		goto again;
+	}
+	list_del_init(&dl_se->dl_inactive_entry);
+	WRITE_ONCE(dl_se->dl_inactive_cpu, -1);
+	raw_spin_unlock(&rq->dl.dl_inactive_lock);
+}
+
+static void migrate_inactive(struct sched_dl_entity *dl_se, int new_cpu)
+{
+	int tmp, cpu = READ_ONCE(dl_se->dl_inactive_cpu);
+	struct rq *src_rq, *dst_rq;
+
+	dst_rq = cpu_rq(new_cpu);
+again:
+	if (cpu == -1)
+		return;
+	src_rq = cpu_rq(cpu);
+
+	double_raw_lock(&src_rq->dl.dl_inactive_lock,
+			&dst_rq->dl.dl_inactive_lock);
+	tmp = READ_ONCE(dl_se->dl_inactive_cpu);
+	if (cpu != tmp) {
+		cpu = tmp;
+		raw_spin_unlock(&dst_rq->dl.dl_inactive_lock);
+		raw_spin_unlock(&src_rq->dl.dl_inactive_lock);
+		goto again;
+	}
+	list_move(&dl_se->dl_inactive_entry, &dst_rq->dl.dl_inactive_list);
+	WRITE_ONCE(dl_se->dl_inactive_cpu, new_cpu);
+	raw_spin_unlock(&dst_rq->dl.dl_inactive_lock);
+	raw_spin_unlock(&src_rq->dl.dl_inactive_lock);
+}
+
 #else
 
 static inline
@@ -327,6 +386,11 @@ static inline void queue_push_tasks(struct rq *rq)
 static inline void queue_pull_task(struct rq *rq)
 {
 }
+
+static inline void enqueue_inactive(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) { }
+static inline void dequeue_inactive(struct sched_dl_entity *dl_se) { }
+static inline void migrate_inactive(struct sched_dl_entity *dl_se, int new_cpu) { }
+
 #endif /* CONFIG_SMP */
 
 static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags);
@@ -960,6 +1024,9 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
 	if (p->dl.dl_throttled && !(flags & ENQUEUE_REPLENISH))
 		return;
 
+	if (!(flags & ENQUEUE_RESTORE))
+		dequeue_inactive(&p->dl);
+
 	enqueue_dl_entity(&p->dl, pi_se, flags);
 
 	if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
@@ -970,6 +1037,8 @@ static void __dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags)
 {
 	dequeue_dl_entity(&p->dl);
 	dequeue_pushable_dl_task(rq, p);
+	if (!(flags & DEQUEUE_SAVE))
+		enqueue_inactive(&p->dl, &rq->dl);
 }
 
 static void dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags)
@@ -1074,6 +1143,34 @@ static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p)
 	resched_curr(rq);
 }
 
+static void migrate_task_rq_dl(struct task_struct *p, int new_cpu)
+{
+	struct sched_dl_entity *dl_se = &p->dl;
+
+	if (list_empty(&dl_se->dl_inactive_entry))
+		return;
+
+	migrate_inactive(dl_se, new_cpu);
+}
+
+void migrate_inactive_dl(struct rq *src_rq)
+{
+	int cpu = cpumask_any_and(src_rq->rd->online, cpu_active_mask);
+	struct rq *dst_rq = cpu_rq(cpu);
+	struct sched_dl_entity *dl_se, *tmp;
+
+	double_raw_lock(&src_rq->dl.dl_inactive_lock,
+			&dst_rq->dl.dl_inactive_lock);
+
+	list_for_each_entry_safe(dl_se, tmp, &src_rq->dl.dl_inactive_list, dl_inactive_entry) {
+		WRITE_ONCE(dl_se->dl_inactive_cpu, cpu);
+		list_move(&dl_se->dl_inactive_entry, &dst_rq->dl.dl_inactive_list);
+	}
+
+	raw_spin_unlock(&dst_rq->dl.dl_inactive_lock);
+	raw_spin_unlock(&src_rq->dl.dl_inactive_lock);
+}
+
 #endif /* CONFIG_SMP */
 
 /*
@@ -1211,13 +1308,19 @@ static void task_dead_dl(struct task_struct *p)
 {
 	struct dl_bw *dl_b = dl_bw_of(task_cpu(p));
 
+	local_irq_disable();
+
 	/*
 	 * Since we are TASK_DEAD we won't slip out of the domain!
 	 */
-	raw_spin_lock_irq(&dl_b->lock);
+	raw_spin_lock(&dl_b->lock);
 	/* XXX we should retain the bw until 0-lag */
 	dl_b->total_bw -= p->dl.dl_bw;
-	raw_spin_unlock_irq(&dl_b->lock);
+	raw_spin_unlock(&dl_b->lock);
+
+	dequeue_inactive(&p->dl);
+
+	local_irq_enable();
 }
 
 static void set_curr_task_dl(struct rq *rq)
@@ -1702,7 +1805,12 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p)
 	 * this is the right place to try to pull some other one
 	 * from an overloaded cpu, if any.
 	 */
-	if (!task_on_rq_queued(p) || rq->dl.dl_nr_running)
+	if (!task_on_rq_queued(p)) {
+		dequeue_inactive(&p->dl);
+		return;
+	}
+
+	if (rq->dl.dl_nr_running)
 		return;
 
 	queue_pull_task(rq);
@@ -1728,6 +1836,9 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p)
 			resched_curr(rq);
 #endif
 	}
+
+	if (!task_on_rq_queued(p))
+		enqueue_inactive(&p->dl, &rq->dl);
 }
 
 /*
@@ -1779,6 +1890,7 @@ const struct sched_class dl_sched_class = {
 
 #ifdef CONFIG_SMP
 	.select_task_rq		= select_task_rq_dl,
+	.migrate_task_rq	= migrate_task_rq_dl,
 	.set_cpus_allowed       = set_cpus_allowed_dl,
 	.rq_online              = rq_online_dl,
 	.rq_offline             = rq_offline_dl,
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 303d6392b389..04e856a85c0f 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5231,7 +5231,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
  * cfs_rq_of(p) references at time of call are still valid and identify the
  * previous cpu. The caller guarantees p->pi_lock or task_rq(p)->lock is held.
  */
-static void migrate_task_rq_fair(struct task_struct *p)
+static void migrate_task_rq_fair(struct task_struct *p, int new_cpu)
 {
 	/*
 	 * We are supposed to update the task to "current" time, then its up to date
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index e6d4a3fa3660..0de1e2894d22 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -517,6 +517,9 @@ struct dl_rq {
 	 */
 	struct rb_root pushable_dl_tasks_root;
 	struct rb_node *pushable_dl_tasks_leftmost;
+
+	raw_spinlock_t   dl_inactive_lock;
+	struct list_head dl_inactive_list;
 #else
 	struct dl_bw dl_bw;
 #endif
@@ -776,6 +779,8 @@ extern int migrate_swap(struct task_struct *, struct task_struct *);
 
 #ifdef CONFIG_SMP
 
+extern void migrate_inactive_dl(struct rq *src_rq);
+
 static inline void
 queue_balance_callback(struct rq *rq,
 		       struct callback_head *head,
@@ -1205,7 +1210,7 @@ struct sched_class {
 
 #ifdef CONFIG_SMP
 	int  (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags);
-	void (*migrate_task_rq)(struct task_struct *p);
+	void (*migrate_task_rq)(struct task_struct *p, int new_cpu);
 
 	void (*task_waking) (struct task_struct *task);
 	void (*task_woken) (struct rq *this_rq, struct task_struct *task);