linux-kernel - [RFC PATCH 2/3] sched/deadline: Hierarchical scheduling with DL on top of RT

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <1490982277-4437-3-git-send-email-a.balsini@sssup.it>
Date:   Fri, 31 Mar 2017 19:44:36 +0200
From:   Alessio Balsini <a.balsini@...up.it>
To:     Ingo Molnar <mingo@...hat.com>,
        Peter Zijlstra <peterz@...radead.org>
Cc:     Alessio Balsini <a.balsini@...up.it>,
        Tommaso Cucinotta <tommaso.cucinotta@...up.it>,
        Juri Lelli <juri.lelli@....com>,
        Daniel Bristot de Oliveira <bristot@...hat.com>,
        Steven Rostedt <rostedt@...dmis.org>,
        Andrea Parri <parri.andrea@...il.com>,
        Luca Abeni <luca.abeni@...tannapisa.it>,
        linux-kernel@...r.kernel.org
Subject: [RFC PATCH 2/3] sched/deadline: Hierarchical scheduling with DL on top of RT

The runtime of RT tasks controlled by CGroups are enforced by the
SCHED_DEADLINE scheduling class, based on the runtime and period (the
deadline is set equal to the period) parameters.

sched_dl_entity may also represent a group of RT tasks, providing a rt_rq.

Signed-off-by: Andrea Parri <parri.andrea@...il.com>
Signed-off-by: Luca Abeni <luca.abeni@...tannapisa.it>
Cc: Tommaso Cucinotta <tommaso.cucinotta@...up.it>
Cc: Juri Lelli <juri.lelli@....com>
Cc: Daniel Bristot de Oliveira <bristot@...hat.com>
Cc: Steven Rostedt <rostedt@...dmis.org>
Cc: Ingo Molnar <mingo@...hat.com>
Cc: Peter Zijlstra <peterz@...radead.org>
Signed-off-by: Alessio Balsini <a.balsini@...up.it>
---
 include/linux/sched.h    |  13 +-
 kernel/sched/autogroup.c |   4 +-
 kernel/sched/core.c      |  86 ++++++++--
 kernel/sched/deadline.c  | 174 ++++++++++++++++----
 kernel/sched/rt.c        | 407 ++++++++++++++++++++++++++++++-----------------
 kernel/sched/sched.h     | 142 +++++++++++++++--
 6 files changed, 611 insertions(+), 215 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index d67eee8..fdd62f5 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -402,7 +402,7 @@ struct sched_rt_entity {
 
 	struct sched_rt_entity		*back;
 #ifdef CONFIG_RT_GROUP_SCHED
-	struct sched_rt_entity		*parent;
+	struct sched_dl_entity		*parent;
 	/* rq on which this entity is (to be) queued: */
 	struct rt_rq			*rt_rq;
 	/* rq "owned" by this entity/group: */
@@ -455,6 +455,17 @@ struct sched_dl_entity {
 	 * own bandwidth to be enforced, thus we need one timer per task.
 	 */
 	struct hrtimer			dl_timer;
+
+/*
+ * An instance of a sched_dl_entity may represent a group of tasks, therefore
+ * it requires:
+ * - dl_rq: the rq on which this entity is queued;
+ * - rt_rq: the rq owned by this entity;
+ */
+#ifdef CONFIG_RT_GROUP_SCHED
+	struct dl_rq			*dl_rq;
+	struct rt_rq			*my_q;
+#endif
 };
 
 union rcu_special {
diff --git a/kernel/sched/autogroup.c b/kernel/sched/autogroup.c
index da39489..e14acb4 100644
--- a/kernel/sched/autogroup.c
+++ b/kernel/sched/autogroup.c
@@ -30,7 +30,7 @@ static inline void autogroup_destroy(struct kref *kref)
 
 #ifdef CONFIG_RT_GROUP_SCHED
 	/* We've redirected RT tasks to the root task group... */
-	ag->tg->rt_se = NULL;
+	ag->tg->dl_se = NULL;
 	ag->tg->rt_rq = NULL;
 #endif
 	sched_offline_group(ag->tg);
@@ -88,7 +88,7 @@ static inline struct autogroup *autogroup_create(void)
 	 * the policy change to proceed.
 	 */
 	free_rt_sched_group(tg);
-	tg->rt_se = root_task_group.rt_se;
+	tg->dl_se = root_task_group.dl_se;
 	tg->rt_rq = root_task_group.rt_rq;
 #endif
 	tg->autogroup = ag;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 3d4cce4..b139719 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -902,6 +902,9 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
 {
 	const struct sched_class *class;
 
+	if (is_dl_group(rt_rq_of_se(&p->rt)) && task_has_rt_policy(p))
+		resched_curr(rq);
+
 	if (p->sched_class == rq->curr->sched_class) {
 		rq->curr->sched_class->check_preempt_curr(rq, p, flags);
 	} else {
@@ -2165,6 +2168,9 @@ void __dl_clear_params(struct task_struct *p)
 
 	dl_se->dl_throttled = 0;
 	dl_se->dl_yielded = 0;
+#ifdef CONFIG_RT_GROUP_SCHED
+	dl_se->my_q = NULL;
+#endif
 }
 
 /*
@@ -4261,7 +4267,8 @@ static int __sched_setscheduler(struct task_struct *p,
 		 * Do not allow realtime tasks into groups that have no runtime
 		 * assigned.
 		 */
-		if (rt_policy(policy) &&
+		if (dl_bandwidth_enabled() && rt_policy(policy) &&
+				task_group(p)->dl_bandwidth.dl_runtime == 0 &&
 				!task_group_is_autogroup(task_group(p))) {
 			task_rq_unlock(rq, p, &rf);
 			return -EPERM;
@@ -5987,7 +5994,7 @@ void __init sched_init(void)
 
 #endif /* CONFIG_FAIR_GROUP_SCHED */
 #ifdef CONFIG_RT_GROUP_SCHED
-		root_task_group.rt_se = (struct sched_rt_entity **)ptr;
+		root_task_group.dl_se = (struct sched_dl_entity **)ptr;
 		ptr += nr_cpu_ids * sizeof(void **);
 
 		root_task_group.rt_rq = (struct rt_rq **)ptr;
@@ -6010,6 +6017,10 @@ void __init sched_init(void)
 	init_defrootdomain();
 #endif
 
+#ifdef CONFIG_RT_GROUP_SCHED
+	init_dl_bandwidth(&root_task_group.dl_bandwidth,
+			global_rt_period(), global_rt_runtime());
+#endif /* CONFIG_RT_GROUP_SCHED */
 
 #ifdef CONFIG_CGROUP_SCHED
 	task_group_cache = KMEM_CACHE(task_group, 0);
@@ -6460,9 +6471,10 @@ static int tg_rt_schedulable(struct task_group *tg, void *data)
 	struct task_group *child;
 	unsigned long total, sum = 0;
 	u64 period, runtime;
+	unsigned long flags;
 
-	period = 0;
-	runtime = 0;
+	period  = tg->dl_bandwidth.dl_period;
+	runtime = tg->dl_bandwidth.dl_runtime;
 
 	if (tg == d->tg) {
 		period = d->rt_period;
@@ -6478,7 +6490,7 @@ static int tg_rt_schedulable(struct task_group *tg, void *data)
 	/*
 	 * Ensure we don't starve existing RT tasks.
 	 */
-	if (!runtime && tg_has_rt_tasks(tg))
+	if (dl_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg))
 		return -EBUSY;
 
 	total = to_ratio(period, runtime);
@@ -6489,12 +6501,27 @@ static int tg_rt_schedulable(struct task_group *tg, void *data)
 	if (total > to_ratio(global_rt_period(), global_rt_runtime()))
 		return -EINVAL;
 
+	if (tg == &root_task_group) {
+		int cpus = num_online_cpus();
+		struct dl_bw *dl_b = dl_bw_of(smp_processor_id());
+
+		raw_spin_lock_irqsave(&dl_b->lock, flags);
+
+		if (dl_b->bw != -1 &&
+		    dl_b->bw * cpus < dl_b->total_bw + total * cpus) {
+			raw_spin_unlock_irqrestore(&dl_b->lock, flags);
+			return -EBUSY;
+		}
+
+		raw_spin_unlock_irqrestore(&dl_b->lock, flags);
+	}
+
 	/*
 	 * The sum of our children's runtime should not exceed our own.
 	 */
 	list_for_each_entry_rcu(child, &tg->children, siblings) {
-		period = 0;
-		runtime = 0;
+		period  = child->dl_bandwidth.dl_period;
+		runtime = child->dl_bandwidth.dl_runtime;
 
 		if (child == d->tg) {
 			period = d->rt_period;
@@ -6549,6 +6576,33 @@ static int tg_set_rt_bandwidth(struct task_group *tg,
 	if (err)
 		goto unlock;
 
+	raw_spin_lock_irq(&tg->dl_bandwidth.dl_runtime_lock);
+	tg->dl_bandwidth.dl_period  = rt_period;
+	tg->dl_bandwidth.dl_runtime = rt_runtime;
+
+	if (tg == &root_task_group)
+		goto unlock_bandwidth;
+
+	for_each_possible_cpu(i) {
+		struct sched_dl_entity *dl_se = tg->dl_se[i];
+		struct rq *rq = container_of(dl_se->dl_rq, struct rq, dl);
+
+		raw_spin_lock_irq(&rq->lock);
+		dl_se->dl_runtime  = rt_runtime;
+		dl_se->dl_period   = rt_period;
+		dl_se->dl_deadline = dl_se->dl_period;
+		dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime);
+
+		if (!((s64)(rt_period - rt_runtime) >= 0) ||
+		    !(rt_runtime >= (2 << (DL_SCALE - 1)))) {
+			raw_spin_unlock_irq(&rq->lock);
+			continue;
+		}
+
+		raw_spin_unlock_irq(&rq->lock);
+	}
+unlock_bandwidth:
+	raw_spin_unlock_irq(&tg->dl_bandwidth.dl_runtime_lock);
 unlock:
 	read_unlock(&tasklist_lock);
 	mutex_unlock(&rt_constraints_mutex);
@@ -6560,7 +6614,7 @@ static int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
 {
 	u64 rt_runtime, rt_period;
 
-	rt_period = 0;
+	rt_period  = tg->dl_bandwidth.dl_period;
 	rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC;
 	if (rt_runtime_us < 0)
 		rt_runtime = RUNTIME_INF;
@@ -6572,10 +6626,10 @@ static long sched_group_rt_runtime(struct task_group *tg)
 {
 	u64 rt_runtime_us;
 
-	if (0 == RUNTIME_INF)
+	if (tg->dl_bandwidth.dl_runtime == RUNTIME_INF)
 		return -1;
 
-	rt_runtime_us = 0;
+	rt_runtime_us = tg->dl_bandwidth.dl_runtime;
 	do_div(rt_runtime_us, NSEC_PER_USEC);
 	return rt_runtime_us;
 }
@@ -6585,7 +6639,7 @@ static int sched_group_set_rt_period(struct task_group *tg, u64 rt_period_us)
 	u64 rt_runtime, rt_period;
 
 	rt_period = rt_period_us * NSEC_PER_USEC;
-	rt_runtime = 0;
+	rt_runtime = tg->dl_bandwidth.dl_runtime;
 
 	return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
 }
@@ -6594,7 +6648,7 @@ static long sched_group_rt_period(struct task_group *tg)
 {
 	u64 rt_period_us;
 
-	rt_period_us = 0;
+	rt_period_us = tg->dl_bandwidth.dl_period;
 	do_div(rt_period_us, NSEC_PER_USEC);
 	return rt_period_us;
 }
@@ -6617,7 +6671,7 @@ static int sched_rt_global_constraints(void)
 static int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
 {
 	/* Don't accept realtime tasks when there is no way for them to run */
-	if (rt_task(tsk))
+	if (rt_task(tsk) && tg->dl_bandwidth.dl_runtime == 0)
 		return 0;
 
 	return 1;
@@ -6785,6 +6839,12 @@ cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
 		return &root_task_group.css;
 	}
 
+	/* Do not allow cpu_cgroup hierachies with depth greater than 2. */
+#ifdef CONFIG_RT_GROUP_SCHED
+	if (parent != &root_task_group)
+		return ERR_PTR(-EINVAL);
+#endif
+
 	tg = sched_create_group(parent);
 	if (IS_ERR(tg))
 		return ERR_PTR(-ENOMEM);
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 1af6219..9a1988b 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -20,8 +20,17 @@
 
 struct dl_bandwidth def_dl_bandwidth;
 
+#ifdef CONFIG_RT_GROUP_SCHED
+#define dl_entity_is_task(dl_se) (!(dl_se)->my_q)
+#define rt_rq_of_dl_entity(dl_se) ((dl_se)->my_q)
+#else
+#define dl_entity_is_task(dl_se) (1)
+#define rt_rq_of_dl_entity(dl_se) (NULL)
+#endif
+
 static inline struct task_struct *dl_task_of(struct sched_dl_entity *dl_se)
 {
+	BUG_ON(!dl_entity_is_task(dl_se));
 	return container_of(dl_se, struct task_struct, dl);
 }
 
@@ -30,6 +39,14 @@ static inline struct rq *rq_of_dl_rq(struct dl_rq *dl_rq)
 	return container_of(dl_rq, struct rq, dl);
 }
 
+#ifdef CONFIG_RT_GROUP_SCHED
+static inline struct dl_rq *dl_rq_of_se(struct sched_dl_entity *dl_se)
+{
+	return dl_se->dl_rq;
+}
+
+#else /* CONFIG_RT_GROUP_SCHED */
+
 static inline struct dl_rq *dl_rq_of_se(struct sched_dl_entity *dl_se)
 {
 	struct task_struct *p = dl_task_of(dl_se);
@@ -37,6 +54,7 @@ static inline struct dl_rq *dl_rq_of_se(struct sched_dl_entity *dl_se)
 
 	return &rq->dl;
 }
+#endif
 
 static inline int on_dl_rq(struct sched_dl_entity *dl_se)
 {
@@ -119,7 +137,11 @@ static inline void dl_clear_overload(struct rq *rq)
 
 static void update_dl_migration(struct dl_rq *dl_rq)
 {
+#ifdef CONFIG_RT_GROUP_SCHED
+	if (dl_rq->dl_nr_migratory && dl_rq->dl_nr_total > 1) {
+#else
 	if (dl_rq->dl_nr_migratory && dl_rq->dl_nr_running > 1) {
+#endif
 		if (!dl_rq->overloaded) {
 			dl_set_overload(rq_of_dl_rq(dl_rq));
 			dl_rq->overloaded = 1;
@@ -520,11 +542,11 @@ static inline u64 dl_next_period(struct sched_dl_entity *dl_se)
  * actually started or not (i.e., the replenishment instant is in
  * the future or in the past).
  */
-static int start_dl_timer(struct task_struct *p)
+int start_dl_timer(struct sched_dl_entity *dl_se)
 {
-	struct sched_dl_entity *dl_se = &p->dl;
 	struct hrtimer *timer = &dl_se->dl_timer;
-	struct rq *rq = task_rq(p);
+	struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
+	struct rq *rq = rq_of_dl_rq(dl_rq);
 	ktime_t now, act;
 	s64 delta;
 
@@ -558,7 +580,11 @@ static int start_dl_timer(struct task_struct *p)
 	 * and observe our state.
 	 */
 	if (!hrtimer_is_queued(timer)) {
-		get_task_struct(p);
+		if (dl_entity_is_task(dl_se)) {
+			struct task_struct *p = dl_task_of(dl_se);
+
+			get_task_struct(p);
+		}
 		hrtimer_start(timer, act, HRTIMER_MODE_ABS);
 	}
 
@@ -583,10 +609,43 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
 	struct sched_dl_entity *dl_se = container_of(timer,
 						     struct sched_dl_entity,
 						     dl_timer);
-	struct task_struct *p = dl_task_of(dl_se);
+	struct task_struct *p;
 	struct rq_flags rf;
 	struct rq *rq;
 
+#ifdef CONFIG_RT_GROUP_SCHED
+	/* Replenish dl group and check for preemption. */
+	if (!dl_entity_is_task(dl_se)) {
+		struct rt_rq *rt_rq = rt_rq_of_dl_entity(dl_se);
+
+		rq = rq_of_dl_rq(dl_rq_of_se(dl_se));
+
+		raw_spin_lock(&rq->lock);
+
+
+		sched_clock_tick();
+		update_rq_clock(rq);
+
+		dl_se->dl_throttled = 0;
+		if (rt_rq->rt_nr_running) {
+			enqueue_dl_entity(dl_se, dl_se, ENQUEUE_REPLENISH);
+
+			resched_curr(rq);
+#ifdef CONFIG_SMP
+			if (has_pushable_dl_tasks(rq))
+				push_dl_task(rq);
+#endif
+		} else {
+			replenish_dl_entity(dl_se, dl_se);
+		}
+
+		raw_spin_unlock(&rq->lock);
+
+		return HRTIMER_NORESTART;
+	}
+#endif /* CONFIG_RT_GROUP_SCHED */
+
+	p  = dl_task_of(dl_se);
 	rq = task_rq_lock(p, &rf);
 
 	/*
@@ -720,13 +779,12 @@ static inline void dl_check_constrained_dl(struct sched_dl_entity *dl_se)
 
 	if (dl_time_before(dl_se->deadline, rq_clock(rq)) &&
 	    dl_time_before(rq_clock(rq), dl_next_period(dl_se))) {
-		if (unlikely(dl_se->dl_boosted || !start_dl_timer(p)))
+		if (unlikely(dl_se->dl_boosted || !start_dl_timer(&p->dl)))
 			return;
 		dl_se->dl_throttled = 1;
 	}
 }
 
-static
 int dl_runtime_exceeded(struct sched_dl_entity *dl_se)
 {
 	return (dl_se->runtime <= 0);
@@ -780,7 +838,7 @@ static void update_curr_dl(struct rq *rq)
 	if (dl_runtime_exceeded(dl_se) || dl_se->dl_yielded) {
 		dl_se->dl_throttled = 1;
 		__dequeue_task_dl(rq, curr, 0);
-		if (unlikely(dl_se->dl_boosted || !start_dl_timer(curr)))
+		if (unlikely(dl_se->dl_boosted || !start_dl_timer(&curr->dl)))
 			enqueue_task_dl(rq, curr, ENQUEUE_REPLENISH);
 
 		if (!is_leftmost(curr, &rq->dl))
@@ -833,29 +891,39 @@ static inline void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline) {}
 static inline
 void inc_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
 {
-	int prio = dl_task_of(dl_se)->prio;
 	u64 deadline = dl_se->deadline;
 
-	WARN_ON(!dl_prio(prio));
-	dl_rq->dl_nr_running++;
-	add_nr_running(rq_of_dl_rq(dl_rq), 1);
+	if (dl_entity_is_task(dl_se)) {
+		dl_rq->dl_nr_running++;
+		add_nr_running(rq_of_dl_rq(dl_rq), 1);
+		inc_dl_migration(dl_se, dl_rq);
+	} else {
+		struct rt_rq *rt_rq = rt_rq_of_dl_entity(dl_se);
+
+		add_nr_running(rq_of_dl_rq(dl_rq), rt_rq->rt_nr_running);
+	}
 
 	inc_dl_deadline(dl_rq, deadline);
-	inc_dl_migration(dl_se, dl_rq);
 }
 
 static inline
 void dec_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
 {
-	int prio = dl_task_of(dl_se)->prio;
+#ifdef CONFIG_RT_GROUP_SCHED
+	WARN_ON(!dl_rq->dl_nr_total);
+#endif
 
-	WARN_ON(!dl_prio(prio));
-	WARN_ON(!dl_rq->dl_nr_running);
-	dl_rq->dl_nr_running--;
-	sub_nr_running(rq_of_dl_rq(dl_rq), 1);
+	if (dl_entity_is_task(dl_se)) {
+		dl_rq->dl_nr_running--;
+		sub_nr_running(rq_of_dl_rq(dl_rq), 1);
+		dec_dl_migration(dl_se, dl_rq);
+	} else {
+		struct rt_rq *rt_rq = rt_rq_of_dl_entity(dl_se);
+
+		sub_nr_running(rq_of_dl_rq(dl_rq), rt_rq->rt_nr_running);
+	}
 
 	dec_dl_deadline(dl_rq, dl_se->deadline);
-	dec_dl_migration(dl_se, dl_rq);
 }
 
 static void __enqueue_dl_entity(struct sched_dl_entity *dl_se)
@@ -884,7 +952,9 @@ static void __enqueue_dl_entity(struct sched_dl_entity *dl_se)
 
 	rb_link_node(&dl_se->rb_node, parent, link);
 	rb_insert_color(&dl_se->rb_node, &dl_rq->rb_root);
-
+#ifdef CONFIG_RT_GROUP_SCHED
+	dl_rq->dl_nr_total++;
+#endif
 	inc_dl_tasks(dl_se, dl_rq);
 }
 
@@ -906,9 +976,12 @@ static void __dequeue_dl_entity(struct sched_dl_entity *dl_se)
 	RB_CLEAR_NODE(&dl_se->rb_node);
 
 	dec_dl_tasks(dl_se, dl_rq);
+#ifdef CONFIG_RT_GROUP_SCHED
+	dl_rq->dl_nr_total--;
+#endif
 }
 
-static void
+void
 enqueue_dl_entity(struct sched_dl_entity *dl_se,
 		  struct sched_dl_entity *pi_se, int flags)
 {
@@ -927,7 +1000,7 @@ enqueue_dl_entity(struct sched_dl_entity *dl_se,
 	__enqueue_dl_entity(dl_se);
 }
 
-static void dequeue_dl_entity(struct sched_dl_entity *dl_se)
+void dequeue_dl_entity(struct sched_dl_entity *dl_se)
 {
 	__dequeue_dl_entity(dl_se);
 }
@@ -1120,12 +1193,12 @@ static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p,
 }
 
 #ifdef CONFIG_SCHED_HRTICK
-static void start_hrtick_dl(struct rq *rq, struct task_struct *p)
+void start_hrtick_dl(struct rq *rq, struct sched_dl_entity *dl_se)
 {
-	hrtick_start(rq, p->dl.runtime);
+	hrtick_start(rq, dl_se->runtime);
 }
 #else /* !CONFIG_SCHED_HRTICK */
-static void start_hrtick_dl(struct rq *rq, struct task_struct *p)
+void start_hrtick_dl(struct rq *rq, struct sched_dl_entity *dl_se)
 {
 }
 #endif
@@ -1176,14 +1249,35 @@ pick_next_task_dl(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
 	if (prev->sched_class == &dl_sched_class)
 		update_curr_dl(rq);
 
+#ifdef CONFIG_RT_GROUP_SCHED
+	if (unlikely(!dl_rq->dl_nr_total))
+		return NULL;
+#else
 	if (unlikely(!dl_rq->dl_nr_running))
 		return NULL;
-
-	put_prev_task(rq, prev);
+#endif
 
 	dl_se = pick_next_dl_entity(rq, dl_rq);
 	BUG_ON(!dl_se);
 
+	put_prev_task(rq, prev);
+
+	if (!dl_entity_is_task(dl_se)) {
+		struct rt_rq *rt_rq = rt_rq_of_dl_entity(dl_se);
+		struct sched_rt_entity *rt_se;
+
+		rt_se = pick_next_rt_entity(rq, rt_rq);
+		p = container_of(rt_se, struct task_struct, rt);
+		p->se.exec_start = rq_clock_task(rq);
+
+		dequeue_pushable_task(rt_rq_of_se(&p->rt), p);
+
+		if (hrtick_enabled(rq))
+			start_hrtick_dl(rq, dl_se);
+
+		return p;
+	}
+
 	p = dl_task_of(dl_se);
 	p->se.exec_start = rq_clock_task(rq);
 
@@ -1191,7 +1285,7 @@ pick_next_task_dl(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
        dequeue_pushable_dl_task(rq, p);
 
 	if (hrtick_enabled(rq))
-		start_hrtick_dl(rq, p);
+		start_hrtick_dl(rq, &p->dl);
 
 	queue_push_tasks(rq);
 
@@ -1217,7 +1311,7 @@ static void task_tick_dl(struct rq *rq, struct task_struct *p, int queued)
 	 */
 	if (hrtick_enabled(rq) && queued && p->dl.runtime > 0 &&
 	    is_leftmost(p, &rq->dl))
-		start_hrtick_dl(rq, p);
+		start_hrtick_dl(rq, &p->dl);
 }
 
 static void task_fork_dl(struct task_struct *p)
@@ -1632,14 +1726,21 @@ static void pull_dl_task(struct rq *this_rq)
  */
 static void task_woken_dl(struct rq *rq, struct task_struct *p)
 {
-	if (!task_running(rq, p) &&
-	    !test_tsk_need_resched(rq->curr) &&
-	    p->nr_cpus_allowed > 1 &&
-	    dl_task(rq->curr) &&
+	if (task_running(rq, p) ||
+	    test_tsk_need_resched(rq->curr) ||
+	    p->nr_cpus_allowed <= 1)
+		return;
+
+	if (dl_task(rq->curr) &&
 	    (rq->curr->nr_cpus_allowed < 2 ||
 	     !dl_entity_preempt(&p->dl, &rq->curr->dl))) {
 		push_dl_tasks(rq);
 	}
+
+#ifdef CONFIG_RT_GROUP_SCHED
+	if (rt_task(rq->curr) && is_dl_group(rq->curr->rt.rt_rq))
+		push_dl_tasks(rq);
+#endif
 }
 
 static void set_cpus_allowed_dl(struct task_struct *p,
@@ -1715,7 +1816,7 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p)
 	 * SCHED_DEADLINE until the deadline passes, the timer will reset the
 	 * task.
 	 */
-	if (!start_dl_timer(p))
+	if (!start_dl_timer(&p->dl))
 		__dl_clear_params(p);
 
 	/*
@@ -1723,10 +1824,15 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p)
 	 * this is the right place to try to pull some other one
 	 * from an overloaded cpu, if any.
 	 */
+#ifdef CONFIG_RT_GROUP_SCHED
+	if (!rq->dl.dl_nr_total)
+		queue_pull_task(rq);
+#else
 	if (!task_on_rq_queued(p) || rq->dl.dl_nr_running)
 		return;
 
 	queue_pull_task(rq);
+#endif
 }
 
 /*
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index e72ccb8..f38bd4b 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -46,49 +46,37 @@ void init_rt_rq(struct rt_rq *rt_rq)
 
 #ifdef CONFIG_RT_GROUP_SCHED
 
-static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
-{
-#ifdef CONFIG_SCHED_DEBUG
-	WARN_ON_ONCE(!rt_entity_is_task(rt_se));
-#endif
-	return container_of(rt_se, struct task_struct, rt);
-}
-
 static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
 {
 	return rt_rq->rq;
 }
 
-static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
-{
-	return rt_se->rt_rq;
-}
-
-static inline struct rq *rq_of_rt_se(struct sched_rt_entity *rt_se)
-{
-	struct rt_rq *rt_rq = rt_se->rt_rq;
-
-	return rt_rq->rq;
-}
-
 void free_rt_sched_group(struct task_group *tg)
 {
+	unsigned long flags;
 	int i;
 
 	for_each_possible_cpu(i) {
 		if (tg->rt_rq)
 			kfree(tg->rt_rq[i]);
-		if (tg->rt_se)
-			kfree(tg->rt_se[i]);
+		if (tg->dl_se) {
+			raw_spin_lock_irqsave(&cpu_rq(i)->lock, flags);
+			if (!tg->dl_se[i]->dl_throttled)
+				dequeue_dl_entity(tg->dl_se[i]);
+			raw_spin_unlock_irqrestore(&cpu_rq(i)->lock, flags);
+
+			hrtimer_cancel(&tg->dl_se[i]->dl_timer);
+			kfree(tg->dl_se[i]);
+		}
 	}
 
 	kfree(tg->rt_rq);
-	kfree(tg->rt_se);
+	kfree(tg->dl_se);
 }
 
 void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
-		struct sched_rt_entity *rt_se, int cpu,
-		struct sched_rt_entity *parent)
+		struct sched_dl_entity *dl_se, int cpu,
+		struct sched_dl_entity *parent)
 {
 	struct rq *rq = cpu_rq(cpu);
 
@@ -97,47 +85,56 @@ void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
 	rt_rq->tg = tg;
 
 	tg->rt_rq[cpu] = rt_rq;
-	tg->rt_se[cpu] = rt_se;
+	tg->dl_se[cpu] = dl_se;
 
-	if (!rt_se)
+	if (!dl_se)
 		return;
 
-	if (!parent)
-		rt_se->rt_rq = &rq->rt;
-	else
-		rt_se->rt_rq = parent->my_q;
-
-	rt_se->my_q = rt_rq;
-	rt_se->parent = parent;
-	INIT_LIST_HEAD(&rt_se->run_list);
+	dl_se->dl_rq = &rq->dl;
+	dl_se->my_q = rt_rq;
+	RB_CLEAR_NODE(&dl_se->rb_node);
 }
 
 int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
 {
 	struct rt_rq *rt_rq;
-	struct sched_rt_entity *rt_se;
+	struct sched_dl_entity *dl_se;
 	int i;
 
-	tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL);
+	tg->rt_rq = kcalloc(nr_cpu_ids, sizeof(rt_rq), GFP_KERNEL);
 	if (!tg->rt_rq)
 		goto err;
-	tg->rt_se = kzalloc(sizeof(rt_se) * nr_cpu_ids, GFP_KERNEL);
-	if (!tg->rt_se)
+	tg->dl_se = kcalloc(nr_cpu_ids, sizeof(dl_se), GFP_KERNEL);
+	if (!tg->dl_se)
 		goto err;
 
+	init_dl_bandwidth(&tg->dl_bandwidth,
+			def_dl_bandwidth.dl_period, 0);
+
 	for_each_possible_cpu(i) {
 		rt_rq = kzalloc_node(sizeof(struct rt_rq),
 				     GFP_KERNEL, cpu_to_node(i));
 		if (!rt_rq)
 			goto err;
 
-		rt_se = kzalloc_node(sizeof(struct sched_rt_entity),
+		dl_se = kzalloc_node(sizeof(struct sched_dl_entity),
 				     GFP_KERNEL, cpu_to_node(i));
-		if (!rt_se)
+		if (!dl_se)
 			goto err_free_rq;
 
 		init_rt_rq(rt_rq);
-		init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]);
+		rt_rq->rq = cpu_rq(i);
+
+		init_dl_task_timer(dl_se);
+
+		dl_se->dl_runtime = tg->dl_bandwidth.dl_runtime;
+		dl_se->dl_period = tg->dl_bandwidth.dl_period;
+		dl_se->dl_deadline = dl_se->dl_period;
+		dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime);
+
+		dl_se->dl_throttled = 0;
+
+		init_tg_rt_entry(tg, rt_rq, dl_se, i, parent->dl_se[i]);
 	}
 
 	return 1;
@@ -150,30 +147,11 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
 
 #else /* CONFIG_RT_GROUP_SCHED */
 
-static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
-{
-	return container_of(rt_se, struct task_struct, rt);
-}
-
 static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
 {
 	return container_of(rt_rq, struct rq, rt);
 }
 
-static inline struct rq *rq_of_rt_se(struct sched_rt_entity *rt_se)
-{
-	struct task_struct *p = rt_task_of(rt_se);
-
-	return task_rq(p);
-}
-
-static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
-{
-	struct rq *rq = rq_of_rt_se(rt_se);
-
-	return &rq->rt;
-}
-
 void free_rt_sched_group(struct task_group *tg) { }
 
 int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
@@ -228,7 +206,7 @@ static inline void rt_clear_overload(struct rq *rq)
 
 static void update_rt_migration(struct rt_rq *rt_rq)
 {
-	if (rt_rq->rt_nr_migratory && rt_rq->rt_nr_total > 1) {
+	if (rt_rq->rt_nr_migratory && rt_rq->rt_nr_running > 1) {
 		if (!rt_rq->overloaded) {
 			rt_set_overload(rq_of_rt_rq(rt_rq));
 			rt_rq->overloaded = 1;
@@ -243,8 +221,6 @@ static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 {
 	struct task_struct *p;
 
-	return;
-
 	p = rt_task_of(rt_se);
 
 	if (p->nr_cpus_allowed > 1)
@@ -258,18 +234,16 @@ static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 	struct task_struct *p;
 
 	p = rt_task_of(rt_se);
-	rt_rq = &rq_of_rt_rq(rt_rq)->rt;
 
-	rt_rq->rt_nr_total--;
 	if (p->nr_cpus_allowed > 1)
 		rt_rq->rt_nr_migratory--;
 
 	update_rt_migration(rt_rq);
 }
 
-static inline int has_pushable_tasks(struct rq *rq)
+static inline int has_pushable_tasks(struct rt_rq *rt_rq)
 {
-	return !plist_head_empty(&rq->rt.pushable_tasks);
+	return !plist_head_empty(&rt_rq->pushable_tasks);
 }
 
 static DEFINE_PER_CPU(struct callback_head, rt_push_head);
@@ -280,7 +254,7 @@ static void pull_rt_task(struct rq *);
 
 static inline void queue_push_tasks(struct rq *rq)
 {
-	if (!has_pushable_tasks(rq))
+	if (!has_pushable_tasks(&rq->rt))
 		return;
 
 	queue_balance_callback(rq, &per_cpu(rt_push_head, rq->cpu), push_rt_tasks);
@@ -291,37 +265,35 @@ static inline void queue_pull_task(struct rq *rq)
 	queue_balance_callback(rq, &per_cpu(rt_pull_head, rq->cpu), pull_rt_task);
 }
 
-static void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
+static void enqueue_pushable_task(struct rt_rq *rt_rq, struct task_struct *p)
 {
-	plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
+	plist_del(&p->pushable_tasks, &rt_rq->pushable_tasks);
 	plist_node_init(&p->pushable_tasks, p->prio);
-	plist_add(&p->pushable_tasks, &rq->rt.pushable_tasks);
+	plist_add(&p->pushable_tasks, &rt_rq->pushable_tasks);
 
 	/* Update the highest prio pushable task */
-	if (p->prio < rq->rt.highest_prio.next)
-		rq->rt.highest_prio.next = p->prio;
+	if (p->prio < rt_rq->highest_prio.next)
+		rt_rq->highest_prio.next = p->prio;
 }
 
-static void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
+#ifdef CONFIG_RT_GROUP_SCHED
+void dequeue_pushable_task(struct rt_rq *rt_rq, struct task_struct *p)
 {
-	plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
+	plist_del(&p->pushable_tasks, &rt_rq->pushable_tasks);
 
 	/* Update the new highest prio pushable task */
-	if (has_pushable_tasks(rq)) {
-		p = plist_first_entry(&rq->rt.pushable_tasks,
+	if (has_pushable_tasks(rt_rq)) {
+		p = plist_first_entry(&rt_rq->pushable_tasks,
 				      struct task_struct, pushable_tasks);
-		rq->rt.highest_prio.next = p->prio;
+		rt_rq->highest_prio.next = p->prio;
 	} else
-		rq->rt.highest_prio.next = MAX_RT_PRIO;
+		rt_rq->highest_prio.next = MAX_RT_PRIO;
 }
-
+#endif
 #else
 
-static inline void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
-{
-}
-
-static inline void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
+static inline
+void enqueue_pushable_task(struct rt_rq *rt_rq, struct task_struct *p)
 {
 }
 
@@ -347,33 +319,17 @@ static inline void pull_rt_task(struct rq *this_rq)
 static inline void queue_push_tasks(struct rq *rq)
 {
 }
+
+static inline void queue_pull_task(struct rq *rq)
+{
+}
 #endif /* CONFIG_SMP */
 
 static inline int on_rt_rq(struct sched_rt_entity *rt_se)
 {
-	return rt_se->on_rq;
+	return !list_empty(&rt_se->run_list);
 }
 
-#ifdef CONFIG_RT_GROUP_SCHED
-
-#define for_each_sched_rt_entity(rt_se) \
-	for (; rt_se; rt_se = rt_se->parent)
-
-static void enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags);
-static void dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags);
-
-#else /* !CONFIG_RT_GROUP_SCHED */
-
-typedef struct rt_rq *rt_rq_iter_t;
-
-#define for_each_rt_rq(rt_rq, iter, rq) \
-	for ((void) iter, rt_rq = &rq->rt; rt_rq; rt_rq = NULL)
-
-#define for_each_sched_rt_entity(rt_se) \
-	for (; rt_se; rt_se = NULL)
-
-#endif /* CONFIG_RT_GROUP_SCHED */
-
 static inline int rt_se_prio(struct sched_rt_entity *rt_se)
 {
 	return rt_task_of(rt_se)->prio;
@@ -386,6 +342,7 @@ static inline int rt_se_prio(struct sched_rt_entity *rt_se)
 static void update_curr_rt(struct rq *rq)
 {
 	struct task_struct *curr = rq->curr;
+	struct rt_rq *rt_rq = rt_rq_of_se(&curr->rt);
 	u64 delta_exec;
 
 	if (curr->sched_class != &rt_sched_class)
@@ -408,6 +365,34 @@ static void update_curr_rt(struct rq *rq)
 	cpuacct_charge(curr, delta_exec);
 
 	sched_rt_avg_update(rq, delta_exec);
+
+	if (!dl_bandwidth_enabled())
+		return;
+
+	if (is_dl_group(rt_rq)) {
+		struct sched_dl_entity *dl_se = dl_group_of(rt_rq);
+
+		if (dl_se->dl_throttled) {
+			resched_curr(rq);
+			return;
+		}
+
+		BUG_ON(rt_rq->rt_nr_running > rq->nr_running);
+		dl_se->runtime -= delta_exec;
+
+		/* A group exhausts the budget. */
+		if (dl_runtime_exceeded(dl_se)) {
+			dequeue_dl_entity(dl_se);
+
+			if (likely(start_dl_timer(dl_se)))
+				dl_se->dl_throttled = 1;
+			else
+				enqueue_dl_entity(dl_se, dl_se,
+						  ENQUEUE_REPLENISH);
+
+			resched_curr(rq);
+		}
+	}
 }
 
 #if defined CONFIG_SMP
@@ -417,7 +402,7 @@ inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
 {
 	struct rq *rq = rq_of_rt_rq(rt_rq);
 
-	if (&rq->rt != rt_rq)
+	if (is_dl_group(rt_rq))
 		return;
 
 	if (rq->online && prio < prev_prio)
@@ -429,7 +414,7 @@ dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
 {
 	struct rq *rq = rq_of_rt_rq(rt_rq);
 
-	if (&rq->rt != rt_rq)
+	if (is_dl_group(rt_rq))
 		return;
 
 	if (rq->online && rt_rq->highest_prio.curr != prev_prio)
@@ -445,12 +430,15 @@ void dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {}
 
 #endif /* CONFIG_SMP */
 
-#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
+#if defined(CONFIG_SMP)
 static void
 inc_rt_prio(struct rt_rq *rt_rq, int prio)
 {
 	int prev_prio = rt_rq->highest_prio.curr;
 
+	if (is_dl_group(rt_rq))
+		return;
+
 	if (prio < prev_prio)
 		rt_rq->highest_prio.curr = prio;
 
@@ -462,6 +450,9 @@ dec_rt_prio(struct rt_rq *rt_rq, int prio)
 {
 	int prev_prio = rt_rq->highest_prio.curr;
 
+	if (is_dl_group(rt_rq))
+		return;
+
 	if (rt_rq->rt_nr_running) {
 
 		WARN_ON(prio < prev_prio);
@@ -488,7 +479,7 @@ dec_rt_prio(struct rt_rq *rt_rq, int prio)
 static inline void inc_rt_prio(struct rt_rq *rt_rq, int prio) {}
 static inline void dec_rt_prio(struct rt_rq *rt_rq, int prio) {}
 
-#endif /* CONFIG_SMP || CONFIG_RT_GROUP_SCHED */
+#endif /* CONFIG_SMP && !CONFIG_RT_GROUP_SCHED */
 
 static inline
 unsigned int rt_se_nr_running(struct sched_rt_entity *rt_se)
@@ -516,6 +507,16 @@ void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 	rt_rq->rr_nr_running += rt_se_rr_nr_running(rt_se);
 
 	inc_rt_prio(rt_rq, prio);
+
+	if (is_dl_group(rt_rq)) {
+		struct sched_dl_entity *dl_se = dl_group_of(rt_rq);
+
+		if (!dl_se->dl_throttled)
+			add_nr_running(rq_of_rt_rq(rt_rq), 1);
+	} else {
+		add_nr_running(rq_of_rt_rq(rt_rq), 1);
+	}
+
 	inc_rt_migration(rt_se, rt_rq);
 }
 
@@ -523,11 +524,18 @@ static inline
 void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 {
 	WARN_ON(!rt_prio(rt_se_prio(rt_se)));
-	WARN_ON(!rt_rq->rt_nr_running);
 	rt_rq->rt_nr_running -= rt_se_nr_running(rt_se);
 	rt_rq->rr_nr_running -= rt_se_rr_nr_running(rt_se);
 
 	dec_rt_prio(rt_rq, rt_se_prio(rt_se));
+	if (is_dl_group(rt_rq)) {
+		struct sched_dl_entity *dl_se = dl_group_of(rt_rq);
+
+		if (!dl_se->dl_throttled)
+			sub_nr_running(rq_of_rt_rq(rt_rq), 1);
+	} else {
+		sub_nr_running(rq_of_rt_rq(rt_rq), 1);
+	}
 	dec_rt_migration(rt_se, rt_rq);
 }
 
@@ -596,24 +604,49 @@ static void
 enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
 {
 	struct sched_rt_entity *rt_se = &p->rt;
+	struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
 
 	if (flags & ENQUEUE_WAKEUP)
 		rt_se->timeout = 0;
 
-	enqueue_rt_entity(rt_se, flags);
+	/* Task arriving in an idle group of tasks. */
+	if (is_dl_group(rt_rq) && (rt_rq->rt_nr_running == 0)) {
+		struct sched_dl_entity *dl_se = dl_group_of(rt_rq);
+
+		if (!dl_se->dl_throttled) {
+			enqueue_dl_entity(dl_se, dl_se, flags);
+			resched_curr(rq);
+		}
+	}
+
+	enqueue_rt_entity(rt_se, flags & ENQUEUE_HEAD);
 
 	if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
-		enqueue_pushable_task(rq, p);
+		enqueue_pushable_task(rt_rq, p);
 }
 
 static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
 {
 	struct sched_rt_entity *rt_se = &p->rt;
+	struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
 
 	update_curr_rt(rq);
 	dequeue_rt_entity(rt_se, flags);
 
-	dequeue_pushable_task(rq, p);
+	dequeue_pushable_task(rt_rq_of_se(rt_se), p);
+
+	/* Last task of the task group. */
+	if (is_dl_group(rt_rq) && !rt_rq->rt_nr_running) {
+		struct sched_dl_entity *dl_se = dl_group_of(rt_rq);
+
+#ifndef CONFIG_RT_GROUP_SCHED
+		queue_pull_task(rq);
+#endif
+		if (!rt_rq->rt_nr_running) {
+			dequeue_dl_entity(dl_se);
+			resched_curr(rq);
+		}
+	}
 }
 
 /*
@@ -639,10 +672,8 @@ static void requeue_task_rt(struct rq *rq, struct task_struct *p, int head)
 	struct sched_rt_entity *rt_se = &p->rt;
 	struct rt_rq *rt_rq;
 
-	for_each_sched_rt_entity(rt_se) {
-		rt_rq = rt_rq_of_se(rt_se);
-		requeue_rt_entity(rt_rq, rt_se, head);
-	}
+	rt_rq = rt_rq_of_se(rt_se);
+	requeue_rt_entity(rt_rq, rt_se, head);
 }
 
 static void yield_task_rt(struct rq *rq)
@@ -743,6 +774,30 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
  */
 static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flags)
 {
+	if (is_dl_group(rt_rq_of_se(&p->rt)) &&
+	    is_dl_group(rt_rq_of_se(&rq->curr->rt))) {
+		struct sched_dl_entity *dl_se, *curr_dl_se;
+
+		dl_se = dl_group_of(rt_rq_of_se(&p->rt));
+		curr_dl_se = dl_group_of(rt_rq_of_se(&rq->curr->rt));
+
+		if (dl_entity_preempt(dl_se, curr_dl_se)) {
+			resched_curr(rq);
+			return;
+		} else if (!dl_entity_preempt(curr_dl_se, dl_se)) {
+			if (p->prio < rq->curr->prio) {
+				resched_curr(rq);
+				return;
+			}
+		}
+		return;
+	} else if (is_dl_group(rt_rq_of_se(&p->rt))) {
+		resched_curr(rq);
+		return;
+	} else if (is_dl_group(rt_rq_of_se(&rq->curr->rt))) {
+		return;
+	}
+
 	if (p->prio < rq->curr->prio) {
 		resched_curr(rq);
 		return;
@@ -766,7 +821,7 @@ static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flag
 #endif
 }
 
-static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq,
+struct sched_rt_entity *pick_next_rt_entity(struct rq *rq,
 						   struct rt_rq *rt_rq)
 {
 	struct rt_prio_array *array = &rt_rq->active;
@@ -831,7 +886,7 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
 	if (prev->sched_class == &rt_sched_class)
 		update_curr_rt(rq);
 
-	if (!rt_rq->rt_queued)
+	if (!rt_rq->rt_nr_running)
 		return NULL;
 
 	put_prev_task(rq, prev);
@@ -839,7 +894,7 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
 	p = _pick_next_task_rt(rq);
 
 	/* The running task is never eligible for pushing */
-	dequeue_pushable_task(rq, p);
+	dequeue_pushable_task(rt_rq, p);
 
 	queue_push_tasks(rq);
 
@@ -848,6 +903,8 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
 
 static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
 {
+	struct rt_rq *rt_rq = rt_rq_of_se(&p->rt);
+
 	update_curr_rt(rq);
 
 	/*
@@ -855,7 +912,8 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
 	 * if it is still active
 	 */
 	if (on_rt_rq(&p->rt) && p->nr_cpus_allowed > 1)
-		enqueue_pushable_task(rq, p);
+		enqueue_pushable_task(rt_rq, p);
+
 }
 
 #ifdef CONFIG_SMP
@@ -863,9 +921,9 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
 /* Only try algorithms three times */
 #define RT_MAX_TRIES 3
 
-static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
+static int pick_rt_task(struct rt_rq *rt_rq, struct task_struct *p, int cpu)
 {
-	if (!task_running(rq, p) &&
+	if (!task_running(rq_of_rt_rq(rt_rq), p) &&
 	    cpumask_test_cpu(cpu, &p->cpus_allowed))
 		return 1;
 	return 0;
@@ -875,16 +933,17 @@ static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
  * Return the highest pushable rq's task, which is suitable to be executed
  * on the cpu, NULL otherwise
  */
-static struct task_struct *pick_highest_pushable_task(struct rq *rq, int cpu)
+static
+struct task_struct *pick_highest_pushable_task(struct rt_rq *rt_rq, int cpu)
 {
-	struct plist_head *head = &rq->rt.pushable_tasks;
+	struct plist_head *head = &rt_rq->pushable_tasks;
 	struct task_struct *p;
 
-	if (!has_pushable_tasks(rq))
+	if (!has_pushable_tasks(rt_rq))
 		return NULL;
 
 	plist_for_each_entry(p, head, pushable_tasks) {
-		if (pick_rt_task(rq, p, cpu))
+		if (pick_rt_task(rt_rq, p, cpu))
 			return p;
 	}
 
@@ -1024,14 +1083,15 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
 	return lowest_rq;
 }
 
-static struct task_struct *pick_next_pushable_task(struct rq *rq)
+static struct task_struct *pick_next_pushable_task(struct rt_rq *rt_rq)
 {
+	struct rq *rq = rq_of_rt_rq(rt_rq);
 	struct task_struct *p;
 
-	if (!has_pushable_tasks(rq))
+	if (!has_pushable_tasks(rt_rq))
 		return NULL;
 
-	p = plist_first_entry(&rq->rt.pushable_tasks,
+	p = plist_first_entry(&rt_rq->pushable_tasks,
 			      struct task_struct, pushable_tasks);
 
 	BUG_ON(rq->cpu != task_cpu(p));
@@ -1058,7 +1118,7 @@ static int push_rt_task(struct rq *rq)
 	if (!rq->rt.overloaded)
 		return 0;
 
-	next_task = pick_next_pushable_task(rq);
+	next_task = pick_next_pushable_task(&rq->rt);
 	if (!next_task)
 		return 0;
 
@@ -1093,7 +1153,7 @@ static int push_rt_task(struct rq *rq)
 		 * run-queue and is also still the next task eligible for
 		 * pushing.
 		 */
-		task = pick_next_pushable_task(rq);
+		task = pick_next_pushable_task(&rq->rt);
 		if (task_cpu(next_task) == rq->cpu && task == next_task) {
 			/*
 			 * The task hasn't migrated, and is still the next
@@ -1331,7 +1391,7 @@ static void try_to_push_tasks(void *arg)
 	src_rq = rq_of_rt_rq(rt_rq);
 
 again:
-	if (has_pushable_tasks(rq)) {
+	if (has_pushable_tasks(&rq->rt)) {
 		raw_spin_lock(&rq->lock);
 		push_rt_task(rq);
 		raw_spin_unlock(&rq->lock);
@@ -1382,6 +1442,7 @@ static void pull_rt_task(struct rq *this_rq)
 	int this_cpu = this_rq->cpu, cpu;
 	bool resched = false;
 	struct task_struct *p;
+	struct rt_rq *src_rt_rq;
 	struct rq *src_rq;
 
 	if (likely(!rt_overloaded(this_rq)))
@@ -1405,6 +1466,7 @@ static void pull_rt_task(struct rq *this_rq)
 			continue;
 
 		src_rq = cpu_rq(cpu);
+		src_rt_rq = &src_rq->rt;
 
 		/*
 		 * Don't bother taking the src_rq->lock if the next highest
@@ -1413,7 +1475,7 @@ static void pull_rt_task(struct rq *this_rq)
 		 * logically higher, the src_rq will push this task away.
 		 * And if its going logically lower, we do not care
 		 */
-		if (src_rq->rt.highest_prio.next >=
+		if (src_rt_rq->highest_prio.next >=
 		    this_rq->rt.highest_prio.curr)
 			continue;
 
@@ -1428,7 +1490,7 @@ static void pull_rt_task(struct rq *this_rq)
 		 * We can pull only a task, which is pushable
 		 * on its rq, and no others.
 		 */
-		p = pick_highest_pushable_task(src_rq, this_cpu);
+		p = pick_highest_pushable_task(src_rt_rq, this_cpu);
 
 		/*
 		 * Do we have an RT task that preempts
@@ -1469,19 +1531,44 @@ static void pull_rt_task(struct rq *this_rq)
 		resched_curr(this_rq);
 }
 
+#ifdef CONFIG_RT_GROUP_SCHED
+int group_push_rt_task(struct rt_rq *rt_rq)
+{
+	struct rq *rq = rq_of_rt_rq(rt_rq);
+
+	if (is_dl_group(rt_rq))
+		return 0;
+
+	return push_rt_task(rq);
+}
+
+void group_push_rt_tasks(struct rt_rq *rt_rq)
+{
+	while (group_push_rt_task(rt_rq))
+		;
+}
+#else
+void group_push_rt_tasks(struct rt_rq *rt_rq)
+{
+	push_rt_tasks(rq_of_rt_rq(rt_rq));
+}
+#endif
+
 /*
  * If we are not running and we are not going to reschedule soon, we should
  * try to push tasks away now
  */
 static void task_woken_rt(struct rq *rq, struct task_struct *p)
 {
+	struct rt_rq *rt_rq = rt_rq_of_se(&p->rt);
+
 	if (!task_running(rq, p) &&
 	    !test_tsk_need_resched(rq->curr) &&
 	    p->nr_cpus_allowed > 1 &&
 	    (dl_task(rq->curr) || rt_task(rq->curr)) &&
 	    (rq->curr->nr_cpus_allowed < 2 ||
 	     rq->curr->prio <= p->prio))
-		push_rt_tasks(rq);
+		group_push_rt_tasks(rt_rq);
 }
 
 /* Assumes rq->lock is held */
@@ -1508,6 +1595,8 @@ static void rq_offline_rt(struct rq *rq)
  */
 static void switched_from_rt(struct rq *rq, struct task_struct *p)
 {
+	struct rt_rq *rt_rq = rt_rq_of_se(&p->rt);
+
 	/*
 	 * If there are other RT tasks then we will reschedule
 	 * and the scheduling of the other RT tasks will handle
@@ -1515,10 +1604,12 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p)
 	 * we may need to handle the pulling of RT tasks
 	 * now.
 	 */
-	if (!task_on_rq_queued(p) || rq->rt.rt_nr_running)
+	if (!task_on_rq_queued(p) || rt_rq->rt_nr_running)
 		return;
 
+#ifndef CONFIG_RT_GROUP_SCHED
 	queue_pull_task(rq);
+#endif
 }
 
 void __init init_sched_rt_class(void)
@@ -1548,8 +1639,16 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p)
 	 */
 	if (task_on_rq_queued(p) && rq->curr != p) {
 #ifdef CONFIG_SMP
+#ifndef CONFIG_RT_GROUP_SCHED
 		if (p->nr_cpus_allowed > 1 && rq->rt.overloaded)
 			queue_push_tasks(rq);
+#else
+		if (rt_rq_of_se(&p->rt)->overloaded) {
+		} else {
+			if (p->prio < rq->curr->prio)
+				resched_curr(rq);
+		}
+#endif
 #endif /* CONFIG_SMP */
 		if (p->prio < rq->curr->prio)
 			resched_curr(rq);
@@ -1563,6 +1662,10 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p)
 static void
 prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio)
 {
+#ifdef CONFIG_SMP
+	struct rt_rq *rt_rq = rt_rq_of_se(&p->rt);
+#endif
+
 	if (!task_on_rq_queued(p))
 		return;
 
@@ -1573,13 +1676,14 @@ prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio)
 		 * may need to pull tasks to this runqueue.
 		 */
 		if (oldprio < p->prio)
+#ifndef CONFIG_RT_GROUP_SCHED
 			queue_pull_task(rq);
-
+#endif
 		/*
 		 * If there's a higher priority task waiting to run
 		 * then reschedule.
 		 */
-		if (p->prio > rq->rt.highest_prio.curr)
+		if (p->prio > rt_rq->highest_prio.curr)
 			resched_curr(rq);
 #else
 		/* For UP simply resched on drop of prio */
@@ -1628,6 +1732,14 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
 	struct sched_rt_entity *rt_se = &p->rt;
 
 	update_curr_rt(rq);
+#ifdef CONFIG_RT_GROUP_SCHED
+	if (is_dl_group(&rq->rt)) {
+		struct sched_dl_entity *dl_se = dl_group_of(&rq->rt);
+
+		if (hrtick_enabled(rq) && queued && dl_se->runtime > 0)
+			start_hrtick_dl(rq, dl_se);
+	}
+#endif
 
 	watchdog(rq, p);
 
@@ -1647,23 +1759,22 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
 	 * Requeue to the end of queue if we (and all of our ancestors) are not
 	 * the only element on the queue
 	 */
-	for_each_sched_rt_entity(rt_se) {
-		if (rt_se->run_list.prev != rt_se->run_list.next) {
-			requeue_task_rt(rq, p, 0);
-			resched_curr(rq);
-			return;
-		}
+	if (rt_se->run_list.prev != rt_se->run_list.next) {
+		requeue_task_rt(rq, p, 0);
+		set_tsk_need_resched(p);
+		return;
 	}
 }
 
 static void set_curr_task_rt(struct rq *rq)
 {
 	struct task_struct *p = rq->curr;
+	struct rt_rq *rt_rq = rt_rq_of_se(&p->rt);
 
 	p->se.exec_start = rq_clock_task(rq);
 
 	/* The running task is never eligible for pushing */
-	dequeue_pushable_task(rq, p);
+	dequeue_pushable_task(rt_rq, p);
 }
 
 static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index a4c4a18..528b41c 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -112,6 +112,8 @@ static inline void cpu_load_update_active(struct rq *this_rq) { }
  */
 #define DL_SCALE (10)
 
+unsigned long to_ratio(u64 period, u64 runtime);
+
 /*
  * These are the 'tuning knobs' of the scheduler:
  */
@@ -228,12 +230,6 @@ void __dl_add(struct dl_bw *dl_b, u64 tsk_bw)
 	dl_b->total_bw += tsk_bw;
 }
 
-static inline
-bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw)
-{
-	return dl_b->bw != -1 &&
-	       dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw;
-}
 
 extern void init_dl_bw(struct dl_bw *dl_b);
 
@@ -286,9 +282,14 @@ struct task_group {
 #endif
 
 #ifdef CONFIG_RT_GROUP_SCHED
-	struct sched_rt_entity **rt_se;
+	/*
+	 * The scheduling entities for the task group are managed as a single
+	 * sched_dl_entity, each of them sharing the same dl_bandwidth.
+	 */
+	struct sched_dl_entity **dl_se;
 	struct rt_rq **rt_rq;
 
+	struct dl_bandwidth dl_bandwidth;
 #endif
 
 	struct rcu_head rcu;
@@ -354,8 +355,8 @@ extern void unthrottle_cfs_rq(struct cfs_rq *cfs_rq);
 extern void free_rt_sched_group(struct task_group *tg);
 extern int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent);
 extern void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
-		struct sched_rt_entity *rt_se, int cpu,
-		struct sched_rt_entity *parent);
+		struct sched_dl_entity *rt_se, int cpu,
+		struct sched_dl_entity *parent);
 
 extern struct task_group *sched_create_group(struct task_group *parent);
 extern void sched_online_group(struct task_group *tg,
@@ -383,6 +384,21 @@ struct cfs_bandwidth { };
 
 #endif	/* CONFIG_CGROUP_SCHED */
 
+static inline
+bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw)
+{
+	u64 dl_groups_root = 0;
+
+#ifdef CONFIG_RT_GROUP_SCHED
+	dl_groups_root = to_ratio(root_task_group.dl_bandwidth.dl_period,
+				  root_task_group.dl_bandwidth.dl_runtime);
+#endif
+	return dl_b->bw != -1 &&
+	       dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw
+					+ dl_groups_root * cpus;
+}
+
+
 /* CFS-related fields in a runqueue */
 struct cfs_rq {
 	struct load_weight load;
@@ -484,7 +500,6 @@ struct rt_rq {
 #endif
 #ifdef CONFIG_SMP
 	unsigned long rt_nr_migratory;
-	unsigned long rt_nr_total;
 	int overloaded;
 	struct plist_head pushable_tasks;
 #ifdef HAVE_RT_PUSH_IPI
@@ -494,14 +509,11 @@ struct rt_rq {
 	raw_spinlock_t push_lock;
 #endif
 #endif /* CONFIG_SMP */
-	int rt_queued;
 
 #ifdef CONFIG_RT_GROUP_SCHED
-	unsigned long rt_nr_boosted;
-
-	struct rq *rq;
 	struct task_group *tg;
 #endif
+	struct rq *rq;
 };
 
 /* Deadline class' related fields in a runqueue */
@@ -512,6 +524,12 @@ struct dl_rq {
 
 	unsigned long dl_nr_running;
 
+#ifdef CONFIG_RT_GROUP_SCHED
+	unsigned long dl_nr_total;
+	struct rt_rq *rq_to_push_from;
+	struct rt_rq *rq_to_pull_to;
+#endif
+
 #ifdef CONFIG_SMP
 	/*
 	 * Deadline values of the currently executing and the
@@ -1106,7 +1124,8 @@ static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
 
 #ifdef CONFIG_RT_GROUP_SCHED
 	p->rt.rt_rq  = tg->rt_rq[cpu];
-	p->rt.parent = tg->rt_se[cpu];
+	p->rt.parent = tg->dl_se[cpu];
+	p->dl.dl_rq  = &cpu_rq(cpu)->dl;
 #endif
 }
 
@@ -1461,8 +1480,6 @@ extern struct dl_bandwidth def_dl_bandwidth;
 extern void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime);
 extern void init_dl_task_timer(struct sched_dl_entity *dl_se);
 
-unsigned long to_ratio(u64 period, u64 runtime);
-
 extern void init_entity_runnable_average(struct sched_entity *se);
 extern void post_init_entity_util_avg(struct sched_entity *se);
 
@@ -1513,6 +1530,7 @@ static inline void add_nr_running(struct rq *rq, unsigned count)
 
 static inline void sub_nr_running(struct rq *rq, unsigned count)
 {
+	BUG_ON(rq->nr_running < count);
 	rq->nr_running -= count;
 	/* Check if we still need preemption */
 	sched_update_tick_dependency(rq);
@@ -1864,6 +1882,71 @@ static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2)
 
 #endif
 
+#ifdef CONFIG_RT_GROUP_SCHED
+static inline int is_dl_group(struct rt_rq *rt_rq)
+{
+	return rt_rq->tg != &root_task_group;
+}
+
+/*
+ * Return the scheduling entity of this group of tasks.
+ */
+static inline struct sched_dl_entity *dl_group_of(struct rt_rq *rt_rq)
+{
+	BUG_ON(!is_dl_group(rt_rq));
+
+	return rt_rq->tg->dl_se[cpu_of(rt_rq->rq)];
+}
+
+static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
+{
+	return container_of(rt_se, struct task_struct, rt);
+}
+
+static inline struct rq *rq_of_rt_se(struct sched_rt_entity *rt_se)
+{
+	struct rt_rq *rt_rq = rt_se->rt_rq;
+
+	return rt_rq->rq;
+}
+
+static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
+{
+	return rt_se->rt_rq;
+}
+#else
+static inline int is_dl_group(struct rt_rq *rt_rq)
+{
+	return 0;
+}
+
+static inline struct sched_dl_entity *dl_group_of(struct rt_rq *rt_rq)
+{
+	return NULL;
+}
+
+static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
+{
+	return container_of(rt_se, struct task_struct, rt);
+}
+
+static inline struct rq *rq_of_rt_se(struct sched_rt_entity *rt_se)
+{
+	struct task_struct *p = rt_task_of(rt_se);
+
+	return task_rq(p);
+}
+
+static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
+{
+	struct rq *rq = rq_of_rt_se(rt_se);
+
+	return &rq->rt;
+}
+#endif
+
+
+
 extern struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq);
 extern struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq);
 
@@ -1986,3 +2069,28 @@ static inline void cpufreq_update_this_cpu(struct rq *rq, unsigned int flags) {}
 static inline void arch_task_migrate(struct task_struct *p) { }
 #endif
 
+int group_pull_rt_task(struct rt_rq *rt_rq);
+int group_push_rt_task(struct rt_rq *rt_rq);
+
+struct sched_rt_entity *pick_next_rt_entity(struct rq *rq, struct rt_rq *rt_rq);
+#if defined(CONFIG_RT_GROUP_SCHED) && defined(CONFIG_SMP)
+void dequeue_pushable_task(struct rt_rq *rt_rq, struct task_struct *p);
+#else
+static inline
+void dequeue_pushable_task(struct rt_rq *rt_rq, struct task_struct *p)
+{
+}
+#endif
+
+int is_dl_group(struct rt_rq *rt_rq);
+
+void dequeue_dl_entity(struct sched_dl_entity *dl_se);
+
+void init_dl_timer(struct sched_dl_entity *dl_se);
+
+void enqueue_dl_entity(struct sched_dl_entity *dl_se,
+			      struct sched_dl_entity *pi_se, int flags);
+int dl_runtime_exceeded(struct sched_dl_entity *dl_se);
+int start_dl_timer(struct sched_dl_entity *dl_se);
+void start_hrtick_dl(struct rq *rq, struct sched_dl_entity *dl_se);
+
-- 
2.7.4