At the start of a new period we must refresh the global bandwidth pool as well
as unthrottle any cfs_rq entities who previously ran out of bandwidth (as quota
permits).

Unthrottled entities have the cfs_rq->throttled flag cleared and are re-enqueued
into the entity hierarchy.

Signed-off-by: Paul Turner <pjt@google.com>
Signed-off-by: Nikhil Rao <ncrao@google.com>
Signed-off-by: Bharata B Rao <bharata@linux.vnet.ibm.com>
Reviewed-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>

---
 kernel/sched.c      |    3 +
 kernel/sched_fair.c |  125 +++++++++++++++++++++++++++++++++++++++++++++++++---
 2 files changed, 121 insertions(+), 7 deletions(-)

Index: tip/kernel/sched.c
===================================================================
--- tip.orig/kernel/sched.c
+++ tip/kernel/sched.c
@@ -9002,6 +9002,9 @@ static int tg_set_cfs_bandwidth(struct t
 		raw_spin_lock_irq(&rq->lock);
 		cfs_rq->runtime_enabled = quota != RUNTIME_INF;
 		cfs_rq->runtime_remaining = 0;
+
+		if (cfs_rq_throttled(cfs_rq))
+			unthrottle_cfs_rq(cfs_rq);
 		raw_spin_unlock_irq(&rq->lock);
 	}
 out_unlock:
Index: tip/kernel/sched_fair.c
===================================================================
--- tip.orig/kernel/sched_fair.c
+++ tip/kernel/sched_fair.c
@@ -1448,26 +1448,137 @@ static void throttle_cfs_rq(struct cfs_r
 	raw_spin_unlock(&cfs_b->lock);
 }
 
+static void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
+{
+	struct rq *rq = rq_of(cfs_rq);
+	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
+	struct sched_entity *se;
+	int enqueue = 1;
+	long task_delta;
+
+	se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
+
+	cfs_rq->throttled = 0;
+	raw_spin_lock(&cfs_b->lock);
+	list_del_rcu(&cfs_rq->throttled_list);
+	raw_spin_unlock(&cfs_b->lock);
+
+	if (!cfs_rq->load.weight)
+		return;
+
+	task_delta = cfs_rq->h_nr_running;
+	for_each_sched_entity(se) {
+		if (se->on_rq)
+			enqueue = 0;
+
+		cfs_rq = cfs_rq_of(se);
+		if (enqueue)
+			enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP);
+		cfs_rq->h_nr_running += task_delta;
+
+		if (cfs_rq_throttled(cfs_rq))
+			break;
+	}
+
+	if (!se)
+		rq->nr_running += task_delta;
+
+	/* determine whether we need to wake up potentially idle cpu */
+	if (rq->curr == rq->idle && rq->cfs.nr_running)
+		resched_task(rq->curr);
+}
+
+static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
+		u64 remaining, u64 expires)
+{
+	struct cfs_rq *cfs_rq;
+	u64 runtime = remaining;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
+				throttled_list) {
+		struct rq *rq = rq_of(cfs_rq);
+
+		raw_spin_lock(&rq->lock);
+		if (!cfs_rq_throttled(cfs_rq))
+			goto next;
+
+		runtime = -cfs_rq->runtime_remaining + 1;
+		if (runtime > remaining)
+			runtime = remaining;
+		remaining -= runtime;
+
+		cfs_rq->runtime_remaining += runtime;
+		cfs_rq->runtime_expires = expires;
+
+		/* we check whether we're throttled above */
+		if (cfs_rq->runtime_remaining > 0)
+			unthrottle_cfs_rq(cfs_rq);
+
+next:
+		raw_spin_unlock(&rq->lock);
+
+		if (!remaining)
+			break;
+	}
+	rcu_read_unlock();
+
+	return remaining;
+}
+
 static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
 {
-	int idle = 1;
+	int idle = 1, throttled = 0;
+	u64 runtime, runtime_expires;
+
 
 	raw_spin_lock(&cfs_b->lock);
 	if (cfs_b->quota != RUNTIME_INF) {
-		idle = cfs_b->idle;
-		/* If we're going idle then defer handle the refill */
+		/* idle depends on !throttled in the case of a large deficit */
+		throttled = !list_empty(&cfs_b->throttled_cfs_rq);
+		idle = cfs_b->idle && !throttled;
+
+		/* If we're going idle then defer the refill */
 		if (!idle)
 			__refill_cfs_bandwidth_runtime(cfs_b);
+		if (throttled) {
+			runtime = cfs_b->runtime;
+			runtime_expires = cfs_b->runtime_expires;
+
+			/* we must first distribute to throttled entities */
+			cfs_b->runtime = 0;
+		}
 
 		/*
-		 * mark this bandwidth pool as idle so that we may deactivate
-		 * the timer at the next expiration if there is no usage.
+		 * conditionally mark this bandwidth pool as idle so that we may
+		 * deactivate the timer at the next expiration if there is no
+		 * usage.
 		 */
-		cfs_b->idle = 1;
+		cfs_b->idle = !throttled;
 	}
 
-	if (idle)
+	if (idle) {
 		cfs_b->timer_active = 0;
+		goto out_unlock;
+	}
+	raw_spin_unlock(&cfs_b->lock);
+
+retry:
+	runtime = distribute_cfs_runtime(cfs_b, runtime, runtime_expires);
+
+	raw_spin_lock(&cfs_b->lock);
+	/* new bandwidth specification may exist */
+	if (unlikely(runtime_expires != cfs_b->runtime_expires))
+		goto out_unlock;
+	/* ensure no-one was throttled while we unthrottling */
+	if (unlikely(!list_empty(&cfs_b->throttled_cfs_rq)) && runtime > 0) {
+		raw_spin_unlock(&cfs_b->lock);
+		goto retry;
+	}
+
+	/* return remaining runtime */
+	cfs_b->runtime = runtime;
+out_unlock:
 	raw_spin_unlock(&cfs_b->lock);
 
 	return idle;


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/