At the start of a new period there are several actions we must refresh the global bandwidth pool as well as unthrottle any cfs_rq entities who previously ran out of bandwidth (as quota permits). Unthrottled entities have the cfs_rq->throttled flag cleared and are re-enqueued into the cfs entity hierarchy. Signed-off-by: Paul Turner Signed-off-by: Nikhil Rao Signed-off-by: Bharata B Rao --- kernel/sched.c | 3 + kernel/sched_fair.c | 105 +++++++++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 107 insertions(+), 1 deletion(-) Index: tip/kernel/sched.c =================================================================== --- tip.orig/kernel/sched.c +++ tip/kernel/sched.c @@ -9294,6 +9294,9 @@ static int tg_set_cfs_bandwidth(struct t cfs_rq->runtime_enabled = quota != RUNTIME_INF; cfs_rq->runtime_remaining = 0; cfs_rq->runtime_expires = runtime_expires; + + if (cfs_rq_throttled(cfs_rq)) + unthrottle_cfs_rq(cfs_rq); raw_spin_unlock_irq(&rq->lock); } out_unlock: Index: tip/kernel/sched_fair.c =================================================================== --- tip.orig/kernel/sched_fair.c +++ tip/kernel/sched_fair.c @@ -1456,10 +1456,88 @@ static void check_enqueue_throttle(struc throttle_cfs_rq(cfs_rq); } +static void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) +{ + struct rq *rq = rq_of(cfs_rq); + struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); + struct sched_entity *se; + int enqueue = 1; + long task_delta; + + se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))]; + + cfs_rq->throttled = 0; + raw_spin_lock(&cfs_b->lock); + list_del_rcu(&cfs_rq->throttled_list); + raw_spin_unlock(&cfs_b->lock); + + if (!cfs_rq->load.weight) + return; + + task_delta = cfs_rq->h_nr_running; + for_each_sched_entity(se) { + if (se->on_rq) + enqueue = 0; + + cfs_rq = cfs_rq_of(se); + if (enqueue) + enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP); + cfs_rq->h_nr_running += task_delta; + + if (cfs_rq_throttled(cfs_rq)) + break; + } + + if (!se) + rq->nr_running += task_delta; + + /* determine whether we need to wake up potentially idle cpu */ + if (rq->curr == rq->idle && rq->cfs.nr_running) + resched_task(rq->curr); +} + +static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b, + u64 remaining, u64 expires) +{ + struct cfs_rq *cfs_rq; + u64 runtime = remaining; + + rcu_read_lock(); + list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq, + throttled_list) { + struct rq *rq = rq_of(cfs_rq); + + raw_spin_lock(&rq->lock); + if (!cfs_rq_throttled(cfs_rq)) + goto next; + + runtime = -cfs_rq->runtime_remaining + 1; + if (runtime > remaining) + runtime = remaining; + remaining -= runtime; + + cfs_rq->runtime_remaining += runtime; + cfs_rq->runtime_expires = expires; + + /* we check whether we're throttled above */ + if (cfs_rq->runtime_remaining > 0) + unthrottle_cfs_rq(cfs_rq); + +next: + raw_spin_unlock(&rq->lock); + + if (!remaining) + break; + } + rcu_read_unlock(); + + return remaining; +} + static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun) { u64 quota, runtime = 0, runtime_expires; - int idle = 0; + int idle = 0, throttled = 0; runtime_expires = sched_clock_cpu(smp_processor_id()); @@ -1469,6 +1547,7 @@ static int do_sched_cfs_period_timer(str if (quota != RUNTIME_INF) { runtime = quota; runtime_expires += ktime_to_ns(cfs_b->period); + throttled = !list_empty(&cfs_b->throttled_cfs_rq); cfs_b->runtime = runtime; cfs_b->runtime_expires = runtime_expires; @@ -1477,6 +1556,30 @@ static int do_sched_cfs_period_timer(str } raw_spin_unlock(&cfs_b->lock); + if (!throttled || quota == RUNTIME_INF) + goto out; + idle = 0; + +retry: + runtime = distribute_cfs_runtime(cfs_b, runtime, runtime_expires); + + raw_spin_lock(&cfs_b->lock); + /* new new bandwidth may have been set */ + if (unlikely(runtime_expires != cfs_b->runtime_expires)) + goto out_unlock; + /* + * make sure no-one was throttled while we were handing out the new + * runtime. + */ + if (runtime > 0 && !list_empty(&cfs_b->throttled_cfs_rq)) { + raw_spin_unlock(&cfs_b->lock); + goto retry; + } + cfs_b->runtime = runtime; + cfs_b->idle = idle; +out_unlock: + raw_spin_unlock(&cfs_b->lock); +out: return idle; } #else -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/