At the start of a new period we must refresh the global bandwidth pool as well as unthrottle any cfs_rq entities who previously ran out of bandwidth (as quota permits). Unthrottled entities have the cfs_rq->throttled flag cleared and are re-enqueued into the entity hierarchy. Signed-off-by: Paul Turner Signed-off-by: Nikhil Rao Signed-off-by: Bharata B Rao Reviewed-by: Hidetoshi Seto --- kernel/sched.c | 3 + kernel/sched_fair.c | 125 +++++++++++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 121 insertions(+), 7 deletions(-) Index: tip/kernel/sched.c =================================================================== --- tip.orig/kernel/sched.c +++ tip/kernel/sched.c @@ -9002,6 +9002,9 @@ static int tg_set_cfs_bandwidth(struct t raw_spin_lock_irq(&rq->lock); cfs_rq->runtime_enabled = quota != RUNTIME_INF; cfs_rq->runtime_remaining = 0; + + if (cfs_rq_throttled(cfs_rq)) + unthrottle_cfs_rq(cfs_rq); raw_spin_unlock_irq(&rq->lock); } out_unlock: Index: tip/kernel/sched_fair.c =================================================================== --- tip.orig/kernel/sched_fair.c +++ tip/kernel/sched_fair.c @@ -1448,26 +1448,137 @@ static void throttle_cfs_rq(struct cfs_r raw_spin_unlock(&cfs_b->lock); } +static void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) +{ + struct rq *rq = rq_of(cfs_rq); + struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); + struct sched_entity *se; + int enqueue = 1; + long task_delta; + + se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))]; + + cfs_rq->throttled = 0; + raw_spin_lock(&cfs_b->lock); + list_del_rcu(&cfs_rq->throttled_list); + raw_spin_unlock(&cfs_b->lock); + + if (!cfs_rq->load.weight) + return; + + task_delta = cfs_rq->h_nr_running; + for_each_sched_entity(se) { + if (se->on_rq) + enqueue = 0; + + cfs_rq = cfs_rq_of(se); + if (enqueue) + enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP); + cfs_rq->h_nr_running += task_delta; + + if (cfs_rq_throttled(cfs_rq)) + break; + } + + if (!se) + rq->nr_running += task_delta; + + /* determine whether we need to wake up potentially idle cpu */ + if (rq->curr == rq->idle && rq->cfs.nr_running) + resched_task(rq->curr); +} + +static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b, + u64 remaining, u64 expires) +{ + struct cfs_rq *cfs_rq; + u64 runtime = remaining; + + rcu_read_lock(); + list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq, + throttled_list) { + struct rq *rq = rq_of(cfs_rq); + + raw_spin_lock(&rq->lock); + if (!cfs_rq_throttled(cfs_rq)) + goto next; + + runtime = -cfs_rq->runtime_remaining + 1; + if (runtime > remaining) + runtime = remaining; + remaining -= runtime; + + cfs_rq->runtime_remaining += runtime; + cfs_rq->runtime_expires = expires; + + /* we check whether we're throttled above */ + if (cfs_rq->runtime_remaining > 0) + unthrottle_cfs_rq(cfs_rq); + +next: + raw_spin_unlock(&rq->lock); + + if (!remaining) + break; + } + rcu_read_unlock(); + + return remaining; +} + static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun) { - int idle = 1; + int idle = 1, throttled = 0; + u64 runtime, runtime_expires; + raw_spin_lock(&cfs_b->lock); if (cfs_b->quota != RUNTIME_INF) { - idle = cfs_b->idle; - /* If we're going idle then defer handle the refill */ + /* idle depends on !throttled in the case of a large deficit */ + throttled = !list_empty(&cfs_b->throttled_cfs_rq); + idle = cfs_b->idle && !throttled; + + /* If we're going idle then defer the refill */ if (!idle) __refill_cfs_bandwidth_runtime(cfs_b); + if (throttled) { + runtime = cfs_b->runtime; + runtime_expires = cfs_b->runtime_expires; + + /* we must first distribute to throttled entities */ + cfs_b->runtime = 0; + } /* - * mark this bandwidth pool as idle so that we may deactivate - * the timer at the next expiration if there is no usage. + * conditionally mark this bandwidth pool as idle so that we may + * deactivate the timer at the next expiration if there is no + * usage. */ - cfs_b->idle = 1; + cfs_b->idle = !throttled; } - if (idle) + if (idle) { cfs_b->timer_active = 0; + goto out_unlock; + } + raw_spin_unlock(&cfs_b->lock); + +retry: + runtime = distribute_cfs_runtime(cfs_b, runtime, runtime_expires); + + raw_spin_lock(&cfs_b->lock); + /* new bandwidth specification may exist */ + if (unlikely(runtime_expires != cfs_b->runtime_expires)) + goto out_unlock; + /* ensure no-one was throttled while we unthrottling */ + if (unlikely(!list_empty(&cfs_b->throttled_cfs_rq)) && runtime > 0) { + raw_spin_unlock(&cfs_b->lock); + goto retry; + } + + /* return remaining runtime */ + cfs_b->runtime = runtime; +out_unlock: raw_spin_unlock(&cfs_b->lock); return idle; -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/