Introduce a generational counter which is incremented each quota period. This allows us to determine on a per-cpu basis whether the currently operating quota is "current" or not, without requiring us to visit every cpu and explicitly expire quota on every new period. Signed-off-by: Paul Turner Signed-off-by: Bharata B Rao --- kernel/sched.c | 6 ++++++ kernel/sched_fair.c | 42 +++++++++++++++++++++++++++++++++++++----- 2 files changed, 43 insertions(+), 5 deletions(-) Index: tip/kernel/sched.c =================================================================== --- tip.orig/kernel/sched.c +++ tip/kernel/sched.c @@ -256,6 +256,7 @@ struct cfs_bandwidth { s64 hierarchal_quota; /* used for validating consistency */ struct hrtimer period_timer; + int quota_generation; struct list_head throttled_cfs_rq; /* throttle statistics */ u64 nr_periods; @@ -396,6 +397,7 @@ struct cfs_rq { s64 quota_remaining; u64 throttled_timestamp; + int quota_generation; struct list_head throttled_list; #endif #endif @@ -436,8 +438,10 @@ void init_cfs_bandwidth(struct cfs_bandw raw_spin_lock_init(&cfs_b->lock); cfs_b->quota = cfs_b->runtime = quota; cfs_b->period = ns_to_ktime(period); + cfs_b->quota_generation = 0; INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq); + hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); cfs_b->period_timer.function = sched_cfs_period_timer; @@ -9333,6 +9337,8 @@ static int tg_set_cfs_bandwidth(struct t raw_spin_lock_irq(&cfs_b->lock); cfs_b->period = ns_to_ktime(period); cfs_b->runtime = cfs_b->quota = quota; + + cfs_bump_quota_generation(cfs_b); raw_spin_unlock_irq(&cfs_b->lock); for_each_possible_cpu(i) { Index: tip/kernel/sched_fair.c =================================================================== --- tip.orig/kernel/sched_fair.c +++ tip/kernel/sched_fair.c @@ -1331,11 +1331,25 @@ static void check_cfs_rq_quota(struct cf resched_task(rq_of(cfs_rq)->curr); } +static void cfs_bump_quota_generation(struct cfs_bandwidth *cfs_b) +{ + cfs_b->quota_generation++; + smp_mb(); +} + +static inline int cfs_rq_quota_current(struct cfs_rq *cfs_rq) +{ + struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); + + return cfs_rq->quota_generation == cfs_b->quota_generation; +} + static void request_cfs_rq_quota(struct cfs_rq *cfs_rq) { struct task_group *tg = cfs_rq->tg; struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg); u64 amount = 0, min_amount; + int generation; min_amount = sched_cfs_bandwidth_slice() + (-cfs_rq->quota_remaining); @@ -1347,10 +1361,18 @@ static void request_cfs_rq_quota(struct } else { amount = min_amount; } + generation = cfs_b->quota_generation; raw_spin_unlock(&cfs_b->lock); } + /* a deficit should be carried forwards, surplus should be dropped */ + + if (generation != cfs_rq->quota_generation && + cfs_rq->quota_remaining > 0) + cfs_rq->quota_remaining = 0; + cfs_rq->quota_remaining += amount; + cfs_rq->quota_generation = generation; } static void account_cfs_rq_quota(struct cfs_rq *cfs_rq, @@ -1361,8 +1383,13 @@ static void account_cfs_rq_quota(struct cfs_rq->quota_remaining -= delta_exec; - if (cfs_rq->quota_remaining > 0) - return; + /* we only want to charge deficits against the next generation */ + if (likely(cfs_rq->quota_remaining > 0)) { + if (unlikely(!cfs_rq_quota_current(cfs_rq))) + cfs_rq->quota_remaining = 0; + else + return; + } request_cfs_rq_quota(cfs_rq); } @@ -1492,7 +1519,8 @@ static void unthrottle_cfs_rq(struct cfs resched_task(rq->curr); } -static u64 distribute_cfs_bandwidth(struct cfs_bandwidth *cfs_b, u64 runtime) +static u64 distribute_cfs_bandwidth(struct cfs_bandwidth *cfs_b, u64 runtime, + int generation) { struct cfs_rq *cfs_rq; u64 quota, remaining = runtime; @@ -1512,6 +1540,7 @@ static u64 distribute_cfs_bandwidth(stru remaining -= quota; cfs_rq->quota_remaining += quota; + cfs_rq->quota_generation = generation; if (cfs_rq_throttled(cfs_rq) && cfs_rq->quota_remaining > 0) unthrottle_cfs_rq(cfs_rq); @@ -1529,12 +1558,15 @@ next: static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun) { u64 runtime, runtime_assigned; - int idle, throttled; + int idle, throttled, generation; raw_spin_lock(&cfs_b->lock); runtime = cfs_b->quota; idle = cfs_b->runtime == cfs_b->runtime_assigned; throttled = cfs_b->runtime == 0; + + cfs_bump_quota_generation(cfs_b); + generation = cfs_b->quota_generation; raw_spin_unlock(&cfs_b->lock); if (runtime == RUNTIME_INF) @@ -1543,7 +1575,7 @@ static int do_sched_cfs_period_timer(str runtime *= overrun; runtime_assigned = runtime; - runtime = distribute_cfs_bandwidth(cfs_b, runtime); + runtime = distribute_cfs_bandwidth(cfs_b, runtime, generation); raw_spin_lock(&cfs_b->lock); cfs_b->runtime = runtime; -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/