By tracking a per-cpu load-avg for each cfs_rq and folding it into a global task_group load on each tick we can rework tg_shares_up to be strictly per-cpu. This should improve cpu-cgroup performance for smp systems significantly. Signed-off-by: Peter Zijlstra --- include/linux/sched.h | 2 kernel/sched.c | 173 ++++++++++++------------------------------------ kernel/sched_debug.c | 11 ++- kernel/sched_fair.c | 153 ++++++++++++++++++++++++++---------------- kernel/sched_features.h | 2 kernel/sysctl.c | 17 ---- 6 files changed, 147 insertions(+), 211 deletions(-) Index: linux-2.6/include/linux/sched.h =================================================================== --- linux-2.6.orig/include/linux/sched.h +++ linux-2.6/include/linux/sched.h @@ -1858,8 +1858,6 @@ static inline void wake_up_idle_cpu(int extern unsigned int sysctl_sched_latency; extern unsigned int sysctl_sched_min_granularity; extern unsigned int sysctl_sched_wakeup_granularity; -extern unsigned int sysctl_sched_shares_ratelimit; -extern unsigned int sysctl_sched_shares_thresh; extern unsigned int sysctl_sched_child_runs_first; enum sched_tunable_scaling { Index: linux-2.6/kernel/sched.c =================================================================== --- linux-2.6.orig/kernel/sched.c +++ linux-2.6/kernel/sched.c @@ -253,6 +253,8 @@ struct task_group { /* runqueue "owned" by this group on each cpu */ struct cfs_rq **cfs_rq; unsigned long shares; + + atomic_t load_weight; #endif #ifdef CONFIG_RT_GROUP_SCHED @@ -359,15 +361,11 @@ struct cfs_rq { */ unsigned long h_load; - /* - * this cpu's part of tg->shares - */ - unsigned long shares; + u64 load_avg; + u64 load_period; + u64 load_stamp; - /* - * load.weight at the time we set shares - */ - unsigned long rq_weight; + unsigned long load_contribution; #endif #endif }; @@ -793,20 +791,6 @@ late_initcall(sched_init_debug); const_debug unsigned int sysctl_sched_nr_migrate = 32; /* - * ratelimit for updating the group shares. - * default: 0.25ms - */ -unsigned int sysctl_sched_shares_ratelimit = 250000; -unsigned int normalized_sysctl_sched_shares_ratelimit = 250000; - -/* - * Inject some fuzzyness into changing the per-cpu group shares - * this avoids remote rq-locks at the expense of fairness. - * default: 4 - */ -unsigned int sysctl_sched_shares_thresh = 4; - -/* * period over which we average the RT time consumption, measured * in ms. * @@ -1351,6 +1335,12 @@ static inline void update_load_sub(struc lw->inv_weight = 0; } +static inline void update_load_set(struct load_weight *lw, unsigned long w) +{ + lw->weight = w; + lw->inv_weight = 0; +} + /* * To aid in avoiding the subversion of "niceness" due to uneven distribution * of tasks with abnormal "nice" values across CPUs the contribution that @@ -1539,97 +1529,44 @@ static unsigned long cpu_avg_load_per_ta #ifdef CONFIG_FAIR_GROUP_SCHED -static __read_mostly unsigned long __percpu *update_shares_data; - -static void __set_se_shares(struct sched_entity *se, unsigned long shares); - -/* - * Calculate and set the cpu's group shares. - */ -static void update_group_shares_cpu(struct task_group *tg, int cpu, - unsigned long sd_shares, - unsigned long sd_rq_weight, - unsigned long *usd_rq_weight) -{ - unsigned long shares, rq_weight; - int boost = 0; - - rq_weight = usd_rq_weight[cpu]; - if (!rq_weight) { - boost = 1; - rq_weight = NICE_0_LOAD; - } - - /* - * \Sum_j shares_j * rq_weight_i - * shares_i = ----------------------------- - * \Sum_j rq_weight_j - */ - shares = (sd_shares * rq_weight) / sd_rq_weight; - shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES); - - if (abs(shares - tg->se[cpu]->load.weight) > - sysctl_sched_shares_thresh) { - struct rq *rq = cpu_rq(cpu); - unsigned long flags; - - raw_spin_lock_irqsave(&rq->lock, flags); - tg->cfs_rq[cpu]->rq_weight = boost ? 0 : rq_weight; - tg->cfs_rq[cpu]->shares = boost ? 0 : shares; - __set_se_shares(tg->se[cpu], shares); - raw_spin_unlock_irqrestore(&rq->lock, flags); - } -} +static void update_cfs_load(struct cfs_rq *cfs_rq); +static void update_cfs_shares(struct cfs_rq *cfs_rq); /* - * Re-compute the task group their per cpu shares over the given domain. - * This needs to be done in a bottom-up fashion because the rq weight of a - * parent group depends on the shares of its child groups. + * update tg->load_weight by folding this cpu's load_avg */ static int tg_shares_up(struct task_group *tg, void *data) { - unsigned long weight, rq_weight = 0, sum_weight = 0, shares = 0; - unsigned long *usd_rq_weight; - struct sched_domain *sd = data; + unsigned long load_avg; + struct cfs_rq *cfs_rq; unsigned long flags; - int i; + int cpu = (long)data; + struct rq *rq; - if (!tg->se[0]) + if (!tg->se[cpu]) return 0; - local_irq_save(flags); - usd_rq_weight = per_cpu_ptr(update_shares_data, smp_processor_id()); - - for_each_cpu(i, sched_domain_span(sd)) { - weight = tg->cfs_rq[i]->load.weight; - usd_rq_weight[i] = weight; - - rq_weight += weight; - /* - * If there are currently no tasks on the cpu pretend there - * is one of average load so that when a new task gets to - * run here it will not get delayed by group starvation. - */ - if (!weight) - weight = NICE_0_LOAD; + rq = cpu_rq(cpu); + cfs_rq = tg->cfs_rq[cpu]; - sum_weight += weight; - shares += tg->cfs_rq[i]->shares; - } + raw_spin_lock_irqsave(&rq->lock, flags); - if (!rq_weight) - rq_weight = sum_weight; + update_rq_clock(rq); + update_cfs_load(cfs_rq); - if ((!shares && rq_weight) || shares > tg->shares) - shares = tg->shares; + load_avg = div64_u64(cfs_rq->load_avg, cfs_rq->load_period+1); + load_avg -= cfs_rq->load_contribution; - if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE)) - shares = tg->shares; + atomic_add(load_avg, &tg->load_weight); + cfs_rq->load_contribution += load_avg; - for_each_cpu(i, sched_domain_span(sd)) - update_group_shares_cpu(tg, i, shares, rq_weight, usd_rq_weight); + /* + * We need to update shares after updating tg->load_weight in + * order to adjust the weight of groups with long running tasks. + */ + update_cfs_shares(cfs_rq); - local_irq_restore(flags); + raw_spin_unlock_irqrestore(&rq->lock, flags); return 0; } @@ -1648,7 +1585,7 @@ static int tg_load_down(struct task_grou load = cpu_rq(cpu)->load.weight; } else { load = tg->parent->cfs_rq[cpu]->h_load; - load *= tg->cfs_rq[cpu]->shares; + load *= tg->se[cpu]->load.weight; load /= tg->parent->cfs_rq[cpu]->load.weight + 1; } @@ -1657,21 +1594,16 @@ static int tg_load_down(struct task_grou return 0; } -static void update_shares(struct sched_domain *sd) +static void update_shares(long cpu) { - s64 elapsed; - u64 now; - if (root_task_group_empty()) return; - now = local_clock(); - elapsed = now - sd->last_update; + /* + * XXX: replace with an on-demand list + */ - if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) { - sd->last_update = now; - walk_tg_tree(tg_nop, tg_shares_up, sd); - } + walk_tg_tree(tg_nop, tg_shares_up, (void *)cpu); } static void update_h_load(long cpu) @@ -1681,7 +1613,7 @@ static void update_h_load(long cpu) #else -static inline void update_shares(struct sched_domain *sd) +static inline void update_shares(int cpu) { } @@ -1806,15 +1738,6 @@ static void double_rq_unlock(struct rq * #endif -#ifdef CONFIG_FAIR_GROUP_SCHED -static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares) -{ -#ifdef CONFIG_SMP - cfs_rq->shares = shares; -#endif -} -#endif - static void calc_load_account_idle(struct rq *this_rq); static void update_sysctl(void); static int get_update_sysctl_factor(void); @@ -5400,7 +5323,6 @@ static void update_sysctl(void) SET_SYSCTL(sched_min_granularity); SET_SYSCTL(sched_latency); SET_SYSCTL(sched_wakeup_granularity); - SET_SYSCTL(sched_shares_ratelimit); #undef SET_SYSCTL } @@ -7652,8 +7574,7 @@ static void init_tg_cfs_entry(struct tas se->cfs_rq = parent->my_q; se->my_q = cfs_rq; - se->load.weight = tg->shares; - se->load.inv_weight = 0; + update_load_set(&se->load, tg->shares); se->parent = parent; } #endif @@ -7746,10 +7667,6 @@ void __init sched_init(void) #endif /* CONFIG_CGROUP_SCHED */ -#if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP - update_shares_data = __alloc_percpu(nr_cpu_ids * sizeof(unsigned long), - __alignof__(unsigned long)); -#endif for_each_possible_cpu(i) { struct rq *rq; @@ -8317,8 +8234,7 @@ static void __set_se_shares(struct sched if (on_rq) dequeue_entity(cfs_rq, se, 0); - se->load.weight = shares; - se->load.inv_weight = 0; + update_load_set(&se->load, shares); if (on_rq) enqueue_entity(cfs_rq, se, 0); @@ -8375,7 +8291,6 @@ int sched_group_set_shares(struct task_g /* * force a rebalance */ - cfs_rq_set_shares(tg->cfs_rq[i], 0); set_se_shares(tg->se[i], shares); } Index: linux-2.6/kernel/sched_debug.c =================================================================== --- linux-2.6.orig/kernel/sched_debug.c +++ linux-2.6/kernel/sched_debug.c @@ -204,13 +204,18 @@ void print_cfs_rq(struct seq_file *m, in SPLIT_NS(spread0)); SEQ_printf(m, " .%-30s: %ld\n", "nr_running", cfs_rq->nr_running); SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight); + SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "load_avg", + SPLIT_NS(cfs_rq->load_avg)); + SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "load_period", + SPLIT_NS(cfs_rq->load_period)); + SEQ_printf(m, " .%-30s: %ld\n", "load_contrib", + cfs_rq->load_contribution); + SEQ_printf(m, " .%-30s: %d\n", "load_tg", + atomic_read(&tg->load_weight)); SEQ_printf(m, " .%-30s: %d\n", "nr_spread_over", cfs_rq->nr_spread_over); #ifdef CONFIG_FAIR_GROUP_SCHED -#ifdef CONFIG_SMP - SEQ_printf(m, " .%-30s: %lu\n", "shares", cfs_rq->shares); -#endif print_cfs_group_stats(m, cpu, cfs_rq->tg); #endif } Index: linux-2.6/kernel/sched_fair.c =================================================================== --- linux-2.6.orig/kernel/sched_fair.c +++ linux-2.6/kernel/sched_fair.c @@ -417,7 +417,6 @@ int sched_proc_update_handler(struct ctl WRT_SYSCTL(sched_min_granularity); WRT_SYSCTL(sched_latency); WRT_SYSCTL(sched_wakeup_granularity); - WRT_SYSCTL(sched_shares_ratelimit); #undef WRT_SYSCTL return 0; @@ -633,7 +632,6 @@ account_entity_enqueue(struct cfs_rq *cf list_add(&se->group_node, &cfs_rq->tasks); } cfs_rq->nr_running++; - se->on_rq = 1; } static void @@ -647,9 +645,84 @@ account_entity_dequeue(struct cfs_rq *cf list_del_init(&se->group_node); } cfs_rq->nr_running--; - se->on_rq = 0; } +#ifdef CONFIG_FAIR_GROUP_SCHED +static void update_cfs_load(struct cfs_rq *cfs_rq) +{ + u64 period = sched_avg_period(); + u64 now = rq_of(cfs_rq)->clock; + u64 delta = now - cfs_rq->load_stamp; + + cfs_rq->load_stamp = now; + cfs_rq->load_period += delta; + cfs_rq->load_avg += delta * cfs_rq->load.weight; + + while (cfs_rq->load_period > period) { + /* + * Inline assembly required to prevent the compiler + * optimising this loop into a divmod call. + * See __iter_div_u64_rem() for another example of this. + */ + asm("" : "+rm" (cfs_rq->load_period)); + cfs_rq->load_period /= 2; + cfs_rq->load_avg /= 2; + } +} + +static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, + unsigned long weight) +{ + if (se->on_rq) + account_entity_dequeue(cfs_rq, se); + + update_load_set(&se->load, weight); + + if (se->on_rq) + account_entity_enqueue(cfs_rq, se); +} + +static void update_cfs_shares(struct cfs_rq *cfs_rq) +{ + struct task_group *tg; + struct sched_entity *se; + unsigned long load_weight, load, shares; + + if (!cfs_rq) + return; + + tg = cfs_rq->tg; + se = tg->se[cpu_of(rq_of(cfs_rq))]; + if (!se) + return; + + load = cfs_rq->load.weight; + + load_weight = atomic_read(&tg->load_weight); + load_weight -= cfs_rq->load_contribution; + load_weight += load; + + shares = (tg->shares * load); + if (load_weight) + shares /= load_weight; + + if (shares < MIN_SHARES) + shares = MIN_SHARES; + if (shares > tg->shares) + shares = tg->shares; + + reweight_entity(cfs_rq_of(se), se, shares); +} +#else /* CONFIG_FAIR_GROUP_SCHED */ +static inline void update_cfs_load(struct cfs_rq *cfs_rq) +{ +} + +static inline void update_cfs_shares(struct cfs_rq *cfs_rq) +{ +} +#endif /* CONFIG_FAIR_GROUP_SCHED */ + static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) { #ifdef CONFIG_SCHEDSTATS @@ -771,7 +844,9 @@ enqueue_entity(struct cfs_rq *cfs_rq, st * Update run-time statistics of the 'current'. */ update_curr(cfs_rq); + update_cfs_load(cfs_rq); account_entity_enqueue(cfs_rq, se); + update_cfs_shares(group_cfs_rq(se)); if (flags & ENQUEUE_WAKEUP) { place_entity(cfs_rq, se, 0); @@ -782,6 +857,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, st check_spread(cfs_rq, se); if (se != cfs_rq->curr) __enqueue_entity(cfs_rq, se); + se->on_rq = 1; } static void __clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se) @@ -825,8 +901,11 @@ dequeue_entity(struct cfs_rq *cfs_rq, st if (se != cfs_rq->curr) __dequeue_entity(cfs_rq, se); + se->on_rq = 0; + update_cfs_load(cfs_rq); account_entity_dequeue(cfs_rq, se); update_min_vruntime(cfs_rq); + update_cfs_shares(group_cfs_rq(se)); /* * Normalize the entity after updating the min_vruntime because the @@ -1055,6 +1134,9 @@ enqueue_task_fair(struct rq *rq, struct flags = ENQUEUE_WAKEUP; } + for_each_sched_entity(se) + update_cfs_shares(group_cfs_rq(se)); + hrtick_update(rq); } @@ -1071,12 +1153,16 @@ static void dequeue_task_fair(struct rq for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se); dequeue_entity(cfs_rq, se, flags); + /* Don't dequeue parent if it has other entities besides us */ if (cfs_rq->load.weight) break; flags |= DEQUEUE_SLEEP; } + for_each_sched_entity(se) + update_cfs_shares(group_cfs_rq(se)); + hrtick_update(rq); } @@ -1143,51 +1229,20 @@ static void task_waking_fair(struct rq * * Adding load to a group doesn't make a group heavier, but can cause movement * of group shares between cpus. Assuming the shares were perfectly aligned one * can calculate the shift in shares. - * - * The problem is that perfectly aligning the shares is rather expensive, hence - * we try to avoid doing that too often - see update_shares(), which ratelimits - * this change. - * - * We compensate this by not only taking the current delta into account, but - * also considering the delta between when the shares were last adjusted and - * now. - * - * We still saw a performance dip, some tracing learned us that between - * cgroup:/ and cgroup:/foo balancing the number of affine wakeups increased - * significantly. Therefore try to bias the error in direction of failing - * the affine wakeup. - * */ -static long effective_load(struct task_group *tg, int cpu, - long wl, long wg) +static long effective_load(struct task_group *tg, int cpu, long wl, long wg) { struct sched_entity *se = tg->se[cpu]; if (!tg->parent) return wl; - /* - * By not taking the decrease of shares on the other cpu into - * account our error leans towards reducing the affine wakeups. - */ - if (!wl && sched_feat(ASYM_EFF_LOAD)) - return wl; - for_each_sched_entity(se) { long S, rw, s, a, b; - long more_w; - /* - * Instead of using this increment, also add the difference - * between when the shares were last updated and now. - */ - more_w = se->my_q->load.weight - se->my_q->rq_weight; - wl += more_w; - wg += more_w; - - S = se->my_q->tg->shares; - s = se->my_q->shares; - rw = se->my_q->rq_weight; + S = tg->shares; + s = se->load.weight; + rw = se->my_q->load.weight; a = S*(rw + wl); b = S*rw + s*wg; @@ -1509,23 +1564,6 @@ select_task_rq_fair(struct rq *rq, struc sd = tmp; } -#ifdef CONFIG_FAIR_GROUP_SCHED - if (sched_feat(LB_SHARES_UPDATE)) { - /* - * Pick the largest domain to update shares over - */ - tmp = sd; - if (affine_sd && (!tmp || affine_sd->span_weight > sd->span_weight)) - tmp = affine_sd; - - if (tmp) { - raw_spin_unlock(&rq->lock); - update_shares(tmp); - raw_spin_lock(&rq->lock); - } - } -#endif - if (affine_sd) { if (cpu == prev_cpu || wake_affine(affine_sd, p, sync)) return select_idle_sibling(p, cpu); @@ -2980,7 +3018,6 @@ static int load_balance(int this_cpu, st schedstat_inc(sd, lb_count[idle]); redo: - update_shares(sd); group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle, cpus, balance); @@ -3115,8 +3152,6 @@ out_one_pinned: else ld_moved = 0; out: - if (ld_moved) - update_shares(sd); return ld_moved; } @@ -3510,6 +3545,8 @@ static void rebalance_domains(int cpu, e int update_next_balance = 0; int need_serialize; + update_shares(cpu); + for_each_domain(cpu, sd) { if (!(sd->flags & SD_LOAD_BALANCE)) continue; Index: linux-2.6/kernel/sched_features.h =================================================================== --- linux-2.6.orig/kernel/sched_features.h +++ linux-2.6/kernel/sched_features.h @@ -52,8 +52,6 @@ SCHED_FEAT(ARCH_POWER, 0) SCHED_FEAT(HRTICK, 0) SCHED_FEAT(DOUBLE_TICK, 0) SCHED_FEAT(LB_BIAS, 1) -SCHED_FEAT(LB_SHARES_UPDATE, 1) -SCHED_FEAT(ASYM_EFF_LOAD, 1) /* * Spin-wait on mutex acquisition when the mutex owner is running on Index: linux-2.6/kernel/sysctl.c =================================================================== --- linux-2.6.orig/kernel/sysctl.c +++ linux-2.6/kernel/sysctl.c @@ -307,15 +307,6 @@ static struct ctl_table kern_table[] = { .extra2 = &max_wakeup_granularity_ns, }, { - .procname = "sched_shares_ratelimit", - .data = &sysctl_sched_shares_ratelimit, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = sched_proc_update_handler, - .extra1 = &min_sched_shares_ratelimit, - .extra2 = &max_sched_shares_ratelimit, - }, - { .procname = "sched_tunable_scaling", .data = &sysctl_sched_tunable_scaling, .maxlen = sizeof(enum sched_tunable_scaling), @@ -325,14 +316,6 @@ static struct ctl_table kern_table[] = { .extra2 = &max_sched_tunable_scaling, }, { - .procname = "sched_shares_thresh", - .data = &sysctl_sched_shares_thresh, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, - }, - { .procname = "sched_migration_cost", .data = &sysctl_sched_migration_cost, .maxlen = sizeof(unsigned int), -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/