[<prev] [next>] [day] [month] [year] [list]
Message-ID: <AANLkTin4mMQcs5=MBpTXj29KFN90VZc7snV5JzGScAn+@mail.gmail.com>
Date: Thu, 4 Nov 2010 17:00:30 -0400
From: Paul Turner <pjt@...gle.com>
To: linux-kernel@...r.kernel.org
Cc: Peter Zijlstra <a.p.zijlstra@...llo.nl>,
Ingo Molnar <mingo@...e.hu>,
Srivatsa Vaddagiri <vatsa@...ibm.com>,
Chris Friesen <cfriesen@...tel.com>,
Vaidyanathan Srinivasan <svaidy@...ux.vnet.ibm.com>,
Pierre Bourdon <pbourdon@...ellency.fr>,
Paul Turner <pjt@...gle.com>,
Bharata B Rao <bharata@...ux.vnet.ibm.com>
Subject: Re: [RFC tg_shares_up improvements - v1 01/12] sched: rewrite tg_shares_up
[resend due to html formatting creeping into first email]
On Thu, Nov 4, 2010 at 4:58 PM, Paul Turner <pjt@...gle.com> wrote:
>
>
> On Sat, Oct 16, 2010 at 12:43 AM, <pjt@...gle.com> wrote:
>>
>> From: Peter Zijlstra <a.p.zijlstra@...llo.nl>
>>
>> By tracking a per-cpu load-avg for each cfs_rq and folding it into a
>> global task_group load on each tick we can rework tg_shares_up to be
>> strictly per-cpu.
>>
>> This should improve cpu-cgroup performance for smp systems
>> significantly.
>>
>> [ Paul: changed to use queueing cfs_rq ]
>>
>> Signed-off-by: Peter Zijlstra <a.p.zijlstra@...llo.nl>
>> Signed-off-by: Paul Turner <pjt@...gle.com>
>>
>> ---
>> include/linux/sched.h | 2
>> kernel/sched.c | 173 ++++++++++++------------------------------------
>> kernel/sched_debug.c | 15 +++-
>> kernel/sched_fair.c | 166 +++++++++++++++++++++++++++++-----------------
>> kernel/sched_features.h | 2
>> kernel/sysctl.c | 17 ----
>> 6 files changed, 163 insertions(+), 212 deletions(-)
>>
>> Index: include/linux/sched.h
>> ===================================================================
>> --- include/linux/sched.h.orig
>> +++ include/linux/sched.h
>> @@ -1868,8 +1868,6 @@ static inline void wake_up_idle_cpu(int
>> extern unsigned int sysctl_sched_latency;
>> extern unsigned int sysctl_sched_min_granularity;
>> extern unsigned int sysctl_sched_wakeup_granularity;
>> -extern unsigned int sysctl_sched_shares_ratelimit;
>> -extern unsigned int sysctl_sched_shares_thresh;
>> extern unsigned int sysctl_sched_child_runs_first;
>>
>> enum sched_tunable_scaling {
>> Index: kernel/sched.c
>> ===================================================================
>> --- kernel/sched.c.orig
>> +++ kernel/sched.c
>> @@ -253,6 +253,8 @@ struct task_group {
>> /* runqueue "owned" by this group on each cpu */
>> struct cfs_rq **cfs_rq;
>> unsigned long shares;
>> +
>> + atomic_t load_weight;
>> #endif
>>
>> #ifdef CONFIG_RT_GROUP_SCHED
>> @@ -359,15 +361,11 @@ struct cfs_rq {
>> */
>> unsigned long h_load;
>>
>> - /*
>> - * this cpu's part of tg->shares
>> - */
>> - unsigned long shares;
>> + u64 load_avg;
>> + u64 load_period;
>> + u64 load_stamp;
>>
>> - /*
>> - * load.weight at the time we set shares
>> - */
>> - unsigned long rq_weight;
>> + unsigned long load_contribution;
>> #endif
>> #endif
>> };
>> @@ -790,20 +788,6 @@ late_initcall(sched_init_debug);
>> const_debug unsigned int sysctl_sched_nr_migrate = 32;
>>
>> /*
>> - * ratelimit for updating the group shares.
>> - * default: 0.25ms
>> - */
>> -unsigned int sysctl_sched_shares_ratelimit = 250000;
>> -unsigned int normalized_sysctl_sched_shares_ratelimit = 250000;
>> -
>> -/*
>> - * Inject some fuzzyness into changing the per-cpu group shares
>> - * this avoids remote rq-locks at the expense of fairness.
>> - * default: 4
>> - */
>> -unsigned int sysctl_sched_shares_thresh = 4;
>> -
>> -/*
>> * period over which we average the RT time consumption, measured
>> * in ms.
>> *
>> @@ -1352,6 +1336,12 @@ static inline void update_load_sub(struc
>> lw->inv_weight = 0;
>> }
>>
>> +static inline void update_load_set(struct load_weight *lw, unsigned long w)
>> +{
>> + lw->weight = w;
>> + lw->inv_weight = 0;
>> +}
>> +
>> /*
>> * To aid in avoiding the subversion of "niceness" due to uneven distribution
>> * of tasks with abnormal "nice" values across CPUs the contribution that
>> @@ -1540,97 +1530,44 @@ static unsigned long cpu_avg_load_per_ta
>>
>> #ifdef CONFIG_FAIR_GROUP_SCHED
>>
>> -static __read_mostly unsigned long __percpu *update_shares_data;
>> -
>> -static void __set_se_shares(struct sched_entity *se, unsigned long shares);
>> -
>> -/*
>> - * Calculate and set the cpu's group shares.
>> - */
>> -static void update_group_shares_cpu(struct task_group *tg, int cpu,
>> - unsigned long sd_shares,
>> - unsigned long sd_rq_weight,
>> - unsigned long *usd_rq_weight)
>> -{
>> - unsigned long shares, rq_weight;
>> - int boost = 0;
>> -
>> - rq_weight = usd_rq_weight[cpu];
>> - if (!rq_weight) {
>> - boost = 1;
>> - rq_weight = NICE_0_LOAD;
>> - }
>> -
>> - /*
>> - * \Sum_j shares_j * rq_weight_i
>> - * shares_i = -----------------------------
>> - * \Sum_j rq_weight_j
>> - */
>> - shares = (sd_shares * rq_weight) / sd_rq_weight;
>> - shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES);
>> -
>> - if (abs(shares - tg->se[cpu]->load.weight) >
>> - sysctl_sched_shares_thresh) {
>> - struct rq *rq = cpu_rq(cpu);
>> - unsigned long flags;
>> -
>> - raw_spin_lock_irqsave(&rq->lock, flags);
>> - tg->cfs_rq[cpu]->rq_weight = boost ? 0 : rq_weight;
>> - tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
>> - __set_se_shares(tg->se[cpu], shares);
>> - raw_spin_unlock_irqrestore(&rq->lock, flags);
>> - }
>> -}
>> +static void update_cfs_load(struct cfs_rq *cfs_rq);
>> +static void update_cfs_shares(struct cfs_rq *cfs_rq);
>>
>> /*
>> - * Re-compute the task group their per cpu shares over the given domain.
>> - * This needs to be done in a bottom-up fashion because the rq weight of a
>> - * parent group depends on the shares of its child groups.
>> + * update tg->load_weight by folding this cpu's load_avg
>> */
>> static int tg_shares_up(struct task_group *tg, void *data)
>> {
>> - unsigned long weight, rq_weight = 0, sum_weight = 0, shares = 0;
>> - unsigned long *usd_rq_weight;
>> - struct sched_domain *sd = data;
>> + long load_avg;
>> + struct cfs_rq *cfs_rq;
>> unsigned long flags;
>> - int i;
>> + int cpu = (long)data;
>> + struct rq *rq;
>>
>> - if (!tg->se[0])
>> + if (!tg->se[cpu])
>> return 0;
>>
>> - local_irq_save(flags);
>> - usd_rq_weight = per_cpu_ptr(update_shares_data, smp_processor_id());
>> -
>> - for_each_cpu(i, sched_domain_span(sd)) {
>> - weight = tg->cfs_rq[i]->load.weight;
>> - usd_rq_weight[i] = weight;
>> -
>> - rq_weight += weight;
>> - /*
>> - * If there are currently no tasks on the cpu pretend there
>> - * is one of average load so that when a new task gets to
>> - * run here it will not get delayed by group starvation.
>> - */
>> - if (!weight)
>> - weight = NICE_0_LOAD;
>> + rq = cpu_rq(cpu);
>> + cfs_rq = tg->cfs_rq[cpu];
>>
>> - sum_weight += weight;
>> - shares += tg->cfs_rq[i]->shares;
>> - }
>> + raw_spin_lock_irqsave(&rq->lock, flags);
>>
>> - if (!rq_weight)
>> - rq_weight = sum_weight;
>> + update_rq_clock(rq);
>> + update_cfs_load(cfs_rq);
>>
>> - if ((!shares && rq_weight) || shares > tg->shares)
>> - shares = tg->shares;
>> + load_avg = div64_u64(cfs_rq->load_avg, cfs_rq->load_period+1);
>> + load_avg -= cfs_rq->load_contribution;
>>
>> - if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE))
>> - shares = tg->shares;
>> + atomic_add(load_avg, &tg->load_weight);
>> + cfs_rq->load_contribution += load_avg;
>>
>> - for_each_cpu(i, sched_domain_span(sd))
>> - update_group_shares_cpu(tg, i, shares, rq_weight, usd_rq_weight);
>> + /*
>> + * We need to update shares after updating tg->load_weight in
>> + * order to adjust the weight of groups with long running tasks.
>> + */
>> + update_cfs_shares(cfs_rq);
>>
>> - local_irq_restore(flags);
>> + raw_spin_unlock_irqrestore(&rq->lock, flags);
>>
>> return 0;
>> }
>> @@ -1649,7 +1586,7 @@ static int tg_load_down(struct task_grou
>> load = cpu_rq(cpu)->load.weight;
>> } else {
>> load = tg->parent->cfs_rq[cpu]->h_load;
>> - load *= tg->cfs_rq[cpu]->shares;
>> + load *= tg->se[cpu]->load.weight;
>> load /= tg->parent->cfs_rq[cpu]->load.weight + 1;
>> }
>>
>> @@ -1658,21 +1595,16 @@ static int tg_load_down(struct task_grou
>> return 0;
>> }
>>
>> -static void update_shares(struct sched_domain *sd)
>> +static void update_shares(long cpu)
>> {
>> - s64 elapsed;
>> - u64 now;
>> -
>> if (root_task_group_empty())
>> return;
>>
>> - now = local_clock();
>> - elapsed = now - sd->last_update;
>> + /*
>> + * XXX: replace with an on-demand list
>> + */
>>
>> - if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) {
>> - sd->last_update = now;
>> - walk_tg_tree(tg_nop, tg_shares_up, sd);
>> - }
>> + walk_tg_tree(tg_nop, tg_shares_up, (void *)cpu);
>> }
>>
>> static void update_h_load(long cpu)
>> @@ -1682,7 +1614,7 @@ static void update_h_load(long cpu)
>>
>> #else
>>
>> -static inline void update_shares(struct sched_domain *sd)
>> +static inline void update_shares(int cpu)
>> {
>> }
>>
>> @@ -1807,15 +1739,6 @@ static void double_rq_unlock(struct rq *
>>
>> #endif
>>
>> -#ifdef CONFIG_FAIR_GROUP_SCHED
>> -static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
>> -{
>> -#ifdef CONFIG_SMP
>> - cfs_rq->shares = shares;
>> -#endif
>> -}
>> -#endif
>> -
>> static void calc_load_account_idle(struct rq *this_rq);
>> static void update_sysctl(void);
>> static int get_update_sysctl_factor(void);
>> @@ -5404,7 +5327,6 @@ static void update_sysctl(void)
>> SET_SYSCTL(sched_min_granularity);
>> SET_SYSCTL(sched_latency);
>> SET_SYSCTL(sched_wakeup_granularity);
>> - SET_SYSCTL(sched_shares_ratelimit);
>> #undef SET_SYSCTL
>> }
>>
>> @@ -7721,8 +7643,7 @@ static void init_tg_cfs_entry(struct tas
>> se->cfs_rq = parent->my_q;
>>
>> se->my_q = cfs_rq;
>> - se->load.weight = tg->shares;
>> - se->load.inv_weight = 0;
>> + update_load_set(&se->load, tg->shares);
>> se->parent = parent;
>> }
>> #endif
>> @@ -7815,10 +7736,6 @@ void __init sched_init(void)
>>
>> #endif /* CONFIG_CGROUP_SCHED */
>>
>> -#if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP
>> - update_shares_data = __alloc_percpu(nr_cpu_ids * sizeof(unsigned long),
>> - __alignof__(unsigned long));
>> -#endif
>> for_each_possible_cpu(i) {
>> struct rq *rq;
>>
>> @@ -8386,8 +8303,7 @@ static void __set_se_shares(struct sched
>> if (on_rq)
>> dequeue_entity(cfs_rq, se, 0);
>>
>> - se->load.weight = shares;
>> - se->load.inv_weight = 0;
>> + update_load_set(&se->load, shares);
>>
>> if (on_rq)
>> enqueue_entity(cfs_rq, se, 0);
>> @@ -8444,7 +8360,6 @@ int sched_group_set_shares(struct task_g
>> /*
>> * force a rebalance
>> */
>> - cfs_rq_set_shares(tg->cfs_rq[i], 0);
>> set_se_shares(tg->se[i], shares);
>> }
>>
>> Index: kernel/sched_debug.c
>> ===================================================================
>> --- kernel/sched_debug.c.orig
>> +++ kernel/sched_debug.c
>> @@ -202,15 +202,22 @@ void print_cfs_rq(struct seq_file *m, in
>> spread0 = min_vruntime - rq0_min_vruntime;
>> SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread0",
>> SPLIT_NS(spread0));
>> - SEQ_printf(m, " .%-30s: %ld\n", "nr_running", cfs_rq->nr_running);
>> - SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight);
>> -
>> SEQ_printf(m, " .%-30s: %d\n", "nr_spread_over",
>> cfs_rq->nr_spread_over);
>> + SEQ_printf(m, " .%-30s: %ld\n", "nr_running", cfs_rq->nr_running);
>> + SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight);
>> #ifdef CONFIG_FAIR_GROUP_SCHED
>> #ifdef CONFIG_SMP
>> - SEQ_printf(m, " .%-30s: %lu\n", "shares", cfs_rq->shares);
>> + SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "load_avg",
>> + SPLIT_NS(cfs_rq->load_avg));
>> + SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "load_period",
>> + SPLIT_NS(cfs_rq->load_period));
>> + SEQ_printf(m, " .%-30s: %ld\n", "load_contrib",
>> + cfs_rq->load_contribution);
>> + SEQ_printf(m, " .%-30s: %d\n", "load_tg",
>> + atomic_read(&tg->load_weight));
>> #endif
>> +
>> print_cfs_group_stats(m, cpu, cfs_rq->tg);
>> #endif
>> }
>> Index: kernel/sched_fair.c
>> ===================================================================
>> --- kernel/sched_fair.c.orig
>> +++ kernel/sched_fair.c
>> @@ -417,7 +417,6 @@ int sched_proc_update_handler(struct ctl
>> WRT_SYSCTL(sched_min_granularity);
>> WRT_SYSCTL(sched_latency);
>> WRT_SYSCTL(sched_wakeup_granularity);
>> - WRT_SYSCTL(sched_shares_ratelimit);
>> #undef WRT_SYSCTL
>>
>> return 0;
>> @@ -633,7 +632,6 @@ account_entity_enqueue(struct cfs_rq *cf
>> list_add(&se->group_node, &cfs_rq->tasks);
>> }
>> cfs_rq->nr_running++;
>> - se->on_rq = 1;
>> }
>>
>> static void
>> @@ -647,9 +645,89 @@ account_entity_dequeue(struct cfs_rq *cf
>> list_del_init(&se->group_node);
>> }
>> cfs_rq->nr_running--;
>> - se->on_rq = 0;
>> }
>>
>> +#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED
>> +static void update_cfs_load(struct cfs_rq *cfs_rq)
>> +{
>> + u64 period = sched_avg_period();
>> + u64 now, delta;
>> +
>> + if (!cfs_rq)
>> + return;
>> +
>> + now = rq_of(cfs_rq)->clock;
>> + delta = now - cfs_rq->load_stamp;
>> +
>> + cfs_rq->load_stamp = now;
>> + cfs_rq->load_period += delta;
>> + cfs_rq->load_avg += delta * cfs_rq->load.weight;
>> +
>> + while (cfs_rq->load_period > period) {
>> + /*
>> + * Inline assembly required to prevent the compiler
>> + * optimising this loop into a divmod call.
>> + * See __iter_div_u64_rem() for another example of this.
>> + */
>> + asm("" : "+rm" (cfs_rq->load_period));
>> + cfs_rq->load_period /= 2;
>> + cfs_rq->load_avg /= 2;
>> + }
>> +}
>> +
>> +static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
>> + unsigned long weight)
>> +{
>> + if (se->on_rq)
>> + account_entity_dequeue(cfs_rq, se);
>> +
>> + update_load_set(&se->load, weight);
>> +
>> + if (se->on_rq)
>> + account_entity_enqueue(cfs_rq, se);
>> +}
>> +
>> +static void update_cfs_shares(struct cfs_rq *cfs_rq)
>> +{
>> + struct task_group *tg;
>> + struct sched_entity *se;
>> + long load_weight, load, shares;
>> +
>> + if (!cfs_rq)
>> + return;
>> +
>> + tg = cfs_rq->tg;
>> + se = tg->se[cpu_of(rq_of(cfs_rq))];
>> + if (!se)
>> + return;
>> +
>> + load = cfs_rq->load.weight;
>> +
>> + load_weight = atomic_read(&tg->load_weight);
>> + load_weight -= cfs_rq->load_contribution;
>> + load_weight += load;
>> +
>> + shares = (tg->shares * load);
>> + if (load_weight)
>> + shares /= load_weight;
>> +
>> + if (shares < MIN_SHARES)
>> + shares = MIN_SHARES;
>> + if (shares > tg->shares)
>> + shares = tg->shares;
>> +
>> + reweight_entity(cfs_rq_of(se), se, shares);
>> +}
>> +#else /* CONFIG_FAIR_GROUP_SCHED */
>> +static inline void update_cfs_load(struct cfs_rq *cfs_rq)
>> +{
>> +}
>> +
>> +static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
>> +{
>> +}
>> +#endif /* CONFIG_FAIR_GROUP_SCHED */
>> +
>> static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
>> {
>> #ifdef CONFIG_SCHEDSTATS
>> @@ -771,7 +849,9 @@ enqueue_entity(struct cfs_rq *cfs_rq, st
>> * Update run-time statistics of the 'current'.
>> */
>> update_curr(cfs_rq);
>> + update_cfs_load(cfs_rq);
>> account_entity_enqueue(cfs_rq, se);
>> + update_cfs_shares(cfs_rq_of(se));
>>
>> if (flags & ENQUEUE_WAKEUP) {
>> place_entity(cfs_rq, se, 0);
>> @@ -782,6 +862,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, st
>> check_spread(cfs_rq, se);
>> if (se != cfs_rq->curr)
>> __enqueue_entity(cfs_rq, se);
>> + se->on_rq = 1;
>> }
>>
>> static void __clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
>> @@ -825,8 +906,11 @@ dequeue_entity(struct cfs_rq *cfs_rq, st
>>
>> if (se != cfs_rq->curr)
>> __dequeue_entity(cfs_rq, se);
>> + se->on_rq = 0;
>> + update_cfs_load(cfs_rq);
>> account_entity_dequeue(cfs_rq, se);
>> update_min_vruntime(cfs_rq);
>> + update_cfs_shares(cfs_rq_of(se));
>>
>> /*
>> * Normalize the entity after updating the min_vruntime because the
>> @@ -1055,6 +1139,13 @@ enqueue_task_fair(struct rq *rq, struct
>> flags = ENQUEUE_WAKEUP;
>> }
>>
>> + for_each_sched_entity(se) {
>> + struct cfs_rq *cfs_rq = cfs_rq_of(se);
>> +
>> + update_cfs_load(cfs_rq);
>> + update_cfs_shares(cfs_rq);
>> + }
>> +
>> hrtick_update(rq);
>> }
>>
>> @@ -1071,12 +1162,20 @@ static void dequeue_task_fair(struct rq
>> for_each_sched_entity(se) {
>> cfs_rq = cfs_rq_of(se);
>> dequeue_entity(cfs_rq, se, flags);
>> +
>> /* Don't dequeue parent if it has other entities besides us */
>> if (cfs_rq->load.weight)
>> break;
>> flags |= DEQUEUE_SLEEP;
>> }
>>
>> + for_each_sched_entity(se) {
>> + struct cfs_rq *cfs_rq = cfs_rq_of(se);
>> +
>> + update_cfs_load(cfs_rq);
>> + update_cfs_shares(cfs_rq);
>> + }
>> +
>> hrtick_update(rq);
>> }
>>
>> @@ -1143,51 +1242,20 @@ static void task_waking_fair(struct rq *
>> * Adding load to a group doesn't make a group heavier, but can cause movement
>> * of group shares between cpus. Assuming the shares were perfectly aligned one
>> * can calculate the shift in shares.
>> - *
>> - * The problem is that perfectly aligning the shares is rather expensive, hence
>> - * we try to avoid doing that too often - see update_shares(), which ratelimits
>> - * this change.
>> - *
>> - * We compensate this by not only taking the current delta into account, but
>> - * also considering the delta between when the shares were last adjusted and
>> - * now.
>> - *
>> - * We still saw a performance dip, some tracing learned us that between
>> - * cgroup:/ and cgroup:/foo balancing the number of affine wakeups increased
>> - * significantly. Therefore try to bias the error in direction of failing
>> - * the affine wakeup.
>> - *
>> */
>> -static long effective_load(struct task_group *tg, int cpu,
>> - long wl, long wg)
>> +static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
>> {
>> struct sched_entity *se = tg->se[cpu];
>>
>> if (!tg->parent)
>> return wl;
>>
>> - /*
>> - * By not taking the decrease of shares on the other cpu into
>> - * account our error leans towards reducing the affine wakeups.
>> - */
>> - if (!wl && sched_feat(ASYM_EFF_LOAD))
>> - return wl;
>> -
>> for_each_sched_entity(se) {
>> long S, rw, s, a, b;
>> - long more_w;
>> -
>> - /*
>> - * Instead of using this increment, also add the difference
>> - * between when the shares were last updated and now.
>> - */
>> - more_w = se->my_q->load.weight - se->my_q->rq_weight;
>> - wl += more_w;
>> - wg += more_w;
>>
>> - S = se->my_q->tg->shares;
>> - s = se->my_q->shares;
>> - rw = se->my_q->rq_weight;
>> + S = tg->shares;
>
This needs to at least be se->my_q->tg->shares (versus tg->shares as
we walk up the tree).
>>
>> + s = se->load.weight;
>> + rw = se->my_q->load.weight;
>
I suspect we could get better accuracy if we used the new knowledge of
global weight more directly (which we can get from the
load_contribution without having to bounce tg->load_weight around).
This path needs some attention in general though so I'll leave such
musings to a second patchset which will focus on improving group
wake_affine behavior.
>
>
>>
>> a = S*(rw + wl);
>> b = S*rw + s*wg;
>> @@ -1508,23 +1576,6 @@ select_task_rq_fair(struct rq *rq, struc
>> sd = tmp;
>> }
>>
>> -#ifdef CONFIG_FAIR_GROUP_SCHED
>> - if (sched_feat(LB_SHARES_UPDATE)) {
>> - /*
>> - * Pick the largest domain to update shares over
>> - */
>> - tmp = sd;
>> - if (affine_sd && (!tmp || affine_sd->span_weight > sd->span_weight))
>> - tmp = affine_sd;
>> -
>> - if (tmp) {
>> - raw_spin_unlock(&rq->lock);
>> - update_shares(tmp);
>> - raw_spin_lock(&rq->lock);
>> - }
>> - }
>> -#endif
>> -
>> if (affine_sd) {
>> if (cpu == prev_cpu || wake_affine(affine_sd, p, sync))
>> return select_idle_sibling(p, cpu);
>> @@ -2977,7 +3028,6 @@ static int load_balance(int this_cpu, st
>> schedstat_inc(sd, lb_count[idle]);
>>
>> redo:
>> - update_shares(sd);
>> group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
>> cpus, balance);
>>
>> @@ -3119,8 +3169,6 @@ out_one_pinned:
>> else
>> ld_moved = 0;
>> out:
>> - if (ld_moved)
>> - update_shares(sd);
>> return ld_moved;
>> }
>>
>> @@ -3514,6 +3562,8 @@ static void rebalance_domains(int cpu, e
>> int update_next_balance = 0;
>> int need_serialize;
>>
>> + update_shares(cpu);
>> +
>> for_each_domain(cpu, sd) {
>> if (!(sd->flags & SD_LOAD_BALANCE))
>> continue;
>> Index: kernel/sched_features.h
>> ===================================================================
>> --- kernel/sched_features.h.orig
>> +++ kernel/sched_features.h
>> @@ -52,8 +52,6 @@ SCHED_FEAT(ARCH_POWER, 0)
>> SCHED_FEAT(HRTICK, 0)
>> SCHED_FEAT(DOUBLE_TICK, 0)
>> SCHED_FEAT(LB_BIAS, 1)
>> -SCHED_FEAT(LB_SHARES_UPDATE, 1)
>> -SCHED_FEAT(ASYM_EFF_LOAD, 1)
>>
>> /*
>> * Spin-wait on mutex acquisition when the mutex owner is running on
>> Index: kernel/sysctl.c
>> ===================================================================
>> --- kernel/sysctl.c.orig
>> +++ kernel/sysctl.c
>> @@ -307,15 +307,6 @@ static struct ctl_table kern_table[] = {
>> .extra2 = &max_wakeup_granularity_ns,
>> },
>> {
>> - .procname = "sched_shares_ratelimit",
>> - .data = &sysctl_sched_shares_ratelimit,
>> - .maxlen = sizeof(unsigned int),
>> - .mode = 0644,
>> - .proc_handler = sched_proc_update_handler,
>> - .extra1 = &min_sched_shares_ratelimit,
>> - .extra2 = &max_sched_shares_ratelimit,
>> - },
>> - {
>> .procname = "sched_tunable_scaling",
>> .data = &sysctl_sched_tunable_scaling,
>> .maxlen = sizeof(enum sched_tunable_scaling),
>> @@ -325,14 +316,6 @@ static struct ctl_table kern_table[] = {
>> .extra2 = &max_sched_tunable_scaling,
>> },
>> {
>> - .procname = "sched_shares_thresh",
>> - .data = &sysctl_sched_shares_thresh,
>> - .maxlen = sizeof(unsigned int),
>> - .mode = 0644,
>> - .proc_handler = proc_dointvec_minmax,
>> - .extra1 = &zero,
>> - },
>> - {
>> .procname = "sched_migration_cost",
>> .data = &sysctl_sched_migration_cost,
>> .maxlen = sizeof(unsigned int),
>>
>> --
>>
>
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Powered by blists - more mailing lists