Move the nr_busy_cpus thing from its hacky sd->parent->groups->sgc location into the much more natural sched_domain_shared location. Signed-off-by: Peter Zijlstra (Intel) --- include/linux/sched.h | 1 + kernel/sched/core.c | 10 +++++----- kernel/sched/fair.c | 22 ++++++++++++---------- kernel/sched/sched.h | 6 +----- kernel/time/tick-sched.c | 10 +++++----- 5 files changed, 24 insertions(+), 25 deletions(-) --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1059,6 +1059,7 @@ struct sched_group; struct sched_domain_shared { atomic_t ref; + atomic_t nr_busy_cpus; }; struct sched_domain { --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5866,14 +5866,14 @@ static void destroy_sched_domains(struct DEFINE_PER_CPU(struct sched_domain *, sd_llc); DEFINE_PER_CPU(int, sd_llc_size); DEFINE_PER_CPU(int, sd_llc_id); +DEFINE_PER_CPU(struct sched_domain_shared *, sd_llc_shared); DEFINE_PER_CPU(struct sched_domain *, sd_numa); -DEFINE_PER_CPU(struct sched_domain *, sd_busy); DEFINE_PER_CPU(struct sched_domain *, sd_asym); static void update_top_cache_domain(int cpu) { + struct sched_domain_shared *sds = NULL; struct sched_domain *sd; - struct sched_domain *busy_sd = NULL; int id = cpu; int size = 1; @@ -5881,13 +5881,13 @@ static void update_top_cache_domain(int if (sd) { id = cpumask_first(sched_domain_span(sd)); size = cpumask_weight(sched_domain_span(sd)); - busy_sd = sd->parent; /* sd_busy */ + sds = sd->shared; } - rcu_assign_pointer(per_cpu(sd_busy, cpu), busy_sd); rcu_assign_pointer(per_cpu(sd_llc, cpu), sd); per_cpu(sd_llc_size, cpu) = size; per_cpu(sd_llc_id, cpu) = id; + rcu_assign_pointer(per_cpu(sd_llc_shared, cpu), sds); sd = lowest_flag_domain(cpu, SD_NUMA); rcu_assign_pointer(per_cpu(sd_numa, cpu), sd); @@ -6184,7 +6184,6 @@ static void init_sched_groups_capacity(i return; update_group_capacity(sd, cpu); - atomic_set(&sg->sgc->nr_busy_cpus, sg->group_weight); } /* @@ -6388,6 +6387,7 @@ sd_init(struct sched_domain_topology_lev sd->shared = *per_cpu_ptr(sdd->sds, sd_id); atomic_inc(&sd->shared->ref); + atomic_set(&sd->shared->nr_busy_cpus, sd_weight); #ifdef CONFIG_NUMA } else if (sd->flags & SD_NUMA) { --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7842,13 +7842,13 @@ static inline void set_cpu_sd_state_busy int cpu = smp_processor_id(); rcu_read_lock(); - sd = rcu_dereference(per_cpu(sd_busy, cpu)); + sd = rcu_dereference(per_cpu(sd_llc, cpu)); if (!sd || !sd->nohz_idle) goto unlock; sd->nohz_idle = 0; - atomic_inc(&sd->groups->sgc->nr_busy_cpus); + atomic_inc(&sd->shared->nr_busy_cpus); unlock: rcu_read_unlock(); } @@ -7859,13 +7859,13 @@ void set_cpu_sd_state_idle(void) int cpu = smp_processor_id(); rcu_read_lock(); - sd = rcu_dereference(per_cpu(sd_busy, cpu)); + sd = rcu_dereference(per_cpu(sd_llc, cpu)); if (!sd || sd->nohz_idle) goto unlock; sd->nohz_idle = 1; - atomic_dec(&sd->groups->sgc->nr_busy_cpus); + atomic_dec(&sd->shared->nr_busy_cpus); unlock: rcu_read_unlock(); } @@ -8092,8 +8092,8 @@ static void nohz_idle_balance(struct rq static inline bool nohz_kick_needed(struct rq *rq) { unsigned long now = jiffies; + struct sched_domain_shared *sds; struct sched_domain *sd; - struct sched_group_capacity *sgc; int nr_busy, cpu = rq->cpu; bool kick = false; @@ -8121,11 +8121,13 @@ static inline bool nohz_kick_needed(stru return true; rcu_read_lock(); - sd = rcu_dereference(per_cpu(sd_busy, cpu)); - if (sd) { - sgc = sd->groups->sgc; - nr_busy = atomic_read(&sgc->nr_busy_cpus); - + sds = rcu_dereference(per_cpu(sd_llc_shared, cpu)); + if (sds) { + /* + * XXX: write a coherent comment on why we do this. + * See also: http:lkml.kernel.org/r/20111202010832.602203411@sbsiddha-desk.sc.intel.com + */ + nr_busy = atomic_read(&sds->nr_busy_cpus); if (nr_busy > 1) { kick = true; goto unlock; --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -856,8 +856,8 @@ static inline struct sched_domain *lowes DECLARE_PER_CPU(struct sched_domain *, sd_llc); DECLARE_PER_CPU(int, sd_llc_size); DECLARE_PER_CPU(int, sd_llc_id); +DECLARE_PER_CPU(struct sched_domain_shared *, sd_llc_shared); DECLARE_PER_CPU(struct sched_domain *, sd_numa); -DECLARE_PER_CPU(struct sched_domain *, sd_busy); DECLARE_PER_CPU(struct sched_domain *, sd_asym); struct sched_group_capacity { @@ -869,10 +869,6 @@ struct sched_group_capacity { unsigned int capacity; unsigned long next_update; int imbalance; /* XXX unrelated to capacity but shared group state */ - /* - * Number of busy cpus in this group. - */ - atomic_t nr_busy_cpus; unsigned long cpumask[0]; /* iteration mask */ }; --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -933,11 +933,11 @@ void tick_nohz_idle_enter(void) WARN_ON_ONCE(irqs_disabled()); /* - * Update the idle state in the scheduler domain hierarchy - * when tick_nohz_stop_sched_tick() is called from the idle loop. - * State will be updated to busy during the first busy tick after - * exiting idle. - */ + * Update the idle state in the scheduler domain hierarchy + * when tick_nohz_stop_sched_tick() is called from the idle loop. + * State will be updated to busy during the first busy tick after + * exiting idle. + */ set_cpu_sd_state_idle(); local_irq_disable();