Currently the lb_monitor will walk all the domains/cpus from a single cpu's timer interrupt. This will cause massive cache-trashing and cache-line bouncing on larger machines. Split the lb_monitor into root_domain (disjoint sched-domains). Signed-off-by: Peter Zijlstra CC: Gregory Haskins --- kernel/sched.c | 106 ++++++++++++++++++++++++++++------------------------ kernel/sched_fair.c | 2 2 files changed, 59 insertions(+), 49 deletions(-) Index: linux-2.6/kernel/sched.c =================================================================== --- linux-2.6.orig/kernel/sched.c +++ linux-2.6/kernel/sched.c @@ -357,8 +357,6 @@ struct lb_monitor { spinlock_t lock; }; -static struct lb_monitor lb_monitor; - /* * How frequently should we rebalance_shares() across cpus? * @@ -417,6 +415,9 @@ static void lb_monitor_wake(struct lb_mo if (hrtimer_active(&lb_monitor->timer)) return; + /* + * XXX: rd->load_balance && weight(rd->span) > 1 + */ if (nr_cpu_ids == 1) return; @@ -444,6 +445,11 @@ static void lb_monitor_init(struct lb_mo spin_lock_init(&lb_monitor->lock); } + +static int lb_monitor_destroy(struct lb_monitor *lb_monitor) +{ + return hrtimer_cancel(&lb_monitor->timer); +} #endif static void set_se_shares(struct sched_entity *se, unsigned long shares); @@ -607,6 +613,8 @@ struct root_domain { */ cpumask_t rto_mask; atomic_t rto_count; + + struct lb_monitor lb_monitor; }; /* @@ -6328,6 +6336,7 @@ static void rq_attach_root(struct rq *rq { unsigned long flags; const struct sched_class *class; + int active = 0; spin_lock_irqsave(&rq->lock, flags); @@ -6342,8 +6351,14 @@ static void rq_attach_root(struct rq *rq cpu_clear(rq->cpu, old_rd->span); cpu_clear(rq->cpu, old_rd->online); - if (atomic_dec_and_test(&old_rd->refcount)) + if (atomic_dec_and_test(&old_rd->refcount)) { + /* + * sync with active timers. + */ + active = lb_monitor_destroy(&old_rd->lb_monitor); + kfree(old_rd); + } } atomic_inc(&rd->refcount); @@ -6358,6 +6373,9 @@ static void rq_attach_root(struct rq *rq class->join_domain(rq); } + if (active) + lb_monitor_wake(&rd->lb_monitor); + spin_unlock_irqrestore(&rq->lock, flags); } @@ -6367,6 +6385,8 @@ static void init_rootdomain(struct root_ cpus_clear(rd->span); cpus_clear(rd->online); + + lb_monitor_init(&rd->lb_monitor); } static void init_defrootdomain(void) @@ -7398,10 +7418,6 @@ void __init sched_init(void) #ifdef CONFIG_SMP init_defrootdomain(); - -#ifdef CONFIG_FAIR_GROUP_SCHED - lb_monitor_init(&lb_monitor); -#endif #endif init_rt_bandwidth(&def_rt_bandwidth, global_rt_period(), global_rt_runtime()); @@ -7631,11 +7647,11 @@ void set_curr_task(int cpu, struct task_ * distribute shares of all task groups among their schedulable entities, * to reflect load distribution across cpus. */ -static int rebalance_shares(struct sched_domain *sd, int this_cpu) +static int rebalance_shares(struct root_domain *rd, int this_cpu) { struct cfs_rq *cfs_rq; struct rq *rq = cpu_rq(this_cpu); - cpumask_t sdspan = sd->span; + cpumask_t sdspan = rd->span; int state = shares_idle; /* Walk thr' all the task groups that we have */ @@ -7685,50 +7701,12 @@ static int rebalance_shares(struct sched return state; } -static int load_balance_shares(struct lb_monitor *lb_monitor) +static void set_lb_monitor_timeout(struct lb_monitor *lb_monitor, int state) { - int i, cpu, state = shares_idle; u64 max_timeout = (u64)sysctl_sched_max_bal_int_shares * NSEC_PER_MSEC; u64 min_timeout = (u64)sysctl_sched_min_bal_int_shares * NSEC_PER_MSEC; u64 timeout; - /* Prevent cpus going down or coming up */ - /* get_online_cpus(); */ - /* lockout changes to doms_cur[] array */ - /* lock_doms_cur(); */ - /* - * Enter a rcu read-side critical section to safely walk rq->sd - * chain on various cpus and to walk task group list - * (rq->leaf_cfs_rq_list) in rebalance_shares(). - */ - rcu_read_lock(); - - for (i = 0; i < ndoms_cur; i++) { - cpumask_t cpumap = doms_cur[i]; - struct sched_domain *sd = NULL, *sd_prev = NULL; - - cpu = first_cpu(cpumap); - - /* Find the highest domain at which to balance shares */ - for_each_domain(cpu, sd) { - if (!(sd->flags & SD_LOAD_BALANCE)) - continue; - sd_prev = sd; - } - - sd = sd_prev; - /* sd == NULL? No load balance reqd in this domain */ - if (!sd) - continue; - - state = max(state, rebalance_shares(sd, cpu)); - } - - rcu_read_unlock(); - - /* unlock_doms_cur(); */ - /* put_online_cpus(); */ - timeout = ktime_to_ns(lb_monitor->timeout); switch (state) { case shares_balanced: @@ -7741,6 +7719,38 @@ static int load_balance_shares(struct lb break; } lb_monitor->timeout = ns_to_ktime(timeout); +} + +static int load_balance_shares(struct lb_monitor *lb_monitor) +{ + int cpu = smp_processor_id(); + struct rq *rq = cpu_rq(cpu); + struct root_domain *rd; + struct sched_domain *sd, *sd_prev = NULL; + int state = shares_idle; + + spin_lock(&rq->lock); + /* + * root_domain will stay valid until timer exits - synchronized by + * hrtimer_cancel(). + */ + rd = rq->rd; + spin_unlock(&rq->lock); + + /* + * complicated way to find rd->load_balance + */ + rcu_read_lock(); + for_each_domain(cpu, sd) { + if (!(sd->flags & SD_LOAD_BALANCE)) + continue; + sd_prev = sd; + } + if (sd_prev) + state = max(state, rebalance_shares(rd, cpu)); + rcu_read_unlock(); + + set_lb_monitor_timeout(lb_monitor, state); return state; } Index: linux-2.6/kernel/sched_fair.c =================================================================== --- linux-2.6.orig/kernel/sched_fair.c +++ linux-2.6/kernel/sched_fair.c @@ -553,7 +553,7 @@ account_entity_enqueue(struct cfs_rq *cf se->on_rq = 1; list_add(&se->group_node, &cfs_rq->tasks); #if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP) - lb_monitor_wake(&lb_monitor); + lb_monitor_wake(&rq_of(cfs_rq)->rd->lb_monitor); #endif } -- -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/