It was observed that in __update_group_shares_cpu() rq_weight > aggregate()->rq_weight This is caused by forks/wakeups in between the initial aggregate pass and locking of the RQs for load balance. To avoid this situation partially re-do the aggregation once we have the RQs locked (which avoids new tasks from appearing). Signed-off-by: Peter Zijlstra --- kernel/sched.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) Index: linux-2.6/kernel/sched.c =================================================================== --- linux-2.6.orig/kernel/sched.c +++ linux-2.6/kernel/sched.c @@ -1703,6 +1703,11 @@ aggregate_get_up(struct task_group *tg, aggregate_group_set_shares(tg, cpu, sd); } +static void +aggregate_get_nop(struct task_group *tg, int cpu, struct sched_domain *sd) +{ +} + static DEFINE_PER_CPU(spinlock_t, aggregate_lock); static void __init init_aggregate(void) @@ -1722,6 +1727,11 @@ static int get_aggregate(int cpu, struct return 1; } +static void update_aggregate(int cpu, struct sched_domain *sd) +{ + aggregate_walk_tree(aggregate_get_down, aggregate_get_nop, cpu, sd); +} + static void put_aggregate(int cpu, struct sched_domain *sd) { spin_unlock(&per_cpu(aggregate_lock, cpu)); @@ -1743,6 +1753,10 @@ static inline int get_aggregate(int cpu, return 0; } +static inline void update_aggregate(int cpu, struct sched_domain *sd) +{ +} + static inline void put_aggregate(int cpu, struct sched_domain *sd) { } @@ -2180,6 +2194,12 @@ find_idlest_group(struct sched_domain *s int load_idx = sd->forkexec_idx; int imbalance = 100 + (sd->imbalance_pct-100)/2; + /* + * now that we have both rqs locked the rq weight won't change + * anymore - so update the stats. + */ + update_aggregate(this_cpu, sd); + do { unsigned long load, avg_load; int local_group; -- -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/