cpu_power is supposed to be a representation of the process capacity of the cpu, not a value to randomly tweak in order to affect placement. Remove the placement hacks and add SMT gain. Signed-off-by: Peter Zijlstra LKML-Reference: --- include/linux/sched.h | 1 + include/linux/topology.h | 1 + kernel/sched.c | 34 ++++++++++++++++++---------------- 3 files changed, 20 insertions(+), 16 deletions(-) Index: linux-2.6/include/linux/sched.h =================================================================== --- linux-2.6.orig/include/linux/sched.h +++ linux-2.6/include/linux/sched.h @@ -930,6 +930,7 @@ struct sched_domain { unsigned int newidle_idx; unsigned int wake_idx; unsigned int forkexec_idx; + unsigned int smt_gain; int flags; /* See SD_* */ enum sched_domain_level level; Index: linux-2.6/include/linux/topology.h =================================================================== --- linux-2.6.orig/include/linux/topology.h +++ linux-2.6/include/linux/topology.h @@ -99,6 +99,7 @@ int arch_update_cpu_topology(void); | SD_SHARE_CPUPOWER, \ .last_balance = jiffies, \ .balance_interval = 1, \ + .smt_gain = 1178, /* 15% */ \ } #endif #endif /* CONFIG_SCHED_SMT */ Index: linux-2.6/kernel/sched.c =================================================================== --- linux-2.6.orig/kernel/sched.c +++ linux-2.6/kernel/sched.c @@ -8468,15 +8468,13 @@ static void free_sched_groups(const stru * there are asymmetries in the topology. If there are asymmetries, group * having more cpu_power will pickup more load compared to the group having * less cpu_power. - * - * cpu_power will be a multiple of SCHED_LOAD_SCALE. This multiple represents - * the maximum number of tasks a group can handle in the presence of other idle - * or lightly loaded groups in the same sched domain. */ static void init_sched_groups_power(int cpu, struct sched_domain *sd) { struct sched_domain *child; struct sched_group *group; + long power; + int weight; WARN_ON(!sd || !sd->groups); @@ -8487,22 +8485,26 @@ static void init_sched_groups_power(int sd->groups->__cpu_power = 0; - /* - * For perf policy, if the groups in child domain share resources - * (for example cores sharing some portions of the cache hierarchy - * or SMT), then set this domain groups cpu_power such that each group - * can handle only one task, when there are other idle groups in the - * same sched domain. - */ - if (!child || (!(sd->flags & SD_POWERSAVINGS_BALANCE) && - (child->flags & - (SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES)))) { - sg_inc_cpu_power(sd->groups, SCHED_LOAD_SCALE); + if (!child) { + power = SCHED_LOAD_SCALE; + weight = cpumask_weight(sched_domain_span(sd)); + /* + * SMT siblings share the power of a single core. + * Usually multiple threads get a better yield out of + * that one core than a single thread would have, + * reflect that in sd->smt_gain. + */ + if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) + power *= sd->smt_gain; + power /= weight; + power >>= SCHED_LOAD_SHIFT; + } + sg_inc_cpu_power(sd->groups, power); return; } /* - * add cpu_power of each child group to this groups cpu_power + * Add cpu_power of each child group to this groups cpu_power. */ group = child->groups; do { -- -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/