Performance improvements with this patch: "lat_ctx -s 0 2" ~22usec (before-this-patch) ~5usec (after-this-patch) There are number of things wrong with the select_idle_sibling() logic a) Once we select the idle sibling, we use that domain (spanning the cpu that the task is currently woken-up and the idle sibling that we found) in our wake_affine() comparisons. This domain is completely different from the domain(we are supposed to use) that spans the cpu that the task currently woken-up and the cpu where the task previously ran. b) We do select_idle_sibling() check only for the cpu that the task is currently woken-up on. If the wake_affine makes the decision of selecting the cpu where the task previously ran, doing a select_idle_sibling() check for that cpu also helps and we don't do this currently. c) Also, selelct_idle_sibling() should also treat the current cpu as an idle cpu if it is a sync wakeup and we have only one task running. Fixing all this improves the lat_ctx performance. Also, there might be other workloads where select_idle_sibling() check on previously ran cpu will also help. Signed-off-by: Suresh Siddha --- kernel/sched_fair.c | 73 +++++++++++++++++++++++++++++----------------------- 1 file changed, 42 insertions(+), 31 deletions(-) Index: tip/kernel/sched_fair.c =================================================================== --- tip.orig/kernel/sched_fair.c +++ tip/kernel/sched_fair.c @@ -1411,28 +1411,49 @@ find_idlest_cpu(struct sched_group *grou * Try and locate an idle CPU in the sched_domain. */ static int -select_idle_sibling(struct task_struct *p, struct sched_domain *sd, int target) +select_idle_sibling(struct task_struct *p, int target, int sync) { int cpu = smp_processor_id(); int prev_cpu = task_cpu(p); int i; + struct sched_domain *sd; + + /* + * If the task is going to be woken-up on this cpu and if it is + * already idle or going to be idle, then it is the right target. + */ + if (target == cpu && (!cpu_rq(cpu)->cfs.nr_running || + (sync && cpu_rq(cpu)->cfs.nr_running == 1))) + return cpu; /* - * If this domain spans both cpu and prev_cpu (see the SD_WAKE_AFFINE - * test in select_task_rq_fair) and the prev_cpu is idle then that's - * always a better target than the current cpu. + * If the task is going to be woken-up on the cpu where it previously + * ran and if it is currently idle, then it the right target. */ - if (target == cpu && !cpu_rq(prev_cpu)->cfs.nr_running) + if (target == prev_cpu && !cpu_rq(prev_cpu)->cfs.nr_running) return prev_cpu; /* - * Otherwise, iterate the domain and find an elegible idle cpu. + * Otherwise, iterate the domains and find an elegible idle cpu. */ - for_each_cpu_and(i, sched_domain_span(sd), &p->cpus_allowed) { - if (!cpu_rq(i)->cfs.nr_running) { - target = i; + for_each_domain(target, sd) { + if (!(sd->flags & SD_SHARE_PKG_RESOURCES)) break; + + for_each_cpu_and(i, sched_domain_span(sd), &p->cpus_allowed) { + if (!cpu_rq(i)->cfs.nr_running) { + target = i; + break; + } } + + /* + * Lets stop looking for an idle sibling when we reached + * the domain that spans the current cpu and prev_cpu. + */ + if (cpumask_test_cpu(cpu, sched_domain_span(sd)) && + cpumask_test_cpu(prev_cpu, sched_domain_span(sd))) + break; } return target; @@ -1496,32 +1517,17 @@ static int select_task_rq_fair(struct ta /* * While iterating the domains looking for a spanning - * WAKE_AFFINE domain, adjust the affine target to any idle cpu - * in cache sharing domains along the way. + * WAKE_AFFINE domain. */ if (want_affine) { - int target = -1; - /* * If both cpu and prev_cpu are part of this domain, * cpu is a valid SD_WAKE_AFFINE target. */ - if (cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) - target = cpu; - - /* - * If there's an idle sibling in this domain, make that - * the wake_affine target instead of the current cpu. - */ - if (tmp->flags & SD_SHARE_PKG_RESOURCES) - target = select_idle_sibling(p, tmp, target); - - if (target >= 0) { - if (tmp->flags & SD_WAKE_AFFINE) { - affine_sd = tmp; - want_affine = 0; - } - cpu = target; + if (cpumask_test_cpu(prev_cpu, sched_domain_span(tmp)) + && (tmp->flags & SD_WAKE_AFFINE)) { + affine_sd = tmp; + want_affine = 0; } } @@ -1549,8 +1555,13 @@ static int select_task_rq_fair(struct ta update_shares(tmp); } - if (affine_sd && wake_affine(affine_sd, p, sync)) - return cpu; + if (affine_sd) { + int target; + + target = wake_affine(affine_sd, p, sync) ? cpu : prev_cpu; + + return select_idle_sibling(p, target, sync); + } while (sd) { int load_idx = sd->forkexec_idx; -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/