[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <1338501826.28384.133.camel@twins>
Date: Fri, 01 Jun 2012 00:03:46 +0200
From: Peter Zijlstra <peterz@...radead.org>
To: David Rientjes <rientjes@...gle.com>
Cc: Ingo Molnar <mingo@...nel.org>, hpa@...or.com,
linux-kernel@...r.kernel.org,
Linus Torvalds <torvalds@...ux-foundation.org>, pjt@...gle.com,
cl@...ux.com, riel@...hat.com, bharata.rao@...il.com,
Andrew Morton <akpm@...ux-foundation.org>,
Lee.Schermerhorn@...com, aarcange@...hat.com, danms@...ibm.com,
suresh.b.siddha@...el.com, tglx@...utronix.de,
linux-tip-commits@...r.kernel.org
Subject: Re: [tip:sched/numa] sched/numa: Introduce sys_numa_{t,m}bind()
On Fri, 2012-05-25 at 10:35 +0200, Peter Zijlstra wrote:
> What does the node distance table on that thing look like?
The below makes it boot and I think even balance right. But I'm not
happy with the patch as I think it can be done simpler. The resulting
domain setup isn't minimal either.
---
Subject:
From: Peter Zijlstra <a.p.zijlstra@...llo.nl>
Date: Thu May 31 14:47:33 CEST 2012
Signed-off-by: Peter Zijlstra <a.p.zijlstra@...llo.nl>
---
include/linux/sched.h | 11 ++++++++
kernel/sched/core.c | 64 +++++++++++++++++++++++++++++++++++++++++++-------
kernel/sched/fair.c | 5 ++-
kernel/sched/sched.h | 2 +
4 files changed, 72 insertions(+), 10 deletions(-)
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -878,6 +878,8 @@ struct sched_group_power {
* Number of busy cpus in this group.
*/
atomic_t nr_busy_cpus;
+
+ unsigned long cpumask[0]; /* iteration mask */
};
struct sched_group {
@@ -902,6 +904,15 @@ static inline struct cpumask *sched_grou
return to_cpumask(sg->cpumask);
}
+/*
+ * cpumask masking which cpus in the group are allowed to iterate up the domain
+ * tree.
+ */
+static inline struct cpumask *sched_group_mask(struct sched_group *sg)
+{
+ return to_cpumask(sg->sgp->cpumask);
+}
+
/**
* group_first_cpu - Returns the first cpu in the cpumask of a sched_group.
* @group: The group whose first cpu is to be returned.
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6008,6 +6008,44 @@ struct sched_domain_topology_level {
struct sd_data data;
};
+/*
+ * Build an iteration mask that can exclude certain CPUs from the upwards
+ * domain traversal.
+ *
+ * Asymmetric node setups can result in situations where the domain tree is of
+ * unequal depth, make sure to skip domains that already cover the entire
+ * range.
+ *
+ * In that case build_sched_domains() will have terminated the iteration early
+ * and our sibling sd spans will be empty. Domains should always include the
+ * cpu they're built on, so check that.
+ *
+ */
+static void build_group_mask(struct sched_domain *sd, struct sched_group *sg)
+{
+ const struct cpumask *span = sched_domain_span(sd);
+ struct sd_data *sdd = sd->private;
+ struct sched_domain *sibling;
+ int i;
+
+ for_each_cpu(i, span) {
+ sibling = *per_cpu_ptr(sdd->sd, i);
+ if (!cpumask_test_cpu(i, sched_domain_span(sibling)))
+ continue;
+
+ cpumask_set_cpu(i, sched_group_mask(sg));
+ }
+}
+
+/*
+ * Return the canonical balance cpu for this group, this is the first cpu
+ * of this group that's also in the iteration mask.
+ */
+int group_balance_cpu(struct sched_group *sg)
+{
+ return cpumask_first_and(sched_group_cpus(sg), sched_group_mask(sg));
+}
+
static int
build_overlap_sched_groups(struct sched_domain *sd, int cpu)
{
@@ -6026,6 +6064,12 @@ build_overlap_sched_groups(struct sched_
if (cpumask_test_cpu(i, covered))
continue;
+ child = *per_cpu_ptr(sdd->sd, i);
+
+ /* See the comment near build_group_mask(). */
+ if (!cpumask_test_cpu(i, sched_domain_span(child)))
+ continue;
+
sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
GFP_KERNEL, cpu_to_node(cpu));
@@ -6033,8 +6077,6 @@ build_overlap_sched_groups(struct sched_
goto fail;
sg_span = sched_group_cpus(sg);
-
- child = *per_cpu_ptr(sdd->sd, i);
if (child->child) {
child = child->child;
cpumask_copy(sg_span, sched_domain_span(child));
@@ -6044,13 +6086,18 @@ build_overlap_sched_groups(struct sched_
cpumask_or(covered, covered, sg_span);
sg->sgp = *per_cpu_ptr(sdd->sgp, i);
- atomic_inc(&sg->sgp->ref);
+ if (atomic_inc_return(&sg->sgp->ref) == 1)
+ build_group_mask(sd, sg);
+
+ /*
+ * Make sure the first group of this domain contains the
+ * canonical balance cpu. Otherwise the sched_domain iteration
+ * breaks. See update_sg_lb_stats().
+ */
if ((!groups && cpumask_test_cpu(cpu, sg_span)) ||
- cpumask_first(sg_span) == cpu) {
- WARN_ON_ONCE(!cpumask_test_cpu(cpu, sg_span));
+ group_balance_cpu(sg) == cpu)
groups = sg;
- }
if (!first)
first = sg;
@@ -6123,6 +6170,7 @@ build_sched_groups(struct sched_domain *
cpumask_clear(sched_group_cpus(sg));
sg->sgp->power = 0;
+ cpumask_setall(sched_group_mask(sg));
for_each_cpu(j, span) {
if (get_group(j, sdd, NULL) != group)
@@ -6164,7 +6212,7 @@ static void init_sched_groups_power(int
sg = sg->next;
} while (sg != sd->groups);
- if (cpu != group_first_cpu(sg))
+ if (cpu != group_balance_cpu(sg))
return;
update_group_power(sd, cpu);
@@ -6572,7 +6620,7 @@ static int __sdt_alloc(const struct cpum
*per_cpu_ptr(sdd->sg, j) = sg;
- sgp = kzalloc_node(sizeof(struct sched_group_power),
+ sgp = kzalloc_node(sizeof(struct sched_group_power) + cpumask_size(),
GFP_KERNEL, cpu_to_node(j));
if (!sgp)
return -ENOMEM;
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3846,7 +3846,7 @@ static inline void update_sg_lb_stats(st
int i;
if (local_group)
- balance_cpu = group_first_cpu(group);
+ balance_cpu = group_balance_cpu(group);
/* Tally up the load of all CPUs in the group */
max_cpu_load = 0;
@@ -3861,7 +3861,8 @@ static inline void update_sg_lb_stats(st
/* Bias balancing toward cpus of our domain */
if (local_group) {
- if (idle_cpu(i) && !first_idle_cpu) {
+ if (idle_cpu(i) && !first_idle_cpu &&
+ cpumask_test_cpu(i, sched_group_mask(group))) {
first_idle_cpu = 1;
balance_cpu = i;
}
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -542,6 +542,8 @@ DECLARE_PER_CPU(struct sched_domain *, s
DECLARE_PER_CPU(int, sd_llc_id);
DECLARE_PER_CPU(struct sched_domain *, sd_node);
+extern int group_balance_cpu(struct sched_group *sg);
+
#endif /* CONFIG_SMP */
#include "stats.h"
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Powered by blists - more mailing lists