[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <ZxJIDwHNzPkuyGrU@linux.ibm.com>
Date: Fri, 18 Oct 2024 17:05:43 +0530
From: Vishal Chourasia <vishalc@...ux.ibm.com>
To: Steve Wahl <steve.wahl@....com>
Cc: Ingo Molnar <mingo@...hat.com>, Peter Zijlstra <peterz@...radead.org>,
Juri Lelli <juri.lelli@...hat.com>,
Vincent Guittot <vincent.guittot@...aro.org>,
Dietmar Eggemann <dietmar.eggemann@....com>,
Steven Rostedt <rostedt@...dmis.org>, Ben Segall <bsegall@...gle.com>,
Mel Gorman <mgorman@...e.de>, Valentin Schneider <vschneid@...hat.com>,
linux-kernel@...r.kernel.org, Russ Anderson <rja@....com>,
Dimitri Sivanich <sivanich@....com>
Subject: Re: [PATCH] sched/topology: improve topology_span_sane speed
On Thu, Oct 10, 2024 at 10:51:11AM -0500, Steve Wahl wrote:
> Use a different approach to topology_span_sane(), that checks for the
> same constraint of no partial overlaps for any two CPU sets for
> non-NUMA topology levels, but does so in a way that is O(N) rather
> than O(N^2).
>
> Instead of comparing with all other masks to detect collisions, keep
> one mask that includes all CPUs seen so far and detect collisions with
> a single cpumask_intersects test.
>
> If the current mask has no collisions with previously seen masks, it
> should be a new mask, which can be uniquely identified by the lowest
> bit set in this mask. Keep a pointer to this mask for future
> reference (in an array indexed by the lowest bit set), and add the
> CPUs in this mask to the list of those seen.
>
> If the current mask does collide with previously seen masks, it should
> be exactly equal to a mask seen before, looked up in the same array
> indexed by the lowest bit set in the mask, a single comparison.
>
> Move the topology_span_sane() check out of the existing topology level
> loop, let it use its own loop so that the array allocation can be done
> only once, shared across levels.
>
> On a system with 1920 processors (16 sockets, 60 cores, 2 threads),
> the average time to take one processor offline is reduced from 2.18
> seconds to 1.01 seconds. (Off-lining 959 of 1920 processors took
> 34m49.765s without this change, 16m10.038s with this change in place.)
>
> Signed-off-by: Steve Wahl <steve.wahl@....com>
> ---
> kernel/sched/topology.c | 79 ++++++++++++++++++++++++++++-------------
> 1 file changed, 54 insertions(+), 25 deletions(-)
>
> diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
> index 9748a4c8d668..c150074033d3 100644
> --- a/kernel/sched/topology.c
> +++ b/kernel/sched/topology.c
> @@ -2356,36 +2356,65 @@ static struct sched_domain *build_sched_domain(struct sched_domain_topology_leve
>
> /*
> * Ensure topology masks are sane, i.e. there are no conflicts (overlaps) for
> - * any two given CPUs at this (non-NUMA) topology level.
> + * any two given CPUs on non-NUMA topology levels.
> */
> -static bool topology_span_sane(struct sched_domain_topology_level *tl,
> - const struct cpumask *cpu_map, int cpu)
> +static bool topology_span_sane(const struct cpumask *cpu_map)
> {
> - int i = cpu + 1;
> + struct sched_domain_topology_level *tl;
> + const struct cpumask **masks;
> + struct cpumask *covered;
> + int cpu, id;
> + bool ret = false;
>
> - /* NUMA levels are allowed to overlap */
> - if (tl->flags & SDTL_OVERLAP)
> - return true;
> + lockdep_assert_held(&sched_domains_mutex);
> + covered = sched_domains_tmpmask;
> +
> + masks = kmalloc_array(num_possible_cpus(), sizeof(struct cpumask *), GFP_KERNEL);
> + if (!masks)
> + return ret;
> +
> + for_each_sd_topology(tl) {
> +
> + /* NUMA levels are allowed to overlap */
> + if (tl->flags & SDTL_OVERLAP)
> + continue;
> +
> + cpumask_clear(covered);
> + memset(masks, 0, num_possible_cpus() * sizeof(struct cpumask *));
>
> - /*
> - * Non-NUMA levels cannot partially overlap - they must be either
> - * completely equal or completely disjoint. Otherwise we can end up
> - * breaking the sched_group lists - i.e. a later get_group() pass
> - * breaks the linking done for an earlier span.
> - */
> - for_each_cpu_from(i, cpu_map) {
> /*
> - * We should 'and' all those masks with 'cpu_map' to exactly
> - * match the topology we're about to build, but that can only
> - * remove CPUs, which only lessens our ability to detect
> - * overlaps
> + * Non-NUMA levels cannot partially overlap - they must be either
> + * completely equal or completely disjoint. Otherwise we can end up
> + * breaking the sched_group lists - i.e. a later get_group() pass
> + * breaks the linking done for an earlier span.
> */
> - if (!cpumask_equal(tl->mask(cpu), tl->mask(i)) &&
> - cpumask_intersects(tl->mask(cpu), tl->mask(i)))
> - return false;
> + for_each_cpu(cpu, cpu_map) {
> + /* lowest bit set in this mask is used as a unique id */
> + id = cpumask_first(tl->mask(cpu));
> +
> + /* if this mask doesn't collide with what we've already seen */
> + if (!cpumask_intersects(tl->mask(cpu), covered)) {
> + /* this failing would be an error in this algorithm */
> + if (WARN_ON(masks[id]))
> + goto notsane;
> +
> + /* record the mask we saw for this id */
> + masks[id] = tl->mask(cpu);
> + cpumask_or(covered, tl->mask(cpu), covered);
> + } else if ((!masks[id]) || !cpumask_equal(masks[id], tl->mask(cpu))) {
> + /*
> + * a collision with covered should have exactly matched
> + * a previously seen mask with the same id
> + */
> + goto notsane;
> + }
> + }
> }
> + ret = true;
>
> - return true;
> + notsane:
> + kfree(masks);
> + return ret;
> }
>
> /*
> @@ -2417,9 +2446,6 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
> sd = NULL;
> for_each_sd_topology(tl) {
>
> - if (WARN_ON(!topology_span_sane(tl, cpu_map, i)))
> - goto error;
> -
> sd = build_sched_domain(tl, cpu_map, attr, sd, i);
>
> has_asym |= sd->flags & SD_ASYM_CPUCAPACITY;
> @@ -2433,6 +2459,9 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
> }
> }
>
> + if (WARN_ON(!topology_span_sane(cpu_map)))
> + goto error;
Hi Steve,
Is there any reason why above check is done after initializing
sched domain struct for all the CPUs in the cpu_map?
It looks to me, that this check can be performed before the call to
__visit_domain_allocation_hell() in the build_sched_domains()
resulting in early return if topology_span_sane() detects incorrect
topology.
Also, the error path in the current code only cleans up d->rd struct, keeping
all the work done by build_sched_domain() inside the loop and __alloc_sdt()
called from __visit_domain_allocation_hell()
is it because we need all that work to remain intact?
static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
const struct cpumask *cpu_map)
{
switch (what) {
case sa_rootdomain:
if (!atomic_read(&d->rd->refcount))
free_rootdomain(&d->rd->rcu);
fallthrough;
case sa_sd:
free_percpu(d->sd);
fallthrough;
case sa_sd_storage:
__sdt_free(cpu_map);
fallthrough;
case sa_none:
break;
}
}
> +
> /* Build the groups for the domains */
> for_each_cpu(i, cpu_map) {
> for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
> --
> 2.26.2
>
Powered by blists - more mailing lists