[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <861e15234270eb3678390da2b0ddf3a7162d98dd.camel@linux.intel.com>
Date: Mon, 29 Sep 2025 15:18:53 -0700
From: Tim Chen <tim.c.chen@...ux.intel.com>
To: "Chen, Yu C" <yu.c.chen@...el.com>, Peter Zijlstra
<peterz@...radead.org>, Ingo Molnar <mingo@...hat.com>
Cc: Juri Lelli <juri.lelli@...hat.com>, Dietmar Eggemann
<dietmar.eggemann@....com>, Ben Segall <bsegall@...gle.com>, Mel Gorman
<mgorman@...e.de>, Valentin Schneider <vschneid@...hat.com>, Tim Chen
<tim.c.chen@...el.com>, Vincent Guittot <vincent.guittot@...aro.org>, Len
Brown <len.brown@...el.com>, linux-kernel@...r.kernel.org, K Prateek Nayak
<kprateek.nayak@....com>, "Gautham R . Shenoy" <gautham.shenoy@....com>,
Zhao Liu <zhao1.liu@...el.com>, Vinicius Costa Gomes
<vinicius.gomes@...el.com>, Arjan Van De Ven <arjan.van.de.ven@...el.com>
Subject: Re: [PATCH v4 1/2] sched: Create architecture specific sched domain
distances
On Sat, 2025-09-27 at 20:34 +0800, Chen, Yu C wrote:
> [snip]
>
> > @@ -1591,10 +1591,12 @@ static void claim_allocations(int cpu, struct sched_domain *sd)
> > enum numa_topology_type sched_numa_topology_type;
> >
> > static int sched_domains_numa_levels;
> > +static int sched_numa_node_levels;
>
> I agree that the benefit of maintaining two NUMA distances - one for the
> sched_domain and another for the NUMA balancing/page allocation policy - is
> to avoid complicating the sched_domain hierarchy while preserving the
> advantages of NUMA locality.
>
> Meanwhile, I wonder if we could also add a "orig" prefix to the original
> NUMA distance. This way, we can quickly understand its meaning later.
> For example,
> sched_orig_node_levels
> sched_orig_node_distance
I am not sure adding orig will make the meaning any clearer.
I can add comments to note that
sched_numa_node_distance mean the node distance between numa nodes
sched_numa_nodel_levels mean the number of unique distances between numa nodes
>
> > static int sched_domains_curr_level;
> >
> > int sched_max_numa_distance;
> > static int *sched_domains_numa_distance;
> > +static int *sched_numa_node_distance;
> > static struct cpumask ***sched_domains_numa_masks;
> > #endif /* CONFIG_NUMA */
> >
> > @@ -1808,10 +1810,10 @@ bool find_numa_distance(int distance)
> > return true;
> >
> > rcu_read_lock();
> > - distances = rcu_dereference(sched_domains_numa_distance);
> > + distances = rcu_dereference(sched_numa_node_distance);
> > if (!distances)
> > goto unlock;
> > - for (i = 0; i < sched_domains_numa_levels; i++) {
> > + for (i = 0; i < sched_numa_node_levels; i++) {
> > if (distances[i] == distance) {
> > found = true;
> > break;
> > @@ -1887,14 +1889,48 @@ static void init_numa_topology_type(int offline_node)
> >
> > #define NR_DISTANCE_VALUES (1 << DISTANCE_BITS)
> >
> > -void sched_init_numa(int offline_node)
> > +/*
> > + * An architecture could modify its NUMA distance, to change
> > + * grouping of NUMA nodes and number of NUMA levels when creating
> > + * NUMA level sched domains.
> > + *
> > + * A NUMA level is created for each unique
> > + * arch_sched_node_distance.
> > + */
> > +static bool __modified_sched_node_dist = true;
> > +
> > +int __weak arch_sched_node_distance(int from, int to)
> > {
> > - struct sched_domain_topology_level *tl;
> > - unsigned long *distance_map;
> > + if (__modified_sched_node_dist)
> > + __modified_sched_node_dist = false;
> > +
> > + return node_distance(from, to);
> > +}
> > +
> > +static bool modified_sched_node_distance(void)
> > +{
> > + /*
> > + * Call arch_sched_node_distance()
> > + * to determine if arch_sched_node_distance
> > + * has been modified from node_distance()
> > + * to arch specific distance.
> > + */
> > + arch_sched_node_distance(0, 0);
> > + return __modified_sched_node_dist;
> > +}
> > +
>
> If our goal is to figure out whether the arch_sched_node_distance()
> has been overridden, how about the following alias?
>
> int __weak arch_sched_node_distance(int from, int to)
> {
> return __node_distance(from, to);
> }
> int arch_sched_node_distance_original(int from, int to) __weak
> __alias(arch_sched_node_distance);
>
> static bool arch_sched_node_distance_is_overridden(void)
> {
> return arch_sched_node_distance != arch_sched_node_distance_original;
> }
>
> so arch_sched_node_distance_is_overridden() can replace
> modified_sched_node_distance()
>
I think that the alias version will still point to the replaced function and not
the originally defined one.
How about not using __weak and just explicitly define arch_sched_node_distance
as a function pointer. Change the code like below.
Tim
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index d6b772990ec2..12db78af09d5 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -545,7 +545,7 @@ static int avg_remote_numa_distance(void)
return sched_avg_remote_distance;
}
-int arch_sched_node_distance(int from, int to)
+static int x86_arch_sched_node_distance(int from, int to)
{
int d = node_distance(from, to);
@@ -918,6 +918,9 @@ static int do_boot_cpu(u32 apicid, unsigned int cpu, struct task_struct *idle)
/* If 64-bit wakeup method exists, use the 64-bit mode trampoline IP */
if (apic->wakeup_secondary_cpu_64)
start_ip = real_mode_header->trampoline_start64;
+#endif
+#ifdef CONFIG_NUMA
+ arch_sched_node_distance = x86_arch_sched_node_distance;
#endif
idle->thread.sp = (unsigned long)task_pt_regs(idle);
initial_code = (unsigned long)start_secondary;
diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index 2d2d29553df8..3549c4a19816 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -56,7 +56,7 @@ static inline int cpu_numa_flags(void)
{
return SD_NUMA;
}
-extern int arch_sched_node_distance(int from, int to);
+extern int (*arch_sched_node_distance)(int, int);
#endif
extern int arch_asym_cpu_priority(int cpu);
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index f25e4402c63e..7cfb7422e9d4 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -1897,26 +1897,17 @@ static void init_numa_topology_type(int offline_node)
* A NUMA level is created for each unique
* arch_sched_node_distance.
*/
-static bool __modified_sched_node_dist = true;
-int __weak arch_sched_node_distance(int from, int to)
+static int default_sched_node_distance(int from, int to)
{
- if (__modified_sched_node_dist)
- __modified_sched_node_dist = false;
-
return node_distance(from, to);
}
+int (*arch_sched_node_distance)(int, int) = default_sched_node_distance;
+
static bool modified_sched_node_distance(void)
{
- /*
- * Call arch_sched_node_distance()
- * to determine if arch_sched_node_distance
- * has been modified from node_distance()
- * to arch specific distance.
- */
- arch_sched_node_distance(0, 0);
- return __modified_sched_node_dist;
+ return arch_sched_node_distance != default_sched_node_distance;
}
static int numa_node_dist(int i, int j)
Powered by blists - more mailing lists