[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <79db86a1-dc75-42ca-9cff-927539dc2104@intel.com>
Date: Tue, 30 Sep 2025 10:28:47 +0800
From: "Chen, Yu C" <yu.c.chen@...el.com>
To: Tim Chen <tim.c.chen@...ux.intel.com>, Peter Zijlstra
<peterz@...radead.org>, Ingo Molnar <mingo@...hat.com>
CC: Juri Lelli <juri.lelli@...hat.com>, Dietmar Eggemann
<dietmar.eggemann@....com>, Ben Segall <bsegall@...gle.com>, Mel Gorman
<mgorman@...e.de>, Valentin Schneider <vschneid@...hat.com>, Tim Chen
<tim.c.chen@...el.com>, Vincent Guittot <vincent.guittot@...aro.org>, "Len
Brown" <len.brown@...el.com>, <linux-kernel@...r.kernel.org>, K Prateek Nayak
<kprateek.nayak@....com>, "Gautham R . Shenoy" <gautham.shenoy@....com>,
"Zhao Liu" <zhao1.liu@...el.com>, Vinicius Costa Gomes
<vinicius.gomes@...el.com>, Arjan Van De Ven <arjan.van.de.ven@...el.com>
Subject: Re: [PATCH v4 1/2] sched: Create architecture specific sched domain
distances
On 9/30/2025 6:18 AM, Tim Chen wrote:
> On Sat, 2025-09-27 at 20:34 +0800, Chen, Yu C wrote:
>> [snip]
>>
>>> @@ -1591,10 +1591,12 @@ static void claim_allocations(int cpu, struct sched_domain *sd)
>>> enum numa_topology_type sched_numa_topology_type;
>>>
>>> static int sched_domains_numa_levels;
>>> +static int sched_numa_node_levels;
>>
>> I agree that the benefit of maintaining two NUMA distances - one for the
>> sched_domain and another for the NUMA balancing/page allocation policy - is
>> to avoid complicating the sched_domain hierarchy while preserving the
>> advantages of NUMA locality.
>>
>> Meanwhile, I wonder if we could also add a "orig" prefix to the original
>> NUMA distance. This way, we can quickly understand its meaning later.
>> For example,
>> sched_orig_node_levels
>> sched_orig_node_distance
>
> I am not sure adding orig will make the meaning any clearer.
> I can add comments to note that
>
> sched_numa_node_distance mean the node distance between numa nodes
> sched_numa_nodel_levels mean the number of unique distances between numa nodes
>
OK, looks good to me.
>>
>>> static int sched_domains_curr_level;
>>>
>>> int sched_max_numa_distance;
>>> static int *sched_domains_numa_distance;
>>> +static int *sched_numa_node_distance;
>>> static struct cpumask ***sched_domains_numa_masks;
>>> #endif /* CONFIG_NUMA */
>>>
>>> @@ -1808,10 +1810,10 @@ bool find_numa_distance(int distance)
>>> return true;
>>>
>>> rcu_read_lock();
>>> - distances = rcu_dereference(sched_domains_numa_distance);
>>> + distances = rcu_dereference(sched_numa_node_distance);
>>> if (!distances)
>>> goto unlock;
>>> - for (i = 0; i < sched_domains_numa_levels; i++) {
>>> + for (i = 0; i < sched_numa_node_levels; i++) {
>>> if (distances[i] == distance) {
>>> found = true;
>>> break;
>>> @@ -1887,14 +1889,48 @@ static void init_numa_topology_type(int offline_node)
>>>
>>> #define NR_DISTANCE_VALUES (1 << DISTANCE_BITS)
>>>
>>> -void sched_init_numa(int offline_node)
>>> +/*
>>> + * An architecture could modify its NUMA distance, to change
>>> + * grouping of NUMA nodes and number of NUMA levels when creating
>>> + * NUMA level sched domains.
>>> + *
>>> + * A NUMA level is created for each unique
>>> + * arch_sched_node_distance.
>>> + */
>>> +static bool __modified_sched_node_dist = true;
>>> +
>>> +int __weak arch_sched_node_distance(int from, int to)
>>> {
>>> - struct sched_domain_topology_level *tl;
>>> - unsigned long *distance_map;
>>> + if (__modified_sched_node_dist)
>>> + __modified_sched_node_dist = false;
>>> +
>>> + return node_distance(from, to);
>>> +}
>>> +
>>> +static bool modified_sched_node_distance(void)
>>> +{
>>> + /*
>>> + * Call arch_sched_node_distance()
>>> + * to determine if arch_sched_node_distance
>>> + * has been modified from node_distance()
>>> + * to arch specific distance.
>>> + */
>>> + arch_sched_node_distance(0, 0);
>>> + return __modified_sched_node_dist;
>>> +}
>>> +
>>
>> If our goal is to figure out whether the arch_sched_node_distance()
>> has been overridden, how about the following alias?
>>
>> int __weak arch_sched_node_distance(int from, int to)
>> {
>> return __node_distance(from, to);
>> }
>> int arch_sched_node_distance_original(int from, int to) __weak
>> __alias(arch_sched_node_distance);
>>
>> static bool arch_sched_node_distance_is_overridden(void)
>> {
>> return arch_sched_node_distance != arch_sched_node_distance_original;
>> }
>>
>> so arch_sched_node_distance_is_overridden() can replace
>> modified_sched_node_distance()
>>
>
> I think that the alias version will still point to the replaced function and not
> the originally defined one.
>
> How about not using __weak and just explicitly define arch_sched_node_distance
> as a function pointer. Change the code like below.
>
The arch_sched_node_distance_original is defined as __weak, so it
should point to the old function even if the function has been
overridden. I did a test on a X86 VM and it seems to be so.
But using the arch_sched_node_distance as a function point
should also be OK.
> Tim
>
> diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
> index d6b772990ec2..12db78af09d5 100644
> --- a/arch/x86/kernel/smpboot.c
> +++ b/arch/x86/kernel/smpboot.c
> @@ -545,7 +545,7 @@ static int avg_remote_numa_distance(void)
> return sched_avg_remote_distance;
> }
>
> -int arch_sched_node_distance(int from, int to)
> +static int x86_arch_sched_node_distance(int from, int to)
> {
> int d = node_distance(from, to);
>
> @@ -918,6 +918,9 @@ static int do_boot_cpu(u32 apicid, unsigned int cpu, struct task_struct *idle)
> /* If 64-bit wakeup method exists, use the 64-bit mode trampoline IP */
> if (apic->wakeup_secondary_cpu_64)
> start_ip = real_mode_header->trampoline_start64;
> +#endif
> +#ifdef CONFIG_NUMA
> + arch_sched_node_distance = x86_arch_sched_node_distance;
> #endif
Above might be called for several APs, maybe we can just call it
once in smp_prepare_cpus_common().
thanks,
Chenyu
> idle->thread.sp = (unsigned long)task_pt_regs(idle);
> initial_code = (unsigned long)start_secondary;
> diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
> index 2d2d29553df8..3549c4a19816 100644
> --- a/include/linux/sched/topology.h
> +++ b/include/linux/sched/topology.h
> @@ -56,7 +56,7 @@ static inline int cpu_numa_flags(void)
> {
> return SD_NUMA;
> }
> -extern int arch_sched_node_distance(int from, int to);
> +extern int (*arch_sched_node_distance)(int, int);
> #endif
>
> extern int arch_asym_cpu_priority(int cpu);
> diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
> index f25e4402c63e..7cfb7422e9d4 100644
> --- a/kernel/sched/topology.c
> +++ b/kernel/sched/topology.c
> @@ -1897,26 +1897,17 @@ static void init_numa_topology_type(int offline_node)
> * A NUMA level is created for each unique
> * arch_sched_node_distance.
> */
> -static bool __modified_sched_node_dist = true;
>
> -int __weak arch_sched_node_distance(int from, int to)
> +static int default_sched_node_distance(int from, int to)
> {
> - if (__modified_sched_node_dist)
> - __modified_sched_node_dist = false;
> -
> return node_distance(from, to);
> }
>
> +int (*arch_sched_node_distance)(int, int) = default_sched_node_distance;
> +
> static bool modified_sched_node_distance(void)
> {
> - /*
> - * Call arch_sched_node_distance()
> - * to determine if arch_sched_node_distance
> - * has been modified from node_distance()
> - * to arch specific distance.
> - */
> - arch_sched_node_distance(0, 0);
> - return __modified_sched_node_dist;
> + return arch_sched_node_distance != default_sched_node_distance;
> }
>
> static int numa_node_dist(int i, int j)
Powered by blists - more mailing lists