lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <79db86a1-dc75-42ca-9cff-927539dc2104@intel.com>
Date: Tue, 30 Sep 2025 10:28:47 +0800
From: "Chen, Yu C" <yu.c.chen@...el.com>
To: Tim Chen <tim.c.chen@...ux.intel.com>, Peter Zijlstra
	<peterz@...radead.org>, Ingo Molnar <mingo@...hat.com>
CC: Juri Lelli <juri.lelli@...hat.com>, Dietmar Eggemann
	<dietmar.eggemann@....com>, Ben Segall <bsegall@...gle.com>, Mel Gorman
	<mgorman@...e.de>, Valentin Schneider <vschneid@...hat.com>, Tim Chen
	<tim.c.chen@...el.com>, Vincent Guittot <vincent.guittot@...aro.org>, "Len
 Brown" <len.brown@...el.com>, <linux-kernel@...r.kernel.org>, K Prateek Nayak
	<kprateek.nayak@....com>, "Gautham R . Shenoy" <gautham.shenoy@....com>,
	"Zhao Liu" <zhao1.liu@...el.com>, Vinicius Costa Gomes
	<vinicius.gomes@...el.com>, Arjan Van De Ven <arjan.van.de.ven@...el.com>
Subject: Re: [PATCH v4 1/2] sched: Create architecture specific sched domain
 distances

On 9/30/2025 6:18 AM, Tim Chen wrote:
> On Sat, 2025-09-27 at 20:34 +0800, Chen, Yu C wrote:
>> [snip]
>>
>>> @@ -1591,10 +1591,12 @@ static void claim_allocations(int cpu, struct sched_domain *sd)
>>>    enum numa_topology_type sched_numa_topology_type;
>>>    
>>>    static int			sched_domains_numa_levels;
>>> +static int			sched_numa_node_levels;
>>
>> I agree that the benefit of maintaining two NUMA distances - one for the
>> sched_domain and another for the NUMA balancing/page allocation policy - is
>> to avoid complicating the sched_domain hierarchy while preserving the
>> advantages of NUMA locality.
>>
>> Meanwhile, I wonder if we could also add a "orig" prefix to the original
>> NUMA distance. This way, we can quickly understand its meaning later.
>> For example,
>> sched_orig_node_levels
>> sched_orig_node_distance
> 
> I am not sure adding orig will make the meaning any clearer.
> I can add comments to note that
> 
> sched_numa_node_distance mean the node distance between numa nodes
> sched_numa_nodel_levels  mean the number of unique distances between numa nodes
> 

OK, looks good to me.

>>
>>>    static int			sched_domains_curr_level;
>>>    
>>>    int				sched_max_numa_distance;
>>>    static int			*sched_domains_numa_distance;
>>> +static int			*sched_numa_node_distance;
>>>    static struct cpumask		***sched_domains_numa_masks;
>>>    #endif /* CONFIG_NUMA */
>>>    
>>> @@ -1808,10 +1810,10 @@ bool find_numa_distance(int distance)
>>>    		return true;
>>>    
>>>    	rcu_read_lock();
>>> -	distances = rcu_dereference(sched_domains_numa_distance);
>>> +	distances = rcu_dereference(sched_numa_node_distance);
>>>    	if (!distances)
>>>    		goto unlock;
>>> -	for (i = 0; i < sched_domains_numa_levels; i++) {
>>> +	for (i = 0; i < sched_numa_node_levels; i++) {
>>>    		if (distances[i] == distance) {
>>>    			found = true;
>>>    			break;
>>> @@ -1887,14 +1889,48 @@ static void init_numa_topology_type(int offline_node)
>>>    
>>>    #define NR_DISTANCE_VALUES (1 << DISTANCE_BITS)
>>>    
>>> -void sched_init_numa(int offline_node)
>>> +/*
>>> + * An architecture could modify its NUMA distance, to change
>>> + * grouping of NUMA nodes and number of NUMA levels when creating
>>> + * NUMA level sched domains.
>>> + *
>>> + * A NUMA level is created for each unique
>>> + * arch_sched_node_distance.
>>> + */
>>> +static bool __modified_sched_node_dist = true;
>>> +
>>> +int __weak arch_sched_node_distance(int from, int to)
>>>    {
>>> -	struct sched_domain_topology_level *tl;
>>> -	unsigned long *distance_map;
>>> +	if (__modified_sched_node_dist)
>>> +		__modified_sched_node_dist = false;
>>> +
>>> +	return node_distance(from, to);
>>> +}
>>> +
>>> +static bool modified_sched_node_distance(void)
>>> +{
>>> +	/*
>>> +	 * Call arch_sched_node_distance()
>>> +	 * to determine if arch_sched_node_distance
>>> +	 * has been modified from node_distance()
>>> +	 * to arch specific distance.
>>> +	 */
>>> +	arch_sched_node_distance(0, 0);
>>> +	return __modified_sched_node_dist;
>>> +}
>>> +
>>
>> If our goal is to figure out whether the arch_sched_node_distance()
>> has been overridden, how about the following alias?
>>
>> int __weak arch_sched_node_distance(int from, int to)
>> {
>> 	return __node_distance(from, to);
>> }
>> int arch_sched_node_distance_original(int from, int to) __weak
>> __alias(arch_sched_node_distance);
>>
>> static bool arch_sched_node_distance_is_overridden(void)
>> {
>> 	return arch_sched_node_distance != arch_sched_node_distance_original;
>> }
>>
>> so arch_sched_node_distance_is_overridden() can replace
>> modified_sched_node_distance()
>>
> 
> I think that the alias version will still point to the replaced function and not
> the originally defined one.
> 
> How about not using __weak and just explicitly define arch_sched_node_distance
> as a function pointer.  Change the code like below.
> 

The arch_sched_node_distance_original is defined as __weak, so it
should point to the old function even if the function has been
overridden. I did a test on a X86 VM and it seems to be so.
But using the arch_sched_node_distance as a function point
should also be OK.
> Tim
> 
> diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
> index d6b772990ec2..12db78af09d5 100644
> --- a/arch/x86/kernel/smpboot.c
> +++ b/arch/x86/kernel/smpboot.c
> @@ -545,7 +545,7 @@ static int avg_remote_numa_distance(void)
>   	return sched_avg_remote_distance;
>   }
>   
> -int arch_sched_node_distance(int from, int to)
> +static int x86_arch_sched_node_distance(int from, int to)
>   {
>   	int d = node_distance(from, to);
>   
> @@ -918,6 +918,9 @@ static int do_boot_cpu(u32 apicid, unsigned int cpu, struct task_struct *idle)
>   	/* If 64-bit wakeup method exists, use the 64-bit mode trampoline IP */
>   	if (apic->wakeup_secondary_cpu_64)
>   		start_ip = real_mode_header->trampoline_start64;
> +#endif
> +#ifdef CONFIG_NUMA
> +	arch_sched_node_distance = x86_arch_sched_node_distance;
>   #endif

Above might be called for several APs, maybe we can just call it
once in smp_prepare_cpus_common().

thanks,
Chenyu

>   	idle->thread.sp = (unsigned long)task_pt_regs(idle);
>   	initial_code = (unsigned long)start_secondary;
> diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
> index 2d2d29553df8..3549c4a19816 100644
> --- a/include/linux/sched/topology.h
> +++ b/include/linux/sched/topology.h
> @@ -56,7 +56,7 @@ static inline int cpu_numa_flags(void)
>   {
>   	return SD_NUMA;
>   }
> -extern int arch_sched_node_distance(int from, int to);
> +extern int (*arch_sched_node_distance)(int, int);
>   #endif
>   
>   extern int arch_asym_cpu_priority(int cpu);
> diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
> index f25e4402c63e..7cfb7422e9d4 100644
> --- a/kernel/sched/topology.c
> +++ b/kernel/sched/topology.c
> @@ -1897,26 +1897,17 @@ static void init_numa_topology_type(int offline_node)
>    * A NUMA level is created for each unique
>    * arch_sched_node_distance.
>    */
> -static bool __modified_sched_node_dist = true;
>   
> -int __weak arch_sched_node_distance(int from, int to)
> +static int default_sched_node_distance(int from, int to)
>   {
> -	if (__modified_sched_node_dist)
> -		__modified_sched_node_dist = false;
> -
>   	return node_distance(from, to);
>   }
>   
> +int (*arch_sched_node_distance)(int, int) = default_sched_node_distance;
> +
>   static bool modified_sched_node_distance(void)
>   {
> -	/*
> -	 * Call arch_sched_node_distance()
> -	 * to determine if arch_sched_node_distance
> -	 * has been modified from node_distance()
> -	 * to arch specific distance.
> -	 */
> -	arch_sched_node_distance(0, 0);
> -	return __modified_sched_node_dist;
> +	return arch_sched_node_distance != default_sched_node_distance;
>   }
>   
>   static int numa_node_dist(int i, int j)

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ