linux-kernel - Re: [PATCH v0] sched/topology: Add lock guard support

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <02751a68-3981-4cc5-886f-a816ae39bf88@amd.com>
Date: Thu, 5 Jun 2025 09:11:55 +0530
From: K Prateek Nayak <kprateek.nayak@....com>
To: Jemmy Wong <jemmywong512@...il.com>, Ingo Molnar <mingo@...hat.com>,
 Peter Zijlstra <peterz@...radead.org>, Juri Lelli <juri.lelli@...hat.com>,
 Vincent Guittot <vincent.guittot@...aro.org>,
 Dietmar Eggemann <dietmar.eggemann@....com>,
 Steven Rostedt <rostedt@...dmis.org>, Ben Segall <bsegall@...gle.com>,
 Mel Gorman <mgorman@...e.de>, Valentin Schneider <vschneid@...hat.com>
Cc: linux-kernel@...r.kernel.org
Subject: Re: [PATCH v0] sched/topology: Add lock guard support

Hello Jammy,

On 6/5/2025 12:20 AM, Jemmy Wong wrote:
> This change replaces manual lock acquisition and release with lock guards
> to improve code robustness and reduce the risk of lock mismanagement.
> No functional changes to the scheduler topology logic are introduced.
> 
> Signed-off-by: Jemmy Wong <jemmywong512@...il.com>
> 
> ---
>   include/linux/sched.h   |  11 +--
>   kernel/sched/core.c     |   6 +-
>   kernel/sched/debug.c    |  28 ++++---
>   kernel/sched/rt.c       |  46 ++++++------
>   kernel/sched/topology.c | 162 +++++++++++++++++++---------------------
>   5 files changed, 120 insertions(+), 133 deletions(-)
> 
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 4f78a64beb52..10a9d6083b72 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -46,6 +46,7 @@
>   #include <linux/rv.h>
>   #include <linux/uidgid_types.h>
>   #include <linux/tracepoint-defs.h>
> +#include <linux/mutex.h>
>   #include <asm/kmap_size.h>
>   
>   /* task_struct member predeclarations (sorted alphabetically): */
> @@ -395,14 +396,14 @@ enum uclamp_id {
>   	UCLAMP_CNT
>   };
>   
> +extern struct mutex sched_domains_mutex;
>   #ifdef CONFIG_SMP
>   extern struct root_domain def_root_domain;
> -extern struct mutex sched_domains_mutex;
> -extern void sched_domains_mutex_lock(void);
> -extern void sched_domains_mutex_unlock(void);
> +DEFINE_LOCK_GUARD_0(sched_domains_mutex,
> +	mutex_lock(&sched_domains_mutex),
> +	mutex_unlock(&sched_domains_mutex))
>   #else
> -static inline void sched_domains_mutex_lock(void) { }
> -static inline void sched_domains_mutex_unlock(void) { }
> +DEFINE_LOCK_GUARD_0(sched_domains_mutex, ,)
>   #endif
>   
>   struct sched_param {
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index dce50fa57471..b2b7a0cae95a 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -8457,9 +8457,9 @@ void __init sched_init_smp(void)
>   	 * CPU masks are stable and all blatant races in the below code cannot
>   	 * happen.
>   	 */
> -	sched_domains_mutex_lock();
> -	sched_init_domains(cpu_active_mask);
> -	sched_domains_mutex_unlock();
> +	scoped_guard(sched_domains_mutex) {
> +		sched_init_domains(cpu_active_mask);
> +	}
>   
>   	/* Move init over to a non-isolated CPU */
>   	if (set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_TYPE_DOMAIN)) < 0)
> diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
> index 9d71baf08075..f56401725ef6 100644
> --- a/kernel/sched/debug.c
> +++ b/kernel/sched/debug.c
> @@ -294,19 +294,17 @@ static ssize_t sched_verbose_write(struct file *filp, const char __user *ubuf,
>   	bool orig;
>   
>   	cpus_read_lock();

cpus_read_{un}lock() have guards too. You can just have:

	guard(cpus_read_lock)();
	guard(sched_domains_mutex)();

no need for scoped guard. Compiler will take care of unlocking
ordering before return.

> -	sched_domains_mutex_lock();
> -
> -	orig = sched_debug_verbose;
> -	result = debugfs_write_file_bool(filp, ubuf, cnt, ppos);
> -
> -	if (sched_debug_verbose && !orig)
> -		update_sched_domain_debugfs();
> -	else if (!sched_debug_verbose && orig) {
> -		debugfs_remove(sd_dentry);
> -		sd_dentry = NULL;
> +	scoped_guard(sched_domains_mutex) {
> +		orig = sched_debug_verbose;
> +		result = debugfs_write_file_bool(filp, ubuf, cnt, ppos);
> +
> +		if (sched_debug_verbose && !orig)
> +			update_sched_domain_debugfs();
> +		else if (!sched_debug_verbose && orig) {
> +			debugfs_remove(sd_dentry);
> +			sd_dentry = NULL;
> +		}
>   	}
> -
> -	sched_domains_mutex_unlock();
>   	cpus_read_unlock();
>   
>   	return result;

General comment, it is okay to convert the folllowing pattern:

	func()
	{
		...
		lock();
		... /* critical section */
	unlock:
		unlock();

		return ret;
	}

to:
	func()
	{
		...
		guard();
		... /* critical section with s/goto unlock/return ret/ */
		
		return ret;
	}

You don't need a scoped_guard() if the critical section is at the end of
the funtion.

> @@ -517,9 +515,9 @@ static __init int sched_init_debug(void)
>   	debugfs_create_u32("migration_cost_ns", 0644, debugfs_sched, &sysctl_sched_migration_cost);
>   	debugfs_create_u32("nr_migrate", 0644, debugfs_sched, &sysctl_sched_nr_migrate);
>   
> -	sched_domains_mutex_lock();
> -	update_sched_domain_debugfs();
> -	sched_domains_mutex_unlock();
> +	scoped_guard(sched_domains_mutex) {
> +		update_sched_domain_debugfs();
> +	}
>   #endif
>   
>   #ifdef CONFIG_NUMA_BALANCING
> diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
> index e40422c37033..3f6f181de387 100644
> --- a/kernel/sched/rt.c
> +++ b/kernel/sched/rt.c
> @@ -2920,36 +2920,36 @@ static int sched_rt_handler(const struct ctl_table *table, int write, void *buff
>   	static DEFINE_MUTEX(mutex);
>   	int ret;
>   
> -	mutex_lock(&mutex);
> -	sched_domains_mutex_lock();
> -	old_period = sysctl_sched_rt_period;
> -	old_runtime = sysctl_sched_rt_runtime;
> +	guard(mutex)(&mutex);
>   
> -	ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
> +	scoped_guard(sched_domains_mutex) {

No need for scoped guard, "guard(sched_domains_mutex)();" should be
enough.

> +		old_period = sysctl_sched_rt_period;
> +		old_runtime = sysctl_sched_rt_runtime;
>   
> -	if (!ret && write) {
> -		ret = sched_rt_global_validate();
> -		if (ret)
> -			goto undo;
> +		ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
>   
> -		ret = sched_dl_global_validate();
> -		if (ret)
> -			goto undo;
> +		if (!ret && write) {
> +			ret = sched_rt_global_validate();
> +			if (ret)
> +				goto undo;
>   
> -		ret = sched_rt_global_constraints();
> -		if (ret)
> -			goto undo;
> +			ret = sched_dl_global_validate();
> +			if (ret)
> +				goto undo;
>   
> -		sched_rt_do_global();
> -		sched_dl_do_global();
> -	}
> -	if (0) {
> +			ret = sched_rt_global_constraints();
> +			if (ret)
> +				goto undo;
> +
> +			sched_rt_do_global();
> +			sched_dl_do_global();
> +		}
> +		if (0) {
>   undo:

On a sidenote, include/linux/cleanup.h has the following comment:

     Lastly, given that the benefit of cleanup helpers is removal of
     "goto", and that the "goto" statement can jump between scopes, the
     expectation is that usage of "goto" and cleanup helpers is never
     mixed in the same function. I.e. for a given routine, convert all
     resources that need a "goto" cleanup to scope-based cleanup, or
     convert none of them.

Although the compiler generates the correct code currently, I think
you should just replicate the undo chunk inplace of "goto undo" just
to be safe like:

	if (ret) {
		sysctl_sched_rt_period = old_period;
		sysctl_sched_rt_runtime = old_runtime;

		return ret;
	}

> -		sysctl_sched_rt_period = old_period;
> -		sysctl_sched_rt_runtime = old_runtime;
> +			sysctl_sched_rt_period = old_period;
> +			sysctl_sched_rt_runtime = old_runtime;
> +		}
>   	}
> -	sched_domains_mutex_unlock();
> -	mutex_unlock(&mutex);
>   
>   	return ret;
>   }
> diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
> index b958fe48e020..dac1dd5a6eca 100644
> --- a/kernel/sched/topology.c
> +++ b/kernel/sched/topology.c
> @@ -6,14 +6,6 @@
>   #include <linux/bsearch.h>
>   
>   DEFINE_MUTEX(sched_domains_mutex);
> -void sched_domains_mutex_lock(void)
> -{
> -	mutex_lock(&sched_domains_mutex);
> -}
> -void sched_domains_mutex_unlock(void)
> -{
> -	mutex_unlock(&sched_domains_mutex);
> -}
>   
>   /* Protected by sched_domains_mutex: */
>   static cpumask_var_t sched_domains_tmpmask;
> @@ -470,44 +462,41 @@ static void free_rootdomain(struct rcu_head *rcu)
>   void rq_attach_root(struct rq *rq, struct root_domain *rd)
>   {
>   	struct root_domain *old_rd = NULL;
> -	struct rq_flags rf;
>   
> -	rq_lock_irqsave(rq, &rf);
> +	scoped_guard(rq_lock_irqsave, rq) {

I'm not a big fan of this added indentation. Perhaps you can move the
rq_lock guarded bit into a separate function?

> +		if (rq->rd) {
> +			old_rd = rq->rd;
>   
> -	if (rq->rd) {
> -		old_rd = rq->rd;
> +			if (cpumask_test_cpu(rq->cpu, old_rd->online))
> +				set_rq_offline(rq);
> +
> +			cpumask_clear_cpu(rq->cpu, old_rd->span);
> +
> +			/*
> +			 * If we don't want to free the old_rd yet then
> +			 * set old_rd to NULL to skip the freeing later
> +			 * in this function:
> +			 */
> +			if (!atomic_dec_and_test(&old_rd->refcount))
> +				old_rd = NULL;
> +		}
>   
> -		if (cpumask_test_cpu(rq->cpu, old_rd->online))
> -			set_rq_offline(rq);
> +		atomic_inc(&rd->refcount);
> +		rq->rd = rd;
>   
> -		cpumask_clear_cpu(rq->cpu, old_rd->span);
> +		cpumask_set_cpu(rq->cpu, rd->span);
> +		if (cpumask_test_cpu(rq->cpu, cpu_active_mask))
> +			set_rq_online(rq);
>   
>   		/*
> -		 * If we don't want to free the old_rd yet then
> -		 * set old_rd to NULL to skip the freeing later
> -		 * in this function:
> +		 * Because the rq is not a task, dl_add_task_root_domain() did not
> +		 * move the fair server bw to the rd if it already started.
> +		 * Add it now.
>   		 */
> -		if (!atomic_dec_and_test(&old_rd->refcount))
> -			old_rd = NULL;
> +		if (rq->fair_server.dl_server)
> +			__dl_server_attach_root(&rq->fair_server, rq);
>   	}
>   
> -	atomic_inc(&rd->refcount);
> -	rq->rd = rd;
> -
> -	cpumask_set_cpu(rq->cpu, rd->span);
> -	if (cpumask_test_cpu(rq->cpu, cpu_active_mask))
> -		set_rq_online(rq);
> -
> -	/*
> -	 * Because the rq is not a task, dl_add_task_root_domain() did not
> -	 * move the fair server bw to the rd if it already started.
> -	 * Add it now.
> -	 */
> -	if (rq->fair_server.dl_server)
> -		__dl_server_attach_root(&rq->fair_server, rq);
> -
> -	rq_unlock_irqrestore(rq, &rf);
> -
>   	if (old_rd)
>   		call_rcu(&old_rd->rcu, free_rootdomain);
>   }
> @@ -1809,18 +1798,17 @@ bool find_numa_distance(int distance)
>   	if (distance == node_distance(0, 0))
>   		return true;
>   
> -	rcu_read_lock();
> -	distances = rcu_dereference(sched_domains_numa_distance);
> -	if (!distances)
> -		goto unlock;
> -	for (i = 0; i < sched_domains_numa_levels; i++) {
> -		if (distances[i] == distance) {
> -			found = true;
> +	scoped_guard(rcu) {

guard(rcu)() should be enough. No need for scoped guard. Instead
of breaks, you can "return found" directly ...

> +		distances = rcu_dereference(sched_domains_numa_distance);
> +		if (!distances)
>   			break;
> +		for (i = 0; i < sched_domains_numa_levels; i++) {
> +			if (distances[i] == distance) {
> +				found = true;
> +				break;
> +			}
>   		}
>   	}
> -unlock:
> -	rcu_read_unlock();
>   
>   	return found;
>   }
> @@ -2134,21 +2122,20 @@ int sched_numa_find_closest(const struct cpumask *cpus, int cpu)
>   	int i, j = cpu_to_node(cpu), found = nr_cpu_ids;
>   	struct cpumask ***masks;
>   
> -	rcu_read_lock();
> -	masks = rcu_dereference(sched_domains_numa_masks);
> -	if (!masks)
> -		goto unlock;
> -	for (i = 0; i < sched_domains_numa_levels; i++) {
> -		if (!masks[i][j])
> -			break;
> -		cpu = cpumask_any_and_distribute(cpus, masks[i][j]);
> -		if (cpu < nr_cpu_ids) {
> -			found = cpu;
> +	scoped_guard(rcu) {

Same as last comment, plain guard(rcu)(); should be fine ...

> +		masks = rcu_dereference(sched_domains_numa_masks);
> +		if (!masks)
>   			break;
> +		for (i = 0; i < sched_domains_numa_levels; i++) {
> +			if (!masks[i][j])
> +				break;
> +			cpu = cpumask_any_and_distribute(cpus, masks[i][j]);
> +			if (cpu < nr_cpu_ids) {
> +				found = cpu;
> +				break;
> +			}
>   		}
>   	}
> -unlock:
> -	rcu_read_unlock();
>   
>   	return found;
>   }
> @@ -2201,24 +2188,25 @@ int sched_numa_find_nth_cpu(const struct cpumask *cpus, int cpu, int node)
>   	if (node == NUMA_NO_NODE)
>   		return cpumask_nth_and(cpu, cpus, cpu_online_mask);
>   
> -	rcu_read_lock();
> +	scoped_guard(rcu) {

Same as last comment ...

> +		/* CPU-less node entries are uninitialized in sched_domains_numa_masks */
> +		node = numa_nearest_node(node, N_CPU);
> +		k.node = node;
>   
> -	/* CPU-less node entries are uninitialized in sched_domains_numa_masks */
> -	node = numa_nearest_node(node, N_CPU);
> -	k.node = node;
> +		k.masks = rcu_dereference(sched_domains_numa_masks);
> +		if (!k.masks)
> +			break;
>   
> -	k.masks = rcu_dereference(sched_domains_numa_masks);
> -	if (!k.masks)
> -		goto unlock;
> +		hop_masks = bsearch(&k, k.masks, sched_domains_numa_levels,
> +			sizeof(k.masks[0]), hop_cmp);
> +		hop = hop_masks	- k.masks;
>   
> -	hop_masks = bsearch(&k, k.masks, sched_domains_numa_levels, sizeof(k.masks[0]), hop_cmp);
> -	hop = hop_masks	- k.masks;
> +		ret = hop ?
> +			cpumask_nth_and_andnot(cpu - k.w, cpus, k.masks[hop][node],
> +				k.masks[hop-1][node]) :
> +			cpumask_nth_and(cpu, cpus, k.masks[0][node]);
> +	}
>   
> -	ret = hop ?
> -		cpumask_nth_and_andnot(cpu - k.w, cpus, k.masks[hop][node], k.masks[hop-1][node]) :
> -		cpumask_nth_and(cpu, cpus, k.masks[0][node]);
> -unlock:
> -	rcu_read_unlock();
>   	return ret;
>   }
>   EXPORT_SYMBOL_GPL(sched_numa_find_nth_cpu);
> @@ -2570,17 +2558,17 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
>   	}
>   
>   	/* Attach the domains */
> -	rcu_read_lock();
> -	for_each_cpu(i, cpu_map) {
> -		rq = cpu_rq(i);
> -		sd = *per_cpu_ptr(d.sd, i);
> +	scoped_guard(rcu) {
> +		for_each_cpu(i, cpu_map) {
> +			rq = cpu_rq(i);
> +			sd = *per_cpu_ptr(d.sd, i);
>   
> -		cpu_attach_domain(sd, d.rd, i);
> +			cpu_attach_domain(sd, d.rd, i);
>   
> -		if (lowest_flag_domain(i, SD_CLUSTER))
> -			has_cluster = true;
> +			if (lowest_flag_domain(i, SD_CLUSTER))
> +				has_cluster = true;
> +		}
>   	}
> -	rcu_read_unlock();
>   
>   	if (has_asym)
>   		static_branch_inc_cpuslocked(&sched_asym_cpucapacity);
> @@ -2688,10 +2676,10 @@ static void detach_destroy_domains(const struct cpumask *cpu_map)
>   	if (static_branch_unlikely(&sched_cluster_active))
>   		static_branch_dec_cpuslocked(&sched_cluster_active);
>   
> -	rcu_read_lock();

Same as last comment ...

> -	for_each_cpu(i, cpu_map)
> -		cpu_attach_domain(NULL, &def_root_domain, i);
> -	rcu_read_unlock();
> +	scoped_guard(rcu) {
> +		for_each_cpu(i, cpu_map)
> +			cpu_attach_domain(NULL, &def_root_domain, i);
> +	}
>   }
>   
>   /* handle null as "default" */
> @@ -2836,7 +2824,7 @@ static void partition_sched_domains_locked(int ndoms_new, cpumask_var_t doms_new
>   void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
>   			     struct sched_domain_attr *dattr_new)
>   {
> -	sched_domains_mutex_lock();
> -	partition_sched_domains_locked(ndoms_new, doms_new, dattr_new);
> -	sched_domains_mutex_unlock();
> +	scoped_guard(sched_domains_mutex) {

Similar to lasr comment, plain guard(sched_domains_mutex)(); should be fine.

> +		partition_sched_domains_locked(ndoms_new, doms_new, dattr_new);
> +	}
>   }

-- 
Thanks and Regards,
Prateek