linux-kernel - Re: [PATCH v1 1/1] sched/topology: Add lock guard support

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite for Android: free password hash cracker in your pocket
[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <4d156476-af35-4f0e-ab68-a3119b9a5a2e@amd.com>
Date: Thu, 12 Jun 2025 11:05:37 +0530
From: K Prateek Nayak <kprateek.nayak@....com>
To: Jemmy Wong <jemmywong512@...il.com>
Cc: Ingo Molnar <mingo@...hat.com>, Peter Zijlstra <peterz@...radead.org>,
 Juri Lelli <juri.lelli@...hat.com>,
 Vincent Guitcct <vincent.guittot@...aro.org>,
 Dietmar Eggemann <dietmar.eggemann@....com>,
 Steven Rostedt <rostedt@...dmis.org>, Ben Segall <bsegall@...gle.com>,
 Mel Gorman <mgorman@...e.de>, Valentin Schneider <vschneid@...hat.com>,
 linux-kernel@...r.kernel.org
Subject: Re: [PATCH v1 1/1] sched/topology: Add lock guard support

Hello Jeremy,

Sorry for the delay!

On 6/5/2025 5:34 PM, Jemmy Wong wrote:
> This change replaces manual lock acquisition and release with lock guards
> to improve code robustness and reduce the risk of lock mismanagement.
> No functional changes to the scheduler topology logic are introduced.
> 
> Signed-off-by: Jemmy Wong <jemmywong512@...il.com>

I have few comments below but other than that, I tested the patch with a
combination of hotplug and cpusets going concurrently and did not see
any issues. Feel free to add:

Tested-by: K Prateek Nayak <kprateek.nayak@....com>

> 
> ---
>   include/linux/sched.h   |  11 ++--
>   kernel/sched/core.c     |   6 +--
>   kernel/sched/debug.c    |  13 ++---
>   kernel/sched/rt.c       |  35 ++++++------
>   kernel/sched/topology.c | 115 +++++++++++++++++-----------------------
>   5 files changed, 81 insertions(+), 99 deletions(-)
> 
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 4f78a64beb52..10a9d6083b72 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -46,6 +46,7 @@
>   #include <linux/rv.h>
>   #include <linux/uidgid_types.h>
>   #include <linux/tracepoint-defs.h>
> +#include <linux/mutex.h>
>   #include <asm/kmap_size.h>
>   
>   /* task_struct member predeclarations (sorted alphabetically): */
> @@ -395,14 +396,14 @@ enum uclamp_id {
>   	UCLAMP_CNT
>   };
>   
> +extern struct mutex sched_domains_mutex;
>   #ifdef CONFIG_SMP
>   extern struct root_domain def_root_domain;
> -extern struct mutex sched_domains_mutex;
> -extern void sched_domains_mutex_lock(void);
> -extern void sched_domains_mutex_unlock(void);
> +DEFINE_LOCK_GUARD_0(sched_domains_mutex,
> +	mutex_lock(&sched_domains_mutex),
> +	mutex_unlock(&sched_domains_mutex))
>   #else
> -static inline void sched_domains_mutex_lock(void) { }
> -static inline void sched_domains_mutex_unlock(void) { }
> +DEFINE_LOCK_GUARD_0(sched_domains_mutex, ,)
>   #endif
>   
>   struct sched_param {
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index dce50fa57471..b2b7a0cae95a 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -8457,9 +8457,9 @@ void __init sched_init_smp(void)
>   	 * CPU masks are stable and all blatant races in the below code cannot
>   	 * happen.
>   	 */
> -	sched_domains_mutex_lock();
> -	sched_init_domains(cpu_active_mask);
> -	sched_domains_mutex_unlock();
> +	scoped_guard(sched_domains_mutex) {
> +		sched_init_domains(cpu_active_mask);
> +	}

Since sched_init_domains() is only called once during init from here,
perhaps you can move the guard() within sched_init_domains() and avoid
the scoped_guard() here in the caller.

>   
>   	/* Move init over to a non-isolated CPU */
>   	if (set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_TYPE_DOMAIN)) < 0)
> diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
> index 9d71baf08075..1c00016fd54c 100644
> --- a/kernel/sched/debug.c
> +++ b/kernel/sched/debug.c
> @@ -293,8 +293,8 @@ static ssize_t sched_verbose_write(struct file *filp, const char __user *ubuf,
>   	ssize_t result;
>   	bool orig;
>   
> -	cpus_read_lock();
> -	sched_domains_mutex_lock();
> +	guard(cpus_read_lock)();
> +	guard(sched_domains_mutex)();
>   
>   	orig = sched_debug_verbose;
>   	result = debugfs_write_file_bool(filp, ubuf, cnt, ppos);
> @@ -306,9 +306,6 @@ static ssize_t sched_verbose_write(struct file *filp, const char __user *ubuf,
>   		sd_dentry = NULL;
>   	}
>   
> -	sched_domains_mutex_unlock();
> -	cpus_read_unlock();
> -
>   	return result;
>   }
>   #else
> @@ -517,9 +514,9 @@ static __init int sched_init_debug(void)
>   	debugfs_create_u32("migration_cost_ns", 0644, debugfs_sched, &sysctl_sched_migration_cost);
>   	debugfs_create_u32("nr_migrate", 0644, debugfs_sched, &sysctl_sched_nr_migrate);
>   
> -	sched_domains_mutex_lock();
> -	update_sched_domain_debugfs();
> -	sched_domains_mutex_unlock();
> +	scoped_guard(sched_domains_mutex) {
> +		update_sched_domain_debugfs();
> +	}
>   #endif
>   
>   #ifdef CONFIG_NUMA_BALANCING
> diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
> index e40422c37033..3533b471b015 100644
> --- a/kernel/sched/rt.c
> +++ b/kernel/sched/rt.c
> @@ -2920,36 +2920,37 @@ static int sched_rt_handler(const struct ctl_table *table, int write, void *buff
>   	static DEFINE_MUTEX(mutex);
>   	int ret;
>   
> -	mutex_lock(&mutex);
> -	sched_domains_mutex_lock();
> +	guard(mutex)(&mutex);
> +	guard(sched_domains_mutex)();
> +
>   	old_period = sysctl_sched_rt_period;
>   	old_runtime = sysctl_sched_rt_runtime;
>   
>   	ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
>   
>   	if (!ret && write) {
> -		ret = sched_rt_global_validate();
> -		if (ret)
> -			goto undo;
> +		do {

Instead of this do-while loop that runs once, how about:

     static int sched_validate_constraints()
     {
         int ret;

         ret = sched_rt_global_validate();
         if (ret)
             return ret;

         ... /* similarly for dl_global_validate, rt_global_constraints */

         sched_rt_do_global();
         sched_dl_do_global();

         return 0;
     }

     staic int sched_rt_handler()
     {
         ...

         ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
         if (ret || !write)
             return ret;

         ret = sched_validate_constraints();
         if (ret) {
             sysctl_sched_rt_period = old_period;
             sysctl_sched_rt_runtime = old_runtime;
         }

         return ret;
     }

> +			ret = sched_rt_global_validate();
> +			if (ret)
> +				break;
>   
> -		ret = sched_dl_global_validate();
> -		if (ret)
> -			goto undo;
> +			ret = sched_dl_global_validate();
> +			if (ret)
> +				break;
>   
> -		ret = sched_rt_global_constraints();
> -		if (ret)
> -			goto undo;
> +			ret = sched_rt_global_constraints();
> +			if (ret)
> +				break;
>   
> -		sched_rt_do_global();
> -		sched_dl_do_global();
> +			sched_rt_do_global();
> +			sched_dl_do_global();
> +		} while (0);
>   	}
> -	if (0) {
> -undo:
> +
> +	if (ret) {

Previously "!write" cases and early ret from proc_dointvec_minmax()
wouldn't do this. It doesn't hurt but it is wasteful and can be avoided
using the pattern from the above suggestion.

>   		sysctl_sched_rt_period = old_period;
>   		sysctl_sched_rt_runtime = old_runtime;
>   	}
> -	sched_domains_mutex_unlock();
> -	mutex_unlock(&mutex);
>   
>   	return ret;
>   }
> diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
> index b958fe48e020..3f89f969666c 100644
> --- a/kernel/sched/topology.c
> +++ b/kernel/sched/topology.c
> @@ -6,14 +6,6 @@
>   #include <linux/bsearch.h>
>   
>   DEFINE_MUTEX(sched_domains_mutex);
> -void sched_domains_mutex_lock(void)
> -{
> -	mutex_lock(&sched_domains_mutex);
> -}
> -void sched_domains_mutex_unlock(void)
> -{
> -	mutex_unlock(&sched_domains_mutex);
> -}
>   
>   /* Protected by sched_domains_mutex: */
>   static cpumask_var_t sched_domains_tmpmask;
> @@ -470,44 +462,41 @@ static void free_rootdomain(struct rcu_head *rcu)
>   void rq_attach_root(struct rq *rq, struct root_domain *rd)
>   {
>   	struct root_domain *old_rd = NULL;
> -	struct rq_flags rf;
>   
> -	rq_lock_irqsave(rq, &rf);
> +	scoped_guard(rq_lock_irqsave, rq) {
> +		if (rq->rd) {
> +			old_rd = rq->rd;
>   
> -	if (rq->rd) {
> -		old_rd = rq->rd;
> +			if (cpumask_test_cpu(rq->cpu, old_rd->online))
> +				set_rq_offline(rq);
> +
> +			cpumask_clear_cpu(rq->cpu, old_rd->span);
> +
> +			/*
> +			 * If we don't want to free the old_rd yet then
> +			 * set old_rd to NULL to skip the freeing later
> +			 * in this function:
> +			 */
> +			if (!atomic_dec_and_test(&old_rd->refcount))
> +				old_rd = NULL;
> +		}
>   
> -		if (cpumask_test_cpu(rq->cpu, old_rd->online))
> -			set_rq_offline(rq);
> +		atomic_inc(&rd->refcount);
> +		rq->rd = rd;
>   
> -		cpumask_clear_cpu(rq->cpu, old_rd->span);
> +		cpumask_set_cpu(rq->cpu, rd->span);
> +		if (cpumask_test_cpu(rq->cpu, cpu_active_mask))
> +			set_rq_online(rq);
>   
>   		/*
> -		 * If we don't want to free the old_rd yet then
> -		 * set old_rd to NULL to skip the freeing later
> -		 * in this function:
> +		 * Because the rq is not a task, dl_add_task_root_domain() did not

nit. Perhaps that comment can now be wrapped at 80 characters. No strong
feelings, just that the comment stood out for being pretty long when
reviewing.

> +		 * move the fair server bw to the rd if it already started.
> +		 * Add it now.
>   		 */
> -		if (!atomic_dec_and_test(&old_rd->refcount))
> -			old_rd = NULL;
> +		if (rq->fair_server.dl_server)
> +			__dl_server_attach_root(&rq->fair_server, rq);
>   	}
>   

-- 
Thanks and Regards,
Prateek