linux-kernel - Re: [PATCH -next 5/6] cpuset: separate generate_sched

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <8d0ef5fc-f392-40f8-9803-50807c172800@redhat.com>
Date: Wed, 17 Dec 2025 12:48:53 -0500
From: Waiman Long <llong@...hat.com>
To: Chen Ridong <chenridong@...weicloud.com>, tj@...nel.org,
 hannes@...xchg.org, mkoutny@...e.com
Cc: cgroups@...r.kernel.org, linux-kernel@...r.kernel.org,
 lujialin4@...wei.com
Subject: Re: [PATCH -next 5/6] cpuset: separate generate_sched_domains for v1
 and v2

On 12/17/25 3:49 AM, Chen Ridong wrote:
> From: Chen Ridong <chenridong@...wei.com>
>
> The generate_sched_domains() function currently handles both v1 and v2
> logic. However, the underlying mechanisms for building scheduler domains
> differ significantly between the two versions. For cpuset v2, scheduler
> domains are straightforwardly derived from valid partitions, whereas
> cpuset v1 employs a more complex union-find algorithm to merge overlapping
> cpusets. Co-locating these implementations complicates maintenance.
>
> This patch, along with subsequent ones, aims to separate the v1 and v2
> logic. For ease of review, this patch first copies the
> generate_sched_domains() function into cpuset-v1.c as
> cpuset1_generate_sched_domains() and removes v2-specific code. Common
> helpers and top_cpuset are declared in cpuset-internal.h. When operating
> in v1 mode, the code now calls cpuset1_generate_sched_domains().
>
> Currently there is some code duplication, which will be largely eliminated
> once v1-specific code is removed from v2 in the following patch.
>
> Signed-off-by: Chen Ridong <chenridong@...wei.com>
> ---
>   kernel/cgroup/cpuset-internal.h |  24 +++++
>   kernel/cgroup/cpuset-v1.c       | 167 ++++++++++++++++++++++++++++++++
>   kernel/cgroup/cpuset.c          |  31 +-----
>   3 files changed, 195 insertions(+), 27 deletions(-)
>
> diff --git a/kernel/cgroup/cpuset-internal.h b/kernel/cgroup/cpuset-internal.h
> index 677053ffb913..bd767f8cb0ed 100644
> --- a/kernel/cgroup/cpuset-internal.h
> +++ b/kernel/cgroup/cpuset-internal.h
> @@ -9,6 +9,7 @@
>   #include <linux/cpuset.h>
>   #include <linux/spinlock.h>
>   #include <linux/union_find.h>
> +#include <linux/sched/isolation.h>
>   
>   /* See "Frequency meter" comments, below. */
>   
> @@ -185,6 +186,8 @@ struct cpuset {
>   #endif
>   };
>   
> +extern struct cpuset top_cpuset;
> +
>   static inline struct cpuset *css_cs(struct cgroup_subsys_state *css)
>   {
>   	return css ? container_of(css, struct cpuset, css) : NULL;
> @@ -242,6 +245,22 @@ static inline int is_spread_slab(const struct cpuset *cs)
>   	return test_bit(CS_SPREAD_SLAB, &cs->flags);
>   }
>   
> +/*
> + * Helper routine for generate_sched_domains().
> + * Do cpusets a, b have overlapping effective cpus_allowed masks?
> + */
> +static inline int cpusets_overlap(struct cpuset *a, struct cpuset *b)
> +{
> +	return cpumask_intersects(a->effective_cpus, b->effective_cpus);
> +}
> +
> +static inline int nr_cpusets(void)
> +{
> +	assert_cpuset_lock_held();

For a simple helper like this one which only does an atomic_read(), I 
don't think you need to assert that cpuset_mutex is held.

> +	/* jump label reference count + the top-level cpuset */
> +	return static_key_count(&cpusets_enabled_key.key) + 1;
> +}
> +
>   /**
>    * cpuset_for_each_child - traverse online children of a cpuset
>    * @child_cs: loop cursor pointing to the current child
> @@ -298,6 +317,9 @@ void cpuset1_init(struct cpuset *cs);
>   void cpuset1_online_css(struct cgroup_subsys_state *css);
>   void update_domain_attr_tree(struct sched_domain_attr *dattr,
>   				    struct cpuset *root_cs);
> +int cpuset1_generate_sched_domains(cpumask_var_t **domains,
> +			struct sched_domain_attr **attributes);
> +
>   #else
>   static inline void cpuset1_update_task_spread_flags(struct cpuset *cs,
>   					struct task_struct *tsk) {}
> @@ -311,6 +333,8 @@ static inline void cpuset1_init(struct cpuset *cs) {}
>   static inline void cpuset1_online_css(struct cgroup_subsys_state *css) {}
>   static inline void update_domain_attr_tree(struct sched_domain_attr *dattr,
>   				    struct cpuset *root_cs) {}
> +static inline int cpuset1_generate_sched_domains(cpumask_var_t **domains,
> +			struct sched_domain_attr **attributes) { return 0; };
>   
>   #endif /* CONFIG_CPUSETS_V1 */
>   
> diff --git a/kernel/cgroup/cpuset-v1.c b/kernel/cgroup/cpuset-v1.c
> index 95de6f2a4cc5..5c0bded46a7c 100644
> --- a/kernel/cgroup/cpuset-v1.c
> +++ b/kernel/cgroup/cpuset-v1.c
> @@ -580,6 +580,173 @@ void update_domain_attr_tree(struct sched_domain_attr *dattr,
>   	rcu_read_unlock();
>   }
>   
> +/*
> + * cpuset1_generate_sched_domains()
> + *
> + * Finding the best partition (set of domains):
> + *	The double nested loops below over i, j scan over the load
> + *	balanced cpusets (using the array of cpuset pointers in csa[])
> + *	looking for pairs of cpusets that have overlapping cpus_allowed
> + *	and merging them using a union-find algorithm.
> + *
> + *	The union of the cpus_allowed masks from the set of all cpusets
> + *	having the same root then form the one element of the partition
> + *	(one sched domain) to be passed to partition_sched_domains().
> + */
> +int cpuset1_generate_sched_domains(cpumask_var_t **domains,
> +			struct sched_domain_attr **attributes)
> +{
> +	struct cpuset *cp;	/* top-down scan of cpusets */
> +	struct cpuset **csa;	/* array of all cpuset ptrs */
> +	int csn;		/* how many cpuset ptrs in csa so far */
> +	int i, j;		/* indices for partition finding loops */
> +	cpumask_var_t *doms;	/* resulting partition; i.e. sched domains */
> +	struct sched_domain_attr *dattr;  /* attributes for custom domains */
> +	int ndoms = 0;		/* number of sched domains in result */
> +	int nslot;		/* next empty doms[] struct cpumask slot */
> +	struct cgroup_subsys_state *pos_css;
> +	bool root_load_balance = is_sched_load_balance(&top_cpuset);
> +	int nslot_update;
> +
> +	assert_cpuset_lock_held();
> +
> +	doms = NULL;
> +	dattr = NULL;
> +	csa = NULL;
> +
> +	/* Special case for the 99% of systems with one, full, sched domain */
> +	if (root_load_balance) {
> +single_root_domain:
> +		ndoms = 1;
> +		doms = alloc_sched_domains(ndoms);
> +		if (!doms)
> +			goto done;
> +
> +		dattr = kmalloc(sizeof(struct sched_domain_attr), GFP_KERNEL);
> +		if (dattr) {
> +			*dattr = SD_ATTR_INIT;
> +			update_domain_attr_tree(dattr, &top_cpuset);
> +		}
> +		cpumask_and(doms[0], top_cpuset.effective_cpus,
> +			    housekeeping_cpumask(HK_TYPE_DOMAIN));
> +
> +		goto done;
> +	}
> +
> +	csa = kmalloc_array(nr_cpusets(), sizeof(cp), GFP_KERNEL);
> +	if (!csa)
> +		goto done;
> +	csn = 0;
> +
> +	rcu_read_lock();
> +	if (root_load_balance)
> +		csa[csn++] = &top_cpuset;
> +	cpuset_for_each_descendant_pre(cp, pos_css, &top_cpuset) {
> +		if (cp == &top_cpuset)
> +			continue;
> +
> +		/*
> +		 * v1:
Remove this v1 line.
> +		 * Continue traversing beyond @cp iff @cp has some CPUs and
> +		 * isn't load balancing.  The former is obvious.  The
> +		 * latter: All child cpusets contain a subset of the
> +		 * parent's cpus, so just skip them, and then we call
> +		 * update_domain_attr_tree() to calc relax_domain_level of
> +		 * the corresponding sched domain.
> +		 */
> +		if (!cpumask_empty(cp->cpus_allowed) &&
> +		    !(is_sched_load_balance(cp) &&
> +		      cpumask_intersects(cp->cpus_allowed,
> +					 housekeeping_cpumask(HK_TYPE_DOMAIN))))
> +			continue;
> +
> +		if (is_sched_load_balance(cp) &&
> +		    !cpumask_empty(cp->effective_cpus))
> +			csa[csn++] = cp;
> +
> +		/* skip @cp's subtree */
> +		pos_css = css_rightmost_descendant(pos_css);
> +		continue;
> +	}
> +	rcu_read_unlock();
> +
> +	/*
> +	 * If there are only isolated partitions underneath the cgroup root,
> +	 * we can optimize out unneeded sched domains scanning.
> +	 */
> +	if (root_load_balance && (csn == 1))
> +		goto single_root_domain;

This check is v2 specific and you can remove it as well as the 
"single_root_domain" label.

Cheers,
Longman