linux-kernel - Re: [RFC/PATCH] sched: Support moving kthreads into cpuset cgroups

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <0e380555-5cd3-41f6-8cc9-5f8ca6472a6e@redhat.com>
Date: Tue, 6 May 2025 15:57:51 -0400
From: Waiman Long <llong@...hat.com>
To: Xi Wang <xii@...gle.com>, linux-kernel@...r.kernel.org,
 cgroups@...r.kernel.org
Cc: Ingo Molnar <mingo@...hat.com>, Peter Zijlstra <peterz@...radead.org>,
 Juri Lelli <juri.lelli@...hat.com>,
 Vincent Guittot <vincent.guittot@...aro.org>,
 Dietmar Eggemann <dietmar.eggemann@....com>,
 Steven Rostedt <rostedt@...dmis.org>, Ben Segall <bsegall@...gle.com>,
 David Rientjes <rientjes@...gle.com>, Mel Gorman <mgorman@...e.de>,
 Valentin Schneider <vschneid@...hat.com>, Tejun Heo <tj@...nel.org>,
 Johannes Weiner <hannes@...xchg.org>, Michal Koutný
 <mkoutny@...e.com>, Lai Jiangshan <jiangshanlai@...il.com1>,
 Frederic Weisbecker <frederic@...nel.org>, Vlastimil Babka <vbabka@...e.cz>,
 Dan Carpenter <dan.carpenter@...aro.org>, Chen Yu <yu.c.chen@...el.com>,
 Kees Cook <kees@...nel.org>, Yu-Chun Lin <eleanor15x@...il.com>,
 Thomas Gleixner <tglx@...utronix.de>, Mickaël Salaün
 <mic@...ikod.net>
Subject: Re: [RFC/PATCH] sched: Support moving kthreads into cpuset cgroups

On 5/6/25 2:35 PM, Xi Wang wrote:
> In theory we should be able to manage kernel tasks with cpuset
> cgroups just like user tasks, would be a flexible way to limit
> interferences to real-time and other sensitive workloads. This is
> however not supported today: When setting cpu affinity for kthreads,
> kernel code uses a simpler control path that directly lead to
> __set_cpus_allowed_ptr or __ktread_bind_mask. Neither honors cpuset
> restrictions.
>
> This patch adds cpuset support for kernel tasks by merging userspace
> and kernel cpu affinity control paths and applying the same
> restrictions to kthreads.
>
> The PF_NO_SETAFFINITY flag is still supported for tasks that have to
> run with certain cpu affinities. Kernel ensures kthreads with this
> flag have their affinities locked and they stay in the root cpuset:
>
> If userspace moves kthreadd out of the root cpuset (see example
> below), a newly forked kthread will be in a non root cgroup as well.
> If PF_NO_SETAFFINITY is detected for the kthread, it will move itself
> into the root cpuset before the threadfn is called. This does depend
> on the kthread create -> kthread bind -> wake up sequence.
>
> Since kthreads are clones of kthreadd, the typical usage pattern is:
>
> Create a cpuset cgroup for kernel threads.
>
> Move kthreadd to that cgroup - all new newly created kthreads are
> automatically enrolled into that cgroup.
>
> Move all remaining unlocked (!PF_NO_SETAFFINITY) kthreads into that
> group.
>
> After these steps, all unlocked kthreads are managed by the cgroup,
> including current and future kthreads.
>
> Command line example:
>
> mkdir /sys/fs/cgroup/kernel
> echo "+cpuset" > /sys/fs/cgroup/cgroup.subtree_control
> echo "+cpuset" > /sys/fs/cgroup/kernel/cgroup.subtree_control
>
> ktd=`pgrep -x kthreadd`; echo "move kthreadd/$ktd first"; echo $ktd > /dev/cgroup/cpuset/kernel/tasks
> kthreads=`ps -e -o pgrp= -o pid=  | sed -ne 's/^ *0 *// p'`
> for p in $kthreads; do echo "moving $p (ok to fail for locked kthreads)"; echo $p > /sys/fs/cgroup/kernel/cgroup.procs; done
> echo 4-7 > /sys/fs/cgroup/kernel/cpuset.cpus
>
> Signed-off-by: Xi Wang <xii@...gle.com>
> ---
>   include/linux/kthread.h | 10 ++++-
>   include/linux/sched.h   | 11 +++++
>   kernel/cgroup/cpuset.c  | 31 ++++++++++++--
>   kernel/kthread.c        | 89 +++++++++++++++++++++++++++++++++++---
>   kernel/sched/core.c     | 95 ++++++++++++++++++++++++++++++++++++++---
>   kernel/sched/sched.h    |  6 ---
>   kernel/sched/syscalls.c | 63 +--------------------------
>   kernel/workqueue.c      |  7 ++-
>   8 files changed, 226 insertions(+), 86 deletions(-)
>
> diff --git a/include/linux/kthread.h b/include/linux/kthread.h
> index 8d27403888ce..36215a30d7f7 100644
> --- a/include/linux/kthread.h
> +++ b/include/linux/kthread.h
> @@ -13,6 +13,14 @@ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
>   					   int node,
>   					   const char namefmt[], ...);
>   
> +__printf(4, 5)
> +struct task_struct *kthread_create_on_node_root_cpuset(
> +					   int (*threadfn)(void *data),
> +					   void *data,
> +					   int node,
> +					   const char namefmt[], ...);
> +
> +
>   /**
>    * kthread_create - create a kthread on the current node
>    * @threadfn: the function to run in the thread
> @@ -27,7 +35,6 @@ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
>   #define kthread_create(threadfn, data, namefmt, arg...) \
>   	kthread_create_on_node(threadfn, data, NUMA_NO_NODE, namefmt, ##arg)
>   
> -
>   struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data),
>   					  void *data,
>   					  unsigned int cpu,
> @@ -85,6 +92,7 @@ kthread_run_on_cpu(int (*threadfn)(void *data), void *data,
>   void free_kthread_struct(struct task_struct *k);
>   void kthread_bind(struct task_struct *k, unsigned int cpu);
>   void kthread_bind_mask(struct task_struct *k, const struct cpumask *mask);
> +void kthread_bind_mask_cpuset(struct task_struct *k, const struct cpumask *mask);
>   int kthread_affine_preferred(struct task_struct *p, const struct cpumask *mask);
>   int kthread_stop(struct task_struct *k);
>   int kthread_stop_put(struct task_struct *k);
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 0782de6b20d5..45b912e21239 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -1855,6 +1855,13 @@ extern int cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpu
>   extern int task_can_attach(struct task_struct *p);
>   extern int dl_bw_alloc(int cpu, u64 dl_bw);
>   extern void dl_bw_free(int cpu, u64 dl_bw);
> +
> +#define SCA_CHECK		0x01
> +#define SCA_MIGRATE_DISABLE	0x02
> +#define SCA_MIGRATE_ENABLE	0x04
> +#define SCA_USER		0x08
> +#define SCA_NO_CPUSET	0x10
> +
>   #ifdef CONFIG_SMP
>   
>   /* do_set_cpus_allowed() - consider using set_cpus_allowed_ptr() instead */
> @@ -1868,6 +1875,9 @@ extern void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new
>    * Return: zero if successful, or a negative error code
>    */
>   extern int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask);
> +extern int set_cpus_allowed_ptr_no_cpuset(struct task_struct *p, const struct cpumask *new_mask);
> +extern int set_cpus_allowed_ptr_flags(
> +	struct task_struct *p, const struct cpumask *new_mask, u32 flags);
>   extern int dup_user_cpus_ptr(struct task_struct *dst, struct task_struct *src, int node);
>   extern void release_user_cpus_ptr(struct task_struct *p);
>   extern int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask);
> @@ -1884,6 +1894,7 @@ static inline int set_cpus_allowed_ptr(struct task_struct *p, const struct cpuma
>   		return -EINVAL;
>   	return 0;
>   }
> +
>   static inline int dup_user_cpus_ptr(struct task_struct *dst, struct task_struct *src, int node)
>   {
>   	if (src->user_cpus_ptr)
> diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
> index d0143b3dce47..ef929b349da8 100644
> --- a/kernel/cgroup/cpuset.c
> +++ b/kernel/cgroup/cpuset.c
> @@ -1128,6 +1128,13 @@ void cpuset_update_tasks_cpumask(struct cpuset *cs, struct cpumask *new_cpus)
>   	while ((task = css_task_iter_next(&it))) {
>   		const struct cpumask *possible_mask = task_cpu_possible_mask(task);
>   
> +		/*
> +		 * See also cpuset_can_attach. A thead with the flag could temporarily
> +		 * reside in a non root cpuset. Don't change its affinity.
> +		 */
> +		if (task->flags & PF_NO_SETAFFINITY)
> +			continue;
> +
>   		if (top_cs) {
>   			/*
>   			 * Percpu kthreads in top_cpuset are ignored
> @@ -3034,7 +3041,14 @@ static int cpuset_can_attach(struct cgroup_taskset *tset)
>   	mems_updated = !nodes_equal(cs->effective_mems, oldcs->effective_mems);
>   
>   	cgroup_taskset_for_each(task, css, tset) {
> -		ret = task_can_attach(task);
> +		/*
> +		 * With the kthreads in cpuset feature, kthreadd can be moved to a
> +		 * non root cpuset. We want to allow a PF_NO_SETAFFINITY task to be
> +		 * spawned and then moved to root, which needs to be allowed here.
> +		 */
> +		ret = !(cs == &top_cpuset && task->flags & PF_NO_SETAFFINITY);
> +		/* Check regular threads */
> +		ret = ret && task_can_attach(task);
>   		if (ret)
>   			goto out_unlock;
>   
> @@ -3127,7 +3141,7 @@ static void cpuset_attach_task(struct cpuset *cs, struct task_struct *task)
>   	 * can_attach beforehand should guarantee that this doesn't
>   	 * fail.  TODO: have a better way to handle failure here
>   	 */
> -	WARN_ON_ONCE(set_cpus_allowed_ptr(task, cpus_attach));
> +	WARN_ON_ONCE(set_cpus_allowed_ptr_flags(task, cpus_attach, SCA_NO_CPUSET));
>   
>   	cpuset_change_task_nodemask(task, &cpuset_attach_nodemask_to);
>   	cpuset1_update_task_spread_flags(cs, task);
> @@ -3164,8 +3178,19 @@ static void cpuset_attach(struct cgroup_taskset *tset)
>   
>   	guarantee_online_mems(cs, &cpuset_attach_nodemask_to);
>   
> -	cgroup_taskset_for_each(task, css, tset)
> +	cgroup_taskset_for_each(task, css, tset) {
> +		/*
> +		 * See cpuset_can_attach.
> +		 * With the kthreads in cpuset feature, kthreadd can be moved to a
> +		 * non root cpuset. We want to allow a PF_NO_SETAFFINITY task to be
> +		 * spawned and then moved to root as it starts running. Don't reset the
> +		 * cpu affinity in this case because the thread could have already been
> +		 * pinned to a cpu with kthread_bind and we want to preserve that.
> +		 */
> +		if (task->flags & PF_NO_SETAFFINITY)
> +			continue;
>   		cpuset_attach_task(cs, task);
> +	}
>   
>   	/*
>   	 * Change mm for all threadgroup leaders. This is expensive and may
> diff --git a/kernel/kthread.c b/kernel/kthread.c
> index 77c44924cf54..2689eb67846e 100644
> --- a/kernel/kthread.c
> +++ b/kernel/kthread.c
> @@ -45,6 +45,7 @@ struct kthread_create_info
>   	int (*threadfn)(void *data);
>   	void *data;
>   	int node;
> +	bool move_to_root;
>   
>   	/* Result passed back to kthread_create() from kthreadd. */
>   	struct task_struct *result;
> @@ -409,6 +410,9 @@ static void kthread_affine_node(void)
>   	}
>   }
>   
> +int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader,
> +		       bool threadgroup);
> +
>   static int kthread(void *_create)
>   {
>   	static const struct sched_param param = { .sched_priority = 0 };
> @@ -418,6 +422,7 @@ static int kthread(void *_create)
>   	void *data = create->data;
>   	struct completion *done;
>   	struct kthread *self;
> +	bool move_to_root = create->move_to_root;
>   	int ret;
>   
>   	self = to_kthread(current);
> @@ -454,6 +459,42 @@ static int kthread(void *_create)
>   
>   	self->started = 1;
>   
> +#ifdef CONFIG_CPUSETS
> +	/*
> +	 * With the kthreads in cgroup feature, kthreadd can be optionally put
> +	 * into a non root cpuset (such that newly created kernel threads are
> +	 * automatically restricted). Certain kernel threads that must to be in
> +	 * the root cpuset are moved to root here.
> +	 *
> +	 * This code is called after the schedule() above, thus kthread_bind
> +	 * or kthread_bind_mask should have already been called if present.
> +	 * PF_NO_SETAFFINITY set by these functions implicitly triggers the
> +	 * move to root action. It can also be explicitly triggered with the
> +	 * move_to_root flag.
> +	 *
> +	 * Potential races between the conditional and cgroup mutex lock:
> +	 *
> +	 * current can be out of root then moved into root before mutex lock,
> +	 * which is ok because cgroup_attach_task should be able to handle
> +	 * src == dst. There are checks in cgroup_migrate_prepare_dst etc.
> +	 *
> +	 * current can be in root then moved out of root before mutex lock,
> +	 * which is also ok: For threads with PF_NO_SETAFFINITY the move is
> +	 * disallowed so we can't have this race. For other threads, we allow
> +	 * users to move them out of the root cgroup and there is no guarantee
> +	 * on the order of actions.
> +	 */
> +	if ((current->flags & PF_NO_SETAFFINITY || move_to_root) &&
> +	  !task_css_is_root(current, cpuset_cgrp_id)) {
> +		mutex_lock(&cgroup_mutex);
> +		percpu_down_write(&cgroup_threadgroup_rwsem);
> +		if (cgroup_attach_task(&cpuset_cgrp_subsys.root->cgrp, current, false))
> +			WARN_ONCE(1, "Cannot move newly created kernel thread to root cpuset");
> +		percpu_up_write(&cgroup_threadgroup_rwsem);
> +		mutex_unlock(&cgroup_mutex);
> +	}
> +#endif
> +
>   	if (!(current->flags & PF_NO_SETAFFINITY) && !self->preferred_affinity)
>   		kthread_affine_node();
>   
> @@ -504,7 +545,8 @@ static __printf(4, 0)
>   struct task_struct *__kthread_create_on_node(int (*threadfn)(void *data),
>   						    void *data, int node,
>   						    const char namefmt[],
> -						    va_list args)
> +						    va_list args,
> +						    bool move_to_root)
>   {
>   	DECLARE_COMPLETION_ONSTACK(done);
>   	struct task_struct *task;
> @@ -516,6 +558,7 @@ struct task_struct *__kthread_create_on_node(int (*threadfn)(void *data),
>   	create->threadfn = threadfn;
>   	create->data = data;
>   	create->node = node;
> +	create->move_to_root = move_to_root;
>   	create->done = &done;
>   	create->full_name = kvasprintf(GFP_KERNEL, namefmt, args);
>   	if (!create->full_name) {
> @@ -585,14 +628,40 @@ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
>   	va_list args;
>   
>   	va_start(args, namefmt);
> -	task = __kthread_create_on_node(threadfn, data, node, namefmt, args);
> +	task = __kthread_create_on_node(threadfn, data, node, namefmt, args, false);
>   	va_end(args);
>   
>   	return task;
>   }
>   EXPORT_SYMBOL(kthread_create_on_node);
>   
> -static void __kthread_bind_mask(struct task_struct *p, const struct cpumask *mask, unsigned int state)
> +/*
> + * Move the newly created kthread to root cpuset if it is not already there.
> + * This happens if kthreadd is moved out of root cpuset by user. Otherwise same
> + * as the regular version.
> + */
> +struct task_struct *kthread_create_on_node_root_cpuset(
> +					   int (*threadfn)(void *data),
> +					   void *data, int node,
> +					   const char namefmt[],
> +					   ...)
> +
> +{
> +	struct task_struct *task;
> +	va_list args;
> +
> +	va_start(args, namefmt);
> +	task = __kthread_create_on_node(threadfn, data, node, namefmt, args, true);
> +	va_end(args);
> +
> +	return task;
> +}
> +EXPORT_SYMBOL(kthread_create_on_node_root_cpuset);
> +
> +
> +static void __kthread_bind_mask(struct task_struct *p, const struct cpumask *mask,
> +  unsigned int state, bool no_setaffinity)
> +
>   {
>   	unsigned long flags;
>   
> @@ -604,22 +673,28 @@ static void __kthread_bind_mask(struct task_struct *p, const struct cpumask *mas
>   	/* It's safe because the task is inactive. */
>   	raw_spin_lock_irqsave(&p->pi_lock, flags);
>   	do_set_cpus_allowed(p, mask);
> -	p->flags |= PF_NO_SETAFFINITY;
> +	if (no_setaffinity)
> +		p->flags |= PF_NO_SETAFFINITY;
>   	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
>   }
>   
>   static void __kthread_bind(struct task_struct *p, unsigned int cpu, unsigned int state)
>   {
> -	__kthread_bind_mask(p, cpumask_of(cpu), state);
> +	__kthread_bind_mask(p, cpumask_of(cpu), state, true);
>   }
>   
>   void kthread_bind_mask(struct task_struct *p, const struct cpumask *mask)
>   {
>   	struct kthread *kthread = to_kthread(p);
> -	__kthread_bind_mask(p, mask, TASK_UNINTERRUPTIBLE);
> +	__kthread_bind_mask(p, mask, TASK_UNINTERRUPTIBLE, true);
>   	WARN_ON_ONCE(kthread->started);
>   }
>   
> +void kthread_bind_mask_cpuset(struct task_struct *p, const struct cpumask *mask)
> +{
> +	set_cpus_allowed_ptr(p, mask);
> +}
> +
>   /**
>    * kthread_bind - bind a just-created kthread to a cpu.
>    * @p: thread created by kthread_create().
> @@ -1044,7 +1119,7 @@ __kthread_create_worker_on_node(unsigned int flags, int node,
>   	kthread_init_worker(worker);
>   
>   	task = __kthread_create_on_node(kthread_worker_fn, worker,
> -					node, namefmt, args);
> +					node, namefmt, args, true);
>   	if (IS_ERR(task))
>   		goto fail_task;
>   
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 54e7d63f7785..b604a8451ba3 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -2393,7 +2393,7 @@ void migrate_enable(void)
>   	struct task_struct *p = current;
>   	struct affinity_context ac = {
>   		.new_mask  = &p->cpus_mask,
> -		.flags     = SCA_MIGRATE_ENABLE,
> +		.flags     = SCA_MIGRATE_ENABLE | SCA_NO_CPUSET,
>   	};
>   
>   #ifdef CONFIG_DEBUG_PREEMPT
> @@ -3153,7 +3153,7 @@ static int __set_cpus_allowed_ptr_locked(struct task_struct *p,
>    * task must not exit() & deallocate itself prematurely. The
>    * call is not atomic; no spinlocks may be held.
>    */
> -int __set_cpus_allowed_ptr(struct task_struct *p, struct affinity_context *ctx)
> +static int do_set_cpus_allowed_ptr(struct task_struct *p, struct affinity_context *ctx)
>   {
>   	struct rq_flags rf;
>   	struct rq *rq;
> @@ -3171,6 +3171,79 @@ int __set_cpus_allowed_ptr(struct task_struct *p, struct affinity_context *ctx)
>   	return __set_cpus_allowed_ptr_locked(p, ctx, rq, &rf);
>   }
>   
> +int __set_cpus_allowed_ptr(struct task_struct *p,
> +				  struct affinity_context *ctx)
The __set_cpus_allowed_ptr() function is almost the same as 
__sched_setaffinity(). Please break the moving and renaming parts out 
into a separate patch to make it easier to review.
> +{
> +	int retval;
> +	cpumask_var_t cpus_allowed, new_mask;
> +
> +	/*
> +	 * Don't restrict the thread to cpuset if explicitly specified or if locked.
> +	 */
> +	if ((ctx->flags & SCA_NO_CPUSET) || (p->flags & PF_NO_SETAFFINITY))
> +		return do_set_cpus_allowed_ptr(p, ctx);

Why you will allow a PF_NO_SETAFFIINITY task to change its affinity? 
What exactly is the purpose of the SCA_NO_CPUSET flag?

Cheers,
Longman

> +
> +	if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {
> +		WARN_ONCE(!(ctx->flags & SCA_USER),
> +		  "Unable to restrict kernel thread to cpuset due to low memory");
> +		return -ENOMEM;
> +	}
> +
> +	if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {
> +		WARN_ONCE(!(ctx->flags & SCA_USER),
> +		  "Unable to restrict kernel thread to cpuset due to low memory");
> +		retval = -ENOMEM;
> +		goto out_free_cpus_allowed;
> +	}
> +
> +	cpuset_cpus_allowed(p, cpus_allowed);
> +	cpumask_and(new_mask, ctx->new_mask, cpus_allowed);
> +
> +	ctx->new_mask = new_mask;
> +	ctx->flags |= SCA_CHECK;
> +
> +	retval = dl_task_check_affinity(p, new_mask);
> +	if (retval)
> +		goto out_free_new_mask;
> +
> +	retval = do_set_cpus_allowed_ptr(p, ctx);
> +	if (retval)
> +		goto out_free_new_mask;
> +
> +	cpuset_cpus_allowed(p, cpus_allowed);
> +	if (!cpumask_subset(new_mask, cpus_allowed)) {
> +		/*
> +		 * We must have raced with a concurrent cpuset update.
> +		 * Just reset the cpumask to the cpuset's cpus_allowed.
> +		 */
> +		cpumask_copy(new_mask, cpus_allowed);
> +
> +		/*
> +		 * If SCA_USER is set, a 2nd call to __set_cpus_allowed_ptr()
> +		 * will restore the previous user_cpus_ptr value.
> +		 *
> +		 * In the unlikely event a previous user_cpus_ptr exists,
> +		 * we need to further restrict the mask to what is allowed
> +		 * by that old user_cpus_ptr.
> +		 */
> +		if (unlikely((ctx->flags & SCA_USER) && ctx->user_mask)) {
> +			bool empty = !cpumask_and(new_mask, new_mask,
> +						  ctx->user_mask);
> +
> +			if (empty)
> +				cpumask_copy(new_mask, cpus_allowed);
> +		}
> +		__set_cpus_allowed_ptr(p, ctx);
> +		retval = -EINVAL;
> +	}
> +
> +out_free_new_mask:
> +	free_cpumask_var(new_mask);
> +out_free_cpus_allowed:
> +	free_cpumask_var(cpus_allowed);
> +	return retval;
> +}
> +
>   int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
>   {
>   	struct affinity_context ac = {
> @@ -3182,6 +3255,17 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
>   }
>   EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
>   
> +int set_cpus_allowed_ptr_flags(struct task_struct *p, const struct cpumask *new_mask, u32 flags)
> +{
> +	struct affinity_context ac = {
> +		.new_mask  = new_mask,
> +		.flags     = flags,
> +	};
> +
> +	return __set_cpus_allowed_ptr(p, &ac);
> +}
> +EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr_flags);
> +
>   /*
>    * Change a given task's CPU affinity to the intersection of its current
>    * affinity mask and @subset_mask, writing the resulting mask to @new_mask.
> @@ -3283,15 +3367,15 @@ void relax_compatible_cpus_allowed_ptr(struct task_struct *p)
>   {
>   	struct affinity_context ac = {
>   		.new_mask  = task_user_cpus(p),
> -		.flags     = 0,
> +		.flags     = SCA_NO_CPUSET,
>   	};
>   	int ret;
>   
>   	/*
> -	 * Try to restore the old affinity mask with __sched_setaffinity().
> +	 * Try to restore the old affinity mask with __set_cpus_allowed_ptr().
>   	 * Cpuset masking will be done there too.
>   	 */
> -	ret = __sched_setaffinity(p, &ac);
> +	ret = __set_cpus_allowed_ptr(p, &ac);
>   	WARN_ON_ONCE(ret);
>   }
>   
> @@ -7292,6 +7376,7 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
>   }
>   #endif
>   
> +
>   #if !defined(CONFIG_PREEMPTION) || defined(CONFIG_PREEMPT_DYNAMIC)
>   int __sched __cond_resched(void)
>   {
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index 91bea8d0a90b..9833432c9a75 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -2576,11 +2576,6 @@ static inline bool sched_fair_runnable(struct rq *rq)
>   extern struct task_struct *pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf);
>   extern struct task_struct *pick_task_idle(struct rq *rq);
>   
> -#define SCA_CHECK		0x01
> -#define SCA_MIGRATE_DISABLE	0x02
> -#define SCA_MIGRATE_ENABLE	0x04
> -#define SCA_USER		0x08
> -
>   #ifdef CONFIG_SMP
>   
>   extern void update_group_capacity(struct sched_domain *sd, int cpu);
> @@ -3939,7 +3934,6 @@ static inline int rt_effective_prio(struct task_struct *p, int prio)
>   #endif /* !CONFIG_RT_MUTEXES */
>   
>   extern int __sched_setscheduler(struct task_struct *p, const struct sched_attr *attr, bool user, bool pi);
> -extern int __sched_setaffinity(struct task_struct *p, struct affinity_context *ctx);
>   extern const struct sched_class *__setscheduler_class(int policy, int prio);
>   extern void set_load_weight(struct task_struct *p, bool update_load);
>   extern void enqueue_task(struct rq *rq, struct task_struct *p, int flags);
> diff --git a/kernel/sched/syscalls.c b/kernel/sched/syscalls.c
> index 547c1f05b667..6528153c1297 100644
> --- a/kernel/sched/syscalls.c
> +++ b/kernel/sched/syscalls.c
> @@ -1151,67 +1151,6 @@ int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask)
>   }
>   #endif /* CONFIG_SMP */
>   
> -int __sched_setaffinity(struct task_struct *p, struct affinity_context *ctx)
> -{
> -	int retval;
> -	cpumask_var_t cpus_allowed, new_mask;
> -
> -	if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL))
> -		return -ENOMEM;
> -
> -	if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {
> -		retval = -ENOMEM;
> -		goto out_free_cpus_allowed;
> -	}
> -
> -	cpuset_cpus_allowed(p, cpus_allowed);
> -	cpumask_and(new_mask, ctx->new_mask, cpus_allowed);
> -
> -	ctx->new_mask = new_mask;
> -	ctx->flags |= SCA_CHECK;
> -
> -	retval = dl_task_check_affinity(p, new_mask);
> -	if (retval)
> -		goto out_free_new_mask;
> -
> -	retval = __set_cpus_allowed_ptr(p, ctx);
> -	if (retval)
> -		goto out_free_new_mask;
> -
> -	cpuset_cpus_allowed(p, cpus_allowed);
> -	if (!cpumask_subset(new_mask, cpus_allowed)) {
> -		/*
> -		 * We must have raced with a concurrent cpuset update.
> -		 * Just reset the cpumask to the cpuset's cpus_allowed.
> -		 */
> -		cpumask_copy(new_mask, cpus_allowed);
> -
> -		/*
> -		 * If SCA_USER is set, a 2nd call to __set_cpus_allowed_ptr()
> -		 * will restore the previous user_cpus_ptr value.
> -		 *
> -		 * In the unlikely event a previous user_cpus_ptr exists,
> -		 * we need to further restrict the mask to what is allowed
> -		 * by that old user_cpus_ptr.
> -		 */
> -		if (unlikely((ctx->flags & SCA_USER) && ctx->user_mask)) {
> -			bool empty = !cpumask_and(new_mask, new_mask,
> -						  ctx->user_mask);
> -
> -			if (empty)
> -				cpumask_copy(new_mask, cpus_allowed);
> -		}
> -		__set_cpus_allowed_ptr(p, ctx);
> -		retval = -EINVAL;
> -	}
> -
> -out_free_new_mask:
> -	free_cpumask_var(new_mask);
> -out_free_cpus_allowed:
> -	free_cpumask_var(cpus_allowed);
> -	return retval;
> -}
> -
>   long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
>   {
>   	struct affinity_context ac;
> @@ -1252,7 +1191,7 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
>   		.flags     = SCA_USER,
>   	};
>   
> -	retval = __sched_setaffinity(p, &ac);
> +	retval = __set_cpus_allowed_ptr(p, &ac);
>   	kfree(ac.user_mask);
>   
>   	return retval;
> diff --git a/kernel/workqueue.c b/kernel/workqueue.c
> index f9ef467020cf..d51c0716674e 100644
> --- a/kernel/workqueue.c
> +++ b/kernel/workqueue.c
> @@ -2813,7 +2813,10 @@ static struct worker *create_worker(struct worker_pool *pool)
>   		}
>   
>   		set_user_nice(worker->task, pool->attrs->nice);
> -		kthread_bind_mask(worker->task, pool_allowed_cpus(pool));
> +		if (!pool || (!worker->rescue_wq && pool->cpu >= 0))
> +			kthread_bind_mask(worker->task, pool_allowed_cpus(pool));
> +		else
> +			kthread_bind_mask_cpuset(worker->task, pool_allowed_cpus(pool));
>   	}
>   
>   	/* successful, attach the worker to the pool */
> @@ -5587,7 +5590,7 @@ static int init_rescuer(struct workqueue_struct *wq)
>   	if (wq->flags & WQ_UNBOUND)
>   		kthread_bind_mask(rescuer->task, unbound_effective_cpumask(wq));
>   	else
> -		kthread_bind_mask(rescuer->task, cpu_possible_mask);
> +		kthread_bind_mask_cpuset(rescuer->task, cpu_possible_mask);
>   	wake_up_process(rescuer->task);
>   
>   	return 0;
>