[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <CAOBoifguyy0zh==gN5XN7mHP28N3Lv1GSf5_KB49Ce101QTMGA@mail.gmail.com>
Date: Tue, 6 May 2025 16:15:16 -0700
From: Xi Wang <xii@...gle.com>
To: Waiman Long <llong@...hat.com>
Cc: linux-kernel@...r.kernel.org, cgroups@...r.kernel.org,
Ingo Molnar <mingo@...hat.com>, Peter Zijlstra <peterz@...radead.org>,
Juri Lelli <juri.lelli@...hat.com>, Vincent Guittot <vincent.guittot@...aro.org>,
Dietmar Eggemann <dietmar.eggemann@....com>, Steven Rostedt <rostedt@...dmis.org>,
Ben Segall <bsegall@...gle.com>, David Rientjes <rientjes@...gle.com>, Mel Gorman <mgorman@...e.de>,
Valentin Schneider <vschneid@...hat.com>, Tejun Heo <tj@...nel.org>,
Johannes Weiner <hannes@...xchg.org>, Michal Koutný <mkoutny@...e.com>,
Lai Jiangshan <jiangshanlai@...il.com1>, Frederic Weisbecker <frederic@...nel.org>,
Vlastimil Babka <vbabka@...e.cz>, Dan Carpenter <dan.carpenter@...aro.org>, Chen Yu <yu.c.chen@...el.com>,
Kees Cook <kees@...nel.org>, Yu-Chun Lin <eleanor15x@...il.com>,
Thomas Gleixner <tglx@...utronix.de>, Mickaël Salaün <mic@...ikod.net>
Subject: Re: [RFC/PATCH] sched: Support moving kthreads into cpuset cgroups
On Tue, May 6, 2025 at 12:58 PM Waiman Long <llong@...hat.com> wrote:
>
> On 5/6/25 2:35 PM, Xi Wang wrote:
> > In theory we should be able to manage kernel tasks with cpuset
> > cgroups just like user tasks, would be a flexible way to limit
> > interferences to real-time and other sensitive workloads. This is
> > however not supported today: When setting cpu affinity for kthreads,
> > kernel code uses a simpler control path that directly lead to
> > __set_cpus_allowed_ptr or __ktread_bind_mask. Neither honors cpuset
> > restrictions.
> >
> > This patch adds cpuset support for kernel tasks by merging userspace
> > and kernel cpu affinity control paths and applying the same
> > restrictions to kthreads.
> >
> > The PF_NO_SETAFFINITY flag is still supported for tasks that have to
> > run with certain cpu affinities. Kernel ensures kthreads with this
> > flag have their affinities locked and they stay in the root cpuset:
> >
> > If userspace moves kthreadd out of the root cpuset (see example
> > below), a newly forked kthread will be in a non root cgroup as well.
> > If PF_NO_SETAFFINITY is detected for the kthread, it will move itself
> > into the root cpuset before the threadfn is called. This does depend
> > on the kthread create -> kthread bind -> wake up sequence.
> >
> > Since kthreads are clones of kthreadd, the typical usage pattern is:
> >
> > Create a cpuset cgroup for kernel threads.
> >
> > Move kthreadd to that cgroup - all new newly created kthreads are
> > automatically enrolled into that cgroup.
> >
> > Move all remaining unlocked (!PF_NO_SETAFFINITY) kthreads into that
> > group.
> >
> > After these steps, all unlocked kthreads are managed by the cgroup,
> > including current and future kthreads.
> >
> > Command line example:
> >
> > mkdir /sys/fs/cgroup/kernel
> > echo "+cpuset" > /sys/fs/cgroup/cgroup.subtree_control
> > echo "+cpuset" > /sys/fs/cgroup/kernel/cgroup.subtree_control
> >
> > ktd=`pgrep -x kthreadd`; echo "move kthreadd/$ktd first"; echo $ktd > /dev/cgroup/cpuset/kernel/tasks
> > kthreads=`ps -e -o pgrp= -o pid= | sed -ne 's/^ *0 *// p'`
> > for p in $kthreads; do echo "moving $p (ok to fail for locked kthreads)"; echo $p > /sys/fs/cgroup/kernel/cgroup.procs; done
> > echo 4-7 > /sys/fs/cgroup/kernel/cpuset.cpus
> >
> > Signed-off-by: Xi Wang <xii@...gle.com>
> > ---
> > include/linux/kthread.h | 10 ++++-
> > include/linux/sched.h | 11 +++++
> > kernel/cgroup/cpuset.c | 31 ++++++++++++--
> > kernel/kthread.c | 89 +++++++++++++++++++++++++++++++++++---
> > kernel/sched/core.c | 95 ++++++++++++++++++++++++++++++++++++++---
> > kernel/sched/sched.h | 6 ---
> > kernel/sched/syscalls.c | 63 +--------------------------
> > kernel/workqueue.c | 7 ++-
> > 8 files changed, 226 insertions(+), 86 deletions(-)
> >
> > diff --git a/include/linux/kthread.h b/include/linux/kthread.h
> > index 8d27403888ce..36215a30d7f7 100644
> > --- a/include/linux/kthread.h
> > +++ b/include/linux/kthread.h
> > @@ -13,6 +13,14 @@ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
> > int node,
> > const char namefmt[], ...);
> >
> > +__printf(4, 5)
> > +struct task_struct *kthread_create_on_node_root_cpuset(
> > + int (*threadfn)(void *data),
> > + void *data,
> > + int node,
> > + const char namefmt[], ...);
> > +
> > +
> > /**
> > * kthread_create - create a kthread on the current node
> > * @threadfn: the function to run in the thread
> > @@ -27,7 +35,6 @@ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
> > #define kthread_create(threadfn, data, namefmt, arg...) \
> > kthread_create_on_node(threadfn, data, NUMA_NO_NODE, namefmt, ##arg)
> >
> > -
> > struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data),
> > void *data,
> > unsigned int cpu,
> > @@ -85,6 +92,7 @@ kthread_run_on_cpu(int (*threadfn)(void *data), void *data,
> > void free_kthread_struct(struct task_struct *k);
> > void kthread_bind(struct task_struct *k, unsigned int cpu);
> > void kthread_bind_mask(struct task_struct *k, const struct cpumask *mask);
> > +void kthread_bind_mask_cpuset(struct task_struct *k, const struct cpumask *mask);
> > int kthread_affine_preferred(struct task_struct *p, const struct cpumask *mask);
> > int kthread_stop(struct task_struct *k);
> > int kthread_stop_put(struct task_struct *k);
> > diff --git a/include/linux/sched.h b/include/linux/sched.h
> > index 0782de6b20d5..45b912e21239 100644
> > --- a/include/linux/sched.h
> > +++ b/include/linux/sched.h
> > @@ -1855,6 +1855,13 @@ extern int cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpu
> > extern int task_can_attach(struct task_struct *p);
> > extern int dl_bw_alloc(int cpu, u64 dl_bw);
> > extern void dl_bw_free(int cpu, u64 dl_bw);
> > +
> > +#define SCA_CHECK 0x01
> > +#define SCA_MIGRATE_DISABLE 0x02
> > +#define SCA_MIGRATE_ENABLE 0x04
> > +#define SCA_USER 0x08
> > +#define SCA_NO_CPUSET 0x10
> > +
> > #ifdef CONFIG_SMP
> >
> > /* do_set_cpus_allowed() - consider using set_cpus_allowed_ptr() instead */
> > @@ -1868,6 +1875,9 @@ extern void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new
> > * Return: zero if successful, or a negative error code
> > */
> > extern int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask);
> > +extern int set_cpus_allowed_ptr_no_cpuset(struct task_struct *p, const struct cpumask *new_mask);
> > +extern int set_cpus_allowed_ptr_flags(
> > + struct task_struct *p, const struct cpumask *new_mask, u32 flags);
> > extern int dup_user_cpus_ptr(struct task_struct *dst, struct task_struct *src, int node);
> > extern void release_user_cpus_ptr(struct task_struct *p);
> > extern int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask);
> > @@ -1884,6 +1894,7 @@ static inline int set_cpus_allowed_ptr(struct task_struct *p, const struct cpuma
> > return -EINVAL;
> > return 0;
> > }
> > +
> > static inline int dup_user_cpus_ptr(struct task_struct *dst, struct task_struct *src, int node)
> > {
> > if (src->user_cpus_ptr)
> > diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
> > index d0143b3dce47..ef929b349da8 100644
> > --- a/kernel/cgroup/cpuset.c
> > +++ b/kernel/cgroup/cpuset.c
> > @@ -1128,6 +1128,13 @@ void cpuset_update_tasks_cpumask(struct cpuset *cs, struct cpumask *new_cpus)
> > while ((task = css_task_iter_next(&it))) {
> > const struct cpumask *possible_mask = task_cpu_possible_mask(task);
> >
> > + /*
> > + * See also cpuset_can_attach. A thead with the flag could temporarily
> > + * reside in a non root cpuset. Don't change its affinity.
> > + */
> > + if (task->flags & PF_NO_SETAFFINITY)
> > + continue;
> > +
> > if (top_cs) {
> > /*
> > * Percpu kthreads in top_cpuset are ignored
> > @@ -3034,7 +3041,14 @@ static int cpuset_can_attach(struct cgroup_taskset *tset)
> > mems_updated = !nodes_equal(cs->effective_mems, oldcs->effective_mems);
> >
> > cgroup_taskset_for_each(task, css, tset) {
> > - ret = task_can_attach(task);
> > + /*
> > + * With the kthreads in cpuset feature, kthreadd can be moved to a
> > + * non root cpuset. We want to allow a PF_NO_SETAFFINITY task to be
> > + * spawned and then moved to root, which needs to be allowed here.
> > + */
> > + ret = !(cs == &top_cpuset && task->flags & PF_NO_SETAFFINITY);
> > + /* Check regular threads */
> > + ret = ret && task_can_attach(task);
> > if (ret)
> > goto out_unlock;
> >
> > @@ -3127,7 +3141,7 @@ static void cpuset_attach_task(struct cpuset *cs, struct task_struct *task)
> > * can_attach beforehand should guarantee that this doesn't
> > * fail. TODO: have a better way to handle failure here
> > */
> > - WARN_ON_ONCE(set_cpus_allowed_ptr(task, cpus_attach));
> > + WARN_ON_ONCE(set_cpus_allowed_ptr_flags(task, cpus_attach, SCA_NO_CPUSET));
> >
> > cpuset_change_task_nodemask(task, &cpuset_attach_nodemask_to);
> > cpuset1_update_task_spread_flags(cs, task);
> > @@ -3164,8 +3178,19 @@ static void cpuset_attach(struct cgroup_taskset *tset)
> >
> > guarantee_online_mems(cs, &cpuset_attach_nodemask_to);
> >
> > - cgroup_taskset_for_each(task, css, tset)
> > + cgroup_taskset_for_each(task, css, tset) {
> > + /*
> > + * See cpuset_can_attach.
> > + * With the kthreads in cpuset feature, kthreadd can be moved to a
> > + * non root cpuset. We want to allow a PF_NO_SETAFFINITY task to be
> > + * spawned and then moved to root as it starts running. Don't reset the
> > + * cpu affinity in this case because the thread could have already been
> > + * pinned to a cpu with kthread_bind and we want to preserve that.
> > + */
> > + if (task->flags & PF_NO_SETAFFINITY)
> > + continue;
> > cpuset_attach_task(cs, task);
> > + }
> >
> > /*
> > * Change mm for all threadgroup leaders. This is expensive and may
> > diff --git a/kernel/kthread.c b/kernel/kthread.c
> > index 77c44924cf54..2689eb67846e 100644
> > --- a/kernel/kthread.c
> > +++ b/kernel/kthread.c
> > @@ -45,6 +45,7 @@ struct kthread_create_info
> > int (*threadfn)(void *data);
> > void *data;
> > int node;
> > + bool move_to_root;
> >
> > /* Result passed back to kthread_create() from kthreadd. */
> > struct task_struct *result;
> > @@ -409,6 +410,9 @@ static void kthread_affine_node(void)
> > }
> > }
> >
> > +int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader,
> > + bool threadgroup);
> > +
> > static int kthread(void *_create)
> > {
> > static const struct sched_param param = { .sched_priority = 0 };
> > @@ -418,6 +422,7 @@ static int kthread(void *_create)
> > void *data = create->data;
> > struct completion *done;
> > struct kthread *self;
> > + bool move_to_root = create->move_to_root;
> > int ret;
> >
> > self = to_kthread(current);
> > @@ -454,6 +459,42 @@ static int kthread(void *_create)
> >
> > self->started = 1;
> >
> > +#ifdef CONFIG_CPUSETS
> > + /*
> > + * With the kthreads in cgroup feature, kthreadd can be optionally put
> > + * into a non root cpuset (such that newly created kernel threads are
> > + * automatically restricted). Certain kernel threads that must to be in
> > + * the root cpuset are moved to root here.
> > + *
> > + * This code is called after the schedule() above, thus kthread_bind
> > + * or kthread_bind_mask should have already been called if present.
> > + * PF_NO_SETAFFINITY set by these functions implicitly triggers the
> > + * move to root action. It can also be explicitly triggered with the
> > + * move_to_root flag.
> > + *
> > + * Potential races between the conditional and cgroup mutex lock:
> > + *
> > + * current can be out of root then moved into root before mutex lock,
> > + * which is ok because cgroup_attach_task should be able to handle
> > + * src == dst. There are checks in cgroup_migrate_prepare_dst etc.
> > + *
> > + * current can be in root then moved out of root before mutex lock,
> > + * which is also ok: For threads with PF_NO_SETAFFINITY the move is
> > + * disallowed so we can't have this race. For other threads, we allow
> > + * users to move them out of the root cgroup and there is no guarantee
> > + * on the order of actions.
> > + */
> > + if ((current->flags & PF_NO_SETAFFINITY || move_to_root) &&
> > + !task_css_is_root(current, cpuset_cgrp_id)) {
> > + mutex_lock(&cgroup_mutex);
> > + percpu_down_write(&cgroup_threadgroup_rwsem);
> > + if (cgroup_attach_task(&cpuset_cgrp_subsys.root->cgrp, current, false))
> > + WARN_ONCE(1, "Cannot move newly created kernel thread to root cpuset");
> > + percpu_up_write(&cgroup_threadgroup_rwsem);
> > + mutex_unlock(&cgroup_mutex);
> > + }
> > +#endif
> > +
> > if (!(current->flags & PF_NO_SETAFFINITY) && !self->preferred_affinity)
> > kthread_affine_node();
> >
> > @@ -504,7 +545,8 @@ static __printf(4, 0)
> > struct task_struct *__kthread_create_on_node(int (*threadfn)(void *data),
> > void *data, int node,
> > const char namefmt[],
> > - va_list args)
> > + va_list args,
> > + bool move_to_root)
> > {
> > DECLARE_COMPLETION_ONSTACK(done);
> > struct task_struct *task;
> > @@ -516,6 +558,7 @@ struct task_struct *__kthread_create_on_node(int (*threadfn)(void *data),
> > create->threadfn = threadfn;
> > create->data = data;
> > create->node = node;
> > + create->move_to_root = move_to_root;
> > create->done = &done;
> > create->full_name = kvasprintf(GFP_KERNEL, namefmt, args);
> > if (!create->full_name) {
> > @@ -585,14 +628,40 @@ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
> > va_list args;
> >
> > va_start(args, namefmt);
> > - task = __kthread_create_on_node(threadfn, data, node, namefmt, args);
> > + task = __kthread_create_on_node(threadfn, data, node, namefmt, args, false);
> > va_end(args);
> >
> > return task;
> > }
> > EXPORT_SYMBOL(kthread_create_on_node);
> >
> > -static void __kthread_bind_mask(struct task_struct *p, const struct cpumask *mask, unsigned int state)
> > +/*
> > + * Move the newly created kthread to root cpuset if it is not already there.
> > + * This happens if kthreadd is moved out of root cpuset by user. Otherwise same
> > + * as the regular version.
> > + */
> > +struct task_struct *kthread_create_on_node_root_cpuset(
> > + int (*threadfn)(void *data),
> > + void *data, int node,
> > + const char namefmt[],
> > + ...)
> > +
> > +{
> > + struct task_struct *task;
> > + va_list args;
> > +
> > + va_start(args, namefmt);
> > + task = __kthread_create_on_node(threadfn, data, node, namefmt, args, true);
> > + va_end(args);
> > +
> > + return task;
> > +}
> > +EXPORT_SYMBOL(kthread_create_on_node_root_cpuset);
> > +
> > +
> > +static void __kthread_bind_mask(struct task_struct *p, const struct cpumask *mask,
> > + unsigned int state, bool no_setaffinity)
> > +
> > {
> > unsigned long flags;
> >
> > @@ -604,22 +673,28 @@ static void __kthread_bind_mask(struct task_struct *p, const struct cpumask *mas
> > /* It's safe because the task is inactive. */
> > raw_spin_lock_irqsave(&p->pi_lock, flags);
> > do_set_cpus_allowed(p, mask);
> > - p->flags |= PF_NO_SETAFFINITY;
> > + if (no_setaffinity)
> > + p->flags |= PF_NO_SETAFFINITY;
> > raw_spin_unlock_irqrestore(&p->pi_lock, flags);
> > }
> >
> > static void __kthread_bind(struct task_struct *p, unsigned int cpu, unsigned int state)
> > {
> > - __kthread_bind_mask(p, cpumask_of(cpu), state);
> > + __kthread_bind_mask(p, cpumask_of(cpu), state, true);
> > }
> >
> > void kthread_bind_mask(struct task_struct *p, const struct cpumask *mask)
> > {
> > struct kthread *kthread = to_kthread(p);
> > - __kthread_bind_mask(p, mask, TASK_UNINTERRUPTIBLE);
> > + __kthread_bind_mask(p, mask, TASK_UNINTERRUPTIBLE, true);
> > WARN_ON_ONCE(kthread->started);
> > }
> >
> > +void kthread_bind_mask_cpuset(struct task_struct *p, const struct cpumask *mask)
> > +{
> > + set_cpus_allowed_ptr(p, mask);
> > +}
> > +
> > /**
> > * kthread_bind - bind a just-created kthread to a cpu.
> > * @p: thread created by kthread_create().
> > @@ -1044,7 +1119,7 @@ __kthread_create_worker_on_node(unsigned int flags, int node,
> > kthread_init_worker(worker);
> >
> > task = __kthread_create_on_node(kthread_worker_fn, worker,
> > - node, namefmt, args);
> > + node, namefmt, args, true);
> > if (IS_ERR(task))
> > goto fail_task;
> >
> > diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> > index 54e7d63f7785..b604a8451ba3 100644
> > --- a/kernel/sched/core.c
> > +++ b/kernel/sched/core.c
> > @@ -2393,7 +2393,7 @@ void migrate_enable(void)
> > struct task_struct *p = current;
> > struct affinity_context ac = {
> > .new_mask = &p->cpus_mask,
> > - .flags = SCA_MIGRATE_ENABLE,
> > + .flags = SCA_MIGRATE_ENABLE | SCA_NO_CPUSET,
> > };
> >
> > #ifdef CONFIG_DEBUG_PREEMPT
> > @@ -3153,7 +3153,7 @@ static int __set_cpus_allowed_ptr_locked(struct task_struct *p,
> > * task must not exit() & deallocate itself prematurely. The
> > * call is not atomic; no spinlocks may be held.
> > */
> > -int __set_cpus_allowed_ptr(struct task_struct *p, struct affinity_context *ctx)
> > +static int do_set_cpus_allowed_ptr(struct task_struct *p, struct affinity_context *ctx)
> > {
> > struct rq_flags rf;
> > struct rq *rq;
> > @@ -3171,6 +3171,79 @@ int __set_cpus_allowed_ptr(struct task_struct *p, struct affinity_context *ctx)
> > return __set_cpus_allowed_ptr_locked(p, ctx, rq, &rf);
> > }
> >
> > +int __set_cpus_allowed_ptr(struct task_struct *p,
> > + struct affinity_context *ctx)
> The __set_cpus_allowed_ptr() function is almost the same as
> __sched_setaffinity(). Please break the moving and renaming parts out
> into a separate patch to make it easier to review.
> > +{
> > + int retval;
> > + cpumask_var_t cpus_allowed, new_mask;
> > +
> > + /*
> > + * Don't restrict the thread to cpuset if explicitly specified or if locked.
> > + */
> > + if ((ctx->flags & SCA_NO_CPUSET) || (p->flags & PF_NO_SETAFFINITY))
> > + return do_set_cpus_allowed_ptr(p, ctx);
>
> Why you will allow a PF_NO_SETAFFIINITY task to change its affinity?
> What exactly is the purpose of the SCA_NO_CPUSET flag?
A PF_NO_SETAFFIINITY task still needs to have its affinity changed once after
creation - kthread_create doesn't have an affinity parameter so we have the
kthread_create -> kthread_bind -> wake up sequence for these tasks.
SCA_NO_CPUSET means don't do cpuset based restriction just like how it is done
without this patch, or passthrough mode for __set_cpus_allowed_ptr.
> > +
> > + if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {
> > + WARN_ONCE(!(ctx->flags & SCA_USER),
> > + "Unable to restrict kernel thread to cpuset due to low memory");
> > + return -ENOMEM;
> > + }
> > +
> > + if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {
> > + WARN_ONCE(!(ctx->flags & SCA_USER),
> > + "Unable to restrict kernel thread to cpuset due to low memory");
> > + retval = -ENOMEM;
> > + goto out_free_cpus_allowed;
> > + }
> > +
> > + cpuset_cpus_allowed(p, cpus_allowed);
> > + cpumask_and(new_mask, ctx->new_mask, cpus_allowed);
> > +
> > + ctx->new_mask = new_mask;
> > + ctx->flags |= SCA_CHECK;
> > +
> > + retval = dl_task_check_affinity(p, new_mask);
> > + if (retval)
> > + goto out_free_new_mask;
> > +
> > + retval = do_set_cpus_allowed_ptr(p, ctx);
> > + if (retval)
> > + goto out_free_new_mask;
> > +
> > + cpuset_cpus_allowed(p, cpus_allowed);
> > + if (!cpumask_subset(new_mask, cpus_allowed)) {
> > + /*
> > + * We must have raced with a concurrent cpuset update.
> > + * Just reset the cpumask to the cpuset's cpus_allowed.
> > + */
> > + cpumask_copy(new_mask, cpus_allowed);
> > +
> > + /*
> > + * If SCA_USER is set, a 2nd call to __set_cpus_allowed_ptr()
> > + * will restore the previous user_cpus_ptr value.
> > + *
> > + * In the unlikely event a previous user_cpus_ptr exists,
> > + * we need to further restrict the mask to what is allowed
> > + * by that old user_cpus_ptr.
> > + */
> > + if (unlikely((ctx->flags & SCA_USER) && ctx->user_mask)) {
> > + bool empty = !cpumask_and(new_mask, new_mask,
> > + ctx->user_mask);
> > +
> > + if (empty)
> > + cpumask_copy(new_mask, cpus_allowed);
> > + }
> > + __set_cpus_allowed_ptr(p, ctx);
> > + retval = -EINVAL;
> > + }
> > +
> > +out_free_new_mask:
> > + free_cpumask_var(new_mask);
> > +out_free_cpus_allowed:
> > + free_cpumask_var(cpus_allowed);
> > + return retval;
> > +}
> > +
> > int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
> > {
> > struct affinity_context ac = {
> > @@ -3182,6 +3255,17 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
> > }
> > EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
> >
> > +int set_cpus_allowed_ptr_flags(struct task_struct *p, const struct cpumask *new_mask, u32 flags)
> > +{
> > + struct affinity_context ac = {
> > + .new_mask = new_mask,
> > + .flags = flags,
> > + };
> > +
> > + return __set_cpus_allowed_ptr(p, &ac);
> > +}
> > +EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr_flags);
> > +
> > /*
> > * Change a given task's CPU affinity to the intersection of its current
> > * affinity mask and @subset_mask, writing the resulting mask to @new_mask.
> > @@ -3283,15 +3367,15 @@ void relax_compatible_cpus_allowed_ptr(struct task_struct *p)
> > {
> > struct affinity_context ac = {
> > .new_mask = task_user_cpus(p),
> > - .flags = 0,
> > + .flags = SCA_NO_CPUSET,
> > };
> > int ret;
> >
> > /*
> > - * Try to restore the old affinity mask with __sched_setaffinity().
> > + * Try to restore the old affinity mask with __set_cpus_allowed_ptr().
> > * Cpuset masking will be done there too.
> > */
> > - ret = __sched_setaffinity(p, &ac);
> > + ret = __set_cpus_allowed_ptr(p, &ac);
> > WARN_ON_ONCE(ret);
> > }
> >
> > @@ -7292,6 +7376,7 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
> > }
> > #endif
> >
> > +
> > #if !defined(CONFIG_PREEMPTION) || defined(CONFIG_PREEMPT_DYNAMIC)
> > int __sched __cond_resched(void)
> > {
> > diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> > index 91bea8d0a90b..9833432c9a75 100644
> > --- a/kernel/sched/sched.h
> > +++ b/kernel/sched/sched.h
> > @@ -2576,11 +2576,6 @@ static inline bool sched_fair_runnable(struct rq *rq)
> > extern struct task_struct *pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf);
> > extern struct task_struct *pick_task_idle(struct rq *rq);
> >
> > -#define SCA_CHECK 0x01
> > -#define SCA_MIGRATE_DISABLE 0x02
> > -#define SCA_MIGRATE_ENABLE 0x04
> > -#define SCA_USER 0x08
> > -
> > #ifdef CONFIG_SMP
> >
> > extern void update_group_capacity(struct sched_domain *sd, int cpu);
> > @@ -3939,7 +3934,6 @@ static inline int rt_effective_prio(struct task_struct *p, int prio)
> > #endif /* !CONFIG_RT_MUTEXES */
> >
> > extern int __sched_setscheduler(struct task_struct *p, const struct sched_attr *attr, bool user, bool pi);
> > -extern int __sched_setaffinity(struct task_struct *p, struct affinity_context *ctx);
> > extern const struct sched_class *__setscheduler_class(int policy, int prio);
> > extern void set_load_weight(struct task_struct *p, bool update_load);
> > extern void enqueue_task(struct rq *rq, struct task_struct *p, int flags);
> > diff --git a/kernel/sched/syscalls.c b/kernel/sched/syscalls.c
> > index 547c1f05b667..6528153c1297 100644
> > --- a/kernel/sched/syscalls.c
> > +++ b/kernel/sched/syscalls.c
> > @@ -1151,67 +1151,6 @@ int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask)
> > }
> > #endif /* CONFIG_SMP */
> >
> > -int __sched_setaffinity(struct task_struct *p, struct affinity_context *ctx)
> > -{
> > - int retval;
> > - cpumask_var_t cpus_allowed, new_mask;
> > -
> > - if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL))
> > - return -ENOMEM;
> > -
> > - if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {
> > - retval = -ENOMEM;
> > - goto out_free_cpus_allowed;
> > - }
> > -
> > - cpuset_cpus_allowed(p, cpus_allowed);
> > - cpumask_and(new_mask, ctx->new_mask, cpus_allowed);
> > -
> > - ctx->new_mask = new_mask;
> > - ctx->flags |= SCA_CHECK;
> > -
> > - retval = dl_task_check_affinity(p, new_mask);
> > - if (retval)
> > - goto out_free_new_mask;
> > -
> > - retval = __set_cpus_allowed_ptr(p, ctx);
> > - if (retval)
> > - goto out_free_new_mask;
> > -
> > - cpuset_cpus_allowed(p, cpus_allowed);
> > - if (!cpumask_subset(new_mask, cpus_allowed)) {
> > - /*
> > - * We must have raced with a concurrent cpuset update.
> > - * Just reset the cpumask to the cpuset's cpus_allowed.
> > - */
> > - cpumask_copy(new_mask, cpus_allowed);
> > -
> > - /*
> > - * If SCA_USER is set, a 2nd call to __set_cpus_allowed_ptr()
> > - * will restore the previous user_cpus_ptr value.
> > - *
> > - * In the unlikely event a previous user_cpus_ptr exists,
> > - * we need to further restrict the mask to what is allowed
> > - * by that old user_cpus_ptr.
> > - */
> > - if (unlikely((ctx->flags & SCA_USER) && ctx->user_mask)) {
> > - bool empty = !cpumask_and(new_mask, new_mask,
> > - ctx->user_mask);
> > -
> > - if (empty)
> > - cpumask_copy(new_mask, cpus_allowed);
> > - }
> > - __set_cpus_allowed_ptr(p, ctx);
> > - retval = -EINVAL;
> > - }
> > -
> > -out_free_new_mask:
> > - free_cpumask_var(new_mask);
> > -out_free_cpus_allowed:
> > - free_cpumask_var(cpus_allowed);
> > - return retval;
> > -}
> > -
> > long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
> > {
> > struct affinity_context ac;
> > @@ -1252,7 +1191,7 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
> > .flags = SCA_USER,
> > };
> >
> > - retval = __sched_setaffinity(p, &ac);
> > + retval = __set_cpus_allowed_ptr(p, &ac);
> > kfree(ac.user_mask);
> >
> > return retval;
> > diff --git a/kernel/workqueue.c b/kernel/workqueue.c
> > index f9ef467020cf..d51c0716674e 100644
> > --- a/kernel/workqueue.c
> > +++ b/kernel/workqueue.c
> > @@ -2813,7 +2813,10 @@ static struct worker *create_worker(struct worker_pool *pool)
> > }
> >
> > set_user_nice(worker->task, pool->attrs->nice);
> > - kthread_bind_mask(worker->task, pool_allowed_cpus(pool));
> > + if (!pool || (!worker->rescue_wq && pool->cpu >= 0))
> > + kthread_bind_mask(worker->task, pool_allowed_cpus(pool));
> > + else
> > + kthread_bind_mask_cpuset(worker->task, pool_allowed_cpus(pool));
> > }
> >
> > /* successful, attach the worker to the pool */
> > @@ -5587,7 +5590,7 @@ static int init_rescuer(struct workqueue_struct *wq)
> > if (wq->flags & WQ_UNBOUND)
> > kthread_bind_mask(rescuer->task, unbound_effective_cpumask(wq));
> > else
> > - kthread_bind_mask(rescuer->task, cpu_possible_mask);
> > + kthread_bind_mask_cpuset(rescuer->task, cpu_possible_mask);
> > wake_up_process(rescuer->task);
> >
> > return 0;
> >
>
Powered by blists - more mailing lists