Because sched_setscheduler() checks p->flags & PF_NO_SETAFFINITY without locks, a caller might observe an old value and race with the set_cpus_allowed_ptr() call from __kthread_bind() and effectively undo it. __kthread_bind() do_set_cpus_allowed() sched_setaffinity() if (p->flags & PF_NO_SETAFFINITIY) set_cpus_allowed_ptr() p->flags |= PF_NO_SETAFFINITY Fix the issue by putting everything under the regular scheduler locks. This also closes a hole in the serialization of task_struct::{nr_,}cpus_allowed. Cc: Tejun Heo Cc: Oleg Nesterov Signed-off-by: Peter Zijlstra (Intel) --- include/linux/kthread.h | 1 + include/linux/sched.h | 7 ------- kernel/kthread.c | 21 ++++++++++++++++++--- kernel/sched/core.c | 30 ++++++++++++++++++++++++++---- kernel/workqueue.c | 11 ++--------- 5 files changed, 47 insertions(+), 23 deletions(-) --- a/include/linux/kthread.h +++ b/include/linux/kthread.h @@ -38,6 +38,7 @@ struct task_struct *kthread_create_on_cp }) void kthread_bind(struct task_struct *k, unsigned int cpu); +void kthread_bind_mask(struct task_struct *k, const struct cpumask *mask); int kthread_stop(struct task_struct *k); bool kthread_should_stop(void); bool kthread_should_park(void); --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2215,13 +2215,6 @@ static inline void calc_load_enter_idle( static inline void calc_load_exit_idle(void) { } #endif /* CONFIG_NO_HZ_COMMON */ -#ifndef CONFIG_CPUMASK_OFFSTACK -static inline int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask) -{ - return set_cpus_allowed_ptr(p, &new_mask); -} -#endif - /* * Do not use outside of architecture code which knows its limitations. * --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -325,16 +325,30 @@ struct task_struct *kthread_create_on_no } EXPORT_SYMBOL(kthread_create_on_node); -static void __kthread_bind(struct task_struct *p, unsigned int cpu, long state) +static void __kthread_bind_mask(struct task_struct *p, const struct cpumask *mask, long state) { - /* Must have done schedule() in kthread() before we set_task_cpu */ + unsigned long flags; + if (!wait_task_inactive(p, state)) { WARN_ON(1); return; } + /* It's safe because the task is inactive. */ - do_set_cpus_allowed(p, cpumask_of(cpu)); + raw_spin_lock_irqsave(&p->pi_lock, flags); + do_set_cpus_allowed(p, mask); p->flags |= PF_NO_SETAFFINITY; + raw_spin_unlock_irqrestore(&p->pi_lock, flags); +} + +static void __kthread_bind(struct task_struct *p, unsigned int cpu, long state) +{ + __kthread_bind_mask(p, cpumask_of(cpu), state); +} + +void kthread_bind_mask(struct task_struct *p, const struct cpumask *mask) +{ + __kthread_bind_mask(p, mask, TASK_UNINTERRUPTIBLE); } /** --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4044,6 +4044,9 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pi return retval; } +static int __set_cpus_allowed_ptr(struct task_struct *p, + const struct cpumask *new_mask, bool check); + long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) { cpumask_var_t cpus_allowed, new_mask; @@ -4110,7 +4113,7 @@ long sched_setaffinity(pid_t pid, const } #endif again: - retval = set_cpus_allowed_ptr(p, new_mask); + retval = __set_cpus_allowed_ptr(p, new_mask, true); if (!retval) { cpuset_cpus_allowed(p, cpus_allowed); @@ -4638,7 +4641,8 @@ void init_idle(struct task_struct *idle, struct rq *rq = cpu_rq(cpu); unsigned long flags; - raw_spin_lock_irqsave(&rq->lock, flags); + raw_spin_lock_irqsave(&idle->pi_lock, flags); + raw_spin_lock(&rq->lock); __sched_fork(0, idle); idle->state = TASK_RUNNING; @@ -4664,7 +4668,8 @@ void init_idle(struct task_struct *idle, #if defined(CONFIG_SMP) idle->on_cpu = 1; #endif - raw_spin_unlock_irqrestore(&rq->lock, flags); + raw_spin_unlock(&rq->lock); + raw_spin_unlock_irqrestore(&idle->pi_lock, flags); /* Set the preempt count _outside_ the spinlocks! */ init_idle_preempt_count(idle, cpu); @@ -4788,6 +4793,8 @@ static struct rq *move_queued_task(struc void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) { + lockdep_assert_held(&p->pi_lock); + if (p->sched_class->set_cpus_allowed) p->sched_class->set_cpus_allowed(p, new_mask); @@ -4818,7 +4825,8 @@ void do_set_cpus_allowed(struct task_str * task must not exit() & deallocate itself prematurely. The * call is not atomic; no spinlocks may be held. */ -int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) +static int __set_cpus_allowed_ptr(struct task_struct *p, + const struct cpumask *new_mask, bool check) { unsigned long flags; struct rq *rq; @@ -4827,6 +4835,15 @@ int set_cpus_allowed_ptr(struct task_str rq = task_rq_lock(p, &flags); + /* + * Must re-check here, to close a race against __kthread_bind(), + * sched_setaffinity() is not guaranteed to observe the flag. + */ + if (check && (p->flags & PF_NO_SETAFFINITY)) { + ret = -EINVAL; + goto out; + } + if (cpumask_equal(&p->cpus_allowed, new_mask)) goto out; @@ -4856,6 +4873,11 @@ int set_cpus_allowed_ptr(struct task_str return ret; } + +int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) +{ + return __set_cpus_allowed_ptr(p, new_mask, false); +} EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr); /* --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1619,11 +1619,7 @@ static void worker_attach_to_pool(struct { mutex_lock(&pool->attach_mutex); - /* - * set_cpus_allowed_ptr() will fail if the cpumask doesn't have any - * online CPUs. It'll be re-applied when any of the CPUs come up. - */ - set_cpus_allowed_ptr(worker->task, pool->attrs->cpumask); + kthread_bind_mask(worker->task, pool->attrs->cpumask); /* * The pool->attach_mutex ensures %POOL_DISASSOCIATED remains @@ -1708,9 +1704,6 @@ static struct worker *create_worker(stru set_user_nice(worker->task, pool->attrs->nice); - /* prevent userland from meddling with cpumask of workqueue workers */ - worker->task->flags |= PF_NO_SETAFFINITY; - /* successful, attach the worker to the pool */ worker_attach_to_pool(worker, pool); @@ -3832,7 +3825,7 @@ struct workqueue_struct *__alloc_workque } wq->rescuer = rescuer; - rescuer->task->flags |= PF_NO_SETAFFINITY; + kthread_bind_mask(rescuer->task, cpu_possible_mask); wake_up_process(rescuer->task); } -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/