linux-kernel - [PATCH v6 4/5] sched: Handle set_cpus_allowed_ptr() & sched

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20220826010119.1265764-5-longman@redhat.com>
Date:   Thu, 25 Aug 2022 21:01:18 -0400
From:   Waiman Long <longman@...hat.com>
To:     Ingo Molnar <mingo@...hat.com>,
        Peter Zijlstra <peterz@...radead.org>,
        Juri Lelli <juri.lelli@...hat.com>,
        Vincent Guittot <vincent.guittot@...aro.org>,
        Dietmar Eggemann <dietmar.eggemann@....com>,
        Steven Rostedt <rostedt@...dmis.org>,
        Ben Segall <bsegall@...gle.com>, Mel Gorman <mgorman@...e.de>,
        Daniel Bristot de Oliveira <bristot@...hat.com>,
        Valentin Schneider <vschneid@...hat.com>,
        Tejun Heo <tj@...nel.org>, Zefan Li <lizefan.x@...edance.com>,
        Johannes Weiner <hannes@...xchg.org>,
        Will Deacon <will@...nel.org>
Cc:     linux-kernel@...r.kernel.org,
        Linus Torvalds <torvalds@...ux-foundation.org>,
        Lai Jiangshan <jiangshanlai@...il.com>,
        Waiman Long <longman@...hat.com>
Subject: [PATCH v6 4/5] sched: Handle set_cpus_allowed_ptr() & sched_setaffinity() race

Racing is possible between set_cpus_allowed_ptr() and sched_setaffinity()
or between multiple sched_setaffinity() calls from different CPUs. To
resolve these race conditions, we need to update both user_cpus_ptr
and cpus_mask in a single lock critical section instead of separated
ones. This requires moving the user_cpus_ptr update to
affine_move_task() before doing task_rq_unlock().

A new argument puser_mask is added to affine_move_task(),
__set_cpus_allowed_ptr_locked() and __set_cpus_allowed_ptr() to do that.

Ideally, user_cpus_ptr should only be updated if the sched_setaffinity()
is successful. However, this patch will update user_cpus_ptr when the
first call to __set_cpus_allowed_ptr() is successful. However, if there
is racing between sched_setaffinity() and cpuset update, the subsequent
calls to __set_cpus_allowed_ptr() may fail but the user_cpus_ptr will
still be updated in this corner case.

Signed-off-by: Waiman Long <longman@...hat.com>
---
 kernel/sched/core.c | 66 ++++++++++++++++++++++++++++-----------------
 1 file changed, 42 insertions(+), 24 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 1c2f548e5369..6cd1177fbcea 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2199,7 +2199,7 @@ __do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask, u32
 
 static int __set_cpus_allowed_ptr(struct task_struct *p,
 				  const struct cpumask *new_mask,
-				  u32 flags);
+				  u32 flags, struct cpumask **puser_mask);
 
 static void migrate_disable_switch(struct rq *rq, struct task_struct *p)
 {
@@ -2249,7 +2249,7 @@ void migrate_enable(void)
 	 */
 	preempt_disable();
 	if (p->cpus_ptr != &p->cpus_mask)
-		__set_cpus_allowed_ptr(p, &p->cpus_mask, SCA_MIGRATE_ENABLE);
+		__set_cpus_allowed_ptr(p, &p->cpus_mask, SCA_MIGRATE_ENABLE, NULL);
 	/*
 	 * Mustn't clear migration_disabled() until cpus_ptr points back at the
 	 * regular cpus_mask, otherwise things that race (eg.
@@ -2618,6 +2618,15 @@ void release_user_cpus_ptr(struct task_struct *p)
 	kfree(clear_user_cpus_ptr(p));
 }
 
+static inline void swap_user_cpus_ptr(struct task_struct *p,
+				      struct cpumask **puser_mask)
+{
+	if (!puser_mask)
+		return;
+
+	swap(p->user_cpus_ptr, *puser_mask);
+}
+
 /*
  * This function is wildly self concurrent; here be dragons.
  *
@@ -2693,9 +2702,12 @@ void release_user_cpus_ptr(struct task_struct *p)
  * Note that the above is safe vs a concurrent migrate_enable(), as any
  * pending affinity completion is preceded by an uninstallation of
  * p->migration_pending done with p->pi_lock held.
+ *
+ * The puser_mask pointer, if defined, will cause its swap with the current
+ * user_cpus_ptr value if operation succeeds.
  */
 static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flags *rf,
-			    int dest_cpu, unsigned int flags)
+			    int dest_cpu, unsigned int flags, struct cpumask **puser_mask)
 	__releases(rq->lock)
 	__releases(p->pi_lock)
 {
@@ -2722,6 +2734,7 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag
 			complete = true;
 		}
 
+		swap_user_cpus_ptr(p, puser_mask);
 		task_rq_unlock(rq, p, rf);
 
 		if (push_task) {
@@ -2793,6 +2806,7 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag
 		if (flags & SCA_MIGRATE_ENABLE)
 			p->migration_flags &= ~MDF_PUSH;
 
+		swap_user_cpus_ptr(p, puser_mask);
 		task_rq_unlock(rq, p, rf);
 
 		if (!stop_pending) {
@@ -2813,6 +2827,8 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag
 				complete = true;
 			}
 		}
+
+		swap_user_cpus_ptr(p, puser_mask);
 		task_rq_unlock(rq, p, rf);
 
 		if (complete)
@@ -2843,7 +2859,8 @@ static int __set_cpus_allowed_ptr_locked(struct task_struct *p,
 					 const struct cpumask *new_mask,
 					 u32 flags,
 					 struct rq *rq,
-					 struct rq_flags *rf)
+					 struct rq_flags *rf,
+					 struct cpumask **puser_mask)
 	__releases(rq->lock)
 	__releases(p->pi_lock)
 {
@@ -2908,7 +2925,7 @@ static int __set_cpus_allowed_ptr_locked(struct task_struct *p,
 
 	__do_set_cpus_allowed(p, new_mask, flags);
 
-	return affine_move_task(rq, p, rf, dest_cpu, flags);
+	return affine_move_task(rq, p, rf, dest_cpu, flags, puser_mask);
 
 out:
 	task_rq_unlock(rq, p, rf);
@@ -2926,7 +2943,8 @@ static int __set_cpus_allowed_ptr_locked(struct task_struct *p,
  * call is not atomic; no spinlocks may be held.
  */
 static int __set_cpus_allowed_ptr(struct task_struct *p,
-				  const struct cpumask *new_mask, u32 flags)
+				  const struct cpumask *new_mask, u32 flags,
+				  struct cpumask **puser_mask)
 {
 	struct cpumask *alloc_mask = NULL;
 	struct rq_flags rf;
@@ -2934,8 +2952,11 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
 	int ret;
 
 	rq = task_rq_lock(p, &rf);
-	if (p->user_cpus_ptr) {
 
+	/*
+	 * user_cpus_ptr masking is skipped if puser_mask is defined.
+	 */
+	if (p->user_cpus_ptr && !puser_mask) {
 		/*
 		 * A scratch cpumask is allocated on the percpu runqueues
 		 * to enable additional masking with user_cpus_ptr. This
@@ -2958,7 +2979,8 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
 	}
 
 
-	ret = __set_cpus_allowed_ptr_locked(p, new_mask, flags, rq, &rf);
+	ret = __set_cpus_allowed_ptr_locked(p, new_mask, flags, rq, &rf,
+					    puser_mask);
 	if (unlikely(alloc_mask))
 		kfree(alloc_mask);
 	return ret;
@@ -2966,7 +2988,7 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
 
 int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
 {
-	return __set_cpus_allowed_ptr(p, new_mask, 0);
+	return __set_cpus_allowed_ptr(p, new_mask, 0, NULL);
 }
 EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
 
@@ -3004,7 +3026,7 @@ static int restrict_cpus_allowed_ptr(struct task_struct *p,
 		goto err_unlock;
 	}
 
-	return __set_cpus_allowed_ptr_locked(p, new_mask, 0, rq, &rf);
+	return __set_cpus_allowed_ptr_locked(p, new_mask, 0, rq, &rf, NULL);
 
 err_unlock:
 	task_rq_unlock(rq, p, &rf);
@@ -3551,7 +3573,7 @@ void sched_set_stop_task(int cpu, struct task_struct *stop)
 
 static inline int __set_cpus_allowed_ptr(struct task_struct *p,
 					 const struct cpumask *new_mask,
-					 u32 flags)
+					 u32 flags, struct cpumask **puser_mask)
 {
 	return set_cpus_allowed_ptr(p, new_mask);
 }
@@ -8109,29 +8131,25 @@ __sched_setaffinity(struct task_struct *p, const struct cpumask *mask, bool save
 		}
 		cpumask_copy(user_mask, mask);
 	}
-again:
-	retval = __set_cpus_allowed_ptr(p, new_mask, SCA_CHECK);
+
+	retval = __set_cpus_allowed_ptr(p, new_mask, SCA_CHECK,
+					user_mask ? &user_mask : NULL);
 	if (retval)
 		goto out_free_new_mask;
 
-	cpuset_cpus_allowed(p, cpus_allowed);
-	if (!cpumask_subset(new_mask, cpus_allowed)) {
+	for (;;) {
+		cpuset_cpus_allowed(p, cpus_allowed);
+		if (cpumask_subset(new_mask, cpus_allowed))
+			break;
+
 		/*
 		 * We must have raced with a concurrent cpuset update.
 		 * Just reset the cpumask to the cpuset's cpus_allowed.
 		 */
 		cpumask_copy(new_mask, cpus_allowed);
-		goto again;
+		retval = __set_cpus_allowed_ptr(p, new_mask, SCA_CHECK, NULL);
 	}
 
-	if (save_mask) {
-		unsigned long flags;
-
-		/* Use pi_lock to synchronize changes to user_cpus_ptr */
-		raw_spin_lock_irqsave(&p->pi_lock, flags);
-		swap(p->user_cpus_ptr, user_mask);
-		raw_spin_unlock_irqrestore(&p->pi_lock, flags);
-	}
 out_free_new_mask:
 	kfree(user_mask);
 	free_cpumask_var(new_mask);
-- 
2.31.1