linux-kernel - [PATCH 04/27] cpu: Protect against concurrent isolated cpuset change

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives

Hash Suite: Windows password security audit tool. GUI, reports in PDF.

[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]

Message-ID: <20250620152308.27492-5-frederic@kernel.org>
Date: Fri, 20 Jun 2025 17:22:45 +0200
From: Frederic Weisbecker <frederic@...nel.org>
To: LKML <linux-kernel@...r.kernel.org>
Cc: Frederic Weisbecker <frederic@...nel.org>,
	Ingo Molnar <mingo@...hat.com>,
	Marco Crivellari <marco.crivellari@...e.com>,
	Michal Hocko <mhocko@...e.com>,
	Peter Zijlstra <peterz@...radead.org>,
	Tejun Heo <tj@...nel.org>,
	Thomas Gleixner <tglx@...utronix.de>,
	Vlastimil Babka <vbabka@...e.cz>,
	Waiman Long <longman@...hat.com>
Subject: [PATCH 04/27] cpu: Protect against concurrent isolated cpuset change

_cpu_down() is called through work_on_cpu() on a target contained
within the HK_TYPE_DOMAIN cpumask.

But that cpumask will soon also integrate the cpuset isolated
partitions and some synchronization is needed to make sure that
the work_on_cpu() doesn't execute or last on an isolated CPU.

Unfortunately housekeeping_lock() can't be held before the call to
work_on_cpu() because _cpu_down() afterwards holds cpu_hotplug_lock.
This would be a lock inversion:

   cpu_down()                                         cpuset
   ---------                                          ------
   percpu_down_read(&housekeeping_pcpu_lock);         percpu_down_read(&cpu_hotplug_lock);
   percpu_down_write(&cpu_hotplug_lock);              percpu_down_write(&housekeeping_pcpu_lock);

To solve this situation, write-lock the cpu_hotplug_lock around the call
to work_on_cpu(). This will prevent from cpuset to modify the
housekeeping cpumask and therefore synchronize against HK_TYPE_DOMAIN
cpumask changes.

Signed-off-by: Frederic Weisbecker <frederic@...nel.org>
---
 kernel/cpu.c | 44 ++++++++++++++++++++++++++++++--------------
 1 file changed, 30 insertions(+), 14 deletions(-)

diff --git a/kernel/cpu.c b/kernel/cpu.c
index a59e009e0be4..069fce6c7eae 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -1398,8 +1398,8 @@ static int cpuhp_down_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st,
 }
 
 /* Requires cpu_add_remove_lock to be held */
-static int __ref _cpu_down(unsigned int cpu, int tasks_frozen,
-			   enum cpuhp_state target)
+static int __ref _cpu_down_locked(unsigned int cpu, int tasks_frozen,
+				  enum cpuhp_state target)
 {
 	struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
 	int prev_state, ret = 0;
@@ -1410,8 +1410,6 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen,
 	if (!cpu_present(cpu))
 		return -EINVAL;
 
-	cpus_write_lock();
-
 	cpuhp_tasks_frozen = tasks_frozen;
 
 	prev_state = cpuhp_set_state(cpu, st, target);
@@ -1427,14 +1425,14 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen,
 		 * return the error code..
 		 */
 		if (ret)
-			goto out;
+			return ret;
 
 		/*
 		 * We might have stopped still in the range of the AP hotplug
 		 * thread. Nothing to do anymore.
 		 */
 		if (st->state > CPUHP_TEARDOWN_CPU)
-			goto out;
+			return ret;
 
 		st->target = target;
 	}
@@ -1452,9 +1450,6 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen,
 		}
 	}
 
-out:
-	cpus_write_unlock();
-	arch_smt_update();
 	return ret;
 }
 
@@ -1463,16 +1458,17 @@ struct cpu_down_work {
 	enum cpuhp_state	target;
 };
 
-static long __cpu_down_maps_locked(void *arg)
+static long __cpu_down_locked_work(void *arg)
 {
 	struct cpu_down_work *work = arg;
 
-	return _cpu_down(work->cpu, 0, work->target);
+	return _cpu_down_locked(work->cpu, 0, work->target);
 }
 
 static int cpu_down_maps_locked(unsigned int cpu, enum cpuhp_state target)
 {
 	struct cpu_down_work work = { .cpu = cpu, .target = target, };
+	int err;
 
 	/*
 	 * If the platform does not support hotplug, report it explicitly to
@@ -1483,17 +1479,24 @@ static int cpu_down_maps_locked(unsigned int cpu, enum cpuhp_state target)
 	if (cpu_hotplug_disabled)
 		return -EBUSY;
 
+	err = -EBUSY;
+
 	/*
 	 * Ensure that the control task does not run on the to be offlined
 	 * CPU to prevent a deadlock against cfs_b->period_timer.
 	 * Also keep at least one housekeeping cpu onlined to avoid generating
-	 * an empty sched_domain span.
+	 * an empty sched_domain span. Hotplug must be locked already to prevent
+	 * cpusets from concurrently changing the housekeeping mask.
 	 */
+	cpus_write_lock();
 	for_each_cpu_and(cpu, cpu_online_mask, housekeeping_cpumask(HK_TYPE_DOMAIN)) {
 		if (cpu != work.cpu)
-			return work_on_cpu(cpu, __cpu_down_maps_locked, &work);
+			err = work_on_cpu(cpu, __cpu_down_locked_work, &work);
 	}
-	return -EBUSY;
+	cpus_write_unlock();
+	arch_smt_update();
+
+	return err;
 }
 
 static int cpu_down(unsigned int cpu, enum cpuhp_state target)
@@ -1896,6 +1899,19 @@ void __init bringup_nonboot_cpus(unsigned int max_cpus)
 #ifdef CONFIG_PM_SLEEP_SMP
 static cpumask_var_t frozen_cpus;
 
+static int __ref _cpu_down(unsigned int cpu, int tasks_frozen,
+			    enum cpuhp_state target)
+{
+	int err;
+
+	cpus_write_lock();
+	err = _cpu_down_locked(cpu, tasks_frozen, target);
+	cpus_write_unlock();
+	arch_smt_update();
+
+	return err;
+}
+
 int freeze_secondary_cpus(int primary)
 {
 	int cpu, error = 0;
-- 
2.48.1