lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-ID: <20251210155805.752523-1-hannes@cmpxchg.org>
Date: Wed, 10 Dec 2025 10:58:04 -0500
From: Johannes Weiner <hannes@...xchg.org>
To: Peter Zijlstra <peterz@...radead.org>,
	Suren Baghdasaryan <surenb@...gle.com>,
	Ingo Molnar <mingo@...nel.org>
Cc: Chengming Zhou <chengming.zhou@...ux.dev>,
	Dietmar Eggemann <dietmar.eggemann@....com>,
	John Stultz <jstultz@...gle.com>,
	linux-kernel@...r.kernel.org
Subject: [PATCH 1/2] sched: psi: loosen clock sync between scheduler and aggregator

In the aggregator, catch races between state snooping and task state
conclusions explicitly by checking for sample underflows; then move
the clock reads out of the reader's seqcount protection.

This shrinks the critical section and allows switching the scheduler
side to looser (cheaper) clock sourcing in the next patch.

Suggested-by: Chengming Zhou <chengming.zhou@...ux.dev>
Signed-off-by: Johannes Weiner <hannes@...xchg.org>
---
 kernel/sched/psi.c | 34 +++++++++++++++++++++++++++-------
 1 file changed, 27 insertions(+), 7 deletions(-)

diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index 59fdb7ebbf22..4b7bf8eb46c2 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -285,7 +285,6 @@ static void get_recent_times(struct psi_group *group, int cpu,
 	/* Snapshot a coherent view of the CPU state */
 	do {
 		seq = psi_read_begin(cpu);
-		now = cpu_clock(cpu);
 		memcpy(times, groupc->times, sizeof(groupc->times));
 		state_mask = groupc->state_mask;
 		state_start = groupc->state_start;
@@ -293,6 +292,9 @@ static void get_recent_times(struct psi_group *group, int cpu,
 			memcpy(tasks, groupc->tasks, sizeof(groupc->tasks));
 	} while (psi_read_retry(cpu, seq));
 
+	if (state_mask)
+		now = cpu_clock(cpu);
+
 	/* Calculate state time deltas against the previous snapshot */
 	for (s = 0; s < NR_PSI_STATES; s++) {
 		u32 delta;
@@ -308,7 +310,22 @@ static void get_recent_times(struct psi_group *group, int cpu,
 		if (state_mask & (1 << s))
 			times[s] += now - state_start;
 
+		/*
+		 * This snooping ahead can obviously race with the
+		 * state concluding on the cpu. If we previously
+		 * snooped to a time past where the state concludes,
+		 * times[s] can now be behind times_prev[s].
+		 *
+		 * time_after32() would be the obvious choice, but
+		 * S32_MAX is right around two seconds, which is the
+		 * aggregation interval; if the aggregator gets
+		 * delayed, there would be a risk of dismissing
+		 * genuinely large samples. Use a larger margin.
+		 */
 		delta = times[s] - groupc->times_prev[aggregator][s];
+		if (delta > psi_period + (psi_period >> 1))
+			delta = 0;
+
 		groupc->times_prev[aggregator][s] = times[s];
 
 		times[s] = delta;
@@ -908,16 +925,18 @@ static void psi_flags_change(struct task_struct *task, int clear, int set)
 
 void psi_task_change(struct task_struct *task, int clear, int set)
 {
-	int cpu = task_cpu(task);
+	int cpu;
 	u64 now;
 
 	if (!task->pid)
 		return;
 
+	cpu = task_cpu(task);
+	now = cpu_clock(cpu);
+
 	psi_flags_change(task, clear, set);
 
 	psi_write_begin(cpu);
-	now = cpu_clock(cpu);
 	for_each_group(group, task_psi_group(task))
 		psi_group_change(group, cpu, clear, set, now, true);
 	psi_write_end(cpu);
@@ -928,10 +947,9 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next,
 {
 	struct psi_group *common = NULL;
 	int cpu = task_cpu(prev);
-	u64 now;
+	u64 now = cpu_clock(cpu);
 
 	psi_write_begin(cpu);
-	now = cpu_clock(cpu);
 
 	if (next->pid) {
 		psi_flags_change(next, 0, TSK_ONCPU);
@@ -999,6 +1017,7 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next,
 				psi_group_change(group, cpu, clear, set, now, wake_clock);
 		}
 	}
+
 	psi_write_end(cpu);
 }
 
@@ -1027,9 +1046,9 @@ void psi_account_irqtime(struct rq *rq, struct task_struct *curr, struct task_st
 		return;
 	rq->psi_irq_time = irq;
 
-	psi_write_begin(cpu);
 	now = cpu_clock(cpu);
 
+	psi_write_begin(cpu);
 	for_each_group(group, task_psi_group(curr)) {
 		if (!group->enabled)
 			continue;
@@ -1234,8 +1253,9 @@ void psi_cgroup_restart(struct psi_group *group)
 
 		guard(rq_lock_irq)(cpu_rq(cpu));
 
-		psi_write_begin(cpu);
 		now = cpu_clock(cpu);
+
+		psi_write_begin(cpu);
 		psi_group_change(group, cpu, 0, 0, now, true);
 		psi_write_end(cpu);
 	}
-- 
2.52.0


Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ