linux-kernel - [PATCH 1/2] sched: psi: loosen clock sync between scheduler and aggregator

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives

Hash Suite: Windows password security audit tool. GUI, reports in PDF.

[<prev] [next>] [thread-next>] [day] [month] [year] [list]

Message-ID: <20251210155805.752523-1-hannes@cmpxchg.org>
Date: Wed, 10 Dec 2025 10:58:04 -0500
From: Johannes Weiner <hannes@...xchg.org>
To: Peter Zijlstra <peterz@...radead.org>,
	Suren Baghdasaryan <surenb@...gle.com>,
	Ingo Molnar <mingo@...nel.org>
Cc: Chengming Zhou <chengming.zhou@...ux.dev>,
	Dietmar Eggemann <dietmar.eggemann@....com>,
	John Stultz <jstultz@...gle.com>,
	linux-kernel@...r.kernel.org
Subject: [PATCH 1/2] sched: psi: loosen clock sync between scheduler and aggregator

In the aggregator, catch races between state snooping and task state
conclusions explicitly by checking for sample underflows; then move
the clock reads out of the reader's seqcount protection.

This shrinks the critical section and allows switching the scheduler
side to looser (cheaper) clock sourcing in the next patch.

Suggested-by: Chengming Zhou <chengming.zhou@...ux.dev>
Signed-off-by: Johannes Weiner <hannes@...xchg.org>
---
 kernel/sched/psi.c | 34 +++++++++++++++++++++++++++-------
 1 file changed, 27 insertions(+), 7 deletions(-)

diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index 59fdb7ebbf22..4b7bf8eb46c2 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -285,7 +285,6 @@ static void get_recent_times(struct psi_group *group, int cpu,
 	/* Snapshot a coherent view of the CPU state */
 	do {
 		seq = psi_read_begin(cpu);
-		now = cpu_clock(cpu);
 		memcpy(times, groupc->times, sizeof(groupc->times));
 		state_mask = groupc->state_mask;
 		state_start = groupc->state_start;
@@ -293,6 +292,9 @@ static void get_recent_times(struct psi_group *group, int cpu,
 			memcpy(tasks, groupc->tasks, sizeof(groupc->tasks));
 	} while (psi_read_retry(cpu, seq));
 
+	if (state_mask)
+		now = cpu_clock(cpu);
+
 	/* Calculate state time deltas against the previous snapshot */
 	for (s = 0; s < NR_PSI_STATES; s++) {
 		u32 delta;
@@ -308,7 +310,22 @@ static void get_recent_times(struct psi_group *group, int cpu,
 		if (state_mask & (1 << s))
 			times[s] += now - state_start;
 
+		/*
+		 * This snooping ahead can obviously race with the
+		 * state concluding on the cpu. If we previously
+		 * snooped to a time past where the state concludes,
+		 * times[s] can now be behind times_prev[s].
+		 *
+		 * time_after32() would be the obvious choice, but
+		 * S32_MAX is right around two seconds, which is the
+		 * aggregation interval; if the aggregator gets
+		 * delayed, there would be a risk of dismissing
+		 * genuinely large samples. Use a larger margin.
+		 */
 		delta = times[s] - groupc->times_prev[aggregator][s];
+		if (delta > psi_period + (psi_period >> 1))
+			delta = 0;
+
 		groupc->times_prev[aggregator][s] = times[s];
 
 		times[s] = delta;
@@ -908,16 +925,18 @@ static void psi_flags_change(struct task_struct *task, int clear, int set)
 
 void psi_task_change(struct task_struct *task, int clear, int set)
 {
-	int cpu = task_cpu(task);
+	int cpu;
 	u64 now;
 
 	if (!task->pid)
 		return;
 
+	cpu = task_cpu(task);
+	now = cpu_clock(cpu);
+
 	psi_flags_change(task, clear, set);
 
 	psi_write_begin(cpu);
-	now = cpu_clock(cpu);
 	for_each_group(group, task_psi_group(task))
 		psi_group_change(group, cpu, clear, set, now, true);
 	psi_write_end(cpu);
@@ -928,10 +947,9 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next,
 {
 	struct psi_group *common = NULL;
 	int cpu = task_cpu(prev);
-	u64 now;
+	u64 now = cpu_clock(cpu);
 
 	psi_write_begin(cpu);
-	now = cpu_clock(cpu);
 
 	if (next->pid) {
 		psi_flags_change(next, 0, TSK_ONCPU);
@@ -999,6 +1017,7 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next,
 				psi_group_change(group, cpu, clear, set, now, wake_clock);
 		}
 	}
+
 	psi_write_end(cpu);
 }
 
@@ -1027,9 +1046,9 @@ void psi_account_irqtime(struct rq *rq, struct task_struct *curr, struct task_st
 		return;
 	rq->psi_irq_time = irq;
 
-	psi_write_begin(cpu);
 	now = cpu_clock(cpu);
 
+	psi_write_begin(cpu);
 	for_each_group(group, task_psi_group(curr)) {
 		if (!group->enabled)
 			continue;
@@ -1234,8 +1253,9 @@ void psi_cgroup_restart(struct psi_group *group)
 
 		guard(rq_lock_irq)(cpu_rq(cpu));
 
-		psi_write_begin(cpu);
 		now = cpu_clock(cpu);
+
+		psi_write_begin(cpu);
 		psi_group_change(group, cpu, 0, 0, now, true);
 		psi_write_end(cpu);
 	}
-- 
2.52.0