[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20260128103514.GX171111@noisy.programming.kicks-ass.net>
Date: Wed, 28 Jan 2026 11:35:14 +0100
From: Peter Zijlstra <peterz@...radead.org>
To: Johannes Weiner <hannes@...xchg.org>
Cc: Suren Baghdasaryan <surenb@...gle.com>, Ingo Molnar <mingo@...nel.org>,
Chengming Zhou <chengming.zhou@...ux.dev>,
Dietmar Eggemann <dietmar.eggemann@....com>,
John Stultz <jstultz@...gle.com>, linux-kernel@...r.kernel.org
Subject: Re: [PATCH resend 1/2] sched: psi: loosen clock sync between
scheduler and aggregator
On Wed, Jan 14, 2026 at 10:43:16AM -0500, Johannes Weiner wrote:
> In the aggregator, catch races between state snooping and task state
> conclusions explicitly by checking for sample underflows; then move
> the clock reads out of the reader's seqcount protection.
I'm utterly failing to make sense of this -- what?!
> This shrinks the critical section and allows switching the scheduler
> side to looser (cheaper) clock sourcing in the next patch.
>
> Suggested-by: Chengming Zhou <chengming.zhou@...ux.dev>
> Reviewed-by: Chengming Zhou <chengming.zhou@...ux.dev>
> Signed-off-by: Johannes Weiner <hannes@...xchg.org>
> ---
> kernel/sched/psi.c | 34 +++++++++++++++++++++++++++-------
> 1 file changed, 27 insertions(+), 7 deletions(-)
>
> diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
> index 59fdb7ebbf22..4b7bf8eb46c2 100644
> --- a/kernel/sched/psi.c
> +++ b/kernel/sched/psi.c
> @@ -285,7 +285,6 @@ static void get_recent_times(struct psi_group *group, int cpu,
> /* Snapshot a coherent view of the CPU state */
> do {
> seq = psi_read_begin(cpu);
> - now = cpu_clock(cpu);
> memcpy(times, groupc->times, sizeof(groupc->times));
> state_mask = groupc->state_mask;
> state_start = groupc->state_start;
> @@ -293,6 +292,9 @@ static void get_recent_times(struct psi_group *group, int cpu,
> memcpy(tasks, groupc->tasks, sizeof(groupc->tasks));
> } while (psi_read_retry(cpu, seq));
>
> + if (state_mask)
> + now = cpu_clock(cpu);
> +
So this can be later... which results in a larger value
> /* Calculate state time deltas against the previous snapshot */
> for (s = 0; s < NR_PSI_STATES; s++) {
> u32 delta;
> @@ -308,7 +310,22 @@ static void get_recent_times(struct psi_group *group, int cpu,
> if (state_mask & (1 << s))
> times[s] += now - state_start;
>
which makes times[s] larger than it should be
> + /*
> + * This snooping ahead can obviously race with the
> + * state concluding on the cpu. If we previously
> + * snooped to a time past where the state concludes,
> + * times[s] can now be behind times_prev[s].
> + *
> + * time_after32() would be the obvious choice, but
> + * S32_MAX is right around two seconds, which is the
> + * aggregation interval; if the aggregator gets
> + * delayed, there would be a risk of dismissing
> + * genuinely large samples. Use a larger margin.
> + */
> delta = times[s] - groupc->times_prev[aggregator][s];
> + if (delta > psi_period + (psi_period >> 1))
> + delta = 0;
> +
This seems to check if times_prev is larger than times; confused again.
> groupc->times_prev[aggregator][s] = times[s];
It updates times_prev irrespectively. Storing a potentially larger
value.
>
> times[s] = delta;
And stores the delta, which can be larger than it should be?
> @@ -908,16 +925,18 @@ static void psi_flags_change(struct task_struct *task, int clear, int set)
>
> void psi_task_change(struct task_struct *task, int clear, int set)
> {
> - int cpu = task_cpu(task);
> + int cpu;
> u64 now;
>
> if (!task->pid)
> return;
>
> + cpu = task_cpu(task);
> + now = cpu_clock(cpu);
> +
So this clock is earlier, or smaller.
> psi_flags_change(task, clear, set);
>
> psi_write_begin(cpu);
> - now = cpu_clock(cpu);
> for_each_group(group, task_psi_group(task))
> psi_group_change(group, cpu, clear, set, now, true);
> psi_write_end(cpu);
> @@ -928,10 +947,9 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next,
> {
> struct psi_group *common = NULL;
> int cpu = task_cpu(prev);
> - u64 now;
> + u64 now = cpu_clock(cpu);
Same.
> psi_write_begin(cpu);
> - now = cpu_clock(cpu);
>
> if (next->pid) {
> psi_flags_change(next, 0, TSK_ONCPU);
> @@ -999,6 +1017,7 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next,
> psi_group_change(group, cpu, clear, set, now, wake_clock);
> }
> }
> +
> psi_write_end(cpu);
> }
>
> @@ -1027,9 +1046,9 @@ void psi_account_irqtime(struct rq *rq, struct task_struct *curr, struct task_st
> return;
> rq->psi_irq_time = irq;
>
> - psi_write_begin(cpu);
> now = cpu_clock(cpu);
Same.
> + psi_write_begin(cpu);
> for_each_group(group, task_psi_group(curr)) {
> if (!group->enabled)
> continue;
> @@ -1234,8 +1253,9 @@ void psi_cgroup_restart(struct psi_group *group)
>
> guard(rq_lock_irq)(cpu_rq(cpu));
>
> - psi_write_begin(cpu);
> now = cpu_clock(cpu);
And same again.
> +
> + psi_write_begin(cpu);
> psi_group_change(group, cpu, 0, 0, now, true);
> psi_write_end(cpu);
> }
For all these we call psi_group_change(), which calls record_times()
which then sets ->state_start to a smaller value. Resulting in times
above to be larger still.
So this inflates delta and leaves me utterly confused.
Not only does the Changelog here not explain anything, this also very
much needs comments in the code, because the next time someone is going
to be reading this, they'll break their WTF'o'meter and probably the
next one they get too.
/me stomps off searching for where the heck he left his pile of spare
WTF'o'meters..
Powered by blists - more mailing lists