From 2e15180e18b51e9a2bc0d7050e915a70d2673a06 Mon Sep 17 00:00:00 2001 From: K Prateek Nayak Date: Fri, 4 Oct 2024 15:24:35 +0000 Subject: [RFC PATCH] sched/psi: Fixup PSI accounting with DELAY_DEQUEUE After the merge of DELAY_DEQUEUE, "psi: inconsistent task state: warning were seen early into the boot. The crux of the matter is the fact that when a task is delayed, and the delayed task is then migrated, the wakeup context may not have any idea that the task was moved from its previous runqueue. This is the same reason psi_enqueue() considers only ... (flags & ENQUEUE_WAKEUP) && !(flags & ENQUEUE_MIGRATED) ... as a wakeup. In case of a wakeup with migration, PSI forgoes clearing the TSK_IOWAIT flag which seems to be the issue I encountered in my splat previously. With that said, the below diff, based on Peter's original approach currently seems to work for me in the sense that I have not seen the inconsistent state warning for a while now with my stress test. Two key points of the approach are: o It uses "p->migration_flags" to indicate a delayed entity has migrated to another runqueue and convey the same during psi_enqueue(). o It adds ENQUEUE_WAKEUP flag alongside ENQUEUE_DELAYED for enqueue_task() in ttwu_runnable() since psi_enqueue() needs to know of a wakeup without migration to clear the TSK_IOWAIT flag it would have set during psi_task_switch() for blocking task and going down the stack for enqueue_task_fair(), there seem to be no other observer of the ENQUEUE_WAKEUP flag other than psi_enqueue() in the requeue path. Suggested-by: Peter Zijlstra Signed-off-by: K Prateek Nayak --- kernel/sched/core.c | 20 +++++++++++++++++--- kernel/sched/sched.h | 1 + kernel/sched/stats.h | 10 ++++++++++ 3 files changed, 28 insertions(+), 3 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 43e453ab7e20..885801432e9a 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2009,12 +2009,19 @@ unsigned long get_wchan(struct task_struct *p) void enqueue_task(struct rq *rq, struct task_struct *p, int flags) { + bool wakee_not_migrated = (flags & ENQUEUE_WAKEUP) && !(flags & ENQUEUE_MIGRATED); + if (!(flags & ENQUEUE_NOCLOCK)) update_rq_clock(rq); if (!(flags & ENQUEUE_RESTORE)) { sched_info_enqueue(rq, p); - psi_enqueue(p, (flags & ENQUEUE_WAKEUP) && !(flags & ENQUEUE_MIGRATED)); + + /* Notify PSI that the task was migrated in a delayed state before wakeup. */ + if ((p->migration_flags & DELAYED_MIGRATED) && !task_on_rq_migrating(p)) { + wakee_not_migrated = false; + p->migration_flags &= ~DELAYED_MIGRATED; + } } p->sched_class->enqueue_task(rq, p, flags); @@ -2023,6 +2030,8 @@ void enqueue_task(struct rq *rq, struct task_struct *p, int flags) * ->sched_delayed. */ uclamp_rq_inc(rq, p); + if (!(flags & ENQUEUE_RESTORE)) + psi_enqueue(p, wakee_not_migrated); if (sched_core_enabled(rq)) sched_core_enqueue(rq, p); @@ -2042,6 +2051,9 @@ inline bool dequeue_task(struct rq *rq, struct task_struct *p, int flags) if (!(flags & DEQUEUE_SAVE)) { sched_info_dequeue(rq, p); psi_dequeue(p, flags & DEQUEUE_SLEEP); + + if (p->se.sched_delayed && task_on_rq_migrating(p)) + p->migration_flags |= DELAYED_MIGRATED; } /* @@ -3733,7 +3745,7 @@ static int ttwu_runnable(struct task_struct *p, int wake_flags) if (task_on_rq_queued(p)) { update_rq_clock(rq); if (p->se.sched_delayed) - enqueue_task(rq, p, ENQUEUE_NOCLOCK | ENQUEUE_DELAYED); + enqueue_task(rq, p, ENQUEUE_NOCLOCK | ENQUEUE_WAKEUP | ENQUEUE_DELAYED); if (!task_on_cpu(rq, p)) { /* * When on_rq && !on_cpu the task is preempted, see if @@ -6537,6 +6549,7 @@ static void __sched notrace __schedule(int sched_mode) * as a preemption by schedule_debug() and RCU. */ bool preempt = sched_mode > SM_NONE; + bool block = false; unsigned long *switch_count; unsigned long prev_state; struct rq_flags rf; @@ -6622,6 +6635,7 @@ static void __sched notrace __schedule(int sched_mode) * After this, schedule() must not care about p->state any more. */ block_task(rq, prev, flags); + block = true; } switch_count = &prev->nvcsw; } @@ -6667,7 +6681,7 @@ static void __sched notrace __schedule(int sched_mode) migrate_disable_switch(rq, prev); psi_account_irqtime(rq, prev, next); - psi_sched_switch(prev, next, !task_on_rq_queued(prev)); + psi_sched_switch(prev, next, block); trace_sched_switch(preempt, prev, next, prev_state); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index b1c3588a8f00..2dc2c4cb4f5f 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1326,6 +1326,7 @@ static inline int cpu_of(struct rq *rq) } #define MDF_PUSH 0x01 +#define DELAYED_MIGRATED 0x02 /* Task was migrated when in DELAYED_DEQUEUE state */ static inline bool is_migration_disabled(struct task_struct *p) { diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index 237780aa3c53..06a2c6d3ec1e 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h @@ -129,6 +129,13 @@ static inline void psi_enqueue(struct task_struct *p, bool wakeup) if (static_branch_likely(&psi_disabled)) return; + /* + * Delayed task is not ready to run yet! + * Wait for a requeue before accounting. + */ + if (p->se.sched_delayed) + return; + if (p->in_memstall) set |= TSK_MEMSTALL_RUNNING; @@ -148,6 +155,9 @@ static inline void psi_dequeue(struct task_struct *p, bool sleep) if (static_branch_likely(&psi_disabled)) return; + /* Delayed task can only be dequeued for migration. */ + WARN_ON_ONCE(p->se.sched_delayed && sleep); + /* * A voluntary sleep is a dequeue followed by a task switch. To * avoid walking all ancestors twice, psi_task_switch() handles -- 2.34.1