[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20241014144358.GB1021@cmpxchg.org>
Date: Mon, 14 Oct 2024 10:43:58 -0400
From: Johannes Weiner <hannes@...xchg.org>
To: Peter Zijlstra <peterz@...radead.org>
Cc: K Prateek Nayak <kprateek.nayak@....com>,
Ingo Molnar <mingo@...hat.com>, Juri Lelli <juri.lelli@...hat.com>,
Vincent Guittot <vincent.guittot@...aro.org>,
Suren Baghdasaryan <surenb@...gle.com>,
linux-kernel@...r.kernel.org,
Dietmar Eggemann <dietmar.eggemann@....com>,
Steven Rostedt <rostedt@...dmis.org>,
Ben Segall <bsegall@...gle.com>, Mel Gorman <mgorman@...e.de>,
Valentin Schneider <vschneid@...hat.com>,
Thomas Gleixner <tglx@...utronix.de>,
Klaus Kudielka <klaus.kudielka@...il.com>,
Chris Bainbridge <chris.bainbridge@...il.com>,
"Linux regression tracking (Thorsten Leemhuis)" <regressions@...mhuis.info>,
"Gautham R. Shenoy" <gautham.shenoy@....com>,
Youssef Esmat <youssefesmat@...gle.com>,
Paul Menzel <pmenzel@...gen.mpg.de>,
Bert Karwatzki <spasswolf@....de>, regressions@...ts.linux.dev
Subject: Re: [PATCH 3/3] sched/core: Indicate a sched_delayed task was
migrated before wakeup
On Fri, Oct 11, 2024 at 12:39:58PM +0200, Peter Zijlstra wrote:
> On Fri, Oct 11, 2024 at 06:08:03AM -0400, Johannes Weiner wrote:
> > Completely untested. But if it looks good, I'll send a proper patch.
>
> Sure. Thanks for doing this.
Ok here goes. Built, booted and runtime-tested.
---
>From 91f230caa0119877cb861047e1af1371bf00d908 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@...xchg.org>
Date: Fri, 11 Oct 2024 06:18:07 -0400
Subject: [PATCH] sched: psi: pass enqueue/dequeue flags to psi callbacks
directly
What psi needs to do on each enqueue and dequeue has gotten more
subtle, and the generic sched code trying to distill this into a bool
for the callbacks is awkward.
Pass the flags directly and let psi parse them. For that to work, the
#include "stats.h" (which has the psi callback implementations) needs
to be below the flag definitions in "sched.h". Move that section
further down, next to some of the other accounting stuff.
This also puts the ENQUEUE_SAVE/RESTORE branch behind the psi jump
label, slightly reducing overhead when PSI=y but runtime disabled.
Link: https://lore.kernel.org/lkml/20241011083323.GL17263@noisy.programming.kicks-ass.net/
Suggested-by: Peter Zijlstra <peterz@...radead.org>
Signed-off-by: Johannes Weiner <hannes@...xchg.org>
---
kernel/sched/core.c | 12 +++++-----
kernel/sched/sched.h | 56 ++++++++++++++++++++++----------------------
kernel/sched/stats.h | 29 +++++++++++++++--------
3 files changed, 53 insertions(+), 44 deletions(-)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 527502a86ff9..42cf181bf3ab 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2019,10 +2019,10 @@ void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
*/
uclamp_rq_inc(rq, p);
- if (!(flags & ENQUEUE_RESTORE)) {
+ psi_enqueue(p, flags);
+
+ if (!(flags & ENQUEUE_RESTORE))
sched_info_enqueue(rq, p);
- psi_enqueue(p, flags & ENQUEUE_MIGRATED);
- }
if (sched_core_enabled(rq))
sched_core_enqueue(rq, p);
@@ -2039,10 +2039,10 @@ inline bool dequeue_task(struct rq *rq, struct task_struct *p, int flags)
if (!(flags & DEQUEUE_NOCLOCK))
update_rq_clock(rq);
- if (!(flags & DEQUEUE_SAVE)) {
+ if (!(flags & DEQUEUE_SAVE))
sched_info_dequeue(rq, p);
- psi_dequeue(p, !(flags & DEQUEUE_SLEEP));
- }
+
+ psi_dequeue(p, flags);
/*
* Must be before ->dequeue_task() because ->dequeue_task() can 'fail'
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index b1c3588a8f00..bbda03f17126 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2085,34 +2085,6 @@ static inline const struct cpumask *task_user_cpus(struct task_struct *p)
#endif /* CONFIG_SMP */
-#include "stats.h"
-
-#if defined(CONFIG_SCHED_CORE) && defined(CONFIG_SCHEDSTATS)
-
-extern void __sched_core_account_forceidle(struct rq *rq);
-
-static inline void sched_core_account_forceidle(struct rq *rq)
-{
- if (schedstat_enabled())
- __sched_core_account_forceidle(rq);
-}
-
-extern void __sched_core_tick(struct rq *rq);
-
-static inline void sched_core_tick(struct rq *rq)
-{
- if (sched_core_enabled(rq) && schedstat_enabled())
- __sched_core_tick(rq);
-}
-
-#else /* !(CONFIG_SCHED_CORE && CONFIG_SCHEDSTATS): */
-
-static inline void sched_core_account_forceidle(struct rq *rq) { }
-
-static inline void sched_core_tick(struct rq *rq) { }
-
-#endif /* !(CONFIG_SCHED_CORE && CONFIG_SCHEDSTATS) */
-
#ifdef CONFIG_CGROUP_SCHED
/*
@@ -3166,6 +3138,34 @@ extern void nohz_run_idle_balance(int cpu);
static inline void nohz_run_idle_balance(int cpu) { }
#endif
+#include "stats.h"
+
+#if defined(CONFIG_SCHED_CORE) && defined(CONFIG_SCHEDSTATS)
+
+extern void __sched_core_account_forceidle(struct rq *rq);
+
+static inline void sched_core_account_forceidle(struct rq *rq)
+{
+ if (schedstat_enabled())
+ __sched_core_account_forceidle(rq);
+}
+
+extern void __sched_core_tick(struct rq *rq);
+
+static inline void sched_core_tick(struct rq *rq)
+{
+ if (sched_core_enabled(rq) && schedstat_enabled())
+ __sched_core_tick(rq);
+}
+
+#else /* !(CONFIG_SCHED_CORE && CONFIG_SCHEDSTATS): */
+
+static inline void sched_core_account_forceidle(struct rq *rq) { }
+
+static inline void sched_core_tick(struct rq *rq) { }
+
+#endif /* !(CONFIG_SCHED_CORE && CONFIG_SCHEDSTATS) */
+
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
struct irqtime {
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index 767e098a3bd1..8ee0add5a48a 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -127,21 +127,25 @@ static inline void psi_account_irqtime(struct rq *rq, struct task_struct *curr,
* go through migration requeues. In this case, *sleeping* states need
* to be transferred.
*/
-static inline void psi_enqueue(struct task_struct *p, bool migrate)
+static inline void psi_enqueue(struct task_struct *p, int flags)
{
int clear = 0, set = 0;
if (static_branch_likely(&psi_disabled))
return;
+ /* Same runqueue, nothing changed for psi */
+ if (flags & ENQUEUE_RESTORE)
+ return;
+
if (p->se.sched_delayed) {
/* CPU migration of "sleeping" task */
- SCHED_WARN_ON(!migrate);
+ SCHED_WARN_ON(!(flags & ENQUEUE_MIGRATED));
if (p->in_memstall)
set |= TSK_MEMSTALL;
if (p->in_iowait)
set |= TSK_IOWAIT;
- } else if (migrate) {
+ } else if (flags & ENQUEUE_MIGRATED) {
/* CPU migration of runnable task */
set = TSK_RUNNING;
if (p->in_memstall)
@@ -158,17 +162,14 @@ static inline void psi_enqueue(struct task_struct *p, bool migrate)
psi_task_change(p, clear, set);
}
-static inline void psi_dequeue(struct task_struct *p, bool migrate)
+static inline void psi_dequeue(struct task_struct *p, int flags)
{
if (static_branch_likely(&psi_disabled))
return;
- /*
- * When migrating a task to another CPU, clear all psi
- * state. The enqueue callback above will work it out.
- */
- if (migrate)
- psi_task_change(p, p->psi_flags, 0);
+ /* Same runqueue, nothing changed for psi */
+ if (flags & DEQUEUE_SAVE)
+ return;
/*
* A voluntary sleep is a dequeue followed by a task switch. To
@@ -176,6 +177,14 @@ static inline void psi_dequeue(struct task_struct *p, bool migrate)
* TSK_RUNNING and TSK_IOWAIT for us when it moves TSK_ONCPU.
* Do nothing here.
*/
+ if (flags & DEQUEUE_SLEEP)
+ return;
+
+ /*
+ * When migrating a task to another CPU, clear all psi
+ * state. The enqueue callback above will work it out.
+ */
+ psi_task_change(p, p->psi_flags, 0);
}
static inline void psi_ttwu_dequeue(struct task_struct *p)
--
2.47.0
Powered by blists - more mailing lists