[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-ID: <20240613015837.4132703-1-jstultz@google.com>
Date: Wed, 12 Jun 2024 18:58:26 -0700
From: John Stultz <jstultz@...gle.com>
To: LKML <linux-kernel@...r.kernel.org>
Cc: John Stultz <jstultz@...gle.com>, Thomas Gleixner <tglx@...utronix.de>,
Frederic Weisbecker <frederic@...nel.org>, Ingo Molnar <mingo@...hat.com>,
Peter Zijlstra <peterz@...radead.org>, Juri Lelli <juri.lelli@...hat.com>,
Vincent Guittot <vincent.guittot@...aro.org>, Dietmar Eggemann <dietmar.eggemann@....com>,
Steven Rostedt <rostedt@...dmis.org>, Ben Segall <bsegall@...gle.com>, Mel Gorman <mgorman@...e.de>,
Daniel Bristot de Oliveira <bristot@...hat.com>, Valentin Schneider <vschneid@...hat.com>,
Qais Yousef <qyousef@...alina.io>, Joel Fernandes <joel@...lfernandes.org>, kernel-team@...roid.com
Subject: [PATCH] RFC: sched: Rework task_sched_runtime to avoid calling update_rq_clock
I recently got a bug report that
clock_gettime(CLOCK_THREAD_CPUTIME_ID,...) had regressed between
5.10 and 6.1. Its not a huge regression in absolute time
(~30-40ns), but is >10% change.
I narrowed the cause down to the addition of
psi_account_irqtime() in update_rq_clock_task(), in commit
52b1364ba0b1 ("sched/psi: Add PSI_IRQ to track IRQ/SOFTIRQ
pressure")
So that explains the behavior change, but it also seems odd that
we're doing psi irq accounting from a syscall that is just
trying to read the thread's cputime.
Thinking about it more, it seems the re-use of update_rq_clock()
to handle accounting for any in-progress time for the current
task has the potential for side effects and unnecessary work.
So instead rework the logic so we calculate the current cpu
runtime in a read-only fashion.
This has the side benefit of improving
clock_gettime(CLOCK_THREAD_CPUTIME_ID,...) performance by ~12%
over the behavior in 5.10, and ~21% over the 6.1 behavior.
NOTE: I'm not 100% sure this is correct yet. There may be some
edge cases I've overlooked, so I'd greatly appreciate any
review or feedback.
Cc: Thomas Gleixner <tglx@...utronix.de>
Cc: Frederic Weisbecker <frederic@...nel.org>
Cc: Ingo Molnar <mingo@...hat.com>
Cc: Peter Zijlstra <peterz@...radead.org>
Cc: Juri Lelli <juri.lelli@...hat.com>
Cc: Vincent Guittot <vincent.guittot@...aro.org>
Cc: Dietmar Eggemann <dietmar.eggemann@....com>
Cc: Steven Rostedt <rostedt@...dmis.org>
Cc: Ben Segall <bsegall@...gle.com>
Cc: Mel Gorman <mgorman@...e.de>
Cc: Daniel Bristot de Oliveira <bristot@...hat.com>
Cc: Valentin Schneider <vschneid@...hat.com>
Cc: Qais Yousef <qyousef@...alina.io>
Cc: Joel Fernandes <joel@...lfernandes.org>
Cc: kernel-team@...roid.com
Signed-off-by: John Stultz <jstultz@...gle.com>
---
kernel/sched/core.c | 82 ++++++++++++++++++++++++++-------------------
1 file changed, 47 insertions(+), 35 deletions(-)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index bcf2c4cc0522..b29cde5ded84 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -692,16 +692,11 @@ struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf)
* RQ-clock updating methods:
*/
-static void update_rq_clock_task(struct rq *rq, s64 delta)
-{
-/*
- * In theory, the compile should just see 0 here, and optimize out the call
- * to sched_rt_avg_update. But I don't trust it...
- */
- s64 __maybe_unused steal = 0, irq_delta = 0;
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
- irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
+static inline s64 get_irq_delta(struct rq *rq, s64 delta)
+{
+ s64 irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
/*
* Since irq_time is only updated on {soft,}irq_exit, we might run into
@@ -720,7 +715,45 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
*/
if (irq_delta > delta)
irq_delta = delta;
+ return irq_delta;
+}
+#else
+static inline s64 get_irq_delta(struct rq *rq, s64 delta)
+{
+ return 0;
+}
+#endif
+
+#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
+static inline s64 get_steal_time(struct rq *rq, s64 delta)
+{
+ s64 steal;
+ if (!static_key_false(¶virt_steal_rq_enabled))
+ return 0;
+ steal = paravirt_steal_clock(cpu_of(rq));
+ steal -= rq->prev_steal_time_rq;
+ if (unlikely(steal > delta))
+ steal = delta;
+ return steal;
+}
+#else
+static inline s64 get_steal_time(struct rq *rq, s64 delta)
+{
+ return 0;
+}
+#endif
+
+static void update_rq_clock_task(struct rq *rq, s64 delta)
+{
+/*
+ * In theory, the compile should just see 0 here, and optimize out the call
+ * to sched_rt_avg_update. But I don't trust it...
+ */
+ s64 __maybe_unused steal = 0, irq_delta = 0;
+
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+ irq_delta = get_irq_delta(rq, delta);
rq->prev_irq_time += irq_delta;
delta -= irq_delta;
psi_account_irqtime(rq->curr, irq_delta);
@@ -728,12 +761,7 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
#endif
#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
if (static_key_false((¶virt_steal_rq_enabled))) {
- steal = paravirt_steal_clock(cpu_of(rq));
- steal -= rq->prev_steal_time_rq;
-
- if (unlikely(steal > delta))
- steal = delta;
-
+ steal = get_steal_time(rq, delta);
rq->prev_steal_time_rq += steal;
delta -= steal;
}
@@ -5547,23 +5575,6 @@ DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat);
EXPORT_PER_CPU_SYMBOL(kstat);
EXPORT_PER_CPU_SYMBOL(kernel_cpustat);
-/*
- * The function fair_sched_class.update_curr accesses the struct curr
- * and its field curr->exec_start; when called from task_sched_runtime(),
- * we observe a high rate of cache misses in practice.
- * Prefetching this data results in improved performance.
- */
-static inline void prefetch_curr_exec_start(struct task_struct *p)
-{
-#ifdef CONFIG_FAIR_GROUP_SCHED
- struct sched_entity *curr = (&p->se)->cfs_rq->curr;
-#else
- struct sched_entity *curr = (&task_rq(p)->cfs)->curr;
-#endif
- prefetch(curr);
- prefetch(&curr->exec_start);
-}
-
/*
* Return accounted runtime for the task.
* In case the task is currently running, return the runtime plus current's
@@ -5573,6 +5584,7 @@ unsigned long long task_sched_runtime(struct task_struct *p)
{
struct rq_flags rf;
struct rq *rq;
+ s64 delta_exec = 0;
u64 ns;
#if defined(CONFIG_64BIT) && defined(CONFIG_SMP)
@@ -5598,11 +5610,11 @@ unsigned long long task_sched_runtime(struct task_struct *p)
* thread, breaking clock_gettime().
*/
if (task_current(rq, p) && task_on_rq_queued(p)) {
- prefetch_curr_exec_start(p);
- update_rq_clock(rq);
- p->sched_class->update_curr(rq);
+ delta_exec = sched_clock_cpu(cpu_of(rq)) - p->se.exec_start;
+ delta_exec -= get_irq_delta(rq, delta_exec);
+ delta_exec -= get_steal_time(rq, delta_exec);
}
- ns = p->se.sum_exec_runtime;
+ ns = p->se.sum_exec_runtime + delta_exec;
task_rq_unlock(rq, p, &rf);
return ns;
--
2.45.2.505.gda0bf45e8d-goog
Powered by blists - more mailing lists