[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20250125052521.19487-4-wuyun.abel@bytedance.com>
Date: Sat, 25 Jan 2025 13:25:12 +0800
From: Abel Wu <wuyun.abel@...edance.com>
To: Tejun Heo <tj@...nel.org>,
Johannes Weiner <hannes@...xchg.org>,
Michal Koutný <mkoutny@...e.com>,
Jonathan Corbet <corbet@....net>,
Ingo Molnar <mingo@...hat.com>,
Peter Zijlstra <peterz@...radead.org>,
Juri Lelli <juri.lelli@...hat.com>,
Vincent Guittot <vincent.guittot@...aro.org>,
Dietmar Eggemann <dietmar.eggemann@....com>,
Steven Rostedt <rostedt@...dmis.org>,
Ben Segall <bsegall@...gle.com>,
Mel Gorman <mgorman@...e.de>,
Valentin Schneider <vschneid@...hat.com>,
Thomas Gleixner <tglx@...utronix.de>,
Yury Norov <yury.norov@...il.com>,
Abel Wu <wuyun.abel@...edance.com>,
Andrew Morton <akpm@...ux-foundation.org>,
Bitao Hu <yaoma@...ux.alibaba.com>,
Chen Ridong <chenridong@...wei.com>
Cc: cgroups@...r.kernel.org (open list:CONTROL GROUP (CGROUP)),
linux-doc@...r.kernel.org (open list:DOCUMENTATION),
linux-kernel@...r.kernel.org (open list)
Subject: [PATCH v2 3/3] cgroup/rstat: Add run_delay accounting for cgroups
The per-task and per-cpu accounting have already been tracked by
t->sched_info.run_delay and rq->rq_sched_info.run_delay respectively.
Extends this to also include cgroups.
The PSI indicator, "some" of cpu.pressure, loses the insight into how
severely that cgroup is stalled. Say 100 tasks or just 1 task that gets
stalled at a certain point will show no difference in "some" pressure.
IOW "some" is a flat value that not weighted by the severity (e.g. # of
tasks).
Only cgroup v2 is supported. Similar to the task accounting, the cgroup
accounting requires that CONFIG_SCHED_INFO is enabled.
Signed-off-by: Abel Wu <wuyun.abel@...edance.com>
---
Documentation/admin-guide/cgroup-v2.rst | 1 +
include/linux/cgroup-defs.h | 3 +++
include/linux/kernel_stat.h | 14 ++++++++++++++
kernel/cgroup/rstat.c | 17 +++++++++++++++++
kernel/sched/cputime.c | 12 ++++++++++++
kernel/sched/stats.h | 2 ++
6 files changed, 49 insertions(+)
diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
index 315ede811c9d..440c3800c49c 100644
--- a/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
@@ -1100,6 +1100,7 @@ All time durations are in microseconds.
- usage_usec
- user_usec
- system_usec
+ - run_delay_usec (requires CONFIG_SCHED_INFO)
and the following five when the controller is enabled:
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index 1b20d2d8ef7c..287366e60414 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -328,6 +328,9 @@ struct cgroup_base_stat {
u64 forceidle_sum;
#endif
u64 ntime;
+#ifdef CONFIG_SCHED_INFO
+ u64 run_delay;
+#endif
};
/*
diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h
index b97ce2df376f..256b1a55de62 100644
--- a/include/linux/kernel_stat.h
+++ b/include/linux/kernel_stat.h
@@ -31,6 +31,15 @@ enum cpu_usage_stat {
CPUTIME_FORCEIDLE,
#endif
NR_STATS,
+
+#ifdef CONFIG_SCHED_INFO
+ /*
+ * Instead of cputime, run_delay is tracked through
+ * sched_info by task and rq, so there is no need to
+ * enlarge the cpustat[] array.
+ */
+ CPUTIME_RUN_DELAY,
+#endif
};
struct kernel_cpustat {
@@ -141,4 +150,9 @@ extern void account_idle_ticks(unsigned long ticks);
extern void __account_forceidle_time(struct task_struct *tsk, u64 delta);
#endif
+#ifdef CONFIG_SCHED_INFO
+extern void account_run_delay_time(struct task_struct *tsk, u64 delta);
+extern u64 get_cpu_run_delay(int cpu);
+#endif
+
#endif /* _LINUX_KERNEL_STAT_H */
diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
index dc6acab00d69..c7f9397a714e 100644
--- a/kernel/cgroup/rstat.c
+++ b/kernel/cgroup/rstat.c
@@ -445,6 +445,9 @@ static void cgroup_base_stat_add(struct cgroup_base_stat *dst_bstat,
dst_bstat->forceidle_sum += src_bstat->forceidle_sum;
#endif
dst_bstat->ntime += src_bstat->ntime;
+#ifdef CONFIG_SCHED_INFO
+ dst_bstat->run_delay += src_bstat->run_delay;
+#endif
}
static void cgroup_base_stat_sub(struct cgroup_base_stat *dst_bstat,
@@ -457,6 +460,9 @@ static void cgroup_base_stat_sub(struct cgroup_base_stat *dst_bstat,
dst_bstat->forceidle_sum -= src_bstat->forceidle_sum;
#endif
dst_bstat->ntime -= src_bstat->ntime;
+#ifdef CONFIG_SCHED_INFO
+ dst_bstat->run_delay -= src_bstat->run_delay;
+#endif
}
static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu)
@@ -551,6 +557,11 @@ void __cgroup_account_cputime_field(struct cgroup *cgrp,
case CPUTIME_FORCEIDLE:
rstatc->bstat.forceidle_sum += delta_exec;
break;
+#endif
+#ifdef CONFIG_SCHED_INFO
+ case CPUTIME_RUN_DELAY:
+ rstatc->bstat.run_delay += delta_exec;
+ break;
#endif
default:
break;
@@ -596,6 +607,9 @@ static void root_cgroup_cputime(struct cgroup_base_stat *bstat)
bstat->forceidle_sum += cpustat[CPUTIME_FORCEIDLE];
#endif
bstat->ntime += cpustat[CPUTIME_NICE];
+#ifdef CONFIG_SCHED_INFO
+ bstat->run_delay += get_cpu_run_delay(i);
+#endif
}
}
@@ -611,6 +625,9 @@ static struct bstat_entry {
BSTAT_ENTRY("nice_usec", ntime),
#ifdef CONFIG_SCHED_CORE
BSTAT_ENTRY("core_sched.force_idle_usec", forceidle_sum),
+#endif
+#ifdef CONFIG_SCHED_INFO
+ BSTAT_ENTRY("run_delay_usec", run_delay),
#endif
{ NULL } /* must be at end */
#undef BSTAT_ENTRY
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 5d9143dd0879..e6be57cdb54e 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -243,6 +243,18 @@ void __account_forceidle_time(struct task_struct *p, u64 delta)
}
#endif
+#ifdef CONFIG_SCHED_INFO
+void account_run_delay_time(struct task_struct *p, u64 delta)
+{
+ cgroup_account_cputime_field(p, CPUTIME_RUN_DELAY, delta);
+}
+
+u64 get_cpu_run_delay(int cpu)
+{
+ return cpu_rq(cpu)->rq_sched_info.run_delay;
+}
+#endif
+
/*
* When a guest is interrupted for a longer amount of time, missed clock
* ticks are not redelivered later. Due to that, this function may on
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index 6ade91bce63e..b21a2c4b9c54 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -249,6 +249,7 @@ static inline void sched_info_dequeue(struct rq *rq, struct task_struct *t)
t->sched_info.last_queued = 0;
t->sched_info.run_delay += delta;
+ account_run_delay_time(t, delta);
rq_sched_info_dequeue(rq, delta);
}
@@ -271,6 +272,7 @@ static void sched_info_arrive(struct rq *rq, struct task_struct *t)
t->sched_info.last_arrival = now;
t->sched_info.pcount++;
+ account_run_delay_time(t, delta);
rq_sched_info_arrive(rq, delta);
}
--
2.37.3
Powered by blists - more mailing lists