linux-kernel - [PATCH v4 2/2] cgroup/rstat: Add run

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20250209061322.15260-3-wuyun.abel@bytedance.com>
Date: Sun,  9 Feb 2025 14:13:12 +0800
From: Abel Wu <wuyun.abel@...edance.com>
To: Johannes Weiner <hannes@...xchg.org>,
	Tejun Heo <tj@...nel.org>,
	Michal Koutný <mkoutny@...e.com>,
	Ingo Molnar <mingo@...hat.com>,
	Peter Zijlstra <peterz@...radead.org>,
	Juri Lelli <juri.lelli@...hat.com>,
	Vincent Guittot <vincent.guittot@...aro.org>,
	Dietmar Eggemann <dietmar.eggemann@....com>,
	Steven Rostedt <rostedt@...dmis.org>,
	Ben Segall <bsegall@...gle.com>,
	Mel Gorman <mgorman@...e.de>,
	Valentin Schneider <vschneid@...hat.com>,
	Bitao Hu <yaoma@...ux.alibaba.com>,
	Thomas Gleixner <tglx@...utronix.de>,
	Yury Norov <yury.norov@...il.com>,
	Abel Wu <wuyun.abel@...edance.com>,
	Andrew Morton <akpm@...ux-foundation.org>,
	Chen Ridong <chenridong@...wei.com>
Cc: cgroups@...r.kernel.org (open list:CONTROL GROUP (CGROUP)),
	linux-kernel@...r.kernel.org (open list)
Subject: [PATCH v4 2/2] cgroup/rstat: Add run_delay accounting for cgroups

The "some" field of cpu.pressure indicator may lose the insight into
how severely one cgroup is stalled on certain cpu, because PSI tracks
stall time for each cpu through:

	tSOME[cpu] = time(nr_delayed_tasks[cpu] != 0)

which turns nr_delayed_tasks[cpu] into boolean value. So together with
this cgroup level run_delay accounting, the scheduling info of cgroups
will be better illustrated.

Currently the task and cpu level accounting have already been tracked
through the following two holders respectively:

	struct task_struct::sched_info if SCHED_INFO
	struct rq::rq_sched_info if SCHEDSTATS

When extending this to cgroups, the minimal requirement would be:

	root: relies on rq::rq_sched_info, hence SCHEDSTATS
	non-root: relies on task's, hence SCHED_INFO

It might be too demanding to require both, while collecting data for
root cgroup from different holders according to different configs would
also be confusing and error-prone. In order to keep things simple, let
us rely on the cputime infrastructure to do the accounting as the other
cputimes do.

Only cgroup v2 is supported and CONFIG_SCHED_INFO is required.

Signed-off-by: Abel Wu <wuyun.abel@...edance.com>
---
 include/linux/cgroup-defs.h |  3 +++
 include/linux/kernel_stat.h |  7 +++++++
 kernel/cgroup/rstat.c       | 25 +++++++++++++++++++++++++
 kernel/sched/cputime.c      | 10 ++++++++++
 kernel/sched/stats.h        |  3 +++
 5 files changed, 48 insertions(+)

diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index 1b20d2d8ef7c..287366e60414 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -328,6 +328,9 @@ struct cgroup_base_stat {
 	u64 forceidle_sum;
 #endif
 	u64 ntime;
+#ifdef CONFIG_SCHED_INFO
+	u64 run_delay;
+#endif
 };
 
 /*
diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h
index b97ce2df376f..ddd59fea10ad 100644
--- a/include/linux/kernel_stat.h
+++ b/include/linux/kernel_stat.h
@@ -29,6 +29,9 @@ enum cpu_usage_stat {
 	CPUTIME_GUEST_NICE,
 #ifdef CONFIG_SCHED_CORE
 	CPUTIME_FORCEIDLE,
+#endif
+#ifdef CONFIG_SCHED_INFO
+	CPUTIME_RUN_DELAY,
 #endif
 	NR_STATS,
 };
@@ -141,4 +144,8 @@ extern void account_idle_ticks(unsigned long ticks);
 extern void __account_forceidle_time(struct task_struct *tsk, u64 delta);
 #endif
 
+#ifdef CONFIG_SCHED_INFO
+extern void account_run_delay_time(struct task_struct *tsk, u64 delta);
+#endif
+
 #endif /* _LINUX_KERNEL_STAT_H */
diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
index c2784c317cdd..53984cdf7f9b 100644
--- a/kernel/cgroup/rstat.c
+++ b/kernel/cgroup/rstat.c
@@ -445,6 +445,9 @@ static void cgroup_base_stat_add(struct cgroup_base_stat *dst_bstat,
 	dst_bstat->forceidle_sum += src_bstat->forceidle_sum;
 #endif
 	dst_bstat->ntime += src_bstat->ntime;
+#ifdef CONFIG_SCHED_INFO
+	dst_bstat->run_delay += src_bstat->run_delay;
+#endif
 }
 
 static void cgroup_base_stat_sub(struct cgroup_base_stat *dst_bstat,
@@ -457,6 +460,9 @@ static void cgroup_base_stat_sub(struct cgroup_base_stat *dst_bstat,
 	dst_bstat->forceidle_sum -= src_bstat->forceidle_sum;
 #endif
 	dst_bstat->ntime -= src_bstat->ntime;
+#ifdef CONFIG_SCHED_INFO
+	dst_bstat->run_delay -= src_bstat->run_delay;
+#endif
 }
 
 static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu)
@@ -551,6 +557,11 @@ void __cgroup_account_cputime_field(struct cgroup *cgrp,
 	case CPUTIME_FORCEIDLE:
 		rstatc->bstat.forceidle_sum += delta_exec;
 		break;
+#endif
+#ifdef CONFIG_SCHED_INFO
+	case CPUTIME_RUN_DELAY:
+		rstatc->bstat.run_delay += delta_exec;
+		break;
 #endif
 	default:
 		break;
@@ -596,6 +607,9 @@ static void root_cgroup_cputime(struct cgroup_base_stat *bstat)
 		bstat->forceidle_sum += cpustat[CPUTIME_FORCEIDLE];
 #endif
 		bstat->ntime += cpustat[CPUTIME_NICE];
+#ifdef CONFIG_SCHED_INFO
+		bstat->run_delay += cpustat[CPUTIME_RUN_DELAY];
+#endif
 	}
 }
 
@@ -610,6 +624,16 @@ static void cgroup_force_idle_show(struct seq_file *seq, struct cgroup_base_stat
 #endif
 }
 
+static void cgroup_run_delay_show(struct seq_file *seq, struct cgroup_base_stat *bstat)
+{
+#ifdef CONFIG_SCHED_INFO
+	u64 run_delay = bstat->run_delay;
+
+	do_div(run_delay, NSEC_PER_USEC);
+	seq_printf(seq, "run_delay_usec %llu\n", run_delay);
+#endif
+}
+
 void cgroup_base_stat_cputime_show(struct seq_file *seq)
 {
 	struct cgroup *cgrp = seq_css(seq)->cgroup;
@@ -640,6 +664,7 @@ void cgroup_base_stat_cputime_show(struct seq_file *seq)
 			bstat.ntime);
 
 	cgroup_force_idle_show(seq, &bstat);
+	cgroup_run_delay_show(seq, &bstat);
 }
 
 /* Add bpf kfuncs for cgroup_rstat_updated() and cgroup_rstat_flush() */
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 5d9143dd0879..42af602c10a6 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -243,6 +243,16 @@ void __account_forceidle_time(struct task_struct *p, u64 delta)
 }
 #endif
 
+#ifdef CONFIG_SCHED_INFO
+/*
+ * Account for run_delay time spent waiting in rq.
+ */
+void account_run_delay_time(struct task_struct *p, u64 delta)
+{
+	task_group_account_field(p, CPUTIME_RUN_DELAY, delta);
+}
+#endif
+
 /*
  * When a guest is interrupted for a longer amount of time, missed clock
  * ticks are not redelivered later. Due to that, this function may on
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index 19cdbe96f93d..fdfd04a89b05 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -252,7 +252,9 @@ static inline void sched_info_dequeue(struct rq *rq, struct task_struct *t)
 		t->sched_info.max_run_delay = delta;
 	if (delta && (!t->sched_info.min_run_delay || delta < t->sched_info.min_run_delay))
 		t->sched_info.min_run_delay = delta;
+
 	rq_sched_info_dequeue(rq, delta);
+	account_run_delay_time(t, delta);
 }
 
 /*
@@ -279,6 +281,7 @@ static void sched_info_arrive(struct rq *rq, struct task_struct *t)
 		t->sched_info.min_run_delay = delta;
 
 	rq_sched_info_arrive(rq, delta);
+	account_run_delay_time(t, delta);
 }
 
 /*
-- 
2.37.3