linux-kernel - [PATCH] sched/core: add forced idle accounting for cgroups

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-Id: <20220513005427.2507335-1-joshdon@google.com>
Date:   Thu, 12 May 2022 17:54:27 -0700
From:   Josh Don <joshdon@...gle.com>
To:     Ingo Molnar <mingo@...hat.com>,
        Peter Zijlstra <peterz@...radead.org>,
        Juri Lelli <juri.lelli@...hat.com>,
        Vincent Guittot <vincent.guittot@...aro.org>
Cc:     Dietmar Eggemann <dietmar.eggemann@....com>,
        Steven Rostedt <rostedt@...dmis.org>,
        Ben Segall <bsegall@...gle.com>, Mel Gorman <mgorman@...e.de>,
        Daniel Bristot de Oliveira <bristot@...hat.com>,
        Valentin Schneider <vschneid@...hat.com>,
        linux-kernel@...r.kernel.org,
        Cruz Zhao <CruzZhao@...ux.alibaba.com>,
        Tejun Heo <tj@...nel.org>, Josh Don <joshdon@...gle.com>
Subject: [PATCH] sched/core: add forced idle accounting for cgroups

4feee7d1260 previously added per-task forced idle accounting. This patch
extends this to also include cgroups.

rstat is used for cgroup accounting, except for the root, which uses
kcpustat in order to bypass the need for doing an rstat flush when
reading root stats.

Only cgroup v2 is supported. Similar to the task accounting, the cgroup
accounting requires that schedstats is enabled.

Signed-off-by: Josh Don <joshdon@...gle.com>
---
 include/linux/kernel_stat.h |  1 +
 kernel/sched/core.c         | 15 ++++++++-
 kernel/sched/core_sched.c   | 62 +++++++++++++++++++++++++++++++++++++
 kernel/sched/sched.h        | 18 +++++++++++
 4 files changed, 95 insertions(+), 1 deletion(-)

diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h
index 69ae6b278464..2e9b3c7d2f18 100644
--- a/include/linux/kernel_stat.h
+++ b/include/linux/kernel_stat.h
@@ -28,6 +28,7 @@ enum cpu_usage_stat {
 	CPUTIME_STEAL,
 	CPUTIME_GUEST,
 	CPUTIME_GUEST_NICE,
+	CPUTIME_FORCEIDLE,
 	NR_STATS,
 };
 
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 48cfad152b86..a29cb4029818 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -10828,12 +10828,18 @@ static struct cftype cpu_legacy_files[] = {
 	{ }	/* Terminate */
 };
 
+static void cpu_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu)
+{
+	sched_core_rstat_flush(css_tg(css), cpu);
+}
+
 static int cpu_extra_stat_show(struct seq_file *sf,
 			       struct cgroup_subsys_state *css)
 {
+	struct task_group __maybe_unused *tg = css_tg(css);
+
 #ifdef CONFIG_CFS_BANDWIDTH
 	{
-		struct task_group *tg = css_tg(css);
 		struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
 		u64 throttled_usec, burst_usec;
 
@@ -10851,6 +10857,12 @@ static int cpu_extra_stat_show(struct seq_file *sf,
 			   throttled_usec, cfs_b->nr_burst, burst_usec);
 	}
 #endif
+
+#ifdef CONFIG_SCHED_CORE
+	/* already updated stats via rstat flush */
+	seq_printf(sf, "forceidle_usec %llu\n",
+			sched_core_forceidle_sum(tg) / NSEC_PER_USEC);
+#endif
 	return 0;
 }
 
@@ -11031,6 +11043,7 @@ struct cgroup_subsys cpu_cgrp_subsys = {
 	.css_online	= cpu_cgroup_css_online,
 	.css_released	= cpu_cgroup_css_released,
 	.css_free	= cpu_cgroup_css_free,
+	.css_rstat_flush = cpu_cgroup_css_rstat_flush,
 	.css_extra_stat_show = cpu_extra_stat_show,
 	.fork		= cpu_cgroup_fork,
 	.can_attach	= cpu_cgroup_can_attach,
diff --git a/kernel/sched/core_sched.c b/kernel/sched/core_sched.c
index 38a2cec21014..ccfeef6542dc 100644
--- a/kernel/sched/core_sched.c
+++ b/kernel/sched/core_sched.c
@@ -277,7 +277,16 @@ void __sched_core_account_forceidle(struct rq *rq)
 		if (p == rq_i->idle)
 			continue;
 
+		/* thread accounting */
 		__schedstat_add(p->stats.core_forceidle_sum, delta);
+
+		/* root accounting */
+		kcpustat_cpu(i).cpustat[CPUTIME_FORCEIDLE] += delta;
+
+		/* cgroup accounting */
+#ifdef CONFIG_FAIR_GROUP_SCHED
+		task_group(p)->cfs_rq[i]->forceidle_sum += delta;
+#endif
 	}
 }
 
@@ -292,4 +301,57 @@ void __sched_core_tick(struct rq *rq)
 	__sched_core_account_forceidle(rq);
 }
 
+void sched_core_rstat_flush(struct task_group *tg, int cpu)
+{
+	struct cfs_rq *cfs_rq = tg->cfs_rq[cpu];
+	struct task_group *parent = tg->parent;
+	u64 delta, curr_sum;
+
+	/* root uses cpustat */
+	if (!parent)
+		return;
+
+	/*
+	 * Note: cgroup_rstat_lock protects cfs_rq->forceidle_sum_prev and
+	 * tg->{forceidle_sum, forceidle_sum_pending}.
+	 */
+
+	delta = tg->forceidle_sum_pending;
+	if (delta)
+		tg->forceidle_sum_pending = 0;
+
+	/* rq lock not held; value may change concurrently */
+	curr_sum = READ_ONCE(cfs_rq->forceidle_sum);
+	if (curr_sum != cfs_rq->forceidle_sum_prev) {
+		delta += curr_sum - cfs_rq->forceidle_sum_prev;
+		cfs_rq->forceidle_sum_prev = curr_sum;
+	}
+
+	if (!delta)
+		return;
+
+	tg->forceidle_sum += delta;
+	parent->forceidle_sum_pending += delta;
+}
+
+/* REQUIRES: If tg is not root, an rstat flush was recently done. */
+u64 sched_core_forceidle_sum(struct task_group *tg)
+{
+	if (!tg->parent) {
+		u64 sum = 0;
+		int i;
+
+		for_each_possible_cpu(i) {
+			struct kernel_cpustat kcpustat;
+
+			kcpustat_cpu_fetch(&kcpustat, i);
+			sum += kcpustat.cpustat[CPUTIME_FORCEIDLE];
+		}
+
+		return sum;
+	} else {
+		return tg->forceidle_sum;
+	}
+}
+
 #endif /* CONFIG_SCHEDSTATS */
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 7f338c53ce42..36bef97b9e2f 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -425,6 +425,12 @@ struct task_group {
 	struct uclamp_se	uclamp[UCLAMP_CNT];
 #endif
 
+#ifdef CONFIG_SCHED_CORE
+	/* used with rstat */
+	u64			forceidle_sum;
+	u64			forceidle_sum_pending;
+#endif
+
 };
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
@@ -526,6 +532,10 @@ struct cfs_rq {
 #ifdef CONFIG_SCHED_CORE
 	unsigned int		forceidle_seq;
 	u64			min_vruntime_fi;
+
+	/* for accounting with rstat */
+	u64			forceidle_sum;
+	u64			forceidle_sum_prev;
 #endif
 
 #ifndef CONFIG_64BIT
@@ -1849,12 +1859,20 @@ static inline void sched_core_tick(struct rq *rq)
 		__sched_core_tick(rq);
 }
 
+extern void sched_core_rstat_flush(struct task_group *tg, int cpu);
+
+extern u64 sched_core_forceidle_sum(struct task_group *tg);
+
 #else
 
 static inline void sched_core_account_forceidle(struct rq *rq) {}
 
 static inline void sched_core_tick(struct rq *rq) {}
 
+static inline void sched_core_rstat_flush(struct task_group *tg, int cpu) {}
+
+static inline u64 sched_core_forceidle_sum(struct task_group *tg) { return 0; }
+
 #endif /* CONFIG_SCHED_CORE && CONFIG_SCHEDSTATS */
 
 #ifdef CONFIG_CGROUP_SCHED
-- 
2.36.0.512.ge40c2bad7a-goog