[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-Id: <20220107234138.1765668-1-joshdon@google.com>
Date: Fri, 7 Jan 2022 15:41:37 -0800
From: Josh Don <joshdon@...gle.com>
To: Tejun Heo <tj@...nel.org>, Zefan Li <lizefan.x@...edance.com>,
Johannes Weiner <hannes@...xchg.org>,
Ingo Molnar <mingo@...hat.com>,
Peter Zijlstra <peterz@...radead.org>,
Juri Lelli <juri.lelli@...hat.com>,
Vincent Guittot <vincent.guittot@...aro.org>
Cc: Dietmar Eggemann <dietmar.eggemann@....com>,
Steven Rostedt <rostedt@...dmis.org>,
Ben Segall <bsegall@...gle.com>, Mel Gorman <mgorman@...e.de>,
Daniel Bristot de Oliveira <bristot@...hat.com>,
cgroups@...r.kernel.org, linux-kernel@...r.kernel.org,
Josh Don <joshdon@...gle.com>
Subject: [PATCH 1/2] cgroup: add cpu.stat_percpu
cpu.stat displays global metrics, such as cgroup usage. It would also be
useful to be able to break these down by cpu; to that end, this patch
adds a new interface, 'cpu.stat_percpu', to display the percpu values of
these stats.
Each line of the output corresponds to a particular metric. The format
of each line is the name of the metric, followed by space delimited
percpu values. The reason for this approach (vs having each line
correspond to a particular cpu) is to make it easier to display extra
subsystem-specific percpu fields.
Signed-off-by: Josh Don <joshdon@...gle.com>
---
include/linux/cgroup-defs.h | 5 +
kernel/cgroup/cgroup-internal.h | 1 +
kernel/cgroup/cgroup.c | 10 ++
kernel/cgroup/rstat.c | 159 ++++++++++++++++++++++++++++----
4 files changed, 155 insertions(+), 20 deletions(-)
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index db2e147e069f..7778a011f457 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -461,6 +461,11 @@ struct cgroup {
struct cgroup_base_stat bstat;
struct prev_cputime prev_cputime; /* for printing out cputime */
+ /* Per-cpu basic resource statistics. These are NULL on root. */
+ struct cgroup_base_stat __percpu *bstat_cpu;
+ struct cgroup_base_stat __percpu *last_bstat_cpu;
+ struct prev_cputime __percpu *prev_cputime_cpu;
+
/*
* list of pidlists, up to two for each namespace (one for procs, one
* for tasks); created on demand.
diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h
index bfbeabc17a9d..07e932c4f875 100644
--- a/kernel/cgroup/cgroup-internal.h
+++ b/kernel/cgroup/cgroup-internal.h
@@ -254,6 +254,7 @@ int cgroup_rstat_init(struct cgroup *cgrp);
void cgroup_rstat_exit(struct cgroup *cgrp);
void cgroup_rstat_boot(void);
void cgroup_base_stat_cputime_show(struct seq_file *seq);
+void cgroup_base_stat_percpu_cputime_show(struct seq_file *seq);
/*
* namespace.c
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 919194de39c8..4f5ddce529eb 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -3604,6 +3604,12 @@ static int cpu_stat_show(struct seq_file *seq, void *v)
return ret;
}
+static int cpu_stat_percpu_show(struct seq_file *seq, void *v)
+{
+ cgroup_base_stat_percpu_cputime_show(seq);
+ return 0;
+}
+
#ifdef CONFIG_PSI
static int cgroup_io_pressure_show(struct seq_file *seq, void *v)
{
@@ -5014,6 +5020,10 @@ static struct cftype cgroup_base_files[] = {
.name = "cpu.stat",
.seq_show = cpu_stat_show,
},
+ {
+ .name = "cpu.stat_percpu",
+ .seq_show = cpu_stat_percpu_show,
+ },
#ifdef CONFIG_PSI
{
.name = "io.pressure",
diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
index 1486768f2318..1af37333e5bf 100644
--- a/kernel/cgroup/rstat.c
+++ b/kernel/cgroup/rstat.c
@@ -253,7 +253,19 @@ int cgroup_rstat_init(struct cgroup *cgrp)
if (!cgrp->rstat_cpu) {
cgrp->rstat_cpu = alloc_percpu(struct cgroup_rstat_cpu);
if (!cgrp->rstat_cpu)
- return -ENOMEM;
+ goto error_nomem;
+
+ cgrp->last_bstat_cpu = alloc_percpu(struct cgroup_base_stat);
+ if (!cgrp->last_bstat_cpu)
+ goto error_nomem;
+
+ cgrp->bstat_cpu = alloc_percpu(struct cgroup_base_stat);
+ if (!cgrp->bstat_cpu)
+ goto error_nomem;
+
+ cgrp->prev_cputime_cpu = alloc_percpu(struct prev_cputime);
+ if (!cgrp->prev_cputime_cpu)
+ goto error_nomem;
}
/* ->updated_children list is self terminated */
@@ -265,6 +277,21 @@ int cgroup_rstat_init(struct cgroup *cgrp)
}
return 0;
+
+error_nomem:
+ free_percpu(cgrp->rstat_cpu);
+ cgrp->rstat_cpu = NULL;
+
+ free_percpu(cgrp->last_bstat_cpu);
+ cgrp->last_bstat_cpu = NULL;
+
+ free_percpu(cgrp->bstat_cpu);
+ cgrp->bstat_cpu = NULL;
+
+ free_percpu(cgrp->prev_cputime_cpu);
+ cgrp->prev_cputime_cpu = NULL;
+
+ return -ENOMEM;
}
void cgroup_rstat_exit(struct cgroup *cgrp)
@@ -284,6 +311,12 @@ void cgroup_rstat_exit(struct cgroup *cgrp)
free_percpu(cgrp->rstat_cpu);
cgrp->rstat_cpu = NULL;
+ free_percpu(cgrp->last_bstat_cpu);
+ cgrp->last_bstat_cpu = NULL;
+ free_percpu(cgrp->bstat_cpu);
+ cgrp->bstat_cpu = NULL;
+ free_percpu(cgrp->prev_cputime_cpu);
+ cgrp->prev_cputime_cpu = NULL;
}
void __init cgroup_rstat_boot(void)
@@ -319,22 +352,29 @@ static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu)
struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu);
struct cgroup *parent = cgroup_parent(cgrp);
struct cgroup_base_stat cur, delta;
+ struct cgroup_base_stat *bstat_cpu, *last_bstat_cpu;
unsigned seq;
/* Root-level stats are sourced from system-wide CPU stats */
if (!parent)
return;
+ /* these are not present on root */
+ bstat_cpu = per_cpu_ptr(cgrp->bstat_cpu, cpu);
+ last_bstat_cpu = per_cpu_ptr(cgrp->last_bstat_cpu, cpu);
+
/* fetch the current per-cpu values */
do {
seq = __u64_stats_fetch_begin(&rstatc->bsync);
cur.cputime = rstatc->bstat.cputime;
} while (__u64_stats_fetch_retry(&rstatc->bsync, seq));
+
/* propagate percpu delta to global */
delta = cur;
cgroup_base_stat_sub(&delta, &rstatc->last_bstat);
cgroup_base_stat_add(&cgrp->bstat, &delta);
+ cgroup_base_stat_add(bstat_cpu, &delta);
cgroup_base_stat_add(&rstatc->last_bstat, &delta);
/* propagate global delta to parent (unless that's root) */
@@ -343,6 +383,11 @@ static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu)
cgroup_base_stat_sub(&delta, &cgrp->last_bstat);
cgroup_base_stat_add(&parent->bstat, &delta);
cgroup_base_stat_add(&cgrp->last_bstat, &delta);
+
+ delta = *bstat_cpu;
+ cgroup_base_stat_sub(&delta, last_bstat_cpu);
+ cgroup_base_stat_add(per_cpu_ptr(parent->bstat_cpu, cpu), &delta);
+ cgroup_base_stat_add(last_bstat_cpu, &delta);
}
}
@@ -400,6 +445,30 @@ void __cgroup_account_cputime_field(struct cgroup *cgrp,
cgroup_base_stat_cputime_account_end(cgrp, rstatc, flags);
}
+/* See root_cgroup_cputime. Note that this does not first reset cputime. */
+static void root_cgroup_add_cputime_cpu(struct task_cputime *cputime, int cpu)
+{
+ struct kernel_cpustat kcpustat;
+ u64 *cpustat = kcpustat.cpustat;
+ u64 user = 0;
+ u64 sys = 0;
+
+ kcpustat_cpu_fetch(&kcpustat, cpu);
+
+ user += cpustat[CPUTIME_USER];
+ user += cpustat[CPUTIME_NICE];
+ cputime->utime += user;
+
+ sys += cpustat[CPUTIME_SYSTEM];
+ sys += cpustat[CPUTIME_IRQ];
+ sys += cpustat[CPUTIME_SOFTIRQ];
+ cputime->stime += sys;
+
+ cputime->sum_exec_runtime += user;
+ cputime->sum_exec_runtime += sys;
+ cputime->sum_exec_runtime += cpustat[CPUTIME_STEAL];
+}
+
/*
* compute the cputime for the root cgroup by getting the per cpu data
* at a global level, then categorizing the fields in a manner consistent
@@ -414,25 +483,7 @@ static void root_cgroup_cputime(struct task_cputime *cputime)
cputime->utime = 0;
cputime->sum_exec_runtime = 0;
for_each_possible_cpu(i) {
- struct kernel_cpustat kcpustat;
- u64 *cpustat = kcpustat.cpustat;
- u64 user = 0;
- u64 sys = 0;
-
- kcpustat_cpu_fetch(&kcpustat, i);
-
- user += cpustat[CPUTIME_USER];
- user += cpustat[CPUTIME_NICE];
- cputime->utime += user;
-
- sys += cpustat[CPUTIME_SYSTEM];
- sys += cpustat[CPUTIME_IRQ];
- sys += cpustat[CPUTIME_SOFTIRQ];
- cputime->stime += sys;
-
- cputime->sum_exec_runtime += user;
- cputime->sum_exec_runtime += sys;
- cputime->sum_exec_runtime += cpustat[CPUTIME_STEAL];
+ root_cgroup_add_cputime_cpu(cputime, i);
}
}
@@ -464,3 +515,71 @@ void cgroup_base_stat_cputime_show(struct seq_file *seq)
"system_usec %llu\n",
usage, utime, stime);
}
+
+void cgroup_base_stat_percpu_cputime_show(struct seq_file *seq)
+{
+ static DEFINE_MUTEX(mutex);
+ static DEFINE_PER_CPU(struct cgroup_base_stat, cached_percpu_stats);
+ struct cgroup_base_stat *cached_bstat;
+ struct cgroup *cgrp = seq_css(seq)->cgroup;
+ u64 val;
+ int cpu;
+
+ /* protects cached_percpu_stats */
+ mutex_lock(&mutex);
+
+ if (cgroup_parent(cgrp)) {
+ struct cgroup_base_stat *bstat_cpu;
+
+ cgroup_rstat_flush_hold(cgrp);
+
+ for_each_possible_cpu(cpu) {
+ bstat_cpu = per_cpu_ptr(cgrp->bstat_cpu, cpu);
+ cached_bstat = per_cpu_ptr(&cached_percpu_stats, cpu);
+
+ cached_bstat->cputime.sum_exec_runtime =
+ bstat_cpu->cputime.sum_exec_runtime;
+ cputime_adjust(&bstat_cpu->cputime,
+ per_cpu_ptr(cgrp->prev_cputime_cpu, cpu),
+ &cached_bstat->cputime.utime,
+ &cached_bstat->cputime.stime);
+ }
+
+ cgroup_rstat_flush_release();
+ } else {
+ for_each_possible_cpu(cpu) {
+ cached_bstat = per_cpu_ptr(&cached_percpu_stats, cpu);
+ memset(cached_bstat, 0, sizeof(*cached_bstat));
+ root_cgroup_add_cputime_cpu(&cached_bstat->cputime, cpu);
+ }
+ }
+
+ seq_puts(seq, "usage_usec");
+ for_each_possible_cpu(cpu) {
+ cached_bstat = per_cpu_ptr(&cached_percpu_stats, cpu);
+ val = cached_bstat->cputime.sum_exec_runtime;
+ do_div(val, NSEC_PER_USEC);
+ seq_printf(seq, " %llu", val);
+ }
+ seq_puts(seq, "\n");
+
+ seq_puts(seq, "user_usec");
+ for_each_possible_cpu(cpu) {
+ cached_bstat = per_cpu_ptr(&cached_percpu_stats, cpu);
+ val = cached_bstat->cputime.utime;
+ do_div(val, NSEC_PER_USEC);
+ seq_printf(seq, " %llu", val);
+ }
+ seq_puts(seq, "\n");
+
+ seq_puts(seq, "system_usec");
+ for_each_possible_cpu(cpu) {
+ cached_bstat = per_cpu_ptr(&cached_percpu_stats, cpu);
+ val = cached_bstat->cputime.stime;
+ do_div(val, NSEC_PER_USEC);
+ seq_printf(seq, " %llu", val);
+ }
+ seq_puts(seq, "\n");
+
+ mutex_unlock(&mutex);
+}
--
2.34.1.575.g55b058a8bb-goog
Powered by blists - more mailing lists