[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <20250506153057.210213-1-max.kellermann@ionos.com>
Date: Tue, 6 May 2025 17:30:56 +0200
From: Max Kellermann <max.kellermann@...os.com>
To: tj@...nel.org,
hannes@...xchg.org,
mkoutny@...e.com,
cgroups@...r.kernel.org,
linux-kernel@...r.kernel.org
Cc: Max Kellermann <max.kellermann@...os.com>
Subject: [PATCH PoC] kernel/cgroup/pids: refactor "pids.forks" into "pids.stat"
Proof-of-concept patch as reply to
https://lore.kernel.org/cgroups/aBkqeM0DoXUHHdgq@slm.duckdns.org/ to
be applied on top of
https://lore.kernel.org/lkml/20250502121930.4008251-1-max.kellermann@ionos.com/
This is quick'n'dirty, with a lot of code copied from mm/memcontrol.c
and adjusted. I omitted the tables memcg_vm_event_stat and
memory_stats because I did not understand why they exist; simply using
enum pids_stat_item for everything instead, with no lookup table (only
pids_stats_names, a simple array of C string pointers).
Signed-off-by: Max Kellermann <max.kellermann@...os.com>
---
kernel/cgroup/pids.c | 269 ++++++++++++++++++++++++++++++++++++++++---
1 file changed, 256 insertions(+), 13 deletions(-)
diff --git a/kernel/cgroup/pids.c b/kernel/cgroup/pids.c
index fb18741f85ba..9f09f1ebc986 100644
--- a/kernel/cgroup/pids.c
+++ b/kernel/cgroup/pids.c
@@ -32,12 +32,23 @@
#include <linux/threads.h>
#include <linux/atomic.h>
#include <linux/cgroup.h>
+#include <linux/seq_buf.h>
+#include <linux/sizes.h> // for SZ_4K
#include <linux/slab.h>
#include <linux/sched/task.h>
#define PIDS_MAX (PID_MAX_LIMIT + 1ULL)
#define PIDS_MAX_STR "max"
+/*
+ * size of first charge trial.
+ * TODO: maybe necessary to use big numbers in big irons or dynamic based of the
+ * workload.
+ */
+#define PIDS_CHARGE_BATCH 64U
+
+#define SEQ_BUF_SIZE SZ_4K
+
enum pidcg_event {
/* Fork failed in subtree because this pids_cgroup limit was hit. */
PIDCG_MAX,
@@ -49,9 +60,6 @@ enum pidcg_event {
struct pids_cgroup {
struct cgroup_subsys_state css;
- /* the "pids.forks" counter */
- atomic64_t forks;
-
/*
* Use 64-bit types so that we can safely represent "max" as
* %PIDS_MAX = (%PID_MAX_LIMIT + 1).
@@ -64,8 +72,55 @@ struct pids_cgroup {
struct cgroup_file events_file;
struct cgroup_file events_local_file;
+ /* pids.stat */
+ struct pids_cgroup_stats *stats;
+
atomic64_t events[NR_PIDCG_EVENTS];
atomic64_t events_local[NR_PIDCG_EVENTS];
+
+ struct pids_cgroup_stats_percpu __percpu *stats_percpu;
+};
+
+/* Cgroup-specific page state, on top of universal node page state */
+enum pids_stat_item {
+ PIDS_FORKS,
+ PIDS_NR_STAT,
+};
+
+struct pids_cgroup_stats_percpu {
+ /* Stats updates since the last flush */
+ unsigned int stats_updates;
+
+ /* Cached pointers for fast iteration in pids_rstat_updated() */
+ struct pids_cgroup_stats_percpu *parent;
+ struct pids_cgroup_stats *stats;
+
+ /* The above should fit a single cacheline for pids_rstat_updated() */
+
+ /* Local (CPU and cgroup) state */
+ long state[PIDS_NR_STAT];
+
+ /* Delta calculation for lockless upward propagation */
+ long state_prev[PIDS_NR_STAT];
+} ____cacheline_aligned;
+
+struct pids_cgroup_stats {
+ /* Aggregated (CPU and subtree) state */
+ long state[PIDS_NR_STAT];
+
+ /* Pending child counts during tree propagation */
+ long state_pending[PIDS_NR_STAT];
+
+ /* Stats updates since the last flush */
+ atomic64_t stats_updates;
+};
+
+struct pids_stat {
+ const char *name;
+};
+
+static const char *const pids_stats_names[] = {
+ "forks",
};
static struct pids_cgroup *css_pids(struct cgroup_subsys_state *css)
@@ -78,22 +133,181 @@ static struct pids_cgroup *parent_pids(struct pids_cgroup *pids)
return css_pids(pids->css.parent);
}
+static void __pids_css_free(struct pids_cgroup *pids)
+{
+
+ free_percpu(pids->stats_percpu);
+ kfree(pids);
+}
+
static struct cgroup_subsys_state *
pids_css_alloc(struct cgroup_subsys_state *parent)
{
+ struct pids_cgroup *parent_pids = css_pids(parent);
struct pids_cgroup *pids;
+ struct pids_cgroup_stats_percpu *statc, *pstatc;
+ int cpu;
pids = kzalloc(sizeof(struct pids_cgroup), GFP_KERNEL);
if (!pids)
return ERR_PTR(-ENOMEM);
+ pids->stats = kzalloc(sizeof(struct pids_cgroup_stats),
+ GFP_KERNEL_ACCOUNT);
+ pids->stats_percpu = alloc_percpu_gfp(struct pids_cgroup_stats_percpu,
+ GFP_KERNEL_ACCOUNT);
+ if (!pids->stats || !pids->stats_percpu) {
+ __pids_css_free(pids);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ for_each_possible_cpu(cpu) {
+ if (parent_pids)
+ pstatc = per_cpu_ptr(parent_pids->stats_percpu, cpu);
+ statc = per_cpu_ptr(pids->stats_percpu, cpu);
+ statc->parent = parent_pids ? pstatc : NULL;
+ statc->stats = pids->stats;
+ }
+
atomic64_set(&pids->limit, PIDS_MAX);
return &pids->css;
}
static void pids_css_free(struct cgroup_subsys_state *css)
{
- kfree(css_pids(css));
+ struct pids_cgroup *pids = css_pids(css);
+
+ __pids_css_free(pids);
+}
+
+struct aggregate_control {
+ /* pointer to the aggregated (CPU and subtree aggregated) counters */
+ long *aggregate;
+ /* pointer to the pending child counters during tree propagation */
+ long *pending;
+ /* pointer to the parent's pending counters, could be NULL */
+ long *ppending;
+ /* pointer to the percpu counters to be aggregated */
+ long *cstat;
+ /* pointer to the percpu counters of the last aggregation*/
+ long *cstat_prev;
+ /* size of the above counters */
+ int size;
+};
+
+static void pids_cgroup_stat_aggregate(struct aggregate_control *ac)
+{
+ int i;
+ long delta, delta_cpu, v;
+
+ for (i = 0; i < ac->size; i++) {
+ /*
+ * Collect the aggregated propagation counts of groups
+ * below us. We're in a per-cpu loop here and this is
+ * a global counter, so the first cycle will get them.
+ */
+ delta = ac->pending[i];
+ if (delta)
+ ac->pending[i] = 0;
+
+ /* Add CPU changes on this level since the last flush */
+ delta_cpu = 0;
+ v = READ_ONCE(ac->cstat[i]);
+ if (v != ac->cstat_prev[i]) {
+ delta_cpu = v - ac->cstat_prev[i];
+ delta += delta_cpu;
+ ac->cstat_prev[i] = v;
+ }
+
+ if (delta) {
+ ac->aggregate[i] += delta;
+ if (ac->ppending)
+ ac->ppending[i] += delta;
+ }
+ }
+}
+
+static void pids_css_rstat_flush(struct cgroup_subsys_state *css, int cpu)
+{
+ struct pids_cgroup *pids = css_pids(css);
+ struct pids_cgroup *parent = parent_pids(pids);
+ struct pids_cgroup_stats_percpu *statc;
+ struct aggregate_control ac;
+
+ statc = per_cpu_ptr(pids->stats_percpu, cpu);
+
+ ac = (struct aggregate_control) {
+ .aggregate = pids->stats->state,
+ .pending = pids->stats->state_pending,
+ .ppending = parent ? parent->stats->state_pending : NULL,
+ .cstat = statc->state,
+ .cstat_prev = statc->state_prev,
+ .size = PIDS_NR_STAT,
+ };
+ pids_cgroup_stat_aggregate(&ac);
+
+ WRITE_ONCE(statc->stats_updates, 0);
+ /* We are in a per-cpu loop here, only do the atomic write once */
+ if (atomic64_read(&pids->stats->stats_updates))
+ atomic64_set(&pids->stats->stats_updates, 0);
+}
+
+static bool pids_stats_needs_flush(struct pids_cgroup_stats *stats)
+{
+ return atomic64_read(&stats->stats_updates) >
+ PIDS_CHARGE_BATCH * num_online_cpus();
+}
+
+static inline void pids_rstat_updated(struct pids_cgroup *pids, int val)
+{
+ struct pids_cgroup_stats_percpu *statc;
+ int cpu = smp_processor_id();
+ unsigned int stats_updates;
+
+ if (!val)
+ return;
+
+ cgroup_rstat_updated(pids->css.cgroup, cpu);
+ statc = this_cpu_ptr(pids->stats_percpu);
+ for (; statc; statc = statc->parent) {
+ stats_updates = READ_ONCE(statc->stats_updates) + abs(val);
+ WRITE_ONCE(statc->stats_updates, stats_updates);
+ if (stats_updates < PIDS_CHARGE_BATCH)
+ continue;
+
+ /*
+ * If @pids is already flush-able, increasing stats_updates is
+ * redundant. Avoid the overhead of the atomic update.
+ */
+ if (!pids_stats_needs_flush(statc->stats))
+ atomic64_add(stats_updates,
+ &statc->stats->stats_updates);
+ WRITE_ONCE(statc->stats_updates, 0);
+ }
+}
+
+/**
+ * __mod_pids_state - update cgroup pids statistics
+ * @pids: the pids cgroup
+ * @idx: the stat item - can be enum pids_stat_item or enum node_stat_item
+ * @val: delta to add to the counter, can be negative
+ */
+static void __mod_pids_state(struct pids_cgroup *pids, enum pids_stat_item i,
+ int val)
+{
+ __this_cpu_add(pids->stats_percpu->state[i], val);
+ pids_rstat_updated(pids, val);
+}
+
+/* idx can be of type enum pids_stat_item or node_stat_item */
+static void mod_pids_state(struct pids_cgroup *pids,
+ enum pids_stat_item idx, int val)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ __mod_pids_state(pids, idx, val);
+ local_irq_restore(flags);
}
static void pids_update_watermark(struct pids_cgroup *p, int64_t nr_pids)
@@ -150,7 +364,7 @@ static void pids_charge(struct pids_cgroup *pids, int num)
struct pids_cgroup *p;
for (p = pids; parent_pids(p); p = parent_pids(p)) {
- atomic64_add(num, &p->forks);
+ mod_pids_state(p, PIDS_FORKS, num);
int64_t new = atomic64_add_return(num, &p->counter);
pids_update_watermark(p, new);
@@ -172,10 +386,11 @@ static int pids_try_charge(struct pids_cgroup *pids, int num, struct pids_cgroup
struct pids_cgroup *p, *q;
for (p = pids; parent_pids(p); p = parent_pids(p)) {
- atomic64_add(num, &p->forks);
int64_t new = atomic64_add_return(num, &p->counter);
int64_t limit = atomic64_read(&p->limit);
+ mod_pids_state(p, PIDS_FORKS, num);
+
/*
* Since new is capped to the maximum number of pid_t, if
* p->limit is %PIDS_MAX then we know that this test will never
@@ -347,12 +562,40 @@ static int pids_max_show(struct seq_file *sf, void *v)
return 0;
}
-static s64 pids_forks_read(struct cgroup_subsys_state *css,
- struct cftype *cft)
+static void pids_cgroup_flush_stats(struct pids_cgroup *pids)
{
- struct pids_cgroup *pids = css_pids(css);
+ if (!pids_stats_needs_flush(pids->stats))
+ return;
- return atomic64_read(&pids->forks);
+ cgroup_rstat_flush(pids->css.cgroup);
+}
+
+static void pids_stat_format(struct pids_cgroup *pids, struct seq_buf *s)
+{
+ unsigned i;
+
+ pids_cgroup_flush_stats(pids);
+
+ for (i = 0; i < ARRAY_SIZE(pids_stats_names); i++) {
+ long x = READ_ONCE(pids->stats->state[i]);
+
+ seq_buf_printf(s, "%s %ld\n", pids_stats_names[i], x);
+ }
+}
+
+static int pids_stat_show(struct seq_file *seq, void *v)
+{
+ struct pids_cgroup *pids = css_pids(seq_css(seq));
+ char *buf = kmalloc(SEQ_BUF_SIZE, GFP_KERNEL);
+ struct seq_buf s;
+
+ if (!buf)
+ return -ENOMEM;
+ seq_buf_init(&s, buf, SEQ_BUF_SIZE);
+ pids_stat_format(pids, &s);
+ seq_puts(seq, buf);
+ kfree(buf);
+ return 0;
}
static s64 pids_current_read(struct cgroup_subsys_state *css,
@@ -418,9 +661,8 @@ static struct cftype pids_files[] = {
.read_s64 = pids_peak_read,
},
{
- .name = "forks",
- .read_s64 = pids_forks_read,
- .flags = CFTYPE_NOT_ON_ROOT,
+ .name = "stat",
+ .seq_show = pids_stat_show,
},
{
.name = "events",
@@ -467,6 +709,7 @@ static struct cftype pids_files_legacy[] = {
struct cgroup_subsys pids_cgrp_subsys = {
.css_alloc = pids_css_alloc,
.css_free = pids_css_free,
+ .css_rstat_flush = pids_css_rstat_flush,
.can_attach = pids_can_attach,
.cancel_attach = pids_cancel_attach,
.can_fork = pids_can_fork,
--
2.47.2
Powered by blists - more mailing lists