[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20250429061211.1295443-4-shakeel.butt@linux.dev>
Date: Mon, 28 Apr 2025 23:12:09 -0700
From: Shakeel Butt <shakeel.butt@...ux.dev>
To: Tejun Heo <tj@...nel.org>,
Andrew Morton <akpm@...ux-foundation.org>,
Alexei Starovoitov <ast@...nel.org>
Cc: Johannes Weiner <hannes@...xchg.org>,
Michal Hocko <mhocko@...nel.org>,
Roman Gushchin <roman.gushchin@...ux.dev>,
Muchun Song <muchun.song@...ux.dev>,
Yosry Ahmed <yosry.ahmed@...ux.dev>,
Michal Koutný <mkoutny@...e.com>,
Vlastimil Babka <vbabka@...e.cz>,
Sebastian Andrzej Siewior <bigeasy@...utronix.de>,
JP Kobryn <inwardvessel@...il.com>,
bpf@...r.kernel.org,
linux-mm@...ck.org,
cgroups@...r.kernel.org,
linux-kernel@...r.kernel.org,
Meta kernel team <kernel-team@...a.com>
Subject: [RFC PATCH 3/3] cgroup: make css_rstat_updated nmi safe
To make css_rstat_updated() able to safely run in nmi context, it can
not spin on locks and rather has to do trylock on the per-cpu per-ss raw
spinlock. This patch implements the backlog mechanism to handle the
failure in acquiring the per-cpu per-ss raw spinlock.
Each subsystem provides a per-cpu lockless list on which the kernel
stores the css given to css_rstat_updated() on trylock failure. These
lockless lists serve as backlog. On cgroup stats flushing code path, the
kernel first processes all the per-cpu lockless backlog lists of the
given ss and then proceeds to flush the update stat trees.
With css_rstat_updated() being nmi safe, the memch stats can and will be
converted to be nmi safe to enable nmi safe mem charging.
Signed-off-by: Shakeel Butt <shakeel.butt@...ux.dev>
---
kernel/cgroup/rstat.c | 99 +++++++++++++++++++++++++++++++++----------
1 file changed, 76 insertions(+), 23 deletions(-)
diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
index d3092b4c85d7..ac533e46afa9 100644
--- a/kernel/cgroup/rstat.c
+++ b/kernel/cgroup/rstat.c
@@ -11,6 +11,7 @@
static DEFINE_SPINLOCK(rstat_base_lock);
static DEFINE_PER_CPU(raw_spinlock_t, rstat_base_cpu_lock);
+static DEFINE_PER_CPU(struct llist_head, rstat_backlog_list);
static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu);
@@ -42,6 +43,13 @@ static raw_spinlock_t *ss_rstat_cpu_lock(struct cgroup_subsys *ss, int cpu)
return per_cpu_ptr(&rstat_base_cpu_lock, cpu);
}
+static struct llist_head *ss_lhead_cpu(struct cgroup_subsys *ss, int cpu)
+{
+ if (ss)
+ return per_cpu_ptr(ss->lhead, cpu);
+ return per_cpu_ptr(&rstat_backlog_list, cpu);
+}
+
/*
* Helper functions for rstat per CPU locks.
*
@@ -86,6 +94,21 @@ unsigned long _css_rstat_cpu_lock(struct cgroup_subsys_state *css, int cpu,
return flags;
}
+static __always_inline
+bool _css_rstat_cpu_trylock(struct cgroup_subsys_state *css, int cpu,
+ unsigned long *flags)
+{
+ struct cgroup *cgrp = css->cgroup;
+ raw_spinlock_t *cpu_lock;
+ bool contended;
+
+ cpu_lock = ss_rstat_cpu_lock(css->ss, cpu);
+ contended = !raw_spin_trylock_irqsave(cpu_lock, *flags);
+ if (contended)
+ trace_cgroup_rstat_cpu_lock_contended(cgrp, cpu, contended);
+ return !contended;
+}
+
static __always_inline
void _css_rstat_cpu_unlock(struct cgroup_subsys_state *css, int cpu,
unsigned long flags, const bool fast_path)
@@ -102,32 +125,16 @@ void _css_rstat_cpu_unlock(struct cgroup_subsys_state *css, int cpu,
raw_spin_unlock_irqrestore(cpu_lock, flags);
}
-/**
- * css_rstat_updated - keep track of updated rstat_cpu
- * @css: target cgroup subsystem state
- * @cpu: cpu on which rstat_cpu was updated
- *
- * @css's rstat_cpu on @cpu was updated. Put it on the parent's matching
- * rstat_cpu->updated_children list. See the comment on top of
- * css_rstat_cpu definition for details.
- */
-__bpf_kfunc void css_rstat_updated(struct cgroup_subsys_state *css, int cpu)
+static void css_add_to_backlog(struct cgroup_subsys_state *css, int cpu)
{
- unsigned long flags;
-
- /*
- * Speculative already-on-list test. This may race leading to
- * temporary inaccuracies, which is fine.
- *
- * Because @parent's updated_children is terminated with @parent
- * instead of NULL, we can tell whether @css is on the list by
- * testing the next pointer for NULL.
- */
- if (data_race(css_rstat_cpu(css, cpu)->updated_next))
- return;
+ struct llist_head *lhead = ss_lhead_cpu(css->ss, cpu);
+ struct css_rstat_cpu *rstatc = css_rstat_cpu(css, cpu);
- flags = _css_rstat_cpu_lock(css, cpu, true);
+ llist_add_iff_not_on_list(&rstatc->lnode, lhead);
+}
+static void __css_rstat_updated(struct cgroup_subsys_state *css, int cpu)
+{
/* put @css and all ancestors on the corresponding updated lists */
while (true) {
struct css_rstat_cpu *rstatc = css_rstat_cpu(css, cpu);
@@ -153,6 +160,51 @@ __bpf_kfunc void css_rstat_updated(struct cgroup_subsys_state *css, int cpu)
css = parent;
}
+}
+
+static void css_process_backlog(struct cgroup_subsys *ss, int cpu)
+{
+ struct llist_head *lhead = ss_lhead_cpu(ss, cpu);
+ struct llist_node *lnode;
+
+ while ((lnode = llist_del_first_init(lhead))) {
+ struct css_rstat_cpu *rstatc;
+
+ rstatc = container_of(lnode, struct css_rstat_cpu, lnode);
+ __css_rstat_updated(rstatc->owner, cpu);
+ }
+}
+
+/**
+ * css_rstat_updated - keep track of updated rstat_cpu
+ * @css: target cgroup subsystem state
+ * @cpu: cpu on which rstat_cpu was updated
+ *
+ * @css's rstat_cpu on @cpu was updated. Put it on the parent's matching
+ * rstat_cpu->updated_children list. See the comment on top of
+ * css_rstat_cpu definition for details.
+ */
+__bpf_kfunc void css_rstat_updated(struct cgroup_subsys_state *css, int cpu)
+{
+ unsigned long flags;
+
+ /*
+ * Speculative already-on-list test. This may race leading to
+ * temporary inaccuracies, which is fine.
+ *
+ * Because @parent's updated_children is terminated with @parent
+ * instead of NULL, we can tell whether @css is on the list by
+ * testing the next pointer for NULL.
+ */
+ if (data_race(css_rstat_cpu(css, cpu)->updated_next))
+ return;
+
+ if (!_css_rstat_cpu_trylock(css, cpu, &flags)) {
+ css_add_to_backlog(css, cpu);
+ return;
+ }
+
+ __css_rstat_updated(css, cpu);
_css_rstat_cpu_unlock(css, cpu, flags, true);
}
@@ -255,6 +307,7 @@ static struct cgroup_subsys_state *css_rstat_updated_list(
flags = _css_rstat_cpu_lock(root, cpu, false);
+ css_process_backlog(root->ss, cpu);
/* Return NULL if this subtree is not on-list */
if (!rstatc->updated_next)
goto unlock_ret;
--
2.47.1
Powered by blists - more mailing lists