lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20250429061211.1295443-4-shakeel.butt@linux.dev>
Date: Mon, 28 Apr 2025 23:12:09 -0700
From: Shakeel Butt <shakeel.butt@...ux.dev>
To: Tejun Heo <tj@...nel.org>,
	Andrew Morton <akpm@...ux-foundation.org>,
	Alexei Starovoitov <ast@...nel.org>
Cc: Johannes Weiner <hannes@...xchg.org>,
	Michal Hocko <mhocko@...nel.org>,
	Roman Gushchin <roman.gushchin@...ux.dev>,
	Muchun Song <muchun.song@...ux.dev>,
	Yosry Ahmed <yosry.ahmed@...ux.dev>,
	Michal Koutný <mkoutny@...e.com>,
	Vlastimil Babka <vbabka@...e.cz>,
	Sebastian Andrzej Siewior <bigeasy@...utronix.de>,
	JP Kobryn <inwardvessel@...il.com>,
	bpf@...r.kernel.org,
	linux-mm@...ck.org,
	cgroups@...r.kernel.org,
	linux-kernel@...r.kernel.org,
	Meta kernel team <kernel-team@...a.com>
Subject: [RFC PATCH 3/3] cgroup: make css_rstat_updated nmi safe

To make css_rstat_updated() able to safely run in nmi context, it can
not spin on locks and rather has to do trylock on the per-cpu per-ss raw
spinlock. This patch implements the backlog mechanism to handle the
failure in acquiring the per-cpu per-ss raw spinlock.

Each subsystem provides a per-cpu lockless list on which the kernel
stores the css given to css_rstat_updated() on trylock failure. These
lockless lists serve as backlog. On cgroup stats flushing code path, the
kernel first processes all the per-cpu lockless backlog lists of the
given ss and then proceeds to flush the update stat trees.

With css_rstat_updated() being nmi safe, the memch stats can and will be
converted to be nmi safe to enable nmi safe mem charging.

Signed-off-by: Shakeel Butt <shakeel.butt@...ux.dev>
---
 kernel/cgroup/rstat.c | 99 +++++++++++++++++++++++++++++++++----------
 1 file changed, 76 insertions(+), 23 deletions(-)

diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
index d3092b4c85d7..ac533e46afa9 100644
--- a/kernel/cgroup/rstat.c
+++ b/kernel/cgroup/rstat.c
@@ -11,6 +11,7 @@
 
 static DEFINE_SPINLOCK(rstat_base_lock);
 static DEFINE_PER_CPU(raw_spinlock_t, rstat_base_cpu_lock);
+static DEFINE_PER_CPU(struct llist_head, rstat_backlog_list);
 
 static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu);
 
@@ -42,6 +43,13 @@ static raw_spinlock_t *ss_rstat_cpu_lock(struct cgroup_subsys *ss, int cpu)
 	return per_cpu_ptr(&rstat_base_cpu_lock, cpu);
 }
 
+static struct llist_head *ss_lhead_cpu(struct cgroup_subsys *ss, int cpu)
+{
+	if (ss)
+		return per_cpu_ptr(ss->lhead, cpu);
+	return per_cpu_ptr(&rstat_backlog_list, cpu);
+}
+
 /*
  * Helper functions for rstat per CPU locks.
  *
@@ -86,6 +94,21 @@ unsigned long _css_rstat_cpu_lock(struct cgroup_subsys_state *css, int cpu,
 	return flags;
 }
 
+static __always_inline
+bool _css_rstat_cpu_trylock(struct cgroup_subsys_state *css, int cpu,
+			    unsigned long *flags)
+{
+	struct cgroup *cgrp = css->cgroup;
+	raw_spinlock_t *cpu_lock;
+	bool contended;
+
+	cpu_lock = ss_rstat_cpu_lock(css->ss, cpu);
+	contended = !raw_spin_trylock_irqsave(cpu_lock, *flags);
+	if (contended)
+		trace_cgroup_rstat_cpu_lock_contended(cgrp, cpu, contended);
+	return !contended;
+}
+
 static __always_inline
 void _css_rstat_cpu_unlock(struct cgroup_subsys_state *css, int cpu,
 		unsigned long flags, const bool fast_path)
@@ -102,32 +125,16 @@ void _css_rstat_cpu_unlock(struct cgroup_subsys_state *css, int cpu,
 	raw_spin_unlock_irqrestore(cpu_lock, flags);
 }
 
-/**
- * css_rstat_updated - keep track of updated rstat_cpu
- * @css: target cgroup subsystem state
- * @cpu: cpu on which rstat_cpu was updated
- *
- * @css's rstat_cpu on @cpu was updated. Put it on the parent's matching
- * rstat_cpu->updated_children list. See the comment on top of
- * css_rstat_cpu definition for details.
- */
-__bpf_kfunc void css_rstat_updated(struct cgroup_subsys_state *css, int cpu)
+static void css_add_to_backlog(struct cgroup_subsys_state *css, int cpu)
 {
-	unsigned long flags;
-
-	/*
-	 * Speculative already-on-list test. This may race leading to
-	 * temporary inaccuracies, which is fine.
-	 *
-	 * Because @parent's updated_children is terminated with @parent
-	 * instead of NULL, we can tell whether @css is on the list by
-	 * testing the next pointer for NULL.
-	 */
-	if (data_race(css_rstat_cpu(css, cpu)->updated_next))
-		return;
+	struct llist_head *lhead = ss_lhead_cpu(css->ss, cpu);
+	struct css_rstat_cpu *rstatc = css_rstat_cpu(css, cpu);
 
-	flags = _css_rstat_cpu_lock(css, cpu, true);
+	llist_add_iff_not_on_list(&rstatc->lnode, lhead);
+}
 
+static void __css_rstat_updated(struct cgroup_subsys_state *css, int cpu)
+{
 	/* put @css and all ancestors on the corresponding updated lists */
 	while (true) {
 		struct css_rstat_cpu *rstatc = css_rstat_cpu(css, cpu);
@@ -153,6 +160,51 @@ __bpf_kfunc void css_rstat_updated(struct cgroup_subsys_state *css, int cpu)
 
 		css = parent;
 	}
+}
+
+static void css_process_backlog(struct cgroup_subsys *ss, int cpu)
+{
+	struct llist_head *lhead = ss_lhead_cpu(ss, cpu);
+	struct llist_node *lnode;
+
+	while ((lnode = llist_del_first_init(lhead))) {
+		struct css_rstat_cpu *rstatc;
+
+		rstatc = container_of(lnode, struct css_rstat_cpu, lnode);
+		__css_rstat_updated(rstatc->owner, cpu);
+	}
+}
+
+/**
+ * css_rstat_updated - keep track of updated rstat_cpu
+ * @css: target cgroup subsystem state
+ * @cpu: cpu on which rstat_cpu was updated
+ *
+ * @css's rstat_cpu on @cpu was updated. Put it on the parent's matching
+ * rstat_cpu->updated_children list. See the comment on top of
+ * css_rstat_cpu definition for details.
+ */
+__bpf_kfunc void css_rstat_updated(struct cgroup_subsys_state *css, int cpu)
+{
+	unsigned long flags;
+
+	/*
+	 * Speculative already-on-list test. This may race leading to
+	 * temporary inaccuracies, which is fine.
+	 *
+	 * Because @parent's updated_children is terminated with @parent
+	 * instead of NULL, we can tell whether @css is on the list by
+	 * testing the next pointer for NULL.
+	 */
+	if (data_race(css_rstat_cpu(css, cpu)->updated_next))
+		return;
+
+	if (!_css_rstat_cpu_trylock(css, cpu, &flags)) {
+		css_add_to_backlog(css, cpu);
+		return;
+	}
+
+	__css_rstat_updated(css, cpu);
 
 	_css_rstat_cpu_unlock(css, cpu, flags, true);
 }
@@ -255,6 +307,7 @@ static struct cgroup_subsys_state *css_rstat_updated_list(
 
 	flags = _css_rstat_cpu_lock(root, cpu, false);
 
+	css_process_backlog(root->ss, cpu);
 	/* Return NULL if this subtree is not on-list */
 	if (!rstatc->updated_next)
 		goto unlock_ret;
-- 
2.47.1


Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ