linux-kernel - [PATCH-cgroup v5] cgroup: Show # of subsystem CSSes in cgroup.stat

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-Id: <20240711133745.2376401-1-longman@redhat.com>
Date: Thu, 11 Jul 2024 09:37:45 -0400
From: Waiman Long <longman@...hat.com>
To: Tejun Heo <tj@...nel.org>,
	Zefan Li <lizefan.x@...edance.com>,
	Johannes Weiner <hannes@...xchg.org>,
	Michal Koutný <mkoutny@...e.com>,
	Jonathan Corbet <corbet@....net>
Cc: cgroups@...r.kernel.org,
	linux-doc@...r.kernel.org,
	linux-kernel@...r.kernel.org,
	Kamalesh Babulal <kamalesh.babulal@...cle.com>,
	Roman Gushchin <roman.gushchin@...ux.dev>,
	Waiman Long <longman@...hat.com>
Subject: [PATCH-cgroup v5] cgroup: Show # of subsystem CSSes in cgroup.stat

Cgroup subsystem state (CSS) is an abstraction in the cgroup layer to
help manage different structures in various cgroup subsystems by being
an embedded element inside a larger structure like cpuset or mem_cgroup.

The /proc/cgroups file shows the number of cgroups for each of the
subsystems.  With cgroup v1, the number of CSSes is the same as the
number of cgroups.  That is not the case anymore with cgroup v2. The
/proc/cgroups file cannot show the actual number of CSSes for the
subsystems that are bound to cgroup v2.

So if a v2 cgroup subsystem is leaking cgroups (usually memory cgroup),
we can't tell by looking at /proc/cgroups which cgroup subsystems may
be responsible.

As cgroup v2 had deprecated the use of /proc/cgroups, the hierarchical
cgroup.stat file is now being extended to show the number of live and
dying CSSes associated with all the non-inhibited cgroup subsystems
that have been bound to cgroup v2 as long as it is not zero.  The number
includes CSSes in the current cgroup as well as in all the descendants
underneath it.  This will help us pinpoint which subsystems are
responsible for the increasing number of dying (nr_dying_descendants)
cgroups.

The cgroup-v2.rst file is updated to discuss this new behavior.

With this patch applied, a sample output from root cgroup.stat file
was shown below.

	nr_descendants 55
	nr_dying_descendants 35
	nr_subsys_cpuset 1
	nr_subsys_cpu 40
	nr_subsys_io 40
	nr_subsys_memory 55
	nr_dying_subsys_memory 35
	nr_subsys_perf_event 56
	nr_subsys_hugetlb 1
	nr_subsys_pids 55
	nr_subsys_rdma 1
	nr_subsys_misc 1

Another sample output from system.slice/cgroup.stat was:

	nr_descendants 32
	nr_dying_descendants 33
	nr_subsys_cpu 30
	nr_subsys_io 30
	nr_subsys_memory 32
	nr_dying_subsys_memory 33
	nr_subsys_perf_event 33
	nr_subsys_pids 32

Signed-off-by: Waiman Long <longman@...hat.com>
Acked-by: Roman Gushchin <roman.gushchin@...ux.dev>
---
 Documentation/admin-guide/cgroup-v2.rst | 14 ++++++--
 include/linux/cgroup-defs.h             |  7 ++++
 kernel/cgroup/cgroup.c                  | 47 ++++++++++++++++++++++++-
 3 files changed, 65 insertions(+), 3 deletions(-)

diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
index 52763d6b2919..356cd430c888 100644
--- a/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
@@ -981,6 +981,16 @@ All cgroup core files are prefixed with "cgroup."
 		A dying cgroup can consume system resources not exceeding
 		limits, which were active at the moment of cgroup deletion.
 
+	  nr_subsys_<cgroup_subsys>
+		Total number of live cgroups associated with that cgroup
+		subsystem (e.g. memory) at and beneath the current
+		cgroup.  An entry will only be shown if it is not zero.
+
+	  nr_dying_subsys_<cgroup_subsys>
+		Total number of dying cgroups associated with that cgroup
+		subsystem (e.g. memory) beneath the current cgroup.
+		An entry will only be shown if it is not zero.
+
   cgroup.freeze
 	A read-write single value file which exists on non-root cgroups.
 	Allowed values are "0" and "1". The default is "0".
@@ -2930,8 +2940,8 @@ Deprecated v1 Core Features
 
 - "cgroup.clone_children" is removed.
 
-- /proc/cgroups is meaningless for v2.  Use "cgroup.controllers" file
-  at the root instead.
+- /proc/cgroups is meaningless for v2.  Use "cgroup.controllers" or
+  "cgroup.stat" files at the root instead.
 
 
 Issues with v1 and Rationales for v2
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index b36690ca0d3f..62de18874508 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -210,6 +210,13 @@ struct cgroup_subsys_state {
 	 * fields of the containing structure.
 	 */
 	struct cgroup_subsys_state *parent;
+
+	/*
+	 * Keep track of total numbers of visible and dying descendant CSSes.
+	 * Protected by cgroup_mutex.
+	 */
+	int nr_descendants;
+	int nr_dying_descendants;
 };
 
 /*
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index c8e4b62b436a..a59e869e8f39 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -3669,12 +3669,36 @@ static int cgroup_events_show(struct seq_file *seq, void *v)
 static int cgroup_stat_show(struct seq_file *seq, void *v)
 {
 	struct cgroup *cgroup = seq_css(seq)->cgroup;
+	struct cgroup_subsys_state *css;
+	int ssid;
 
 	seq_printf(seq, "nr_descendants %d\n",
 		   cgroup->nr_descendants);
 	seq_printf(seq, "nr_dying_descendants %d\n",
 		   cgroup->nr_dying_descendants);
 
+	/*
+	 * Show the number of live and dying csses associated with each of
+	 * non-inhibited cgroup subsystems bound to cgroup v2 if non-zero.
+	 *
+	 * Without proper lock protection, racing is possible. So the
+	 * numbers may not be consistent when that happens.
+	 */
+	rcu_read_lock();
+	for_each_css(css, ssid, cgroup) {
+		if ((BIT(ssid) & cgrp_dfl_inhibit_ss_mask) ||
+		    (cgroup_subsys[ssid]->root !=  &cgrp_dfl_root))
+			continue;
+
+		seq_printf(seq, "nr_subsys_%s %d\n", cgroup_subsys[ssid]->name,
+			   css->nr_descendants + 1);
+		/* Current css is online */
+		if (css->nr_dying_descendants)
+			seq_printf(seq, "nr_dying_subsys_%s %d\n",
+				   cgroup_subsys[ssid]->name,
+				   css->nr_dying_descendants);
+	}
+	rcu_read_unlock();
 	return 0;
 }
 
@@ -5424,6 +5448,8 @@ static void css_release_work_fn(struct work_struct *work)
 	list_del_rcu(&css->sibling);
 
 	if (ss) {
+		struct cgroup_subsys_state *parent_css;
+
 		/* css release path */
 		if (!list_empty(&css->rstat_css_node)) {
 			cgroup_rstat_flush(cgrp);
@@ -5433,6 +5459,13 @@ static void css_release_work_fn(struct work_struct *work)
 		cgroup_idr_replace(&ss->css_idr, NULL, css->id);
 		if (ss->css_released)
 			ss->css_released(css);
+
+		WARN_ON_ONCE(css->nr_descendants || css->nr_dying_descendants);
+		parent_css = css->parent;
+		while (parent_css) {
+			parent_css->nr_dying_descendants--;
+			parent_css = parent_css->parent;
+		}
 	} else {
 		struct cgroup *tcgrp;
 
@@ -5517,8 +5550,11 @@ static int online_css(struct cgroup_subsys_state *css)
 		rcu_assign_pointer(css->cgroup->subsys[ss->id], css);
 
 		atomic_inc(&css->online_cnt);
-		if (css->parent)
+		if (css->parent) {
 			atomic_inc(&css->parent->online_cnt);
+			while ((css = css->parent))
+				css->nr_descendants++;
+		}
 	}
 	return ret;
 }
@@ -5540,6 +5576,15 @@ static void offline_css(struct cgroup_subsys_state *css)
 	RCU_INIT_POINTER(css->cgroup->subsys[ss->id], NULL);
 
 	wake_up_all(&css->cgroup->offline_waitq);
+
+	/*
+	 * Parent css cannot be freed until after the freeing of child css,
+	 * see css_free_rwork_fn().
+	 */
+	while ((css = css->parent)) {
+		css->nr_descendants--;
+		css->nr_dying_descendants++;
+	}
 }
 
 /**
-- 
2.39.3