linux-kernel - [RFC PATCH v3 8/9] cgroup/pids: Enforce pids.max on task migrations

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite for Android: free password hash cracker in your pocket
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20240405170548.15234-9-mkoutny@suse.com>
Date: Fri,  5 Apr 2024 19:05:47 +0200
From: Michal Koutný <mkoutny@...e.com>
To: cgroups@...r.kernel.org,
	linux-doc@...r.kernel.org,
	linux-kernel@...r.kernel.org,
	linux-kselftest@...r.kernel.org
Cc: Tejun Heo <tj@...nel.org>,
	Zefan Li <lizefan.x@...edance.com>,
	Johannes Weiner <hannes@...xchg.org>,
	Jonathan Corbet <corbet@....net>,
	Shuah Khan <shuah@...nel.org>
Subject: [RFC PATCH v3 8/9] cgroup/pids: Enforce pids.max on task migrations

While pids controller is designed with only forks in mind, it leads to
situations where limit is apparently ineffective.
A manager daemon is in /src and it spawns tasks into /dst. The
administrator sets up a limit dst/pids.max while src/pids.max is
unlimited. The manager daemon can spawn more than dst/pids.max tasks
because they get into their target cgroup via migration (or
CLONE_INTO_CGROUP).

For this (migration) to work both src and dst must be in the same
resource domain so the manager daemon does not honor the limit which is
under its control anyway and no excessive resource consumption happens.

dst/pids.current > dst/pids.max may come as a surprise when the
spawning mechanism is opaque to the administrator of dst/pids.max.

Change the behavior of pids controller to take into account limits of
target cgroup upon migration (but only below common ancestor src and
dst, pids.current of common ancestor and above is not affected by
migration, so deliberatly ignore pre-existing pids.current > pids.max).

This change of behavior is hidden behind cgroup2 mount option and
the default is unchanged, pids.max won't affect migrations.

Signed-off-by: Michal Koutný <mkoutny@...e.com>
---
 Documentation/admin-guide/cgroup-v2.rst |  8 +++++++-
 include/linux/cgroup-defs.h             |  7 ++++++-
 kernel/cgroup/cgroup.c                  | 16 +++++++++++++++-
 kernel/cgroup/pids.c                    |  8 ++++++--
 4 files changed, 34 insertions(+), 5 deletions(-)

diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
index 5d4c505cae06..d7e721aed584 100644
--- a/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
@@ -239,6 +239,11 @@ cgroup v2 currently supports the following mount options.
           will not be tracked by the memory controller (even if cgroup
           v2 is remounted later on).
 
+  pids_miglimit
+        Apply pids.max limit also when migrating tasks between cgroups. Only
+        new destination limit are taken into account, i.e. if subtree has
+        pids.current > pids.max, migration within that subtree is allowed.
+
 
 Organizing Processes and Threads
 --------------------------------
@@ -2204,7 +2209,8 @@ Organisational operations are not blocked by cgroup policies, so it is
 possible to have pids.current > pids.max.  This can be done by either
 setting the limit to be smaller than pids.current, or attaching enough
 processes to the cgroup such that pids.current is larger than
-pids.max.  However, it is not possible to violate a cgroup PID policy
+pids.max (unless pids_miglimit mount options is given).
+However, it is not possible to violate a cgroup PID policy
 through fork() or clone(). These will return -EAGAIN if the creation
 of a new process would cause a cgroup policy to be violated.
 
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index ea48c861cd36..a99db24b5496 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -119,7 +119,12 @@ enum {
 	/*
 	 * Enable hugetlb accounting for the memory controller.
 	 */
-	 CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING = (1 << 19),
+	CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING = (1 << 19),
+
+	/*
+	 * Enforce pids limit upon task migration
+	 */
+	CGRP_ROOT_PIDS_MIGRATION_LIMIT = (1 << 20),
 };
 
 /* cftype->flags */
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index a66c088c851c..9aa6428c84c1 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -1922,6 +1922,7 @@ enum cgroup2_param {
 	Opt_memory_localevents,
 	Opt_memory_recursiveprot,
 	Opt_memory_hugetlb_accounting,
+	Opt_pids_miglimit,
 	nr__cgroup2_params
 };
 
@@ -1931,6 +1932,7 @@ static const struct fs_parameter_spec cgroup2_fs_parameters[] = {
 	fsparam_flag("memory_localevents",	Opt_memory_localevents),
 	fsparam_flag("memory_recursiveprot",	Opt_memory_recursiveprot),
 	fsparam_flag("memory_hugetlb_accounting", Opt_memory_hugetlb_accounting),
+	fsparam_flag("pids_miglimit",           Opt_pids_miglimit),
 	{}
 };
 
@@ -1960,6 +1962,9 @@ static int cgroup2_parse_param(struct fs_context *fc, struct fs_parameter *param
 	case Opt_memory_hugetlb_accounting:
 		ctx->flags |= CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING;
 		return 0;
+	case Opt_pids_miglimit:
+		ctx->flags |= CGRP_ROOT_PIDS_MIGRATION_LIMIT;
+		return 0;
 	}
 	return -EINVAL;
 }
@@ -1989,6 +1994,12 @@ static void apply_cgroup_root_flags(unsigned int root_flags)
 			cgrp_dfl_root.flags |= CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING;
 		else
 			cgrp_dfl_root.flags &= ~CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING;
+
+		if (root_flags & CGRP_ROOT_PIDS_MIGRATION_LIMIT)
+			cgrp_dfl_root.flags |= CGRP_ROOT_PIDS_MIGRATION_LIMIT;
+		else
+			cgrp_dfl_root.flags &= ~CGRP_ROOT_PIDS_MIGRATION_LIMIT;
+
 	}
 }
 
@@ -2004,6 +2015,8 @@ static int cgroup_show_options(struct seq_file *seq, struct kernfs_root *kf_root
 		seq_puts(seq, ",memory_recursiveprot");
 	if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING)
 		seq_puts(seq, ",memory_hugetlb_accounting");
+	if (cgrp_dfl_root.flags & CGRP_ROOT_PIDS_MIGRATION_LIMIT)
+		seq_puts(seq, ",pids_miglimit");
 	return 0;
 }
 
@@ -7061,7 +7074,8 @@ static ssize_t features_show(struct kobject *kobj, struct kobj_attribute *attr,
 			"favordynmods\n"
 			"memory_localevents\n"
 			"memory_recursiveprot\n"
-			"memory_hugetlb_accounting\n");
+			"memory_hugetlb_accounting\n"
+			"pids_miglimit\n");
 }
 static struct kobj_attribute cgroup_features_attr = __ATTR_RO(features);
 
diff --git a/kernel/cgroup/pids.c b/kernel/cgroup/pids.c
index 9df8a209a6e2..4683629b8168 100644
--- a/kernel/cgroup/pids.c
+++ b/kernel/cgroup/pids.c
@@ -217,6 +217,7 @@ static int pids_can_attach(struct cgroup_taskset *tset)
 {
 	struct task_struct *task;
 	struct cgroup_subsys_state *dst_css;
+	int err, ret = 0;
 
 	cgroup_taskset_for_each(task, dst_css, tset) {
 		struct pids_cgroup *pids = css_pids(dst_css);
@@ -231,10 +232,13 @@ static int pids_can_attach(struct cgroup_taskset *tset)
 		old_css = task_css(task, pids_cgrp_id);
 		old_pids = css_pids(old_css);
 
-		(void) pids_tranfer_charge(old_pids, pids, 1);
+		err = pids_tranfer_charge(old_pids, pids, 1);
+
+		if (!ret && (cgrp_dfl_root.flags & CGRP_ROOT_PIDS_MIGRATION_LIMIT))
+			ret = err;
 	}
 
-	return 0;
+	return ret;
 }
 
 static void pids_cancel_attach(struct cgroup_taskset *tset)
-- 
2.44.0