linux-kernel - [PATCH 7/9] sched: Cgroup core-scheduling interface

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite for Android: free password hash cracker in your pocket
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20210401133917.469929784@infradead.org>
Date:   Thu, 01 Apr 2021 15:10:19 +0200
From:   Peter Zijlstra <peterz@...radead.org>
To:     joel@...lfernandes.org, chris.hyser@...cle.com, joshdon@...gle.com,
        mingo@...nel.org, vincent.guittot@...aro.org,
        valentin.schneider@....com, mgorman@...e.de
Cc:     linux-kernel@...r.kernel.org, peterz@...radead.org, tj@...nel.org,
        tglx@...utronix.de
Subject: [PATCH 7/9] sched: Cgroup core-scheduling interface

Implement a basic cgroup core-scheduling interface.

A new cpu.core_sched file is added which takes the values 0,1. When
set, the cgroup and all it's descendants will be granted the same
cookie and thus allowed to share a core with each-other, but not with
system tasks or tasks of other subtrees that might have another
cookie.

The file is hierarchical, and a subtree can again set it to 1, in
which case that subtree will get a different cookie and will no longer
share with the parent tree.

For each task, the nearest core_sched parent 'wins'.

Interaction with the prctl() interface is non-existent and left for a
future patch.

Noteably; this patch somewhat abuses cgroup_mutex. By holding
cgroup_mutex over the write() operation, which sets the cookie, the
cookie is stable in any cgroup callback (that is called with
cgroup_mutex held). A future patch relies on ss->can_attach() and
ss->attach() being 'atomic', which is hard to do without cgroup_mutex.

Signed-off-by: Peter Zijlstra (Intel) <peterz@...radead.org>
---
 kernel/sched/core.c  |  150 +++++++++++++++++++++++++++++++++++++++++++++++++++
 kernel/sched/sched.h |    7 ++
 2 files changed, 157 insertions(+)

--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5688,10 +5688,53 @@ static inline void sched_core_cpu_starti
 		}
 	}
 }
+
+void sched_core_cgroup_online(struct task_group *parent, struct task_group *tg)
+{
+	lockdep_assert_held(&cgroup_mutex);
+
+	if (parent->core_parent) {
+		WARN_ON_ONCE(parent->core_cookie);
+		WARN_ON_ONCE(!parent->core_parent->core_cookie);
+		tg->core_parent = parent->core_parent;
+
+	} else if (parent->core_cookie) {
+		WARN_ON_ONCE(parent->core_parent);
+		tg->core_parent = parent;
+	}
+}
+
+void sched_core_cgroup_free(struct task_group *tg)
+{
+	sched_core_put_cookie(tg->core_cookie);
+}
+
+unsigned long sched_core_cgroup_cookie(struct task_group *tg)
+{
+	unsigned long cookie = 0;
+
+	lockdep_assert_held(&cgroup_mutex);
+
+	if (tg->core_cookie)
+		cookie = tg->core_cookie;
+	else if (tg->core_parent)
+		cookie = tg->core_parent->core_cookie;
+
+	return sched_core_get_cookie(cookie);
+}
+
 #else /* !CONFIG_SCHED_CORE */
 
 static inline void sched_core_cpu_starting(unsigned int cpu) {}
 
+static inline void sched_core_cgroup_free(struct task_group *tg) { }
+static inline void sched_core_cgroup_online(struct task_group *parent, struct task_group tg) { }
+
+static inline unsigned long sched_core_cgroup_cookie(struct task_group *tg)
+{
+	return 0;
+}
+
 static struct task_struct *
 pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
 {
@@ -9310,6 +9353,7 @@ static void sched_free_group(struct task
 	free_fair_sched_group(tg);
 	free_rt_sched_group(tg);
 	autogroup_free(tg);
+	sched_core_cgroup_free(tg);
 	kmem_cache_free(task_group_cache, tg);
 }
 
@@ -9353,6 +9397,8 @@ void sched_online_group(struct task_grou
 	spin_unlock_irqrestore(&task_group_lock, flags);
 
 	online_fair_sched_group(tg);
+
+	sched_core_cgroup_online(parent, tg);
 }
 
 /* rcu callback to free various structures associated with a task group */
@@ -9414,6 +9460,7 @@ void sched_move_task(struct task_struct
 {
 	int queued, running, queue_flags =
 		DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
+	unsigned long cookie;
 	struct rq_flags rf;
 	struct rq *rq;
 
@@ -9443,6 +9490,10 @@ void sched_move_task(struct task_struct
 	}
 
 	task_rq_unlock(rq, tsk, &rf);
+
+	cookie = sched_core_cgroup_cookie(tsk->sched_task_group);
+	cookie = sched_core_update_cookie(tsk, cookie);
+	sched_core_put_cookie(cookie);
 }
 
 static inline struct task_group *css_tg(struct cgroup_subsys_state *css)
@@ -10050,6 +10101,89 @@ static u64 cpu_rt_period_read_uint(struc
 }
 #endif /* CONFIG_RT_GROUP_SCHED */
 
+#ifdef CONFIG_SCHED_CORE
+u64 cpu_sched_core_read_u64(struct cgroup_subsys_state *css, struct cftype *cft)
+{
+	return !!css_tg(css)->core_cookie;
+}
+
+int cpu_sched_core_write_u64(struct cgroup_subsys_state *css, struct cftype *cft, u64 val)
+{
+	unsigned long cookie = 0, old_cookie = 0;
+	struct task_group *tg = css_tg(css);
+	struct cgroup_subsys_state *cssi;
+	struct task_group *parent = NULL;
+	int ret = 0;
+
+	if (val > 1)
+		return -ERANGE;
+
+	if (!static_branch_likely(&sched_smt_present))
+		return -ENODEV;
+
+	mutex_lock(&cgroup_mutex);
+	if (!!val == !!tg->core_cookie)
+		goto unlock;
+
+	old_cookie = tg->core_cookie;
+	if (val) {
+		cookie = sched_core_alloc_cookie();
+		if (!cookie) {
+			ret = -ENOMEM;
+			goto unlock;
+		}
+		WARN_ON_ONCE(old_cookie);
+
+	} else if (tg->parent) {
+		if (tg->parent->core_parent)
+			parent = tg->parent->core_parent;
+		else if (tg->parent->core_cookie)
+			parent = tg->parent;
+	}
+
+	WARN_ON_ONCE(cookie && parent);
+
+	tg->core_cookie = sched_core_get_cookie(cookie);
+	tg->core_parent = parent;
+
+	if (cookie)
+		parent = tg;
+	else if (parent)
+		cookie = sched_core_get_cookie(parent->core_cookie);
+
+	css_for_each_descendant_pre(cssi, css) {
+		struct task_group *tgi = css_tg(cssi);
+		struct css_task_iter it;
+		struct task_struct *p;
+
+		if (tgi != tg) {
+			if (tgi->core_cookie || (tgi->core_parent && tgi->core_parent != tg))
+				continue;
+
+			tgi->core_parent = parent;
+			tgi->core_cookie = 0;
+		}
+
+		css_task_iter_start(cssi, 0, &it);
+		while ((p = css_task_iter_next(&it))) {
+			unsigned long p_cookie;
+
+			cookie = sched_core_get_cookie(cookie);
+			p_cookie = sched_core_update_cookie(p, cookie);
+			sched_core_put_cookie(p_cookie);
+		}
+		css_task_iter_end(&it);
+	}
+
+unlock:
+	mutex_unlock(&cgroup_mutex);
+
+	sched_core_put_cookie(cookie);
+	sched_core_put_cookie(old_cookie);
+	return ret;
+}
+#endif
+
 static struct cftype cpu_legacy_files[] = {
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	{
@@ -10100,6 +10234,14 @@ static struct cftype cpu_legacy_files[]
 		.write = cpu_uclamp_max_write,
 	},
 #endif
+#ifdef CONFIG_SCHED_CORE
+	{
+		.name = "core_sched",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.read_u64 = cpu_sched_core_read_u64,
+		.write_u64 = cpu_sched_core_write_u64,
+	},
+#endif
 	{ }	/* Terminate */
 };
 
@@ -10281,6 +10423,14 @@ static struct cftype cpu_files[] = {
 		.write = cpu_uclamp_max_write,
 	},
 #endif
+#ifdef CONFIG_SCHED_CORE
+	{
+		.name = "core_sched",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.read_u64 = cpu_sched_core_read_u64,
+		.write_u64 = cpu_sched_core_write_u64,
+	},
+#endif
 	{ }	/* terminate */
 };
 
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -431,6 +431,10 @@ struct task_group {
 	struct uclamp_se	uclamp[UCLAMP_CNT];
 #endif
 
+#ifdef CONFIG_SCHED_CORE
+	struct task_group	*core_parent;
+	unsigned long		core_cookie;
+#endif
 };
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
@@ -1130,6 +1134,9 @@ static inline bool is_migration_disabled
 
 struct sched_group;
 #ifdef CONFIG_SCHED_CORE
+
+extern struct mutex cgroup_mutex; // XXX
+
 DECLARE_STATIC_KEY_FALSE(__sched_core_enabled);
 static inline struct cpumask *sched_group_span(struct sched_group *sg);