[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20210401133917.469929784@infradead.org>
Date: Thu, 01 Apr 2021 15:10:19 +0200
From: Peter Zijlstra <peterz@...radead.org>
To: joel@...lfernandes.org, chris.hyser@...cle.com, joshdon@...gle.com,
mingo@...nel.org, vincent.guittot@...aro.org,
valentin.schneider@....com, mgorman@...e.de
Cc: linux-kernel@...r.kernel.org, peterz@...radead.org, tj@...nel.org,
tglx@...utronix.de
Subject: [PATCH 7/9] sched: Cgroup core-scheduling interface
Implement a basic cgroup core-scheduling interface.
A new cpu.core_sched file is added which takes the values 0,1. When
set, the cgroup and all it's descendants will be granted the same
cookie and thus allowed to share a core with each-other, but not with
system tasks or tasks of other subtrees that might have another
cookie.
The file is hierarchical, and a subtree can again set it to 1, in
which case that subtree will get a different cookie and will no longer
share with the parent tree.
For each task, the nearest core_sched parent 'wins'.
Interaction with the prctl() interface is non-existent and left for a
future patch.
Noteably; this patch somewhat abuses cgroup_mutex. By holding
cgroup_mutex over the write() operation, which sets the cookie, the
cookie is stable in any cgroup callback (that is called with
cgroup_mutex held). A future patch relies on ss->can_attach() and
ss->attach() being 'atomic', which is hard to do without cgroup_mutex.
Signed-off-by: Peter Zijlstra (Intel) <peterz@...radead.org>
---
kernel/sched/core.c | 150 +++++++++++++++++++++++++++++++++++++++++++++++++++
kernel/sched/sched.h | 7 ++
2 files changed, 157 insertions(+)
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5688,10 +5688,53 @@ static inline void sched_core_cpu_starti
}
}
}
+
+void sched_core_cgroup_online(struct task_group *parent, struct task_group *tg)
+{
+ lockdep_assert_held(&cgroup_mutex);
+
+ if (parent->core_parent) {
+ WARN_ON_ONCE(parent->core_cookie);
+ WARN_ON_ONCE(!parent->core_parent->core_cookie);
+ tg->core_parent = parent->core_parent;
+
+ } else if (parent->core_cookie) {
+ WARN_ON_ONCE(parent->core_parent);
+ tg->core_parent = parent;
+ }
+}
+
+void sched_core_cgroup_free(struct task_group *tg)
+{
+ sched_core_put_cookie(tg->core_cookie);
+}
+
+unsigned long sched_core_cgroup_cookie(struct task_group *tg)
+{
+ unsigned long cookie = 0;
+
+ lockdep_assert_held(&cgroup_mutex);
+
+ if (tg->core_cookie)
+ cookie = tg->core_cookie;
+ else if (tg->core_parent)
+ cookie = tg->core_parent->core_cookie;
+
+ return sched_core_get_cookie(cookie);
+}
+
#else /* !CONFIG_SCHED_CORE */
static inline void sched_core_cpu_starting(unsigned int cpu) {}
+static inline void sched_core_cgroup_free(struct task_group *tg) { }
+static inline void sched_core_cgroup_online(struct task_group *parent, struct task_group tg) { }
+
+static inline unsigned long sched_core_cgroup_cookie(struct task_group *tg)
+{
+ return 0;
+}
+
static struct task_struct *
pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
{
@@ -9310,6 +9353,7 @@ static void sched_free_group(struct task
free_fair_sched_group(tg);
free_rt_sched_group(tg);
autogroup_free(tg);
+ sched_core_cgroup_free(tg);
kmem_cache_free(task_group_cache, tg);
}
@@ -9353,6 +9397,8 @@ void sched_online_group(struct task_grou
spin_unlock_irqrestore(&task_group_lock, flags);
online_fair_sched_group(tg);
+
+ sched_core_cgroup_online(parent, tg);
}
/* rcu callback to free various structures associated with a task group */
@@ -9414,6 +9460,7 @@ void sched_move_task(struct task_struct
{
int queued, running, queue_flags =
DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
+ unsigned long cookie;
struct rq_flags rf;
struct rq *rq;
@@ -9443,6 +9490,10 @@ void sched_move_task(struct task_struct
}
task_rq_unlock(rq, tsk, &rf);
+
+ cookie = sched_core_cgroup_cookie(tsk->sched_task_group);
+ cookie = sched_core_update_cookie(tsk, cookie);
+ sched_core_put_cookie(cookie);
}
static inline struct task_group *css_tg(struct cgroup_subsys_state *css)
@@ -10050,6 +10101,89 @@ static u64 cpu_rt_period_read_uint(struc
}
#endif /* CONFIG_RT_GROUP_SCHED */
+#ifdef CONFIG_SCHED_CORE
+u64 cpu_sched_core_read_u64(struct cgroup_subsys_state *css, struct cftype *cft)
+{
+ return !!css_tg(css)->core_cookie;
+}
+
+int cpu_sched_core_write_u64(struct cgroup_subsys_state *css, struct cftype *cft, u64 val)
+{
+ unsigned long cookie = 0, old_cookie = 0;
+ struct task_group *tg = css_tg(css);
+ struct cgroup_subsys_state *cssi;
+ struct task_group *parent = NULL;
+ int ret = 0;
+
+ if (val > 1)
+ return -ERANGE;
+
+ if (!static_branch_likely(&sched_smt_present))
+ return -ENODEV;
+
+ mutex_lock(&cgroup_mutex);
+ if (!!val == !!tg->core_cookie)
+ goto unlock;
+
+ old_cookie = tg->core_cookie;
+ if (val) {
+ cookie = sched_core_alloc_cookie();
+ if (!cookie) {
+ ret = -ENOMEM;
+ goto unlock;
+ }
+ WARN_ON_ONCE(old_cookie);
+
+ } else if (tg->parent) {
+ if (tg->parent->core_parent)
+ parent = tg->parent->core_parent;
+ else if (tg->parent->core_cookie)
+ parent = tg->parent;
+ }
+
+ WARN_ON_ONCE(cookie && parent);
+
+ tg->core_cookie = sched_core_get_cookie(cookie);
+ tg->core_parent = parent;
+
+ if (cookie)
+ parent = tg;
+ else if (parent)
+ cookie = sched_core_get_cookie(parent->core_cookie);
+
+ css_for_each_descendant_pre(cssi, css) {
+ struct task_group *tgi = css_tg(cssi);
+ struct css_task_iter it;
+ struct task_struct *p;
+
+ if (tgi != tg) {
+ if (tgi->core_cookie || (tgi->core_parent && tgi->core_parent != tg))
+ continue;
+
+ tgi->core_parent = parent;
+ tgi->core_cookie = 0;
+ }
+
+ css_task_iter_start(cssi, 0, &it);
+ while ((p = css_task_iter_next(&it))) {
+ unsigned long p_cookie;
+
+ cookie = sched_core_get_cookie(cookie);
+ p_cookie = sched_core_update_cookie(p, cookie);
+ sched_core_put_cookie(p_cookie);
+ }
+ css_task_iter_end(&it);
+ }
+
+unlock:
+ mutex_unlock(&cgroup_mutex);
+
+ sched_core_put_cookie(cookie);
+ sched_core_put_cookie(old_cookie);
+ return ret;
+}
+#endif
+
static struct cftype cpu_legacy_files[] = {
#ifdef CONFIG_FAIR_GROUP_SCHED
{
@@ -10100,6 +10234,14 @@ static struct cftype cpu_legacy_files[]
.write = cpu_uclamp_max_write,
},
#endif
+#ifdef CONFIG_SCHED_CORE
+ {
+ .name = "core_sched",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .read_u64 = cpu_sched_core_read_u64,
+ .write_u64 = cpu_sched_core_write_u64,
+ },
+#endif
{ } /* Terminate */
};
@@ -10281,6 +10423,14 @@ static struct cftype cpu_files[] = {
.write = cpu_uclamp_max_write,
},
#endif
+#ifdef CONFIG_SCHED_CORE
+ {
+ .name = "core_sched",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .read_u64 = cpu_sched_core_read_u64,
+ .write_u64 = cpu_sched_core_write_u64,
+ },
+#endif
{ } /* terminate */
};
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -431,6 +431,10 @@ struct task_group {
struct uclamp_se uclamp[UCLAMP_CNT];
#endif
+#ifdef CONFIG_SCHED_CORE
+ struct task_group *core_parent;
+ unsigned long core_cookie;
+#endif
};
#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -1130,6 +1134,9 @@ static inline bool is_migration_disabled
struct sched_group;
#ifdef CONFIG_SCHED_CORE
+
+extern struct mutex cgroup_mutex; // XXX
+
DECLARE_STATIC_KEY_FALSE(__sched_core_enabled);
static inline struct cpumask *sched_group_span(struct sched_group *sg);
Powered by blists - more mailing lists