[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20241028063313.8039-3-dtcccc@linux.alibaba.com>
Date: Mon, 28 Oct 2024 14:33:13 +0800
From: Tianchen Ding <dtcccc@...ux.alibaba.com>
To: linux-kernel@...r.kernel.org
Cc: Ingo Molnar <mingo@...hat.com>,
Peter Zijlstra <peterz@...radead.org>,
Juri Lelli <juri.lelli@...hat.com>,
Vincent Guittot <vincent.guittot@...aro.org>,
Dietmar Eggemann <dietmar.eggemann@....com>,
Steven Rostedt <rostedt@...dmis.org>,
Ben Segall <bsegall@...gle.com>,
Mel Gorman <mgorman@...e.de>,
Valentin Schneider <vschneid@...hat.com>,
Tejun Heo <tj@...nel.org>
Subject: [RFC PATCH 2/2] sched/eevdf: Introduce a cgroup interface for slice
Introduce "cpu.fair_slice" for cgroup v2 and "cpu.fair_slice_us" for v1
according to their name styles. The unit is always microseconds.
A cgroup with shorter slice can preempt others more easily. This could be
useful in container scenarios.
By default, cpu.fair_slice is 0, which means the slice of se is
calculated by min_slice from its cfs_rq. If cpu.fair_slice is set, it
will overwrite se->slice with the customized value.
Signed-off-by: Tianchen Ding <dtcccc@...ux.alibaba.com>
---
CC Tejun, do we need (and reuse) this slice interface for sched_ext?
---
kernel/sched/core.c | 34 ++++++++++++++++++++++++++++++
kernel/sched/fair.c | 49 +++++++++++++++++++++++++++++++++++++++-----
kernel/sched/sched.h | 3 +++
3 files changed, 81 insertions(+), 5 deletions(-)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 114adac5a9c8..8d57b7d88d18 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -9690,6 +9690,24 @@ static int cpu_idle_write_s64(struct cgroup_subsys_state *css,
}
#endif
+#ifdef CONFIG_FAIR_GROUP_SCHED
+static u64 cpu_fair_slice_read_u64(struct cgroup_subsys_state *css,
+ struct cftype *cft)
+{
+ u64 fair_slice_us = css_tg(css)->slice;
+
+ do_div(fair_slice_us, NSEC_PER_USEC);
+
+ return fair_slice_us;
+}
+
+static int cpu_fair_slice_write_u64(struct cgroup_subsys_state *css,
+ struct cftype *cftype, u64 fair_slice_us)
+{
+ return sched_group_set_slice(css_tg(css), fair_slice_us);
+}
+#endif
+
static struct cftype cpu_legacy_files[] = {
#ifdef CONFIG_GROUP_SCHED_WEIGHT
{
@@ -9703,6 +9721,14 @@ static struct cftype cpu_legacy_files[] = {
.write_s64 = cpu_idle_write_s64,
},
#endif
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ {
+ .name = "fair_slice_us",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .read_u64 = cpu_fair_slice_read_u64,
+ .write_u64 = cpu_fair_slice_write_u64,
+ },
+#endif
#ifdef CONFIG_CFS_BANDWIDTH
{
.name = "cfs_quota_us",
@@ -9943,6 +9969,14 @@ static struct cftype cpu_files[] = {
.write_s64 = cpu_idle_write_s64,
},
#endif
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ {
+ .name = "fair_slice",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .read_u64 = cpu_fair_slice_read_u64,
+ .write_u64 = cpu_fair_slice_write_u64,
+ },
+#endif
#ifdef CONFIG_CFS_BANDWIDTH
{
.name = "max",
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 7dc90a6e6e26..694dc0655719 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -797,6 +797,11 @@ static inline u64 cfs_rq_min_slice(struct cfs_rq *cfs_rq)
return min_slice;
}
+static inline u64 cfs_rq_slice(struct cfs_rq *cfs_rq)
+{
+ return cfs_rq->tg->slice ? : cfs_rq_min_slice(cfs_rq);
+}
+
static inline bool __entity_less(struct rb_node *a, const struct rb_node *b)
{
return entity_before(__node_2_se(a), __node_2_se(b));
@@ -6994,7 +6999,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
se->custom_slice = 1;
}
enqueue_entity(cfs_rq, se, flags);
- slice = cfs_rq_min_slice(cfs_rq);
+ slice = cfs_rq_slice(cfs_rq);
cfs_rq->h_nr_running++;
cfs_rq->idle_h_nr_running += idle_h_nr_running;
@@ -7018,7 +7023,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
se->slice = slice;
min_vruntime_cb_propagate(&se->run_node, NULL);
- slice = cfs_rq_min_slice(cfs_rq);
+ slice = cfs_rq_slice(cfs_rq);
cfs_rq->h_nr_running++;
cfs_rq->idle_h_nr_running += idle_h_nr_running;
@@ -7093,7 +7098,7 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
idle_h_nr_running = task_has_idle_policy(p);
} else {
cfs_rq = group_cfs_rq(se);
- slice = cfs_rq_min_slice(cfs_rq);
+ slice = cfs_rq_slice(cfs_rq);
}
for_each_sched_entity(se) {
@@ -7118,7 +7123,7 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
/* Don't dequeue parent if it has other entities besides us */
if (cfs_rq->load.weight) {
- slice = cfs_rq_min_slice(cfs_rq);
+ slice = cfs_rq_slice(cfs_rq);
/* Avoid re-evaluating load for this entity: */
se = parent_entity(se);
@@ -7143,7 +7148,7 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
se->slice = slice;
min_vruntime_cb_propagate(&se->run_node, NULL);
- slice = cfs_rq_min_slice(cfs_rq);
+ slice = cfs_rq_slice(cfs_rq);
cfs_rq->h_nr_running -= h_nr_running;
cfs_rq->idle_h_nr_running -= idle_h_nr_running;
@@ -13535,6 +13540,40 @@ int sched_group_set_idle(struct task_group *tg, long idle)
return 0;
}
+int sched_group_set_slice(struct task_group *tg, u64 fair_slice_us)
+{
+ u64 slice = 0;
+ int i;
+
+ if (fair_slice_us > U64_MAX / NSEC_PER_USEC)
+ return -EINVAL;
+
+ if (fair_slice_us) {
+ slice = clamp_t(u64, fair_slice_us * NSEC_PER_USEC,
+ NSEC_PER_MSEC / 10, /* HZ = 1000 * 10 */
+ NSEC_PER_MSEC * 100); /* HZ = 100 / 10 */
+ }
+
+ if (slice == tg->slice)
+ return 0;
+
+ tg->slice = slice;
+
+ for_each_possible_cpu(i) {
+ struct sched_entity *se = tg->se[i];
+ struct rq *rq = cpu_rq(i);
+
+ guard(rq_lock_irqsave)(rq);
+ for_each_sched_entity(se) {
+ se->custom_slice = 1;
+ se->slice = cfs_rq_slice(group_cfs_rq(se));
+ min_vruntime_cb_propagate(&se->run_node, NULL);
+ }
+ }
+
+ return 0;
+}
+
#endif /* CONFIG_FAIR_GROUP_SCHED */
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 7b139016cbd9..e02f8715bc04 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -443,6 +443,7 @@ struct task_group {
/* runqueue "owned" by this group on each CPU */
struct cfs_rq **cfs_rq;
unsigned long shares;
+ u64 slice;
#ifdef CONFIG_SMP
/*
* load_avg can be heavily contended at clock tick time, so put
@@ -574,6 +575,8 @@ extern int sched_group_set_shares(struct task_group *tg, unsigned long shares);
extern int sched_group_set_idle(struct task_group *tg, long idle);
+extern int sched_group_set_slice(struct task_group *tg, u64 fair_slice_us);
+
#ifdef CONFIG_SMP
extern void set_task_rq_fair(struct sched_entity *se,
struct cfs_rq *prev, struct cfs_rq *next);
--
2.39.3
Powered by blists - more mailing lists