linux-kernel - [PATCH v2 6/7] sched/fair: Add sched group latency support

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20220512163534.2572-7-vincent.guittot@linaro.org>
Date:   Thu, 12 May 2022 18:35:33 +0200
From:   Vincent Guittot <vincent.guittot@...aro.org>
To:     mingo@...hat.com, peterz@...radead.org, juri.lelli@...hat.com,
        dietmar.eggemann@....com, rostedt@...dmis.org, bsegall@...gle.com,
        mgorman@...e.de, bristot@...hat.com, vschneid@...hat.com,
        linux-kernel@...r.kernel.org, parth@...ux.ibm.com
Cc:     qais.yousef@....com, chris.hyser@...cle.com,
        valentin.schneider@....com, patrick.bellasi@...bug.net,
        David.Laight@...lab.com, pjt@...gle.com, pavel@....cz,
        tj@...nel.org, qperret@...gle.com, tim.c.chen@...ux.intel.com,
        joshdon@...gle.com, Vincent Guittot <vincent.guittot@...aro.org>
Subject: [PATCH v2 6/7] sched/fair: Add sched group latency support

Tasks can set its latency priority which is then used to decide to preempt
the current running entity of the cfs but sched group entities still have
the default latency priority.

Add a latency field in task group to set the latency priority of the group
which will be used against other entity in the parent cfs.

Signed-off-by: Vincent Guittot <vincent.guittot@...aro.org>
---

For v1, there were dicussions about the best interface to express some latency
constraints for a cfs group. The weight seems to be specific to the wakeup path
and can't be easily reused elsewhere like EAS or idle cpu selection path.
The current proposal of a latency prio ranging from [0:40] seems the simplest
interface but on the other side, it removes the notion of either having latency
constaint with a negative value or relaxing the latency with positive value;
This is an important and easy to understand difference. So I tend to think that
keeping the latency nice is the easiest way to express if a group has latency
constraint (negative value) or don't care about scheduling latency
(positive value). Using a range will then help to order groups' constraint.

I have studied how we can use a duration but this will mainly provide confusion
to the user because whatever the value, we will never be able to ensure it.


 kernel/sched/core.c  | 41 +++++++++++++++++++++++++++++++++++++++++
 kernel/sched/fair.c  | 32 ++++++++++++++++++++++++++++++++
 kernel/sched/sched.h |  4 ++++
 3 files changed, 71 insertions(+)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index ff020b99625c..95f3ef54447e 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -10798,6 +10798,30 @@ static int cpu_idle_write_s64(struct cgroup_subsys_state *css,
 {
 	return sched_group_set_idle(css_tg(css), idle);
 }
+
+static s64 cpu_latency_read_s64(struct cgroup_subsys_state *css,
+			       struct cftype *cft)
+{
+	return css_tg(css)->latency_prio;
+}
+
+static int cpu_latency_write_s64(struct cgroup_subsys_state *css,
+				struct cftype *cft, s64 latency_prio)
+{
+	return sched_group_set_latency(css_tg(css), latency_prio);
+}
+
+static s64 cpu_latency_nice_read_s64(struct cgroup_subsys_state *css,
+			       struct cftype *cft)
+{
+	return LATENCY_TO_NICE(css_tg(css)->latency_prio);
+}
+
+static int cpu_latency_nice_write_s64(struct cgroup_subsys_state *css,
+				struct cftype *cft, s64 latency_nice)
+{
+	return sched_group_set_latency(css_tg(css), NICE_TO_LATENCY(latency_nice));
+}
 #endif
 
 static struct cftype cpu_legacy_files[] = {
@@ -10812,6 +10836,11 @@ static struct cftype cpu_legacy_files[] = {
 		.read_s64 = cpu_idle_read_s64,
 		.write_s64 = cpu_idle_write_s64,
 	},
+	{
+		.name = "latency",
+		.read_s64 = cpu_latency_nice_read_s64,
+		.write_s64 = cpu_latency_nice_write_s64,
+	},
 #endif
 #ifdef CONFIG_CFS_BANDWIDTH
 	{
@@ -11029,6 +11058,12 @@ static struct cftype cpu_files[] = {
 		.read_s64 = cpu_idle_read_s64,
 		.write_s64 = cpu_idle_write_s64,
 	},
+	{
+		.name = "latency",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.read_s64 = cpu_latency_nice_read_s64,
+		.write_s64 = cpu_latency_nice_write_s64,
+	},
 #endif
 #ifdef CONFIG_CFS_BANDWIDTH
 	{
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 3af74f1a79ca..71c0762491c5 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -11529,6 +11529,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
 		goto err;
 
 	tg->shares = NICE_0_LOAD;
+	tg->latency_prio = DEFAULT_LATENCY_PRIO;
 
 	init_cfs_bandwidth(tg_cfs_bandwidth(tg));
 
@@ -11627,6 +11628,7 @@ void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
 	}
 
 	se->my_q = cfs_rq;
+	se->latency_weight = sched_latency_to_weight[tg->latency_prio];
 	/* guarantee group entities always have weight */
 	update_load_set(&se->load, NICE_0_LOAD);
 	se->parent = parent;
@@ -11757,6 +11759,36 @@ int sched_group_set_idle(struct task_group *tg, long idle)
 	return 0;
 }
 
+int sched_group_set_latency(struct task_group *tg, long latency_prio)
+{
+	int i;
+
+	if (tg == &root_task_group)
+		return -EINVAL;
+
+	if (latency_prio < 0 ||
+	    latency_prio > LATENCY_NICE_WIDTH)
+		return -EINVAL;
+
+	mutex_lock(&shares_mutex);
+
+	if (tg->latency_prio == latency_prio) {
+		mutex_unlock(&shares_mutex);
+		return 0;
+	}
+
+	tg->latency_prio = latency_prio;
+
+	for_each_possible_cpu(i) {
+		struct sched_entity *se = tg->se[i];
+
+		WRITE_ONCE(se->latency_weight, sched_latency_to_weight[latency_prio]);
+	}
+
+	mutex_unlock(&shares_mutex);
+	return 0;
+}
+
 #else /* CONFIG_FAIR_GROUP_SCHED */
 
 void free_fair_sched_group(struct task_group *tg) { }
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 831b2c8feff1..0c26bb5a742e 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -414,6 +414,8 @@ struct task_group {
 
 	/* A positive value indicates that this is a SCHED_IDLE group. */
 	int			idle;
+	/* latency priority of the group. */
+	int			latency_prio;
 
 #ifdef	CONFIG_SMP
 	/*
@@ -527,6 +529,8 @@ extern int sched_group_set_shares(struct task_group *tg, unsigned long shares);
 
 extern int sched_group_set_idle(struct task_group *tg, long idle);
 
+extern int sched_group_set_latency(struct task_group *tg, long latency);
+
 #ifdef CONFIG_SMP
 extern void set_task_rq_fair(struct sched_entity *se,
 			     struct cfs_rq *prev, struct cfs_rq *next);
-- 
2.17.1