linux-kernel - [RFC PATCH v3 2/6] sched/uclamp: Track a new util_avg

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <2e43dc6b376ea6d785976a398b5d9ffe823cf35d.1715082714.git.hongyan.xia2@arm.com>
Date: Tue,  7 May 2024 13:50:25 +0100
From: Hongyan Xia <hongyan.xia2@....com>
To: Ingo Molnar <mingo@...hat.com>,
	Peter Zijlstra <peterz@...radead.org>,
	Vincent Guittot <vincent.guittot@...aro.org>,
	Dietmar Eggemann <dietmar.eggemann@....com>,
	Juri Lelli <juri.lelli@...hat.com>,
	Steven Rostedt <rostedt@...dmis.org>,
	Ben Segall <bsegall@...gle.com>,
	Mel Gorman <mgorman@...e.de>,
	Daniel Bristot de Oliveira <bristot@...hat.com>,
	Valentin Schneider <vschneid@...hat.com>
Cc: Qais Yousef <qyousef@...alina.io>,
	Morten Rasmussen <morten.rasmussen@....com>,
	Lukasz Luba <lukasz.luba@....com>,
	Christian Loehle <christian.loehle@....com>,
	pierre.gondois@....com,
	linux-kernel@...r.kernel.org,
	Hongyan Xia <hongyan.xia2@....com>
Subject: [RFC PATCH v3 2/6] sched/uclamp: Track a new util_avg_bias signal

Add a util_avg_bias signal in sched_avg, which is obtained by:

util_avg_bias = clamp(util_avg, uclamp_min, uclamp_max) - util_avg

The task utilization after considering uclamp is;

util_avg_uclamp = util_avg + util_avg_bias

We then sum up all biases on the same rq and use the total bias to bias
the rq utilization. This is the core idea of uclamp sum aggregation. The
rq utilization will be

rq_util_avg_uclamp = rq_util_avg + total_util_avg_bias

Signed-off-by: Hongyan Xia <hongyan.xia2@....com>
---
 include/linux/sched.h |  4 +++-
 kernel/sched/debug.c  |  2 +-
 kernel/sched/fair.c   | 16 ++++++++++++++++
 kernel/sched/pelt.c   | 39 +++++++++++++++++++++++++++++++++++++++
 kernel/sched/sched.h  | 28 ++++++++++++++++++++++++++++
 5 files changed, 87 insertions(+), 2 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index c75fd46506df..4ea4b8b30f54 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -476,8 +476,10 @@ struct sched_avg {
 	u32				period_contrib;
 	unsigned long			load_avg;
 	unsigned long			runnable_avg;
-	unsigned long			util_avg;
+	unsigned int			util_avg;
+	int				util_avg_bias;
 	unsigned int			util_est;
+	unsigned int			util_est_uclamp;
 } ____cacheline_aligned;
 
 /*
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 8d5d98a5834d..c4dadb740e96 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -682,7 +682,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
 			cfs_rq->avg.load_avg);
 	SEQ_printf(m, "  .%-30s: %lu\n", "runnable_avg",
 			cfs_rq->avg.runnable_avg);
-	SEQ_printf(m, "  .%-30s: %lu\n", "util_avg",
+	SEQ_printf(m, "  .%-30s: %u\n", "util_avg",
 			cfs_rq->avg.util_avg);
 	SEQ_printf(m, "  .%-30s: %u\n", "util_est",
 			cfs_rq->avg.util_est);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index ef5bb576ac65..571c8de59508 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1083,6 +1083,7 @@ void post_init_entity_util_avg(struct task_struct *p)
 	}
 
 	sa->runnable_avg = sa->util_avg;
+	sa->util_avg_bias = 0;
 }
 
 #else /* !CONFIG_SMP */
@@ -6801,6 +6802,9 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 
 	/* At this point se is NULL and we are at root level*/
 	add_nr_running(rq, 1);
+	util_bias_enqueue(&rq->cfs.avg, p);
+	/* XXX: We should skip the update above and only do it once here. */
+	cpufreq_update_util(rq, 0);
 
 	/*
 	 * Since new tasks are assigned an initial util_avg equal to
@@ -6892,6 +6896,9 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 
 	/* At this point se is NULL and we are at root level*/
 	sub_nr_running(rq, 1);
+	util_bias_dequeue(&rq->cfs.avg, p);
+	/* XXX: We should skip the update above and only do it once here. */
+	cpufreq_update_util(rq, 0);
 
 	/* balance early to pull high priority tasks */
 	if (unlikely(!was_sched_idle && sched_idle_rq(rq)))
@@ -6900,6 +6907,15 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 dequeue_throttle:
 	util_est_update(&rq->cfs, p, task_sleep);
 	hrtick_update(rq);
+
+#ifdef CONFIG_UCLAMP_TASK
+	if (rq->cfs.h_nr_running == 0) {
+		WARN_ONCE(rq->cfs.avg.util_avg_bias,
+			"0 tasks on CFS of CPU %d, but util_avg_bias is %d\n",
+			rq->cpu, rq->cfs.avg.util_avg_bias);
+		WRITE_ONCE(rq->cfs.avg.util_avg_bias, 0);
+	}
+#endif
 }
 
 #ifdef CONFIG_SMP
diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c
index ef00382de595..56346988182f 100644
--- a/kernel/sched/pelt.c
+++ b/kernel/sched/pelt.c
@@ -266,6 +266,40 @@ ___update_load_avg(struct sched_avg *sa, unsigned long load)
 	WRITE_ONCE(sa->util_avg, sa->util_sum / divider);
 }
 
+#ifdef CONFIG_UCLAMP_TASK
+/* avg must belong to the queue this se is on. */
+static void update_util_bias(struct sched_avg *avg, struct task_struct *p)
+{
+	unsigned int util, uclamp_min, uclamp_max;
+	int old, new;
+
+	/*
+	 * We MUST update the rq summed total every time we update the
+	 * util_avg_bias of a task. If we are on a rq but we are not given the
+	 * rq, then the best thing is to just skip this update.
+	 */
+	if (p->se.on_rq && !avg)
+		return;
+
+	util = READ_ONCE(p->se.avg.util_avg);
+	uclamp_min = uclamp_eff_value(p, UCLAMP_MIN);
+	uclamp_max = uclamp_eff_value(p, UCLAMP_MAX);
+	if (uclamp_max == SCHED_CAPACITY_SCALE)
+		uclamp_max = UINT_MAX;
+	old = READ_ONCE(p->se.avg.util_avg_bias);
+	new = (int)clamp(util, uclamp_min, uclamp_max) - (int)util;
+
+	WRITE_ONCE(p->se.avg.util_avg_bias, new);
+	if (!p->se.on_rq)
+		return;
+	WRITE_ONCE(avg->util_avg_bias, READ_ONCE(avg->util_avg_bias) + new - old);
+}
+#else /* !CONFIG_UCLAMP_TASK */
+static void update_util_bias(struct sched_avg *avg, struct task_struct *p)
+{
+}
+#endif
+
 /*
  * sched_entity:
  *
@@ -296,6 +330,8 @@ int __update_load_avg_blocked_se(u64 now, struct sched_entity *se)
 {
 	if (___update_load_sum(now, &se->avg, 0, 0, 0)) {
 		___update_load_avg(&se->avg, se_weight(se));
+		if (entity_is_task(se))
+			update_util_bias(NULL, task_of(se));
 		trace_pelt_se_tp(se);
 		return 1;
 	}
@@ -310,6 +346,9 @@ int __update_load_avg_se(u64 now, struct cfs_rq *cfs_rq, struct sched_entity *se
 
 		___update_load_avg(&se->avg, se_weight(se));
 		cfs_se_util_change(&se->avg);
+		if (entity_is_task(se))
+			update_util_bias(&rq_of(cfs_rq)->cfs.avg,
+					 task_of(se));
 		trace_pelt_se_tp(se);
 		return 1;
 	}
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index cb3792c04eea..aec812e6c6ba 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -3095,6 +3095,24 @@ static inline bool uclamp_is_used(void)
 {
 	return static_branch_likely(&sched_uclamp_used);
 }
+
+static inline void util_bias_enqueue(struct sched_avg *avg,
+				     struct task_struct *p)
+{
+	int avg_val = READ_ONCE(avg->util_avg_bias);
+	int p_val = READ_ONCE(p->se.avg.util_avg_bias);
+
+	WRITE_ONCE(avg->util_avg_bias, avg_val + p_val);
+}
+
+static inline void util_bias_dequeue(struct sched_avg *avg,
+				     struct task_struct *p)
+{
+	int avg_val = READ_ONCE(avg->util_avg_bias);
+	int p_val = READ_ONCE(p->se.avg.util_avg_bias);
+
+	WRITE_ONCE(avg->util_avg_bias, avg_val - p_val);
+}
 #else /* CONFIG_UCLAMP_TASK */
 static inline unsigned long uclamp_eff_value(struct task_struct *p,
 					     enum uclamp_id clamp_id)
@@ -3130,6 +3148,16 @@ static inline bool uclamp_rq_is_idle(struct rq *rq)
 {
 	return false;
 }
+
+static inline void util_bias_enqueue(struct sched_avg *avg,
+				     struct task_struct *p)
+{
+}
+
+static inline void util_bias_dequeue(struct sched_avg *avg,
+				     struct task_struct *p)
+{
+}
 #endif /* CONFIG_UCLAMP_TASK */
 
 #ifdef CONFIG_HAVE_SCHED_AVG_IRQ
-- 
2.34.1