linux-kernel - [RFC PATCH v2 4/7] sched/fair: Use CFS util_avg

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <4f755ae12895bbc74a74bac56bf2ef0f30413a32.1706792708.git.hongyan.xia2@arm.com>
Date: Thu,  1 Feb 2024 13:12:00 +0000
From: Hongyan Xia <hongyan.xia2@....com>
To: Ingo Molnar <mingo@...hat.com>,
	Peter Zijlstra <peterz@...radead.org>,
	Vincent Guittot <vincent.guittot@...aro.org>,
	Dietmar Eggemann <dietmar.eggemann@....com>,
	Juri Lelli <juri.lelli@...hat.com>,
	Steven Rostedt <rostedt@...dmis.org>,
	Ben Segall <bsegall@...gle.com>,
	Mel Gorman <mgorman@...e.de>,
	Daniel Bristot de Oliveira <bristot@...hat.com>,
	Valentin Schneider <vschneid@...hat.com>,
	"Rafael J. Wysocki" <rafael@...nel.org>,
	Viresh Kumar <viresh.kumar@...aro.org>
Cc: Qais Yousef <qyousef@...alina.io>,
	Morten Rasmussen <morten.rasmussen@....com>,
	Lukasz Luba <lukasz.luba@....com>,
	Christian Loehle <christian.loehle@....com>,
	linux-kernel@...r.kernel.org,
	David Dai <davidai@...gle.com>,
	Saravana Kannan <saravanak@...gle.com>,
	linux-pm@...r.kernel.org
Subject: [RFC PATCH v2 4/7] sched/fair: Use CFS util_avg_uclamp for utilization and frequency

Switch to the new util_avg_uclamp for task and runqueue utilization.
Since task_util_est() calls task_util() which now uses util_avg_uclamp,
this means util_est is now also a clamped value.

Now that we have the sum aggregated CFS util value, we do not need to
consult uclamp buckets to know how the frequency should be clamped. We
simply look at the aggregated top level root_cfs_util_uclamp to know
what frequency to choose.

TODO: Sum aggregation for RT tasks. I have already implemented RT sum
aggregation, which is only 49 lines of code, but I don't want RT to
distract this series which is mainly CFS-focused. RT will be sent in a
separate mini series.

Signed-off-by: Hongyan Xia <hongyan.xia2@....com>
---
 kernel/sched/core.c              | 17 ++++----------
 kernel/sched/cpufreq_schedutil.c | 10 ++------
 kernel/sched/fair.c              | 39 ++++++++++++++++----------------
 kernel/sched/sched.h             | 21 +++++++++++++----
 4 files changed, 42 insertions(+), 45 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index db4be4921e7f..0bedc05c883f 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7465,6 +7465,9 @@ int sched_core_idle_cpu(int cpu)
  * The DL bandwidth number otoh is not a measured metric but a value computed
  * based on the task model parameters and gives the minimal utilization
  * required to meet deadlines.
+ *
+ * The util_cfs parameter has already taken uclamp into account (unless uclamp
+ * support is not compiled in).
  */
 unsigned long effective_cpu_util(int cpu, unsigned long util_cfs,
 				 unsigned long *min,
@@ -7490,13 +7493,7 @@ unsigned long effective_cpu_util(int cpu, unsigned long util_cfs,
 	}
 
 	if (min) {
-		/*
-		 * The minimum utilization returns the highest level between:
-		 * - the computed DL bandwidth needed with the IRQ pressure which
-		 *   steals time to the deadline task.
-		 * - The minimum performance requirement for CFS and/or RT.
-		 */
-		*min = max(irq + cpu_bw_dl(rq), uclamp_rq_get(rq, UCLAMP_MIN));
+		*min = irq + cpu_bw_dl(rq);
 
 		/*
 		 * When an RT task is runnable and uclamp is not used, we must
@@ -7515,12 +7512,8 @@ unsigned long effective_cpu_util(int cpu, unsigned long util_cfs,
 	util = util_cfs + cpu_util_rt(rq);
 	util += cpu_util_dl(rq);
 
-	/*
-	 * The maximum hint is a soft bandwidth requirement, which can be lower
-	 * than the actual utilization because of uclamp_max requirements.
-	 */
 	if (max)
-		*max = min(scale, uclamp_rq_get(rq, UCLAMP_MAX));
+		*max = scale;
 
 	if (util >= scale)
 		return scale;
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 95c3c097083e..48a4e4a685d0 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -381,11 +381,8 @@ static void sugov_update_single_freq(struct update_util_data *hook, u64 time,
 	/*
 	 * Do not reduce the frequency if the CPU has not been idle
 	 * recently, as the reduction is likely to be premature then.
-	 *
-	 * Except when the rq is capped by uclamp_max.
 	 */
-	if (!uclamp_rq_is_capped(cpu_rq(sg_cpu->cpu)) &&
-	    sugov_cpu_is_busy(sg_cpu) && next_f < sg_policy->next_freq &&
+	if (sugov_cpu_is_busy(sg_cpu) && next_f < sg_policy->next_freq &&
 	    !sg_policy->need_freq_update) {
 		next_f = sg_policy->next_freq;
 
@@ -435,11 +432,8 @@ static void sugov_update_single_perf(struct update_util_data *hook, u64 time,
 	/*
 	 * Do not reduce the target performance level if the CPU has not been
 	 * idle recently, as the reduction is likely to be premature then.
-	 *
-	 * Except when the rq is capped by uclamp_max.
 	 */
-	if (!uclamp_rq_is_capped(cpu_rq(sg_cpu->cpu)) &&
-	    sugov_cpu_is_busy(sg_cpu) && sg_cpu->util < prev_util)
+	if (sugov_cpu_is_busy(sg_cpu) && sg_cpu->util < prev_util)
 		sg_cpu->util = prev_util;
 
 	cpufreq_driver_adjust_perf(sg_cpu->cpu, sg_cpu->bw_min,
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 36357cfaf48d..b92739e1c52f 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4821,10 +4821,17 @@ static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq)
 
 static int newidle_balance(struct rq *this_rq, struct rq_flags *rf);
 
+#ifdef CONFIG_UCLAMP_TASK
+static inline unsigned long task_util(struct task_struct *p)
+{
+	return READ_ONCE(p->se.avg.util_avg_uclamp);
+}
+#else
 static inline unsigned long task_util(struct task_struct *p)
 {
 	return READ_ONCE(p->se.avg.util_avg);
 }
+#endif
 
 static inline unsigned long task_runnable(struct task_struct *p)
 {
@@ -4932,8 +4939,13 @@ static inline void util_est_update(struct cfs_rq *cfs_rq,
 	 * To avoid underestimate of task utilization, skip updates of EWMA if
 	 * we cannot grant that thread got all CPU time it wanted.
 	 */
-	if ((dequeued + UTIL_EST_MARGIN) < task_runnable(p))
+	if ((READ_ONCE(p->se.avg.util_avg) + UTIL_EST_MARGIN) <
+			task_runnable(p)) {
+		ewma = clamp(ewma,
+			     uclamp_eff_value(p, UCLAMP_MIN),
+			     uclamp_eff_value(p, UCLAMP_MAX));
 		goto done;
+	}
 
 
 	/*
@@ -7685,11 +7697,13 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
 static unsigned long
 cpu_util(int cpu, struct task_struct *p, int dst_cpu, int boost)
 {
-	struct cfs_rq *cfs_rq = &cpu_rq(cpu)->cfs;
-	unsigned long util = READ_ONCE(cfs_rq->avg.util_avg);
+	struct rq *rq = cpu_rq(cpu);
+	struct cfs_rq *cfs_rq = &rq->cfs;
+	unsigned long util = root_cfs_util(rq);
+	bool capped = uclamp_rq_is_capped(rq);
 	unsigned long runnable;
 
-	if (boost) {
+	if (boost && !capped) {
 		runnable = READ_ONCE(cfs_rq->avg.runnable_avg);
 		util = max(util, runnable);
 	}
@@ -7867,7 +7881,6 @@ eenv_pd_max_util(struct energy_env *eenv, struct cpumask *pd_cpus,
 	int cpu;
 
 	for_each_cpu(cpu, pd_cpus) {
-		struct task_struct *tsk = (cpu == dst_cpu) ? p : NULL;
 		unsigned long util = cpu_util(cpu, p, dst_cpu, 1);
 		unsigned long eff_util, min, max;
 
@@ -7880,20 +7893,6 @@ eenv_pd_max_util(struct energy_env *eenv, struct cpumask *pd_cpus,
 		 */
 		eff_util = effective_cpu_util(cpu, util, &min, &max);
 
-		/* Task's uclamp can modify min and max value */
-		if (tsk && uclamp_is_used()) {
-			min = max(min, uclamp_eff_value(p, UCLAMP_MIN));
-
-			/*
-			 * If there is no active max uclamp constraint,
-			 * directly use task's one, otherwise keep max.
-			 */
-			if (uclamp_rq_is_idle(cpu_rq(cpu)))
-				max = uclamp_eff_value(p, UCLAMP_MAX);
-			else
-				max = max(max, uclamp_eff_value(p, UCLAMP_MAX));
-		}
-
 		eff_util = sugov_effective_cpu_perf(cpu, eff_util, min, max);
 		max_util = max(max_util, eff_util);
 	}
@@ -7996,7 +7995,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
 	target = prev_cpu;
 
 	sync_entity_load_avg(&p->se);
-	if (!task_util_est(p) && p_util_min == 0)
+	if (!task_util_est(p))
 		goto unlock;
 
 	eenv_task_busy_time(&eenv, p, prev_cpu);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index ce80b87b549b..3ee28822f48f 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -3062,16 +3062,17 @@ static inline bool uclamp_rq_is_idle(struct rq *rq)
 /* Is the rq being capped/throttled by uclamp_max? */
 static inline bool uclamp_rq_is_capped(struct rq *rq)
 {
-	unsigned long rq_util;
-	unsigned long max_util;
+	unsigned long rq_uclamp_util, rq_real_util;
 
 	if (!static_branch_likely(&sched_uclamp_used))
 		return false;
 
-	rq_util = cpu_util_cfs(cpu_of(rq)) + cpu_util_rt(rq);
-	max_util = READ_ONCE(rq->uclamp[UCLAMP_MAX].value);
+	rq_uclamp_util = cpu_util_cfs(cpu_of(rq)) + cpu_util_rt(rq);
+	rq_real_util = READ_ONCE(rq->cfs.avg.util_avg) +
+		       READ_ONCE(rq->avg_rt.util_avg);
 
-	return max_util != SCHED_CAPACITY_SCALE && rq_util >= max_util;
+	return rq_uclamp_util < SCHED_CAPACITY_SCALE &&
+	       rq_real_util > rq_uclamp_util;
 }
 
 /*
@@ -3087,6 +3088,11 @@ static inline bool uclamp_is_used(void)
 	return static_branch_likely(&sched_uclamp_used);
 }
 
+static inline unsigned long root_cfs_util(struct rq *rq)
+{
+	return READ_ONCE(rq->root_cfs_util_uclamp);
+}
+
 static inline void util_uclamp_enqueue(struct sched_avg *avg,
 				       struct task_struct *p)
 {
@@ -3160,6 +3166,11 @@ static inline bool uclamp_rq_is_idle(struct rq *rq)
 	return false;
 }
 
+static inline unsigned long root_cfs_util(struct rq *rq)
+{
+	return READ_ONCE(rq->cfs.avg.util_avg);
+}
+
 static inline void remove_root_cfs_util_uclamp(struct task_struct *p)
 {
 }
-- 
2.34.1