linux-kernel - [PATCH v2 8/8] sched/uclamp: Solve under-utilization problem

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives

Hash Suite: Windows password security audit tool. GUI, reports in PDF.

[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]

Message-Id: <94048802c665752e92d1d354fdc38dd95ffe4a03.1741091349.git.hongyan.xia2@arm.com>
Date: Tue,  4 Mar 2025 14:23:15 +0000
From: Hongyan Xia <hongyan.xia2@....com>
To: Ingo Molnar <mingo@...hat.com>,
	Peter Zijlstra <peterz@...radead.org>,
	Vincent Guittot <vincent.guittot@...aro.org>,
	Dietmar Eggemann <dietmar.eggemann@....com>,
	Juri Lelli <juri.lelli@...hat.com>,
	Steven Rostedt <rostedt@...dmis.org>,
	Ben Segall <bsegall@...gle.com>,
	Mel Gorman <mgorman@...e.de>,
	Valentin Schneider <vschneid@...hat.com>
Cc: Morten Rasmussen <morten.rasmussen@....com>,
	Lukasz Luba <lukasz.luba@....com>,
	Christian Loehle <christian.loehle@....com>,
	Pierre Gondois <pierre.gondois@....com>,
	linux-kernel@...r.kernel.org
Subject: [PATCH v2 8/8] sched/uclamp: Solve under-utilization problem

With sum aggregation, a heavily uclamp_max-throttled task may throttle
the whole rq, resulting in low OPP.

For example, two tasks having the same priority and both tasks are
always-running tasks. One task has no uclamp values but the other has
uclamp_max of 1. Then, under sum aggregation, the CPU will run at 512 +
1 = 513 OPP, which means the task without uclamp_max only gets 513 / 2 =
256 utilization, even though the CPU still can run faster.

With this patch, we do not throttle a uclamp_max too hard such that it
impacts other tasks. This is done by tracking the highest uclamp_factor
and any uclamp_max tasks cannot throttle more than this factor allows.

Signed-off-by: Hongyan Xia <hongyan.xia2@....com>
---
 kernel/sched/fair.c  | 12 ++++++++++++
 kernel/sched/pelt.c  | 33 +++++++++++++++++++++++++++++----
 kernel/sched/sched.h |  2 ++
 3 files changed, 43 insertions(+), 4 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 944953b90297..966ca63da3fa 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -7159,6 +7159,18 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
 	if (p) {
 		util_bias_dequeue(rq, p);
 		propagate_negative_bias(p);
+		if (p->pid == rq->max_uclamp_factor_pid) {
+			/*
+			 * If the task with the highest uclamp_factor gets
+			 * dequeued, the correct thing to do is to set pid and
+			 * factor to the second highest. However, the overhead
+			 * isn't really necessary because the second highest
+			 * will set these fields the next time it gets updated
+			 * anyway.
+			 */
+			rq->max_uclamp_factor_pid = -1;
+			rq->max_uclamp_factor = 0;
+		}
 	}
 
 	if (rq_h_nr_queued && !rq->cfs.h_nr_queued)
diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c
index f38abe6f0b8b..e96ca045af2e 100644
--- a/kernel/sched/pelt.c
+++ b/kernel/sched/pelt.c
@@ -271,8 +271,8 @@ ___update_load_avg(struct sched_avg *sa, unsigned long load)
 static void util_bias_update(struct task_struct *p)
 {
 	unsigned int util, uclamp_min, uclamp_max;
-	struct rq *rq;
-	int old, new;
+	struct rq *rq = task_rq(p);
+	int old, new, clamped_util, prio = p->prio - MAX_RT_PRIO;
 
 	util = READ_ONCE(p->se.avg.util_avg);
 	uclamp_min = uclamp_eff_value(p, UCLAMP_MIN);
@@ -284,12 +284,37 @@ static void util_bias_update(struct task_struct *p)
 	if (uclamp_max == SCHED_CAPACITY_SCALE)
 		uclamp_max = UINT_MAX;
 	old = READ_ONCE(p->se.avg.util_avg_bias);
-	new = (int)clamp(util, uclamp_min, uclamp_max) - (int)util;
+	clamped_util = (int)clamp(util, uclamp_min, uclamp_max);
+	if (p->se.on_rq && prio >= 0) {
+		/* We only do this for fair class priorities. */
+		u64 uclamp_factor = sched_prio_to_wmult[prio];
+
+		/* This has to be a 64-bit multiplication. */
+		uclamp_factor *= clamped_util;
+		if (rq->max_uclamp_factor_pid == p->pid) {
+			rq->max_uclamp_factor = uclamp_factor;
+		} else if (uclamp_factor > rq->max_uclamp_factor) {
+			rq->max_uclamp_factor = uclamp_factor;
+			rq->max_uclamp_factor_pid = p->pid;
+		} else {
+			u32 weight = sched_prio_to_weight[prio];
+
+			/*
+			 * We cannot throttle too much if some other task is
+			 * running at high utilization. We should prioritize
+			 * giving that task enough utilization and respect
+			 * task priority, before enforcing uclamp_max.
+			 */
+			uclamp_max = max(uclamp_max,
+				(rq->max_uclamp_factor * weight) >> 32);
+			clamped_util = (int)clamp(util, uclamp_min, uclamp_max);
+		}
+	}
+	new = clamped_util - (int)util;
 
 	WRITE_ONCE(p->se.avg.util_avg_bias, new);
 	if (!p->se.on_rq)
 		return;
-	rq = task_rq(p);
 	WRITE_ONCE(rq->cfs.avg.util_avg_bias,
 		   READ_ONCE(rq->cfs.avg.util_avg_bias) + new - old);
 }
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 654eede62979..0dc90208ad73 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1086,6 +1086,8 @@ struct rq {
 	u64			nr_switches;
 
 #ifdef CONFIG_UCLAMP_TASK
+	u64			max_uclamp_factor;
+	pid_t			max_uclamp_factor_pid;
 #endif
 
 	struct cfs_rq		cfs;
-- 
2.34.1