[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <94048802c665752e92d1d354fdc38dd95ffe4a03.1741091349.git.hongyan.xia2@arm.com>
Date: Tue, 4 Mar 2025 14:23:15 +0000
From: Hongyan Xia <hongyan.xia2@....com>
To: Ingo Molnar <mingo@...hat.com>,
Peter Zijlstra <peterz@...radead.org>,
Vincent Guittot <vincent.guittot@...aro.org>,
Dietmar Eggemann <dietmar.eggemann@....com>,
Juri Lelli <juri.lelli@...hat.com>,
Steven Rostedt <rostedt@...dmis.org>,
Ben Segall <bsegall@...gle.com>,
Mel Gorman <mgorman@...e.de>,
Valentin Schneider <vschneid@...hat.com>
Cc: Morten Rasmussen <morten.rasmussen@....com>,
Lukasz Luba <lukasz.luba@....com>,
Christian Loehle <christian.loehle@....com>,
Pierre Gondois <pierre.gondois@....com>,
linux-kernel@...r.kernel.org
Subject: [PATCH v2 8/8] sched/uclamp: Solve under-utilization problem
With sum aggregation, a heavily uclamp_max-throttled task may throttle
the whole rq, resulting in low OPP.
For example, two tasks having the same priority and both tasks are
always-running tasks. One task has no uclamp values but the other has
uclamp_max of 1. Then, under sum aggregation, the CPU will run at 512 +
1 = 513 OPP, which means the task without uclamp_max only gets 513 / 2 =
256 utilization, even though the CPU still can run faster.
With this patch, we do not throttle a uclamp_max too hard such that it
impacts other tasks. This is done by tracking the highest uclamp_factor
and any uclamp_max tasks cannot throttle more than this factor allows.
Signed-off-by: Hongyan Xia <hongyan.xia2@....com>
---
kernel/sched/fair.c | 12 ++++++++++++
kernel/sched/pelt.c | 33 +++++++++++++++++++++++++++++----
kernel/sched/sched.h | 2 ++
3 files changed, 43 insertions(+), 4 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 944953b90297..966ca63da3fa 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -7159,6 +7159,18 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
if (p) {
util_bias_dequeue(rq, p);
propagate_negative_bias(p);
+ if (p->pid == rq->max_uclamp_factor_pid) {
+ /*
+ * If the task with the highest uclamp_factor gets
+ * dequeued, the correct thing to do is to set pid and
+ * factor to the second highest. However, the overhead
+ * isn't really necessary because the second highest
+ * will set these fields the next time it gets updated
+ * anyway.
+ */
+ rq->max_uclamp_factor_pid = -1;
+ rq->max_uclamp_factor = 0;
+ }
}
if (rq_h_nr_queued && !rq->cfs.h_nr_queued)
diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c
index f38abe6f0b8b..e96ca045af2e 100644
--- a/kernel/sched/pelt.c
+++ b/kernel/sched/pelt.c
@@ -271,8 +271,8 @@ ___update_load_avg(struct sched_avg *sa, unsigned long load)
static void util_bias_update(struct task_struct *p)
{
unsigned int util, uclamp_min, uclamp_max;
- struct rq *rq;
- int old, new;
+ struct rq *rq = task_rq(p);
+ int old, new, clamped_util, prio = p->prio - MAX_RT_PRIO;
util = READ_ONCE(p->se.avg.util_avg);
uclamp_min = uclamp_eff_value(p, UCLAMP_MIN);
@@ -284,12 +284,37 @@ static void util_bias_update(struct task_struct *p)
if (uclamp_max == SCHED_CAPACITY_SCALE)
uclamp_max = UINT_MAX;
old = READ_ONCE(p->se.avg.util_avg_bias);
- new = (int)clamp(util, uclamp_min, uclamp_max) - (int)util;
+ clamped_util = (int)clamp(util, uclamp_min, uclamp_max);
+ if (p->se.on_rq && prio >= 0) {
+ /* We only do this for fair class priorities. */
+ u64 uclamp_factor = sched_prio_to_wmult[prio];
+
+ /* This has to be a 64-bit multiplication. */
+ uclamp_factor *= clamped_util;
+ if (rq->max_uclamp_factor_pid == p->pid) {
+ rq->max_uclamp_factor = uclamp_factor;
+ } else if (uclamp_factor > rq->max_uclamp_factor) {
+ rq->max_uclamp_factor = uclamp_factor;
+ rq->max_uclamp_factor_pid = p->pid;
+ } else {
+ u32 weight = sched_prio_to_weight[prio];
+
+ /*
+ * We cannot throttle too much if some other task is
+ * running at high utilization. We should prioritize
+ * giving that task enough utilization and respect
+ * task priority, before enforcing uclamp_max.
+ */
+ uclamp_max = max(uclamp_max,
+ (rq->max_uclamp_factor * weight) >> 32);
+ clamped_util = (int)clamp(util, uclamp_min, uclamp_max);
+ }
+ }
+ new = clamped_util - (int)util;
WRITE_ONCE(p->se.avg.util_avg_bias, new);
if (!p->se.on_rq)
return;
- rq = task_rq(p);
WRITE_ONCE(rq->cfs.avg.util_avg_bias,
READ_ONCE(rq->cfs.avg.util_avg_bias) + new - old);
}
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 654eede62979..0dc90208ad73 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1086,6 +1086,8 @@ struct rq {
u64 nr_switches;
#ifdef CONFIG_UCLAMP_TASK
+ u64 max_uclamp_factor;
+ pid_t max_uclamp_factor_pid;
#endif
struct cfs_rq cfs;
--
2.34.1
Powered by blists - more mailing lists