[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20211222093802.22357-2-vincent.guittot@linaro.org>
Date: Wed, 22 Dec 2021 10:38:00 +0100
From: Vincent Guittot <vincent.guittot@...aro.org>
To: mingo@...hat.com, peterz@...radead.org, juri.lelli@...hat.com,
dietmar.eggemann@....com, rostedt@...dmis.org, bsegall@...gle.com,
mgorman@...e.de, bristot@...hat.com, linux-kernel@...r.kernel.org,
rickyiu@...gle.com, odin@...d.al
Cc: sachinp@...ux.vnet.ibm.com, naresh.kamboju@...aro.org,
Vincent Guittot <vincent.guittot@...aro.org>
Subject: [PATCH v2 1/3] sched/pelt: Don't sync hardly util_sum with uti_avg
Rick reported performance regressions in bugzilla because of cpu frequency
being lower than before:
https://bugzilla.kernel.org/show_bug.cgi?id=215045
He bisected the problem to:
commit 1c35b07e6d39 ("sched/fair: Ensure _sum and _avg values stay consistent")
This commit forces util_sum to be synced with the new util_avg after
removing the contribution of a task and before the next periodic sync. By
doing so util_sum is rounded to its lower bound and might lost up to
LOAD_AVG_MAX-1 of accumulated contribution which has not yet been
reflected in util_avg.
Instead of always setting util_sum to the low bound of util_avg, which can
significantly lower the utilization of root cfs_rq after propagating the
change down into the hierarchy, we revert the commit and propagate the
difference in util_sum.
In addition, we also check that cfs's util_sum always stays above the
lower bound for a given util_avg as it has been observed that
sched_entity's util_sum is sometimes above cfs one.
Fixes: 1c35b07e6d39 ("sched/fair: Ensure _sum and _avg values stay consistent")
Reported-by: Rick Yiu <rickyiu@...gle.com>
Signed-off-by: Vincent Guittot <vincent.guittot@...aro.org>
---
kernel/sched/fair.c | 41 ++++++++++++++++++++++++++++++++---------
1 file changed, 32 insertions(+), 9 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 095b0aa378df..9ac28f0f3645 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3381,6 +3381,7 @@ void set_task_rq_fair(struct sched_entity *se,
se->avg.last_update_time = n_last_update_time;
}
+#define MIN_DIVIDER (LOAD_AVG_MAX - 1024)
/*
* When on migration a sched_entity joins/leaves the PELT hierarchy, we need to
@@ -3449,15 +3450,14 @@ void set_task_rq_fair(struct sched_entity *se,
* XXX: only do this for the part of runnable > running ?
*
*/
-
static inline void
update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
{
- long delta = gcfs_rq->avg.util_avg - se->avg.util_avg;
- u32 divider;
+ long delta_sum, delta_avg = gcfs_rq->avg.util_avg - se->avg.util_avg;
+ u32 new_sum, divider;
/* Nothing to update */
- if (!delta)
+ if (!delta_avg)
return;
/*
@@ -3466,13 +3466,30 @@ update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq
*/
divider = get_pelt_divider(&cfs_rq->avg);
+
/* Set new sched_entity's utilization */
se->avg.util_avg = gcfs_rq->avg.util_avg;
- se->avg.util_sum = se->avg.util_avg * divider;
+ new_sum = se->avg.util_avg * divider;
+ delta_sum = (long)new_sum - (long)se->avg.util_sum;
+ se->avg.util_sum = new_sum;
/* Update parent cfs_rq utilization */
- add_positive(&cfs_rq->avg.util_avg, delta);
- cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * divider;
+ add_positive(&cfs_rq->avg.util_avg, delta_avg);
+ add_positive(&cfs_rq->avg.util_sum, delta_sum);
+
+ /*
+ * Because of rounding, se->util_sum might ends up being +1 more than
+ * cfs->util_sum (XXX fix the rounding). Although this is not
+ * a problem by itself, detaching a lot of tasks with the rounding
+ * problem between 2 updates of util_avg (~1ms) can make cfs->util_sum
+ * becoming null whereas cfs_util_avg is not.
+ * Check that util_sum is still above its lower bound for the new
+ * util_avg. Given that period_contrib might have moved since the last
+ * sync, we are only sure that util_sum must be above or equal to
+ * util_avg * minimum possible divider
+ */
+ cfs_rq->avg.util_sum = max_t(u32, cfs_rq->avg.util_sum,
+ cfs_rq->avg.util_avg * MIN_DIVIDER);
}
static inline void
@@ -3681,7 +3698,9 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
r = removed_util;
sub_positive(&sa->util_avg, r);
- sa->util_sum = sa->util_avg * divider;
+ sub_positive(&sa->util_sum, r * divider);
+ /* See update_tg_cfs_util() */
+ sa->util_sum = max_t(u32, sa->util_sum, sa->util_avg * MIN_DIVIDER);
r = removed_runnable;
sub_positive(&sa->runnable_avg, r);
@@ -3780,7 +3799,11 @@ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
dequeue_load_avg(cfs_rq, se);
sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg);
- cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * divider;
+ sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum);
+ /* See update_tg_cfs_util() */
+ cfs_rq->avg.util_sum = max_t(u32, cfs_rq->avg.util_sum,
+ cfs_rq->avg.util_avg * MIN_DIVIDER);
+
sub_positive(&cfs_rq->avg.runnable_avg, se->avg.runnable_avg);
cfs_rq->avg.runnable_sum = cfs_rq->avg.runnable_avg * divider;
--
2.17.1
Powered by blists - more mailing lists