[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20250903145124.GM4067720@noisy.programming.kicks-ass.net>
Date: Wed, 3 Sep 2025 16:51:24 +0200
From: Peter Zijlstra <peterz@...radead.org>
To: Aaron Lu <ziqianlu@...edance.com>
Cc: Valentin Schneider <vschneid@...hat.com>,
Ben Segall <bsegall@...gle.com>,
K Prateek Nayak <kprateek.nayak@....com>,
Chengming Zhou <chengming.zhou@...ux.dev>,
Josh Don <joshdon@...gle.com>, Ingo Molnar <mingo@...hat.com>,
Vincent Guittot <vincent.guittot@...aro.org>,
Xi Wang <xii@...gle.com>, linux-kernel@...r.kernel.org,
Juri Lelli <juri.lelli@...hat.com>,
Dietmar Eggemann <dietmar.eggemann@....com>,
Steven Rostedt <rostedt@...dmis.org>, Mel Gorman <mgorman@...e.de>,
Chuyi Zhou <zhouchuyi@...edance.com>,
Jan Kiszka <jan.kiszka@...mens.com>,
Florian Bezdeka <florian.bezdeka@...mens.com>,
Songtang Liu <liusongtang@...edance.com>,
Chen Yu <yu.c.chen@...el.com>,
Matteo Martelli <matteo.martelli@...ethink.co.uk>,
Michal Koutn?? <mkoutny@...e.com>,
Sebastian Andrzej Siewior <bigeasy@...utronix.de>
Subject: Re: [PATCH v4 3/5] sched/fair: Switch to task based throttle model
On Fri, Aug 29, 2025 at 04:11:18PM +0800, Aaron Lu wrote:
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index dab4ed86d0c82..25b1014d4ef86 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -5291,18 +5291,23 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
>
> if (cfs_rq->nr_queued == 1) {
> check_enqueue_throttle(cfs_rq);
> - if (!throttled_hierarchy(cfs_rq)) {
> - list_add_leaf_cfs_rq(cfs_rq);
> - } else {
> + list_add_leaf_cfs_rq(cfs_rq);
> #ifdef CONFIG_CFS_BANDWIDTH
> + if (throttled_hierarchy(cfs_rq)) {
> struct rq *rq = rq_of(cfs_rq);
>
> if (cfs_rq_throttled(cfs_rq) && !cfs_rq->throttled_clock)
> cfs_rq->throttled_clock = rq_clock(rq);
> if (!cfs_rq->throttled_clock_self)
> cfs_rq->throttled_clock_self = rq_clock(rq);
> -#endif
> +
> + if (cfs_rq->pelt_clock_throttled) {
> + cfs_rq->throttled_clock_pelt_time += rq_clock_pelt(rq) -
> + cfs_rq->throttled_clock_pelt;
> + cfs_rq->pelt_clock_throttled = 0;
> + }
> }
> +#endif
> }
> }
>
> @@ -5450,8 +5451,18 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
> if (flags & DEQUEUE_DELAYED)
> finish_delayed_dequeue_entity(se);
>
> - if (cfs_rq->nr_queued == 0)
> + if (cfs_rq->nr_queued == 0) {
> update_idle_cfs_rq_clock_pelt(cfs_rq);
> +#ifdef CONFIG_CFS_BANDWIDTH
> + if (throttled_hierarchy(cfs_rq)) {
> + struct rq *rq = rq_of(cfs_rq);
> +
> + list_del_leaf_cfs_rq(cfs_rq);
> + cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq);
> + cfs_rq->pelt_clock_throttled = 1;
> + }
> +#endif
> + }
>
> return true;
> }
> @@ -6717,6 +6731,8 @@ static inline void sync_throttle(struct task_group *tg, int cpu) {}
> static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
> static void task_throttle_setup_work(struct task_struct *p) {}
> static bool task_is_throttled(struct task_struct *p) { return false; }
> +static void dequeue_throttled_task(struct task_struct *p, int flags) {}
> +static bool enqueue_throttled_task(struct task_struct *p) { return false; }
>
> static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
> {
> @@ -6909,6 +6925,9 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
> int rq_h_nr_queued = rq->cfs.h_nr_queued;
> u64 slice = 0;
>
> + if (task_is_throttled(p) && enqueue_throttled_task(p))
> + return;
> +
> /*
> * The code below (indirectly) updates schedutil which looks at
> * the cfs_rq utilization to select a frequency.
> @@ -6961,10 +6980,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
> if (cfs_rq_is_idle(cfs_rq))
> h_nr_idle = 1;
>
> - /* end evaluation on encountering a throttled cfs_rq */
> - if (cfs_rq_throttled(cfs_rq))
> - goto enqueue_throttle;
> -
> flags = ENQUEUE_WAKEUP;
> }
>
> @@ -6986,10 +7001,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
>
> if (cfs_rq_is_idle(cfs_rq))
> h_nr_idle = 1;
> -
> - /* end evaluation on encountering a throttled cfs_rq */
> - if (cfs_rq_throttled(cfs_rq))
> - goto enqueue_throttle;
> }
>
> if (!rq_h_nr_queued && rq->cfs.h_nr_queued) {
> @@ -7019,7 +7030,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
> if (!task_new)
> check_update_overutilized_status(rq);
>
> -enqueue_throttle:
> assert_list_leaf_cfs_rq(rq);
>
> hrtick_update(rq);
> @@ -7074,10 +7084,6 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
> if (cfs_rq_is_idle(cfs_rq))
> h_nr_idle = h_nr_queued;
>
> - /* end evaluation on encountering a throttled cfs_rq */
> - if (cfs_rq_throttled(cfs_rq))
> - return 0;
> -
> /* Don't dequeue parent if it has other entities besides us */
> if (cfs_rq->load.weight) {
> slice = cfs_rq_min_slice(cfs_rq);
> @@ -7114,10 +7120,6 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
>
> if (cfs_rq_is_idle(cfs_rq))
> h_nr_idle = h_nr_queued;
> -
> - /* end evaluation on encountering a throttled cfs_rq */
> - if (cfs_rq_throttled(cfs_rq))
> - return 0;
> }
>
> sub_nr_running(rq, h_nr_queued);
> @@ -7151,6 +7153,11 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
> */
> static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
> {
> + if (task_is_throttled(p)) {
> + dequeue_throttled_task(p, flags);
> + return true;
> + }
> +
> if (!p->se.sched_delayed)
> util_est_dequeue(&rq->cfs, p);
>
OK, so this makes it so that either a task is fully enqueued (all
cfs_rq's) or full not. A group cfs_rq is only marked throttled when all
its tasks are gone, and unthrottled when a task gets added. Right?
But propagate_entity_cfs_rq() is still doing the old thing, and has a
if (cfs_rq_throttled(cfs_rq)) break; inside the for_each_sched_entity()
iteration.
This seems somewhat inconsistent; or am I missing something ?
Powered by blists - more mailing lists