[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <3fdb7163-d1f0-45c8-89fa-7c904b567696@amd.com>
Date: Fri, 14 Mar 2025 09:23:47 +0530
From: K Prateek Nayak <kprateek.nayak@....com>
To: Aaron Lu <ziqianlu@...edance.com>, Valentin Schneider
<vschneid@...hat.com>, Ben Segall <bsegall@...gle.com>, Peter Zijlstra
<peterz@...radead.org>, Josh Don <joshdon@...gle.com>, Ingo Molnar
<mingo@...hat.com>, Vincent Guittot <vincent.guittot@...aro.org>
CC: <linux-kernel@...r.kernel.org>, Juri Lelli <juri.lelli@...hat.com>,
Dietmar Eggemann <dietmar.eggemann@....com>, Steven Rostedt
<rostedt@...dmis.org>, Mel Gorman <mgorman@...e.de>, Chengming Zhou
<chengming.zhou@...ux.dev>, Chuyi Zhou <zhouchuyi@...edance.com>
Subject: Re: [RFC PATCH 3/7] sched/fair: Handle unthrottle path for task based
throttle
Hello Aaron,
On 3/13/2025 12:51 PM, Aaron Lu wrote:
[..snip..]
> ---
> kernel/sched/fair.c | 132 +++++++++++++++-----------------------------
> 1 file changed, 45 insertions(+), 87 deletions(-)
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index ab403ff7d53c8..4a95fe3785e43 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -5366,18 +5366,17 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct
> sched_entity *se, int flags)
>
> if (cfs_rq->nr_queued == 1) {
> check_enqueue_throttle(cfs_rq);
> - if (!throttled_hierarchy(cfs_rq)) {
> - list_add_leaf_cfs_rq(cfs_rq);
> - } else {
> + list_add_leaf_cfs_rq(cfs_rq);
> #ifdef CONFIG_CFS_BANDWIDTH
> + if (throttled_hierarchy(cfs_rq)) {
> struct rq *rq = rq_of(cfs_rq);
>
> if (cfs_rq_throttled(cfs_rq) && !cfs_rq->throttled_clock)
> cfs_rq->throttled_clock = rq_clock(rq);
> if (!cfs_rq->throttled_clock_self)
> cfs_rq->throttled_clock_self = rq_clock(rq);
These bits probabaly need revisiting. From what I understand, these
stats were maintained to know when a task was woken up on a
throttled hierarchy which was not connected to the parent essentially
tracking the amount of time runnable tasks were waiting on the
cfs_rq before an unthrottle event allowed them to be picked.
With per-task throttle, these semantics no longer apply since a woken
task will run and dequeue itself when exiting to userspace.
Josh do you have any thoughts on this?
> -#endif
> }
> +#endif
> }
> }
>
> @@ -5525,8 +5524,11 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct
> sched_entity *se, int flags)
> if (flags & DEQUEUE_DELAYED)
> finish_delayed_dequeue_entity(se);
>
> - if (cfs_rq->nr_queued == 0)
> + if (cfs_rq->nr_queued == 0) {
> update_idle_cfs_rq_clock_pelt(cfs_rq);
> + if (throttled_hierarchy(cfs_rq))
> + list_del_leaf_cfs_rq(cfs_rq);
> + }
>
> return true;
> }
> @@ -5832,6 +5834,11 @@ static inline int throttled_lb_pair(struct
> task_group *tg,
> throttled_hierarchy(dest_cfs_rq);
> }
>
> +static inline bool task_is_throttled(struct task_struct *p)
> +{
> + return !list_empty(&p->throttle_node);
> +}
> +
> static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags);
> static void throttle_cfs_rq_work(struct callback_head *work)
> {
> @@ -5885,32 +5892,45 @@ void init_cfs_throttle_work(struct task_struct *p)
> INIT_LIST_HEAD(&p->throttle_node);
> }
>
> +static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags);
> static int tg_unthrottle_up(struct task_group *tg, void *data)
> {
> struct rq *rq = data;
> struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
> + struct task_struct *p, *tmp;
>
> cfs_rq->throttle_count--;
> - if (!cfs_rq->throttle_count) {
> - cfs_rq->throttled_clock_pelt_time += rq_clock_pelt(rq) -
> - cfs_rq->throttled_clock_pelt;
> + if (cfs_rq->throttle_count)
> + return 0;
>
> - /* Add cfs_rq with load or one or more already running entities to
> the list */
> - if (!cfs_rq_is_decayed(cfs_rq))
> - list_add_leaf_cfs_rq(cfs_rq);
> + cfs_rq->throttled_clock_pelt_time += rq_clock_pelt(rq) -
> + cfs_rq->throttled_clock_pelt;
>
> - if (cfs_rq->throttled_clock_self) {
> - u64 delta = rq_clock(rq) - cfs_rq->throttled_clock_self;
> + if (cfs_rq->throttled_clock_self) {
> + u64 delta = rq_clock(rq) - cfs_rq->throttled_clock_self;
>
> - cfs_rq->throttled_clock_self = 0;
> + cfs_rq->throttled_clock_self = 0;
>
> - if (SCHED_WARN_ON((s64)delta < 0))
> - delta = 0;
> + if (SCHED_WARN_ON((s64)delta < 0))
> + delta = 0;
>
> - cfs_rq->throttled_clock_self_time += delta;
> - }
> + cfs_rq->throttled_clock_self_time += delta;
> }
>
> + /* Re-enqueue the tasks that have been throttled at this level. */
> + list_for_each_entry_safe(p, tmp, &cfs_rq->throttled_limbo_list,
> throttle_node) {
> + list_del_init(&p->throttle_node);
> + /*
> + * FIXME: p may not be allowed to run on this rq anymore
> + * due to affinity change while p is throttled.
> + */
Using dequeue_task_fair() for throttle should ensure that the core now
sees task_on_rq_queued() which should make it go throgh a full dequeue
cycle which will remove the task from the "throttled_limbo_list" and
the enqueue should put it back on the correct runqueue.
Is the above comment inaccurate with your changes or did I miss
something?
> + enqueue_task_fair(rq_of(cfs_rq), p, ENQUEUE_WAKEUP);
> + }
> +
> + /* Add cfs_rq with load or one or more already running entities to the list */
> + if (!cfs_rq_is_decayed(cfs_rq))
> + list_add_leaf_cfs_rq(cfs_rq);
> +
> return 0;
> }
>
> @@ -5947,12 +5967,16 @@ static int tg_throttle_down(struct task_group
> *tg, void *data)
>
> /* group is entering throttled state, stop time */
> cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq);
> - list_del_leaf_cfs_rq(cfs_rq);
>
> SCHED_WARN_ON(cfs_rq->throttled_clock_self);
> if (cfs_rq->nr_queued)
> cfs_rq->throttled_clock_self = rq_clock(rq);
>
> + if (!cfs_rq->nr_queued) {
> + list_del_leaf_cfs_rq(cfs_rq);
> + return 0;
> + }
> +
This bit can perhaps go in Patch 2?
> WARN_ON_ONCE(!list_empty(&cfs_rq->throttled_limbo_list));
> /*
> * rq_lock is held, current is (obviously) executing this in kernelspace.
> @@ -6031,11 +6055,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
> {
> struct rq *rq = rq_of(cfs_rq);
> struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
> - struct sched_entity *se;
> - long queued_delta, runnable_delta, idle_delta;
> - long rq_h_nr_queued = rq->cfs.h_nr_queued;
> -
> - se = cfs_rq->tg->se[cpu_of(rq)];
> + struct sched_entity *se = cfs_rq->tg->se[cpu_of(rq)];
>
> cfs_rq->throttled = 0;
>
[..snip..]
--
Thanks and Regards,
Prateek
Powered by blists - more mailing lists