linux-kernel - Re: [RFC PATCH 3/7] sched/fair: Handle unthrottle path for task based throttle

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <3fdb7163-d1f0-45c8-89fa-7c904b567696@amd.com>
Date: Fri, 14 Mar 2025 09:23:47 +0530
From: K Prateek Nayak <kprateek.nayak@....com>
To: Aaron Lu <ziqianlu@...edance.com>, Valentin Schneider
	<vschneid@...hat.com>, Ben Segall <bsegall@...gle.com>, Peter Zijlstra
	<peterz@...radead.org>, Josh Don <joshdon@...gle.com>, Ingo Molnar
	<mingo@...hat.com>, Vincent Guittot <vincent.guittot@...aro.org>
CC: <linux-kernel@...r.kernel.org>, Juri Lelli <juri.lelli@...hat.com>,
	Dietmar Eggemann <dietmar.eggemann@....com>, Steven Rostedt
	<rostedt@...dmis.org>, Mel Gorman <mgorman@...e.de>, Chengming Zhou
	<chengming.zhou@...ux.dev>, Chuyi Zhou <zhouchuyi@...edance.com>
Subject: Re: [RFC PATCH 3/7] sched/fair: Handle unthrottle path for task based
 throttle

Hello Aaron,

On 3/13/2025 12:51 PM, Aaron Lu wrote:

[..snip..]

> ---
>   kernel/sched/fair.c | 132 +++++++++++++++-----------------------------
>   1 file changed, 45 insertions(+), 87 deletions(-)
> 
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index ab403ff7d53c8..4a95fe3785e43 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -5366,18 +5366,17 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct
> sched_entity *se, int flags)
> 
>   	if (cfs_rq->nr_queued == 1) {
>   		check_enqueue_throttle(cfs_rq);
> -		if (!throttled_hierarchy(cfs_rq)) {
> -			list_add_leaf_cfs_rq(cfs_rq);
> -		} else {
> +		list_add_leaf_cfs_rq(cfs_rq);
>   #ifdef CONFIG_CFS_BANDWIDTH
> +		if (throttled_hierarchy(cfs_rq)) {
>   			struct rq *rq = rq_of(cfs_rq);
> 
>   			if (cfs_rq_throttled(cfs_rq) && !cfs_rq->throttled_clock)
>   				cfs_rq->throttled_clock = rq_clock(rq);
>   			if (!cfs_rq->throttled_clock_self)
>   				cfs_rq->throttled_clock_self = rq_clock(rq);

These bits probabaly need revisiting. From what I understand, these
stats were maintained to know when a task was woken up on a
throttled hierarchy which was not connected to the parent essentially
tracking the amount of time runnable tasks were waiting on the
cfs_rq before an unthrottle event allowed them to be picked.

With per-task throttle, these semantics no longer apply since a woken
task will run and dequeue itself when exiting to userspace.

Josh do you have any thoughts on this?

> -#endif
>   		}
> +#endif
>   	}
>   }
> 
> @@ -5525,8 +5524,11 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct
> sched_entity *se, int flags)
>   	if (flags & DEQUEUE_DELAYED)
>   		finish_delayed_dequeue_entity(se);
> 
> -	if (cfs_rq->nr_queued == 0)
> +	if (cfs_rq->nr_queued == 0) {
>   		update_idle_cfs_rq_clock_pelt(cfs_rq);
> +		if (throttled_hierarchy(cfs_rq))
> +			list_del_leaf_cfs_rq(cfs_rq);
> +	}
> 
>   	return true;
>   }
> @@ -5832,6 +5834,11 @@ static inline int throttled_lb_pair(struct
> task_group *tg,
>   	       throttled_hierarchy(dest_cfs_rq);
>   }
> 
> +static inline bool task_is_throttled(struct task_struct *p)
> +{
> +	return !list_empty(&p->throttle_node);
> +}
> +
>   static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags);
>   static void throttle_cfs_rq_work(struct callback_head *work)
>   {
> @@ -5885,32 +5892,45 @@ void init_cfs_throttle_work(struct task_struct *p)
>   	INIT_LIST_HEAD(&p->throttle_node);
>   }
> 
> +static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags);
>   static int tg_unthrottle_up(struct task_group *tg, void *data)
>   {
>   	struct rq *rq = data;
>   	struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
> +	struct task_struct *p, *tmp;
> 
>   	cfs_rq->throttle_count--;
> -	if (!cfs_rq->throttle_count) {
> -		cfs_rq->throttled_clock_pelt_time += rq_clock_pelt(rq) -
> -					     cfs_rq->throttled_clock_pelt;
> +	if (cfs_rq->throttle_count)
> +		return 0;
> 
> -		/* Add cfs_rq with load or one or more already running entities to
> the list */
> -		if (!cfs_rq_is_decayed(cfs_rq))
> -			list_add_leaf_cfs_rq(cfs_rq);
> +	cfs_rq->throttled_clock_pelt_time += rq_clock_pelt(rq) -
> +		cfs_rq->throttled_clock_pelt;
> 
> -		if (cfs_rq->throttled_clock_self) {
> -			u64 delta = rq_clock(rq) - cfs_rq->throttled_clock_self;
> +	if (cfs_rq->throttled_clock_self) {
> +		u64 delta = rq_clock(rq) - cfs_rq->throttled_clock_self;
> 
> -			cfs_rq->throttled_clock_self = 0;
> +		cfs_rq->throttled_clock_self = 0;
> 
> -			if (SCHED_WARN_ON((s64)delta < 0))
> -				delta = 0;
> +		if (SCHED_WARN_ON((s64)delta < 0))
> +			delta = 0;
> 
> -			cfs_rq->throttled_clock_self_time += delta;
> -		}
> +		cfs_rq->throttled_clock_self_time += delta;
>   	}
> 
> +	/* Re-enqueue the tasks that have been throttled at this level. */
> +	list_for_each_entry_safe(p, tmp, &cfs_rq->throttled_limbo_list,
> throttle_node) {
> +		list_del_init(&p->throttle_node);
> +		/*
> +		 * FIXME: p may not be allowed to run on this rq anymore
> +		 * due to affinity change while p is throttled.
> +		 */

Using dequeue_task_fair() for throttle should ensure that the core now
sees task_on_rq_queued() which should make it go throgh a full dequeue
cycle which will remove the task from the "throttled_limbo_list" and
the enqueue should put it back on the correct runqueue.

Is the above comment inaccurate with your changes or did I miss
something?

> +		enqueue_task_fair(rq_of(cfs_rq), p, ENQUEUE_WAKEUP);
> +	}
> +
> +	/* Add cfs_rq with load or one or more already running entities to the list */
> +	if (!cfs_rq_is_decayed(cfs_rq))
> +		list_add_leaf_cfs_rq(cfs_rq);
> +
>   	return 0;
>   }
> 
> @@ -5947,12 +5967,16 @@ static int tg_throttle_down(struct task_group
> *tg, void *data)
> 
>   	/* group is entering throttled state, stop time */
>   	cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq);
> -	list_del_leaf_cfs_rq(cfs_rq);
> 
>   	SCHED_WARN_ON(cfs_rq->throttled_clock_self);
>   	if (cfs_rq->nr_queued)
>   		cfs_rq->throttled_clock_self = rq_clock(rq);
> 
> +	if (!cfs_rq->nr_queued) {
> +		list_del_leaf_cfs_rq(cfs_rq);
> +		return 0;
> +	}
> +

This bit can perhaps go in Patch 2?

>   	WARN_ON_ONCE(!list_empty(&cfs_rq->throttled_limbo_list));
>   	/*
>   	 * rq_lock is held, current is (obviously) executing this in kernelspace.
> @@ -6031,11 +6055,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
>   {
>   	struct rq *rq = rq_of(cfs_rq);
>   	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
> -	struct sched_entity *se;
> -	long queued_delta, runnable_delta, idle_delta;
> -	long rq_h_nr_queued = rq->cfs.h_nr_queued;
> -
> -	se = cfs_rq->tg->se[cpu_of(rq)];
> +	struct sched_entity *se = cfs_rq->tg->se[cpu_of(rq)];
> 
>   	cfs_rq->throttled = 0;
> 

[..snip..]

-- 
Thanks and Regards,
Prateek