linux-kernel - Re: [PATCH v4 3/5] sched/fair: Switch to task based throttle model

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20250903145124.GM4067720@noisy.programming.kicks-ass.net>
Date: Wed, 3 Sep 2025 16:51:24 +0200
From: Peter Zijlstra <peterz@...radead.org>
To: Aaron Lu <ziqianlu@...edance.com>
Cc: Valentin Schneider <vschneid@...hat.com>,
	Ben Segall <bsegall@...gle.com>,
	K Prateek Nayak <kprateek.nayak@....com>,
	Chengming Zhou <chengming.zhou@...ux.dev>,
	Josh Don <joshdon@...gle.com>, Ingo Molnar <mingo@...hat.com>,
	Vincent Guittot <vincent.guittot@...aro.org>,
	Xi Wang <xii@...gle.com>, linux-kernel@...r.kernel.org,
	Juri Lelli <juri.lelli@...hat.com>,
	Dietmar Eggemann <dietmar.eggemann@....com>,
	Steven Rostedt <rostedt@...dmis.org>, Mel Gorman <mgorman@...e.de>,
	Chuyi Zhou <zhouchuyi@...edance.com>,
	Jan Kiszka <jan.kiszka@...mens.com>,
	Florian Bezdeka <florian.bezdeka@...mens.com>,
	Songtang Liu <liusongtang@...edance.com>,
	Chen Yu <yu.c.chen@...el.com>,
	Matteo Martelli <matteo.martelli@...ethink.co.uk>,
	Michal Koutn?? <mkoutny@...e.com>,
	Sebastian Andrzej Siewior <bigeasy@...utronix.de>
Subject: Re: [PATCH v4 3/5] sched/fair: Switch to task based throttle model

On Fri, Aug 29, 2025 at 04:11:18PM +0800, Aaron Lu wrote:

> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index dab4ed86d0c82..25b1014d4ef86 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -5291,18 +5291,23 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
>  
>  	if (cfs_rq->nr_queued == 1) {
>  		check_enqueue_throttle(cfs_rq);
> -		if (!throttled_hierarchy(cfs_rq)) {
> -			list_add_leaf_cfs_rq(cfs_rq);
> -		} else {
> +		list_add_leaf_cfs_rq(cfs_rq);
>  #ifdef CONFIG_CFS_BANDWIDTH
> +		if (throttled_hierarchy(cfs_rq)) {
>  			struct rq *rq = rq_of(cfs_rq);
>  
>  			if (cfs_rq_throttled(cfs_rq) && !cfs_rq->throttled_clock)
>  				cfs_rq->throttled_clock = rq_clock(rq);
>  			if (!cfs_rq->throttled_clock_self)
>  				cfs_rq->throttled_clock_self = rq_clock(rq);
> -#endif
> +
> +			if (cfs_rq->pelt_clock_throttled) {
> +				cfs_rq->throttled_clock_pelt_time += rq_clock_pelt(rq) -
> +					cfs_rq->throttled_clock_pelt;
> +				cfs_rq->pelt_clock_throttled = 0;
> +			}
>  		}
> +#endif
>  	}
>  }
>  

> @@ -5450,8 +5451,18 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
>  	if (flags & DEQUEUE_DELAYED)
>  		finish_delayed_dequeue_entity(se);
>  
> -	if (cfs_rq->nr_queued == 0)
> +	if (cfs_rq->nr_queued == 0) {
>  		update_idle_cfs_rq_clock_pelt(cfs_rq);
> +#ifdef CONFIG_CFS_BANDWIDTH
> +		if (throttled_hierarchy(cfs_rq)) {
> +			struct rq *rq = rq_of(cfs_rq);
> +
> +			list_del_leaf_cfs_rq(cfs_rq);
> +			cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq);
> +			cfs_rq->pelt_clock_throttled = 1;
> +		}
> +#endif
> +	}
>  
>  	return true;
>  }

> @@ -6717,6 +6731,8 @@ static inline void sync_throttle(struct task_group *tg, int cpu) {}
>  static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
>  static void task_throttle_setup_work(struct task_struct *p) {}
>  static bool task_is_throttled(struct task_struct *p) { return false; }
> +static void dequeue_throttled_task(struct task_struct *p, int flags) {}
> +static bool enqueue_throttled_task(struct task_struct *p) { return false; }
>  
>  static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
>  {
> @@ -6909,6 +6925,9 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
>  	int rq_h_nr_queued = rq->cfs.h_nr_queued;
>  	u64 slice = 0;
>  
> +	if (task_is_throttled(p) && enqueue_throttled_task(p))
> +		return;
> +
>  	/*
>  	 * The code below (indirectly) updates schedutil which looks at
>  	 * the cfs_rq utilization to select a frequency.
> @@ -6961,10 +6980,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
>  		if (cfs_rq_is_idle(cfs_rq))
>  			h_nr_idle = 1;
>  
> -		/* end evaluation on encountering a throttled cfs_rq */
> -		if (cfs_rq_throttled(cfs_rq))
> -			goto enqueue_throttle;
> -
>  		flags = ENQUEUE_WAKEUP;
>  	}
>  
> @@ -6986,10 +7001,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
>  
>  		if (cfs_rq_is_idle(cfs_rq))
>  			h_nr_idle = 1;
> -
> -		/* end evaluation on encountering a throttled cfs_rq */
> -		if (cfs_rq_throttled(cfs_rq))
> -			goto enqueue_throttle;
>  	}
>  
>  	if (!rq_h_nr_queued && rq->cfs.h_nr_queued) {
> @@ -7019,7 +7030,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
>  	if (!task_new)
>  		check_update_overutilized_status(rq);
>  
> -enqueue_throttle:
>  	assert_list_leaf_cfs_rq(rq);
>  
>  	hrtick_update(rq);
> @@ -7074,10 +7084,6 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
>  		if (cfs_rq_is_idle(cfs_rq))
>  			h_nr_idle = h_nr_queued;
>  
> -		/* end evaluation on encountering a throttled cfs_rq */
> -		if (cfs_rq_throttled(cfs_rq))
> -			return 0;
> -
>  		/* Don't dequeue parent if it has other entities besides us */
>  		if (cfs_rq->load.weight) {
>  			slice = cfs_rq_min_slice(cfs_rq);
> @@ -7114,10 +7120,6 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
>  
>  		if (cfs_rq_is_idle(cfs_rq))
>  			h_nr_idle = h_nr_queued;
> -
> -		/* end evaluation on encountering a throttled cfs_rq */
> -		if (cfs_rq_throttled(cfs_rq))
> -			return 0;
>  	}
>  
>  	sub_nr_running(rq, h_nr_queued);
> @@ -7151,6 +7153,11 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
>   */
>  static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
>  {
> +	if (task_is_throttled(p)) {
> +		dequeue_throttled_task(p, flags);
> +		return true;
> +	}
> +
>  	if (!p->se.sched_delayed)
>  		util_est_dequeue(&rq->cfs, p);
>  

OK, so this makes it so that either a task is fully enqueued (all
cfs_rq's) or full not. A group cfs_rq is only marked throttled when all
its tasks are gone, and unthrottled when a task gets added. Right?

But propagate_entity_cfs_rq() is still doing the old thing, and has a
if (cfs_rq_throttled(cfs_rq)) break; inside the for_each_sched_entity()
iteration.

This seems somewhat inconsistent; or am I missing something ?