lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <22aju4edxl4hf7tihzl6672cg234eam5kcgazxbf2ga5thsmm6@l2wwkn2wth7r>
Date: Mon, 14 Jul 2025 23:56:50 +0100
From: Mel Gorman <mgorman@...hsingularity.net>
To: Peter Zijlstra <peterz@...radead.org>
Cc: mingo@...hat.com, juri.lelli@...hat.com, vincent.guittot@...aro.org, 
	dietmar.eggemann@....com, rostedt@...dmis.org, bsegall@...gle.com, mgorman@...e.de, 
	vschneid@...hat.com, clm@...a.com, linux-kernel@...r.kernel.org
Subject: Re: [PATCH v2 02/12] sched/deadline: Less agressive dl_server
 handling

On Wed, Jul 02, 2025 at 01:49:26PM +0200, Peter Zijlstra wrote:
> Chris reported that commit 5f6bd380c7bd ("sched/rt: Remove default
> bandwidth control") caused a significant dip in his favourite
> benchmark of the day. Simply disabling dl_server cured things.
> 

Unrelated to the patch but I've been doing a bit of arcology recently
finding the motivation for various decisions and paragraphs like this
have been painful (most recent was figuring out why a decision was made
for 2.6.32). If the load was described, can you add a Link: tag?  If the
workload is proprietary, cannot be described or would be impractical to
independently created than can that be stated here instead?

> His workload hammers the 0->1, 1->0 transitions, and the
> dl_server_{start,stop}() overhead kills it -- fairly obviously a bad
> idea in hind sight and all that.
> 

Obvious in hindsight but even then a brief explanation as to why it
triggered that particular corner case would be helpful. i.e. was throttling
the trigger or dequeue for a workload with very few (1?) tasks?

> Change things around to only disable the dl_server when there has not
> been a fair task around for a whole period. Since the default period
> is 1 second, this ensures the benchmark never trips this, overhead
> gone.
> 

I didn't dig into this too much but is that 1s fixed because it's related to
the dl server itself rather than any task using the deadline scheduler? The
use of "default" indicates it's tunable but at a glance, it's not clear
if sched_setattr can be used to reconfigure the dl_server or not. Even if
that is the expected case, it's not obvious (to me).

> Fixes: 557a6bfc662c ("sched/fair: Add trivial fair server")
> Reported-by: Chris Mason <clm@...a.com>
> Signed-off-by: Peter Zijlstra (Intel) <peterz@...radead.org>
> Link: https://lkml.kernel.org/r/20250520101727.507378961@infradead.org

Patch looks ok but I'm not particularly familiar with the deadline
scheduler. Even so;

Acked-by: Mel Gorman <mgorman@...hsingularity.net>

One nit below

> ---
>  include/linux/sched.h   |    1 +
>  kernel/sched/deadline.c |   25 ++++++++++++++++++++++---
>  kernel/sched/fair.c     |    9 ---------
>  3 files changed, 23 insertions(+), 12 deletions(-)
> 
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -701,6 +701,7 @@ struct sched_dl_entity {
>  	unsigned int			dl_defer	  : 1;
>  	unsigned int			dl_defer_armed	  : 1;
>  	unsigned int			dl_defer_running  : 1;
> +	unsigned int			dl_server_idle    : 1;
>  
>  	/*
>  	 * Bandwidth enforcement timer. Each -deadline task has its
> --- a/kernel/sched/deadline.c
> +++ b/kernel/sched/deadline.c
> @@ -1215,6 +1215,8 @@ static void __push_dl_task(struct rq *rq
>  /* a defer timer will not be reset if the runtime consumed was < dl_server_min_res */
>  static const u64 dl_server_min_res = 1 * NSEC_PER_MSEC;
>  
> +static bool dl_server_stopped(struct sched_dl_entity *dl_se);
> +
>  static enum hrtimer_restart dl_server_timer(struct hrtimer *timer, struct sched_dl_entity *dl_se)
>  {
>  	struct rq *rq = rq_of_dl_se(dl_se);
> @@ -1234,6 +1236,7 @@ static enum hrtimer_restart dl_server_ti
>  
>  		if (!dl_se->server_has_tasks(dl_se)) {
>  			replenish_dl_entity(dl_se);
> +			dl_server_stopped(dl_se);
>  			return HRTIMER_NORESTART;
>  		}
>  
> @@ -1639,8 +1642,10 @@ void dl_server_update_idle_time(struct r
>  void dl_server_update(struct sched_dl_entity *dl_se, s64 delta_exec)
>  {
>  	/* 0 runtime = fair server disabled */
> -	if (dl_se->dl_runtime)
> +	if (dl_se->dl_runtime) {
> +		dl_se->dl_server_idle = 0;
>  		update_curr_dl_se(dl_se->rq, dl_se, delta_exec);
> +	}
>  }
>  
>  void dl_server_start(struct sched_dl_entity *dl_se)
> @@ -1663,7 +1668,7 @@ void dl_server_start(struct sched_dl_ent
>  		setup_new_dl_entity(dl_se);
>  	}
>  
> -	if (!dl_se->dl_runtime)
> +	if (!dl_se->dl_runtime || dl_se->dl_server_active)
>  		return;
>  
>  	dl_se->dl_server_active = 1;
> @@ -1684,6 +1689,20 @@ void dl_server_stop(struct sched_dl_enti
>  	dl_se->dl_server_active = 0;
>  }
>  
> +static bool dl_server_stopped(struct sched_dl_entity *dl_se)
> +{
> +	if (!dl_se->dl_server_active)
> +		return false;
> +
> +	if (dl_se->dl_server_idle) {
> +		dl_server_stop(dl_se);
> +		return true;
> +	}
> +
> +	dl_se->dl_server_idle = 1;
> +	return false;
> +}
> +

The function name does not suggest there are side-effects. If I'm reading it
correctly, it's basically a 2-pass filter related to the sched_period. It
could do with a comment with bonus points mentioning the duration of the
2-pass filter depends on the sched_period of the dl_server.

>  void dl_server_init(struct sched_dl_entity *dl_se, struct rq *rq,
>  		    dl_server_has_tasks_f has_tasks,
>  		    dl_server_pick_f pick_task)
> @@ -2435,7 +2454,7 @@ static struct task_struct *__pick_task_d
>  	if (dl_server(dl_se)) {
>  		p = dl_se->server_pick_task(dl_se);
>  		if (!p) {
> -			if (dl_server_active(dl_se)) {
> +			if (!dl_server_stopped(dl_se)) {
>  				dl_se->dl_yielded = 1;
>  				update_curr_dl_se(rq, dl_se, 0);
>  			}
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -5879,7 +5879,6 @@ static bool throttle_cfs_rq(struct cfs_r
>  	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
>  	struct sched_entity *se;
>  	long queued_delta, runnable_delta, idle_delta, dequeue = 1;
> -	long rq_h_nr_queued = rq->cfs.h_nr_queued;
>  
>  	raw_spin_lock(&cfs_b->lock);
>  	/* This will start the period timer if necessary */
> @@ -5963,10 +5962,6 @@ static bool throttle_cfs_rq(struct cfs_r
>  
>  	/* At this point se is NULL and we are at root level*/
>  	sub_nr_running(rq, queued_delta);
> -
> -	/* Stop the fair server if throttling resulted in no runnable tasks */
> -	if (rq_h_nr_queued && !rq->cfs.h_nr_queued)
> -		dl_server_stop(&rq->fair_server);
>  done:
>  	/*
>  	 * Note: distribution will already see us throttled via the
> @@ -7060,7 +7055,6 @@ static void set_next_buddy(struct sched_
>  static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
>  {
>  	bool was_sched_idle = sched_idle_rq(rq);
> -	int rq_h_nr_queued = rq->cfs.h_nr_queued;
>  	bool task_sleep = flags & DEQUEUE_SLEEP;
>  	bool task_delayed = flags & DEQUEUE_DELAYED;
>  	struct task_struct *p = NULL;
> @@ -7144,9 +7138,6 @@ static int dequeue_entities(struct rq *r
>  
>  	sub_nr_running(rq, h_nr_queued);
>  
> -	if (rq_h_nr_queued && !rq->cfs.h_nr_queued)
> -		dl_server_stop(&rq->fair_server);
> -
>  	/* balance early to pull high priority tasks */
>  	if (unlikely(!was_sched_idle && sched_idle_rq(rq)))
>  		rq->next_balance = jiffies;
> 
> 

-- 
Mel Gorman
SUSE Labs

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ