linux-kernel - Re: [PATCH 17/24] sched/fair: Implement delayed dequeue

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <f1a2c2d2-6ecc-4cb0-b54d-952a4fb700da@arm.com>
Date: Mon, 19 Aug 2024 11:01:39 +0100
From: Luis Machado <luis.machado@....com>
To: Peter Zijlstra <peterz@...radead.org>, mingo@...hat.com,
 juri.lelli@...hat.com, vincent.guittot@...aro.org, dietmar.eggemann@....com,
 rostedt@...dmis.org, bsegall@...gle.com, mgorman@...e.de,
 vschneid@...hat.com, linux-kernel@...r.kernel.org
Cc: kprateek.nayak@....com, wuyun.abel@...edance.com,
 youssefesmat@...omium.org, tglx@...utronix.de, efault@....de
Subject: Re: [PATCH 17/24] sched/fair: Implement delayed dequeue

Hi Peter,

On 7/27/24 11:27, Peter Zijlstra wrote:
> Extend / fix 86bfbb7ce4f6 ("sched/fair: Add lag based placement") by
> noting that lag is fundamentally a temporal measure. It should not be
> carried around indefinitely.
> 
> OTOH it should also not be instantly discarded, doing so will allow a
> task to game the system by purposefully (micro) sleeping at the end of
> its time quantum.
> 
> Since lag is intimately tied to the virtual time base, a wall-time
> based decay is also insufficient, notably competition is required for
> any of this to make sense.
> 
> Instead, delay the dequeue and keep the 'tasks' on the runqueue,
> competing until they are eligible.
> 
> Strictly speaking, we only care about keeping them until the 0-lag
> point, but that is a difficult proposition, instead carry them around
> until they get picked again, and dequeue them at that point.
> 
> Signed-off-by: Peter Zijlstra (Intel) <peterz@...radead.org>
> ---
>  kernel/sched/deadline.c |    1 
>  kernel/sched/fair.c     |   82 ++++++++++++++++++++++++++++++++++++++++++------
>  kernel/sched/features.h |    9 +++++
>  3 files changed, 81 insertions(+), 11 deletions(-)
> 
> --- a/kernel/sched/deadline.c
> +++ b/kernel/sched/deadline.c
> @@ -2428,7 +2428,6 @@ static struct task_struct *__pick_next_t
>  		else
>  			p = dl_se->server_pick_next(dl_se);
>  		if (!p) {
> -			WARN_ON_ONCE(1);
>  			dl_se->dl_yielded = 1;
>  			update_curr_dl_se(rq, dl_se, 0);
>  			goto again;
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -5379,20 +5379,44 @@ static void clear_buddies(struct cfs_rq
>  
>  static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
>  
> -static void
> +static bool
>  dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
>  {
> -	int action = UPDATE_TG;
> +	if (flags & DEQUEUE_DELAYED) {
> +		/*
> +		 * DEQUEUE_DELAYED is typically called from pick_next_entity()
> +		 * at which point we've already done update_curr() and do not
> +		 * want to do so again.
> +		 */
> +		SCHED_WARN_ON(!se->sched_delayed);
> +		se->sched_delayed = 0;
> +	} else {
> +		bool sleep = flags & DEQUEUE_SLEEP;
> +
> +		/*
> +		 * DELAY_DEQUEUE relies on spurious wakeups, special task
> +		 * states must not suffer spurious wakeups, excempt them.
> +		 */
> +		if (flags & DEQUEUE_SPECIAL)
> +			sleep = false;
> +
> +		SCHED_WARN_ON(sleep && se->sched_delayed);
> +		update_curr(cfs_rq);
>  
> +		if (sched_feat(DELAY_DEQUEUE) && sleep &&
> +		    !entity_eligible(cfs_rq, se)) {
> +			if (cfs_rq->next == se)
> +				cfs_rq->next = NULL;
> +			se->sched_delayed = 1;
> +			return false;
> +		}
> +	}
> +
> +	int action = UPDATE_TG;
>  	if (entity_is_task(se) && task_on_rq_migrating(task_of(se)))
>  		action |= DO_DETACH;
>  
>  	/*
> -	 * Update run-time statistics of the 'current'.
> -	 */
> -	update_curr(cfs_rq);
> -
> -	/*
>  	 * When dequeuing a sched_entity, we must:
>  	 *   - Update loads to have both entity and cfs_rq synced with now.
>  	 *   - For group_entity, update its runnable_weight to reflect the new
> @@ -5430,6 +5454,8 @@ dequeue_entity(struct cfs_rq *cfs_rq, st
>  
>  	if (cfs_rq->nr_running == 0)
>  		update_idle_cfs_rq_clock_pelt(cfs_rq);
> +
> +	return true;
>  }
>  
>  static void
> @@ -5828,11 +5854,21 @@ static bool throttle_cfs_rq(struct cfs_r
>  	idle_task_delta = cfs_rq->idle_h_nr_running;
>  	for_each_sched_entity(se) {
>  		struct cfs_rq *qcfs_rq = cfs_rq_of(se);
> +		int flags;
> +
>  		/* throttled entity or throttle-on-deactivate */
>  		if (!se->on_rq)
>  			goto done;
>  
> -		dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
> +		/*
> +		 * Abuse SPECIAL to avoid delayed dequeue in this instance.
> +		 * This avoids teaching dequeue_entities() about throttled
> +		 * entities and keeps things relatively simple.
> +		 */
> +		flags = DEQUEUE_SLEEP | DEQUEUE_SPECIAL;
> +		if (se->sched_delayed)
> +			flags |= DEQUEUE_DELAYED;
> +		dequeue_entity(qcfs_rq, se, flags);
>  
>  		if (cfs_rq_is_idle(group_cfs_rq(se)))
>  			idle_task_delta = cfs_rq->h_nr_running;
> @@ -6918,6 +6954,7 @@ static int dequeue_entities(struct rq *r
>  	bool was_sched_idle = sched_idle_rq(rq);
>  	int rq_h_nr_running = rq->cfs.h_nr_running;
>  	bool task_sleep = flags & DEQUEUE_SLEEP;
> +	bool task_delayed = flags & DEQUEUE_DELAYED;
>  	struct task_struct *p = NULL;
>  	int idle_h_nr_running = 0;
>  	int h_nr_running = 0;
> @@ -6931,7 +6968,13 @@ static int dequeue_entities(struct rq *r
>  
>  	for_each_sched_entity(se) {
>  		cfs_rq = cfs_rq_of(se);
> -		dequeue_entity(cfs_rq, se, flags);
> +
> +		if (!dequeue_entity(cfs_rq, se, flags)) {
> +			if (p && &p->se == se)
> +				return -1;
> +
> +			break;
> +		}
>  
>  		cfs_rq->h_nr_running -= h_nr_running;
>  		cfs_rq->idle_h_nr_running -= idle_h_nr_running;
> @@ -6956,6 +6999,7 @@ static int dequeue_entities(struct rq *r
>  			break;
>  		}
>  		flags |= DEQUEUE_SLEEP;
> +		flags &= ~(DEQUEUE_DELAYED | DEQUEUE_SPECIAL);
>  	}
>  
>  	for_each_sched_entity(se) {
> @@ -6985,6 +7029,17 @@ static int dequeue_entities(struct rq *r
>  	if (unlikely(!was_sched_idle && sched_idle_rq(rq)))
>  		rq->next_balance = jiffies;
>  
> +	if (p && task_delayed) {
> +		SCHED_WARN_ON(!task_sleep);
> +		SCHED_WARN_ON(p->on_rq != 1);
> +
> +		/* Fix-up what dequeue_task_fair() skipped */
> +		hrtick_update(rq);
> +
> +		/* Fix-up what block_task() skipped. */
> +		__block_task(rq, p);
> +	}
> +
>  	return 1;
>  }
>  /*
> @@ -6996,8 +7051,10 @@ static bool dequeue_task_fair(struct rq
>  {
>  	util_est_dequeue(&rq->cfs, p);
>  
> -	if (dequeue_entities(rq, &p->se, flags) < 0)
> +	if (dequeue_entities(rq, &p->se, flags) < 0) {
> +		util_est_update(&rq->cfs, p, DEQUEUE_SLEEP);
>  		return false;
> +	}
>  
>  	util_est_update(&rq->cfs, p, flags & DEQUEUE_SLEEP);
>  	hrtick_update(rq);
> @@ -12973,6 +13030,11 @@ static void set_next_task_fair(struct rq
>  		/* ensure bandwidth has been allocated on our new cfs_rq */
>  		account_cfs_rq_runtime(cfs_rq, 0);
>  	}
> +
> +	if (!first)
> +		return;
> +
> +	SCHED_WARN_ON(se->sched_delayed);
>  }
>  
>  void init_cfs_rq(struct cfs_rq *cfs_rq)
> --- a/kernel/sched/features.h
> +++ b/kernel/sched/features.h
> @@ -29,6 +29,15 @@ SCHED_FEAT(NEXT_BUDDY, false)
>  SCHED_FEAT(CACHE_HOT_BUDDY, true)
>  
>  /*
> + * Delay dequeueing tasks until they get selected or woken.
> + *
> + * By delaying the dequeue for non-eligible tasks, they remain in the
> + * competition and can burn off their negative lag. When they get selected
> + * they'll have positive lag by definition.
> + */
> +SCHED_FEAT(DELAY_DEQUEUE, true)
> +
> +/*
>   * Allow wakeup-time preemption of the current task:
>   */
>  SCHED_FEAT(WAKEUP_PREEMPTION, true)
> 
> 
> 

Just a heads-up I'm chasing some odd behavior on the big.little/pixel 6 platform, where
sometimes I see runs with spikes of higher frequencies for extended amounts of time (multiple
seconds), in particular for little cores, which leads to higher energy use.

I'm still trying to understand why that happens, but looks like the utilization values are
sometimes stuck at high values. I just want to make sure the delayed dequeue changes aren't
interfering with the util calculations.

Unfortunately the benchmark is Android-specific, so hard to provide a reasonable
reproducer for Linux.