linux-kernel - Re: [PATCH 2/2] sched/fair: Reimplement NEXT

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <a6296e85-4140-4199-b260-dde3d8c4209a@amd.com>
Date: Thu, 23 Oct 2025 11:59:25 +0530
From: K Prateek Nayak <kprateek.nayak@....com>
To: Mel Gorman <mgorman@...hsingularity.net>, <linux-kernel@...r.kernel.org>
CC: Peter Zijlstra <peterz@...radead.org>, Ingo Molnar <mingo@...hat.com>,
	Juri Lelli <juri.lelli@...hat.com>, Dietmar Eggemann
	<dietmar.eggemann@....com>, Valentin Schneider <vschneid@...hat.com>, "Chris
 Mason" <clm@...a.com>
Subject: Re: [PATCH 2/2] sched/fair: Reimplement NEXT_BUDDY to align with
 EEVDF goals

Hello Mel,

On 10/21/2025 7:58 PM, Mel Gorman wrote:
> +static inline enum preempt_wakeup_action
> +__do_preempt_buddy(struct rq *rq, struct cfs_rq *cfs_rq, int wake_flags,
> +		 struct sched_entity *pse, struct sched_entity *se,
> +		 s64 delta)

"delta" is only used within __do_preempt_buddy(). Might as well compute
it here.

> +{
> +	bool pse_before;
> +
> +	/*
> +	 * Ignore wakee preemption on WF_WORK as it is less likely that
> +	 * there is shared data as exec often follow fork. Do not
> +	 * preempt for tasks that are sched_delayed as it would violate
> +	 * EEVDF to forcibly queue an ineligible task.
> +	 */
> +	if (!sched_feat(NEXT_BUDDY) ||
> +	    (wake_flags & WF_FORK) ||
> +	    (pse->sched_delayed)) {
> +		return PREEMPT_WAKEUP_NONE;
> +	}
> +
> +	/* Reschedule if waker is no longer eligible. */
> +	if (!entity_eligible(cfs_rq, se)) {
> +		resched_curr_lazy(rq);

This is unnecessary since PREEMPT_WAKEUP_RESCHED case already does a
"goto preempt" which does a resched_curr_lazy().

> +		return PREEMPT_WAKEUP_RESCHED;

Should we update the next buddy before returning here if
entity_before(pse, cfs_rq->next)?

> +	}
> +
> +	/*
> +	 * Keep existing buddy if the deadline is sooner than pse.
> +	 * The downside is that the older buddy may be cache cold
> +	 * but that is unpredictable where as an earlier deadline
> +	 * is absolute.
> +	 */
> +	if (cfs_rq->next && entity_before(cfs_rq->next, pse))
> +		return PREEMPT_WAKEUP_NONE;
> +
> +	set_next_buddy(pse);
> +
> +	/*
> +	 * WF_SYNC|WF_TTWU indicates the waker expects to sleep but it is not
> +	 * strictly enforced because the hint is either misunderstood or
> +	 * multiple tasks must be woken up.
> +	 */
> +	pse_before = entity_before(pse, se);
> +	if ((wake_flags & (WF_TTWU|WF_SYNC)) == (WF_TTWU|WF_SYNC)) {

Do we have !TTWU cases that set WF_SYNC?

> +		/*
> +		 * WF_RQ_SELECTED implies the tasks are stacking on a
> +		 * CPU when they could run on other CPUs. Only consider
> +		 * reschedule if pse deadline expires before se.
> +		 */

Code doesn't seem to match that comment. We are foregoing preemption
if the current task has run for less than "sysctl_sched_migration_cost"
even on pse_before.

Also. looking at the comment in check_preempt_wakeup_fair():

    If @p potentially is completing work required by current then
    consider preemption.

Shouldn't this check if "se" is indeed the waker task? Despite WF_SYNC,
wake_affine() can still return the CPU where @p was previously running
which now might be running another unrelated task.

> +		if ((wake_flags & WF_RQ_SELECTED) &&
> +		    delta < sysctl_sched_migration_cost && pse_before) {
> +			return PREEMPT_WAKEUP_NONE;
> +		}

Above checks seems unnecessary since we return "PREEMPT_WAKEUP_NONE" if
we don't enter the below condition. The two cannot have any overlap
based on my reading.

> +
> +		/*
> +		 * As WF_SYNC is not strictly obeyed, allow some runtime for
> +		 * batch wakeups to be issued.
> +		 */
> +		if (pse_before && delta >= sysctl_sched_migration_cost)
> +			return PREEMPT_WAKEUP_RESCHED;
> +
> +		return PREEMPT_WAKEUP_NONE;
> +	}
> +
> +	return PREEMPT_WAKEUP_NEXT;
> +}
> +
>  /*
>   * Used by other classes to account runtime.
>   */
> @@ -5512,16 +5598,6 @@ pick_next_entity(struct rq *rq, struct cfs_rq *cfs_rq)
>  {
>  	struct sched_entity *se;
>  
> -	/*
> -	 * Picking the ->next buddy will affect latency but not fairness.
> -	 */
> -	if (sched_feat(PICK_BUDDY) &&
> -	    cfs_rq->next && entity_eligible(cfs_rq, cfs_rq->next)) {
> -		/* ->next will never be delayed */
> -		WARN_ON_ONCE(cfs_rq->next->sched_delayed);
> -		return cfs_rq->next;
> -	}
> -
>  	se = pick_eevdf(cfs_rq);
>  	if (se->sched_delayed) {
>  		dequeue_entities(rq, se, DEQUEUE_SLEEP | DEQUEUE_DELAYED);
> @@ -7028,8 +7104,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
>  	hrtick_update(rq);
>  }
>  
> -static void set_next_buddy(struct sched_entity *se);
> -
>  /*
>   * Basically dequeue_task_fair(), except it can deal with dequeue_entity()
>   * failing half-way through and resume the dequeue later.
> @@ -8734,7 +8808,8 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int
>  	struct sched_entity *se = &donor->se, *pse = &p->se;
>  	struct cfs_rq *cfs_rq = task_cfs_rq(donor);
>  	int cse_is_idle, pse_is_idle;
> -	bool do_preempt_short = false;
> +	enum preempt_wakeup_action do_preempt_short = PREEMPT_WAKEUP_NONE;
> +	s64 delta;
>  
>  	if (unlikely(se == pse))
>  		return;
> @@ -8748,10 +8823,6 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int
>  	if (task_is_throttled(p))
>  		return;
>  
> -	if (sched_feat(NEXT_BUDDY) && !(wake_flags & WF_FORK) && !pse->sched_delayed) {
> -		set_next_buddy(pse);
> -	}
> -
>  	/*
>  	 * We can come here with TIF_NEED_RESCHED already set from new task
>  	 * wake up path.
> @@ -8783,7 +8854,7 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int
>  		 * When non-idle entity preempt an idle entity,
>  		 * don't give idle entity slice protection.
>  		 */
> -		do_preempt_short = true;
> +		do_preempt_short = PREEMPT_WAKEUP_NEXT;
>  		goto preempt;
>  	}
>  
> @@ -8797,12 +8868,31 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int
>  		return;
>  
>  	cfs_rq = cfs_rq_of(se);
> +	delta = rq_clock_task(rq) - se->exec_start;
>  	update_curr(cfs_rq);
>  	/*
>  	 * If @p has a shorter slice than current and @p is eligible, override
>  	 * current's slice protection in order to allow preemption.
>  	 */
> -	do_preempt_short = sched_feat(PREEMPT_SHORT) && (pse->slice < se->slice);
> +	if (sched_feat(PREEMPT_SHORT) && (pse->slice < se->slice)) {
> +		do_preempt_short = PREEMPT_WAKEUP_NEXT;
> +	} else {
> +		/*
> +		 * If @p potentially is completing work required by current then
> +		 * consider preemption.
> +		 */
> +		do_preempt_short = __do_preempt_buddy(rq, cfs_rq, wake_flags,
> +						      pse, se, delta);
> +	}
> +
> +	switch (do_preempt_short) {
> +	case PREEMPT_WAKEUP_NONE:
> +		goto update_slice;

You can just return from here since update_protect_slice() is only
done on "do_preempt_short != PREEMPT_WAKEUP_NONE". With that,
the "update_slice" label becomes unnecessary ...

> +	case PREEMPT_WAKEUP_RESCHED:
> +		goto preempt;
> +	case PREEMPT_WAKEUP_NEXT:
> +		break;
> +	}
>  
>  	/*
>  	 * If @p has become the most eligible task, force preemption.
> @@ -8810,7 +8900,8 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int
>  	if (__pick_eevdf(cfs_rq, !do_preempt_short) == pse)
>  		goto preempt;
>  
> -	if (sched_feat(RUN_TO_PARITY) && do_preempt_short)
> +update_slice:
> +	if (sched_feat(RUN_TO_PARITY) && do_preempt_short != PREEMPT_WAKEUP_NONE)

... and since do_preempt_short can only be "PREEMPT_WAKEUP_NEXT"
which is non-zero, changes in this hunk can be dropped.

>  		update_protect_slice(cfs_rq, se);
>  
>  	return;

-- 
Thanks and Regards,
Prateek