[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <a6296e85-4140-4199-b260-dde3d8c4209a@amd.com>
Date: Thu, 23 Oct 2025 11:59:25 +0530
From: K Prateek Nayak <kprateek.nayak@....com>
To: Mel Gorman <mgorman@...hsingularity.net>, <linux-kernel@...r.kernel.org>
CC: Peter Zijlstra <peterz@...radead.org>, Ingo Molnar <mingo@...hat.com>,
Juri Lelli <juri.lelli@...hat.com>, Dietmar Eggemann
<dietmar.eggemann@....com>, Valentin Schneider <vschneid@...hat.com>, "Chris
Mason" <clm@...a.com>
Subject: Re: [PATCH 2/2] sched/fair: Reimplement NEXT_BUDDY to align with
EEVDF goals
Hello Mel,
On 10/21/2025 7:58 PM, Mel Gorman wrote:
> +static inline enum preempt_wakeup_action
> +__do_preempt_buddy(struct rq *rq, struct cfs_rq *cfs_rq, int wake_flags,
> + struct sched_entity *pse, struct sched_entity *se,
> + s64 delta)
"delta" is only used within __do_preempt_buddy(). Might as well compute
it here.
> +{
> + bool pse_before;
> +
> + /*
> + * Ignore wakee preemption on WF_WORK as it is less likely that
> + * there is shared data as exec often follow fork. Do not
> + * preempt for tasks that are sched_delayed as it would violate
> + * EEVDF to forcibly queue an ineligible task.
> + */
> + if (!sched_feat(NEXT_BUDDY) ||
> + (wake_flags & WF_FORK) ||
> + (pse->sched_delayed)) {
> + return PREEMPT_WAKEUP_NONE;
> + }
> +
> + /* Reschedule if waker is no longer eligible. */
> + if (!entity_eligible(cfs_rq, se)) {
> + resched_curr_lazy(rq);
This is unnecessary since PREEMPT_WAKEUP_RESCHED case already does a
"goto preempt" which does a resched_curr_lazy().
> + return PREEMPT_WAKEUP_RESCHED;
Should we update the next buddy before returning here if
entity_before(pse, cfs_rq->next)?
> + }
> +
> + /*
> + * Keep existing buddy if the deadline is sooner than pse.
> + * The downside is that the older buddy may be cache cold
> + * but that is unpredictable where as an earlier deadline
> + * is absolute.
> + */
> + if (cfs_rq->next && entity_before(cfs_rq->next, pse))
> + return PREEMPT_WAKEUP_NONE;
> +
> + set_next_buddy(pse);
> +
> + /*
> + * WF_SYNC|WF_TTWU indicates the waker expects to sleep but it is not
> + * strictly enforced because the hint is either misunderstood or
> + * multiple tasks must be woken up.
> + */
> + pse_before = entity_before(pse, se);
> + if ((wake_flags & (WF_TTWU|WF_SYNC)) == (WF_TTWU|WF_SYNC)) {
Do we have !TTWU cases that set WF_SYNC?
> + /*
> + * WF_RQ_SELECTED implies the tasks are stacking on a
> + * CPU when they could run on other CPUs. Only consider
> + * reschedule if pse deadline expires before se.
> + */
Code doesn't seem to match that comment. We are foregoing preemption
if the current task has run for less than "sysctl_sched_migration_cost"
even on pse_before.
Also. looking at the comment in check_preempt_wakeup_fair():
If @p potentially is completing work required by current then
consider preemption.
Shouldn't this check if "se" is indeed the waker task? Despite WF_SYNC,
wake_affine() can still return the CPU where @p was previously running
which now might be running another unrelated task.
> + if ((wake_flags & WF_RQ_SELECTED) &&
> + delta < sysctl_sched_migration_cost && pse_before) {
> + return PREEMPT_WAKEUP_NONE;
> + }
Above checks seems unnecessary since we return "PREEMPT_WAKEUP_NONE" if
we don't enter the below condition. The two cannot have any overlap
based on my reading.
> +
> + /*
> + * As WF_SYNC is not strictly obeyed, allow some runtime for
> + * batch wakeups to be issued.
> + */
> + if (pse_before && delta >= sysctl_sched_migration_cost)
> + return PREEMPT_WAKEUP_RESCHED;
> +
> + return PREEMPT_WAKEUP_NONE;
> + }
> +
> + return PREEMPT_WAKEUP_NEXT;
> +}
> +
> /*
> * Used by other classes to account runtime.
> */
> @@ -5512,16 +5598,6 @@ pick_next_entity(struct rq *rq, struct cfs_rq *cfs_rq)
> {
> struct sched_entity *se;
>
> - /*
> - * Picking the ->next buddy will affect latency but not fairness.
> - */
> - if (sched_feat(PICK_BUDDY) &&
> - cfs_rq->next && entity_eligible(cfs_rq, cfs_rq->next)) {
> - /* ->next will never be delayed */
> - WARN_ON_ONCE(cfs_rq->next->sched_delayed);
> - return cfs_rq->next;
> - }
> -
> se = pick_eevdf(cfs_rq);
> if (se->sched_delayed) {
> dequeue_entities(rq, se, DEQUEUE_SLEEP | DEQUEUE_DELAYED);
> @@ -7028,8 +7104,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
> hrtick_update(rq);
> }
>
> -static void set_next_buddy(struct sched_entity *se);
> -
> /*
> * Basically dequeue_task_fair(), except it can deal with dequeue_entity()
> * failing half-way through and resume the dequeue later.
> @@ -8734,7 +8808,8 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int
> struct sched_entity *se = &donor->se, *pse = &p->se;
> struct cfs_rq *cfs_rq = task_cfs_rq(donor);
> int cse_is_idle, pse_is_idle;
> - bool do_preempt_short = false;
> + enum preempt_wakeup_action do_preempt_short = PREEMPT_WAKEUP_NONE;
> + s64 delta;
>
> if (unlikely(se == pse))
> return;
> @@ -8748,10 +8823,6 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int
> if (task_is_throttled(p))
> return;
>
> - if (sched_feat(NEXT_BUDDY) && !(wake_flags & WF_FORK) && !pse->sched_delayed) {
> - set_next_buddy(pse);
> - }
> -
> /*
> * We can come here with TIF_NEED_RESCHED already set from new task
> * wake up path.
> @@ -8783,7 +8854,7 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int
> * When non-idle entity preempt an idle entity,
> * don't give idle entity slice protection.
> */
> - do_preempt_short = true;
> + do_preempt_short = PREEMPT_WAKEUP_NEXT;
> goto preempt;
> }
>
> @@ -8797,12 +8868,31 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int
> return;
>
> cfs_rq = cfs_rq_of(se);
> + delta = rq_clock_task(rq) - se->exec_start;
> update_curr(cfs_rq);
> /*
> * If @p has a shorter slice than current and @p is eligible, override
> * current's slice protection in order to allow preemption.
> */
> - do_preempt_short = sched_feat(PREEMPT_SHORT) && (pse->slice < se->slice);
> + if (sched_feat(PREEMPT_SHORT) && (pse->slice < se->slice)) {
> + do_preempt_short = PREEMPT_WAKEUP_NEXT;
> + } else {
> + /*
> + * If @p potentially is completing work required by current then
> + * consider preemption.
> + */
> + do_preempt_short = __do_preempt_buddy(rq, cfs_rq, wake_flags,
> + pse, se, delta);
> + }
> +
> + switch (do_preempt_short) {
> + case PREEMPT_WAKEUP_NONE:
> + goto update_slice;
You can just return from here since update_protect_slice() is only
done on "do_preempt_short != PREEMPT_WAKEUP_NONE". With that,
the "update_slice" label becomes unnecessary ...
> + case PREEMPT_WAKEUP_RESCHED:
> + goto preempt;
> + case PREEMPT_WAKEUP_NEXT:
> + break;
> + }
>
> /*
> * If @p has become the most eligible task, force preemption.
> @@ -8810,7 +8900,8 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int
> if (__pick_eevdf(cfs_rq, !do_preempt_short) == pse)
> goto preempt;
>
> - if (sched_feat(RUN_TO_PARITY) && do_preempt_short)
> +update_slice:
> + if (sched_feat(RUN_TO_PARITY) && do_preempt_short != PREEMPT_WAKEUP_NONE)
... and since do_preempt_short can only be "PREEMPT_WAKEUP_NEXT"
which is non-zero, changes in this hunk can be dropped.
> update_protect_slice(cfs_rq, se);
>
> return;
--
Thanks and Regards,
Prateek
Powered by blists - more mailing lists