[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20251103140711.GB3245006@noisy.programming.kicks-ass.net>
Date: Mon, 3 Nov 2025 15:07:11 +0100
From: Peter Zijlstra <peterz@...radead.org>
To: Mel Gorman <mgorman@...hsingularity.net>
Cc: Ingo Molnar <mingo@...hat.com>, Juri Lelli <juri.lelli@...hat.com>,
Dietmar Eggemann <dietmar.eggemann@....com>,
Valentin Schneider <vschneid@...hat.com>,
Chris Mason <clm@...a.com>, linux-kernel@...r.kernel.org
Subject: Re: [PATCH 2/2] sched/fair: Reimplement NEXT_BUDDY to align with
EEVDF goals
On Mon, Nov 03, 2025 at 11:04:45AM +0000, Mel Gorman wrote:
> @@ -8725,6 +8725,91 @@ static void set_next_buddy(struct sched_entity *se)
> }
> }
>
> +enum preempt_wakeup_action {
> + PREEMPT_WAKEUP_NONE, /* No action on the buddy */
> + PREEMPT_WAKEUP_SHORT, /* Override current's slice
> + * protection to allow
> + * preemption.
> + */
> + PREEMPT_WAKEUP_NEXT, /* Check next is most eligible
> + * before rescheduling.
> + */
> + PREEMPT_WAKEUP_RESCHED, /* Plain reschedule */
> +};
Not really a fan of that comment style. While noodling, I've managed to
'accidentally' rename NEXT to PICK, since its more about letting
__pick_eevdf() decide.
> +static inline enum preempt_wakeup_action
> +__do_preempt_buddy(struct rq *rq, struct cfs_rq *cfs_rq, int wake_flags,
> + struct sched_entity *pse, struct sched_entity *se)
> +{
> + bool pse_before;
> +
> + /*
> + * Ignore wakee preemption on WF_FORK as it is less likely that
> + * there is shared data as exec often follow fork. Do not
> + * preempt for tasks that are sched_delayed as it would violate
> + * EEVDF to forcibly queue an ineligible task.
> + */
> + if ((wake_flags & WF_FORK) || pse->sched_delayed)
> + return PREEMPT_WAKEUP_NONE;
> +
> + /* Reschedule if waker is no longer eligible. */
> + if (in_task() && !entity_eligible(cfs_rq, se))
> + return PREEMPT_WAKEUP_RESCHED;
> +
> + /*
> + * Keep existing buddy if the deadline is sooner than pse.
> + * The older buddy may be cache cold and completely unrelated
> + * to the current wakeup but that is unpredictable where as
> + * obeying the deadline is more in line with EEVDF objectives.
> + */
> + if (cfs_rq->next && entity_before(cfs_rq->next, pse))
> + return PREEMPT_WAKEUP_NEXT;
> +
> + set_next_buddy(pse);
> +
> + /*
> + * WF_SYNC|WF_TTWU indicates the waker expects to sleep but it is not
> + * strictly enforced because the hint is either misunderstood or
> + * multiple tasks must be woken up.
> + */
> + pse_before = entity_before(pse, se);
> + if (wake_flags & WF_SYNC) {
> + u64 delta = rq_clock_task(rq) - se->exec_start;
> + u64 threshold = sysctl_sched_migration_cost;
> +
> + /*
> + * WF_SYNC without WF_TTWU is not expected so warn if it
> + * happens even though it is likely harmless.
> + */
> + WARN_ON_ONCE(!(wake_flags & WF_TTWU));
> +
> + if ((s64)delta < 0)
> + delta = 0;
> +
> + /*
> + * WF_RQ_SELECTED implies the tasks are stacking on a
> + * CPU when they could run on other CPUs. Reduce the
> + * threshold before preemption is allowed to an
> + * arbitrary lower value as it is more likely (but not
> + * guaranteed) the waker requires the wakee to finish.
> + */
> + if (wake_flags & WF_RQ_SELECTED)
> + threshold >>= 2;
> +
> + /*
> + * As WF_SYNC is not strictly obeyed, allow some runtime for
> + * batch wakeups to be issued.
> + */
> + if (pse_before && delta >= threshold)
> + return PREEMPT_WAKEUP_RESCHED;
> +
> + return PREEMPT_WAKEUP_NONE;
> + }
> +
> + return PREEMPT_WAKEUP_NEXT;
> +}
This seems to do 3 things:
- that reschedule waker on !eligible
- set the buddy (while losing NEXT_BUDDY)
- the WF_SYNC thing
Let's split that out.
> @@ -8734,7 +8819,7 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int
> struct sched_entity *se = &donor->se, *pse = &p->se;
> struct cfs_rq *cfs_rq = task_cfs_rq(donor);
> int cse_is_idle, pse_is_idle;
> - bool do_preempt_short = false;
> + enum preempt_wakeup_action preempt_action = PREEMPT_WAKEUP_NONE;
I'm thinking NONE is the wrong default
>
> if (unlikely(se == pse))
> return;
> @@ -8748,10 +8833,6 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int
> if (task_is_throttled(p))
> return;
>
> - if (sched_feat(NEXT_BUDDY) && !(wake_flags & WF_FORK) && !pse->sched_delayed) {
> - set_next_buddy(pse);
> - }
This was the only NEXT_BUDDY site and none were returned in trade.
> /*
> * We can come here with TIF_NEED_RESCHED already set from new task
> * wake up path.
> @@ -8783,7 +8864,7 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int
> * When non-idle entity preempt an idle entity,
> * don't give idle entity slice protection.
> */
> - do_preempt_short = true;
> + preempt_action = PREEMPT_WAKEUP_SHORT;
> goto preempt;
> }
>
> @@ -8802,21 +8883,41 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int
> * If @p has a shorter slice than current and @p is eligible, override
> * current's slice protection in order to allow preemption.
> */
> - do_preempt_short = sched_feat(PREEMPT_SHORT) && (pse->slice < se->slice);
> + if (sched_feat(PREEMPT_SHORT) && (pse->slice < se->slice)) {
> + preempt_action = PREEMPT_WAKEUP_SHORT;
> + } else {
> + /*
> + * If @p potentially is completing work required by current then
> + * consider preemption.
> + */
> + preempt_action = __do_preempt_buddy(rq, cfs_rq, wake_flags,
> + pse, se);
> + }
> +
> + switch (preempt_action) {
> + case PREEMPT_WAKEUP_NONE:
> + return;
> + case PREEMPT_WAKEUP_RESCHED:
> + goto preempt;
> + case PREEMPT_WAKEUP_SHORT:
> + fallthrough;
(this is redundant)
> + case PREEMPT_WAKEUP_NEXT:
> + break;
> + }
>
> /*
> * If @p has become the most eligible task, force preemption.
> */
> - if (__pick_eevdf(cfs_rq, !do_preempt_short) == pse)
> + if (__pick_eevdf(cfs_rq, preempt_action != PREEMPT_WAKEUP_SHORT) == pse)
> goto preempt;
>
> - if (sched_feat(RUN_TO_PARITY) && do_preempt_short)
> + if (sched_feat(RUN_TO_PARITY))
> update_protect_slice(cfs_rq, se);
>
> return;
>
> preempt:
> - if (do_preempt_short)
> + if (preempt_action == PREEMPT_WAKEUP_SHORT)
> cancel_protect_slice(se);
>
> resched_curr_lazy(rq);
Right, much better. But since I was noddling, how about something like
so on top?
---
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -8727,23 +8727,16 @@ static void set_next_buddy(struct sched_
}
enum preempt_wakeup_action {
- PREEMPT_WAKEUP_NONE, /* No action on the buddy */
- PREEMPT_WAKEUP_SHORT, /* Override current's slice
- * protection to allow
- * preemption.
- */
- PREEMPT_WAKEUP_NEXT, /* Check next is most eligible
- * before rescheduling.
- */
- PREEMPT_WAKEUP_RESCHED, /* Plain reschedule */
+ PREEMPT_WAKEUP_NONE, /* No preemption. */
+ PREEMPT_WAKEUP_SHORT, /* Ignore slice protection. */
+ PREEMPT_WAKEUP_PICK, /* Let __pick_eevdf() decide. */
+ PREEMPT_WAKEUP_RESCHED, /* Force reschedule. */
};
-static inline enum preempt_wakeup_action
-__do_preempt_buddy(struct rq *rq, struct cfs_rq *cfs_rq, int wake_flags,
- struct sched_entity *pse, struct sched_entity *se)
+static inline void
+set_preempt_buddy(struct rq *rq, struct cfs_rq *cfs_rq, int wake_flags,
+ struct sched_entity *pse, struct sched_entity *se)
{
- bool pse_before;
-
/*
* Ignore wakee preemption on WF_FORK as it is less likely that
* there is shared data as exec often follow fork. Do not
@@ -8751,11 +8744,7 @@ __do_preempt_buddy(struct rq *rq, struct
* EEVDF to forcibly queue an ineligible task.
*/
if ((wake_flags & WF_FORK) || pse->sched_delayed)
- return PREEMPT_WAKEUP_NONE;
-
- /* Reschedule if waker is no longer eligible. */
- if (in_task() && !entity_eligible(cfs_rq, se))
- return PREEMPT_WAKEUP_RESCHED;
+ return;
/*
* Keep existing buddy if the deadline is sooner than pse.
@@ -8764,63 +8753,62 @@ __do_preempt_buddy(struct rq *rq, struct
* obeying the deadline is more in line with EEVDF objectives.
*/
if (cfs_rq->next && entity_before(cfs_rq->next, pse))
- return PREEMPT_WAKEUP_NEXT;
+ return;
set_next_buddy(pse);
+}
- /*
- * WF_SYNC|WF_TTWU indicates the waker expects to sleep but it is not
- * strictly enforced because the hint is either misunderstood or
- * multiple tasks must be woken up.
- */
- pse_before = entity_before(pse, se);
- if (wake_flags & WF_SYNC) {
- u64 delta = rq_clock_task(rq) - se->exec_start;
- u64 threshold = sysctl_sched_migration_cost;
-
- /*
- * WF_SYNC without WF_TTWU is not expected so warn if it
- * happens even though it is likely harmless.
- */
- WARN_ON_ONCE(!(wake_flags & WF_TTWU));
+/*
+ * WF_SYNC|WF_TTWU indicates the waker expects to sleep but it is not
+ * strictly enforced because the hint is either misunderstood or
+ * multiple tasks must be woken up.
+ */
+static inline enum preempt_wakeup_action
+preempt_sync(struct rq *rq, int wake_flags,
+ struct sched_entity *pse, struct sched_entity *se)
+{
+ u64 delta = rq_clock_task(rq) - se->exec_start;
+ u64 threshold = sysctl_sched_migration_cost;
- if ((s64)delta < 0)
- delta = 0;
+ /*
+ * WF_SYNC without WF_TTWU is not expected so warn if it
+ * happens even though it is likely harmless.
+ */
+ WARN_ON_ONCE(!(wake_flags & WF_TTWU));
- /*
- * WF_RQ_SELECTED implies the tasks are stacking on a
- * CPU when they could run on other CPUs. Reduce the
- * threshold before preemption is allowed to an
- * arbitrary lower value as it is more likely (but not
- * guaranteed) the waker requires the wakee to finish.
- */
- if (wake_flags & WF_RQ_SELECTED)
- threshold >>= 2;
+ if ((s64)delta < 0)
+ delta = 0;
- /*
- * As WF_SYNC is not strictly obeyed, allow some runtime for
- * batch wakeups to be issued.
- */
- if (pse_before && delta >= threshold)
- return PREEMPT_WAKEUP_RESCHED;
+ /*
+ * WF_RQ_SELECTED implies the tasks are stacking on a
+ * CPU when they could run on other CPUs. Reduce the
+ * threshold before preemption is allowed to an
+ * arbitrary lower value as it is more likely (but not
+ * guaranteed) the waker requires the wakee to finish.
+ */
+ if (wake_flags & WF_RQ_SELECTED)
+ threshold >>= 2;
- return PREEMPT_WAKEUP_NONE;
- }
+ /*
+ * As WF_SYNC is not strictly obeyed, allow some runtime for
+ * batch wakeups to be issued.
+ */
+ if (entity_before(pse, se) && delta >= threshold)
+ return PREEMPT_WAKEUP_RESCHED;
- return PREEMPT_WAKEUP_NEXT;
+ return PREEMPT_WAKEUP_NONE;
}
-
/*
* Preempt the current task with a newly woken task if needed:
*/
static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int wake_flags)
{
+ enum preempt_wakeup_action preempt_action = PREEMPT_WAKEUP_PICK;
struct task_struct *donor = rq->donor;
struct sched_entity *se = &donor->se, *pse = &p->se;
struct cfs_rq *cfs_rq = task_cfs_rq(donor);
int cse_is_idle, pse_is_idle;
- enum preempt_wakeup_action preempt_action = PREEMPT_WAKEUP_NONE;
if (unlikely(se == pse))
return;
@@ -8886,26 +8874,40 @@ static void check_preempt_wakeup_fair(st
*/
if (sched_feat(PREEMPT_SHORT) && (pse->slice < se->slice)) {
preempt_action = PREEMPT_WAKEUP_SHORT;
- } else {
- /*
- * If @p potentially is completing work required by current then
- * consider preemption.
- */
- preempt_action = __do_preempt_buddy(rq, cfs_rq, wake_flags,
- pse, se);
+ goto pick;
+
}
+ /*
+ * If @p potentially is completing work required by current
+ * then consider preemption.
+ */
+ if (in_task() && !entity_eligible(cfs_rq, se)) {
+ /* Reschedule if waker is no longer eligible. */
+ preempt_action = PREEMPT_WAKEUP_RESCHED;
+ goto preempt;
+
+ }
+
+ if (sched_feat(NEXT_BUDDY))
+ set_preempt_buddy(rq, cfs_rq, wake_flags, pse, se);
+
+ if (wake_flags & WF_SYNC)
+ preempt_action = preempt_sync(rq, wake_flags, pse, se);
+
switch (preempt_action) {
case PREEMPT_WAKEUP_NONE:
return;
+
case PREEMPT_WAKEUP_RESCHED:
goto preempt;
+
case PREEMPT_WAKEUP_SHORT:
- fallthrough;
- case PREEMPT_WAKEUP_NEXT:
- break;
+ case PREEMPT_WAKEUP_PICK:
+ goto pick;
}
+pick:
/*
* If @p has become the most eligible task, force preemption.
*/
Powered by blists - more mailing lists