lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20251103140711.GB3245006@noisy.programming.kicks-ass.net>
Date: Mon, 3 Nov 2025 15:07:11 +0100
From: Peter Zijlstra <peterz@...radead.org>
To: Mel Gorman <mgorman@...hsingularity.net>
Cc: Ingo Molnar <mingo@...hat.com>, Juri Lelli <juri.lelli@...hat.com>,
	Dietmar Eggemann <dietmar.eggemann@....com>,
	Valentin Schneider <vschneid@...hat.com>,
	Chris Mason <clm@...a.com>, linux-kernel@...r.kernel.org
Subject: Re: [PATCH 2/2] sched/fair: Reimplement NEXT_BUDDY to align with
 EEVDF goals

On Mon, Nov 03, 2025 at 11:04:45AM +0000, Mel Gorman wrote:

> @@ -8725,6 +8725,91 @@ static void set_next_buddy(struct sched_entity *se)
>  	}
>  }
>  
> +enum preempt_wakeup_action {
> +	PREEMPT_WAKEUP_NONE,		/* No action on the buddy */
> +	PREEMPT_WAKEUP_SHORT,		/* Override current's slice
> +					 * protection to allow
> +					 * preemption.
> +					 */
> +	PREEMPT_WAKEUP_NEXT,		/* Check next is most eligible
> +					 * before rescheduling.
> +					 */
> +	PREEMPT_WAKEUP_RESCHED,		/* Plain reschedule */
> +};

Not really a fan of that comment style. While noodling, I've managed to
'accidentally' rename NEXT to PICK, since its more about letting
__pick_eevdf() decide.

> +static inline enum preempt_wakeup_action
> +__do_preempt_buddy(struct rq *rq, struct cfs_rq *cfs_rq, int wake_flags,
> +		 struct sched_entity *pse, struct sched_entity *se)
> +{
> +	bool pse_before;
> +
> +	/*
> +	 * Ignore wakee preemption on WF_FORK as it is less likely that
> +	 * there is shared data as exec often follow fork. Do not
> +	 * preempt for tasks that are sched_delayed as it would violate
> +	 * EEVDF to forcibly queue an ineligible task.
> +	 */
> +	if ((wake_flags & WF_FORK) || pse->sched_delayed)
> +		return PREEMPT_WAKEUP_NONE;
> +
> +	/* Reschedule if waker is no longer eligible. */
> +	if (in_task() && !entity_eligible(cfs_rq, se))
> +		return PREEMPT_WAKEUP_RESCHED;
> +
> +	/*
> +	 * Keep existing buddy if the deadline is sooner than pse.
> +	 * The older buddy may be cache cold and completely unrelated
> +	 * to the current wakeup but that is unpredictable where as
> +	 * obeying the deadline is more in line with EEVDF objectives.
> +	 */
> +	if (cfs_rq->next && entity_before(cfs_rq->next, pse))
> +		return PREEMPT_WAKEUP_NEXT;
> +
> +	set_next_buddy(pse);
> +
> +	/*
> +	 * WF_SYNC|WF_TTWU indicates the waker expects to sleep but it is not
> +	 * strictly enforced because the hint is either misunderstood or
> +	 * multiple tasks must be woken up.
> +	 */
> +	pse_before = entity_before(pse, se);
> +	if (wake_flags & WF_SYNC) {
> +		u64 delta = rq_clock_task(rq) - se->exec_start;
> +		u64 threshold = sysctl_sched_migration_cost;
> +
> +		/*
> +		 * WF_SYNC without WF_TTWU is not expected so warn if it
> +		 * happens even though it is likely harmless.
> +		 */
> +		WARN_ON_ONCE(!(wake_flags & WF_TTWU));
> +
> +		if ((s64)delta < 0)
> +			delta = 0;
> +
> +		/*
> +		 * WF_RQ_SELECTED implies the tasks are stacking on a
> +		 * CPU when they could run on other CPUs. Reduce the
> +		 * threshold before preemption is allowed to an
> +		 * arbitrary lower value as it is more likely (but not
> +		 * guaranteed) the waker requires the wakee to finish.
> +		 */
> +		if (wake_flags & WF_RQ_SELECTED)
> +			threshold >>= 2;
> +
> +		/*
> +		 * As WF_SYNC is not strictly obeyed, allow some runtime for
> +		 * batch wakeups to be issued.
> +		 */
> +		if (pse_before && delta >= threshold)
> +			return PREEMPT_WAKEUP_RESCHED;
> +
> +		return PREEMPT_WAKEUP_NONE;
> +	}
> +
> +	return PREEMPT_WAKEUP_NEXT;
> +}

This seems to do 3 things:

 - that reschedule waker on !eligible
 - set the buddy (while losing NEXT_BUDDY)
 - the WF_SYNC thing

Let's split that out.

> @@ -8734,7 +8819,7 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int
>  	struct sched_entity *se = &donor->se, *pse = &p->se;
>  	struct cfs_rq *cfs_rq = task_cfs_rq(donor);
>  	int cse_is_idle, pse_is_idle;
> -	bool do_preempt_short = false;
> +	enum preempt_wakeup_action preempt_action = PREEMPT_WAKEUP_NONE;

I'm thinking NONE is the wrong default

>  
>  	if (unlikely(se == pse))
>  		return;
> @@ -8748,10 +8833,6 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int
>  	if (task_is_throttled(p))
>  		return;
>  
> -	if (sched_feat(NEXT_BUDDY) && !(wake_flags & WF_FORK) && !pse->sched_delayed) {
> -		set_next_buddy(pse);
> -	}

This was the only NEXT_BUDDY site and none were returned in trade.

>  	/*
>  	 * We can come here with TIF_NEED_RESCHED already set from new task
>  	 * wake up path.
> @@ -8783,7 +8864,7 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int
>  		 * When non-idle entity preempt an idle entity,
>  		 * don't give idle entity slice protection.
>  		 */
> -		do_preempt_short = true;
> +		preempt_action = PREEMPT_WAKEUP_SHORT;
>  		goto preempt;
>  	}
>  
> @@ -8802,21 +8883,41 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int
>  	 * If @p has a shorter slice than current and @p is eligible, override
>  	 * current's slice protection in order to allow preemption.
>  	 */
> -	do_preempt_short = sched_feat(PREEMPT_SHORT) && (pse->slice < se->slice);
> +	if (sched_feat(PREEMPT_SHORT) && (pse->slice < se->slice)) {
> +		preempt_action = PREEMPT_WAKEUP_SHORT;
> +	} else {
> +		/*
> +		 * If @p potentially is completing work required by current then
> +		 * consider preemption.
> +		 */
> +		preempt_action = __do_preempt_buddy(rq, cfs_rq, wake_flags,
> +						      pse, se);
> +	}
> +
> +	switch (preempt_action) {
> +	case PREEMPT_WAKEUP_NONE:
> +		return;
> +	case PREEMPT_WAKEUP_RESCHED:
> +		goto preempt;
> +	case PREEMPT_WAKEUP_SHORT:
> +		fallthrough;
(this is redundant)
> +	case PREEMPT_WAKEUP_NEXT:
> +		break;
> +	}
>  
>  	/*
>  	 * If @p has become the most eligible task, force preemption.
>  	 */
> -	if (__pick_eevdf(cfs_rq, !do_preempt_short) == pse)
> +	if (__pick_eevdf(cfs_rq, preempt_action != PREEMPT_WAKEUP_SHORT) == pse)
>  		goto preempt;
>  
> -	if (sched_feat(RUN_TO_PARITY) && do_preempt_short)
> +	if (sched_feat(RUN_TO_PARITY))
>  		update_protect_slice(cfs_rq, se);
>  
>  	return;
>  
>  preempt:
> -	if (do_preempt_short)
> +	if (preempt_action == PREEMPT_WAKEUP_SHORT)
>  		cancel_protect_slice(se);
>  
>  	resched_curr_lazy(rq);

Right, much better. But since I was noddling, how about something like
so on top?

---
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -8727,23 +8727,16 @@ static void set_next_buddy(struct sched_
 }
 
 enum preempt_wakeup_action {
-	PREEMPT_WAKEUP_NONE,		/* No action on the buddy */
-	PREEMPT_WAKEUP_SHORT,		/* Override current's slice
-					 * protection to allow
-					 * preemption.
-					 */
-	PREEMPT_WAKEUP_NEXT,		/* Check next is most eligible
-					 * before rescheduling.
-					 */
-	PREEMPT_WAKEUP_RESCHED,		/* Plain reschedule */
+	PREEMPT_WAKEUP_NONE,	/* No preemption. */
+	PREEMPT_WAKEUP_SHORT,	/* Ignore slice protection. */
+	PREEMPT_WAKEUP_PICK,	/* Let __pick_eevdf() decide. */
+	PREEMPT_WAKEUP_RESCHED,	/* Force reschedule. */
 };
 
-static inline enum preempt_wakeup_action
-__do_preempt_buddy(struct rq *rq, struct cfs_rq *cfs_rq, int wake_flags,
-		 struct sched_entity *pse, struct sched_entity *se)
+static inline void
+set_preempt_buddy(struct rq *rq, struct cfs_rq *cfs_rq, int wake_flags,
+		  struct sched_entity *pse, struct sched_entity *se)
 {
-	bool pse_before;
-
 	/*
 	 * Ignore wakee preemption on WF_FORK as it is less likely that
 	 * there is shared data as exec often follow fork. Do not
@@ -8751,11 +8744,7 @@ __do_preempt_buddy(struct rq *rq, struct
 	 * EEVDF to forcibly queue an ineligible task.
 	 */
 	if ((wake_flags & WF_FORK) || pse->sched_delayed)
-		return PREEMPT_WAKEUP_NONE;
-
-	/* Reschedule if waker is no longer eligible. */
-	if (in_task() && !entity_eligible(cfs_rq, se))
-		return PREEMPT_WAKEUP_RESCHED;
+		return;
 
 	/*
 	 * Keep existing buddy if the deadline is sooner than pse.
@@ -8764,63 +8753,62 @@ __do_preempt_buddy(struct rq *rq, struct
 	 * obeying the deadline is more in line with EEVDF objectives.
 	 */
 	if (cfs_rq->next && entity_before(cfs_rq->next, pse))
-		return PREEMPT_WAKEUP_NEXT;
+		return;
 
 	set_next_buddy(pse);
+}
 
-	/*
-	 * WF_SYNC|WF_TTWU indicates the waker expects to sleep but it is not
-	 * strictly enforced because the hint is either misunderstood or
-	 * multiple tasks must be woken up.
-	 */
-	pse_before = entity_before(pse, se);
-	if (wake_flags & WF_SYNC) {
-		u64 delta = rq_clock_task(rq) - se->exec_start;
-		u64 threshold = sysctl_sched_migration_cost;
-
-		/*
-		 * WF_SYNC without WF_TTWU is not expected so warn if it
-		 * happens even though it is likely harmless.
-		 */
-		WARN_ON_ONCE(!(wake_flags & WF_TTWU));
+/*
+ * WF_SYNC|WF_TTWU indicates the waker expects to sleep but it is not
+ * strictly enforced because the hint is either misunderstood or
+ * multiple tasks must be woken up.
+ */
+static inline enum preempt_wakeup_action
+preempt_sync(struct rq *rq, int wake_flags,
+	     struct sched_entity *pse, struct sched_entity *se)
+{
+	u64 delta = rq_clock_task(rq) - se->exec_start;
+	u64 threshold = sysctl_sched_migration_cost;
 
-		if ((s64)delta < 0)
-			delta = 0;
+	/*
+	 * WF_SYNC without WF_TTWU is not expected so warn if it
+	 * happens even though it is likely harmless.
+	 */
+	WARN_ON_ONCE(!(wake_flags & WF_TTWU));
 
-		/*
-		 * WF_RQ_SELECTED implies the tasks are stacking on a
-		 * CPU when they could run on other CPUs. Reduce the
-		 * threshold before preemption is allowed to an
-		 * arbitrary lower value as it is more likely (but not
-		 * guaranteed) the waker requires the wakee to finish.
-		 */
-		if (wake_flags & WF_RQ_SELECTED)
-			threshold >>= 2;
+	if ((s64)delta < 0)
+		delta = 0;
 
-		/*
-		 * As WF_SYNC is not strictly obeyed, allow some runtime for
-		 * batch wakeups to be issued.
-		 */
-		if (pse_before && delta >= threshold)
-			return PREEMPT_WAKEUP_RESCHED;
+	/*
+	 * WF_RQ_SELECTED implies the tasks are stacking on a
+	 * CPU when they could run on other CPUs. Reduce the
+	 * threshold before preemption is allowed to an
+	 * arbitrary lower value as it is more likely (but not
+	 * guaranteed) the waker requires the wakee to finish.
+	 */
+	if (wake_flags & WF_RQ_SELECTED)
+		threshold >>= 2;
 
-		return PREEMPT_WAKEUP_NONE;
-	}
+	/*
+	 * As WF_SYNC is not strictly obeyed, allow some runtime for
+	 * batch wakeups to be issued.
+	 */
+	if (entity_before(pse, se) && delta >= threshold)
+		return PREEMPT_WAKEUP_RESCHED;
 
-	return PREEMPT_WAKEUP_NEXT;
+	return PREEMPT_WAKEUP_NONE;
 }
 
-
 /*
  * Preempt the current task with a newly woken task if needed:
  */
 static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int wake_flags)
 {
+	enum preempt_wakeup_action preempt_action = PREEMPT_WAKEUP_PICK;
 	struct task_struct *donor = rq->donor;
 	struct sched_entity *se = &donor->se, *pse = &p->se;
 	struct cfs_rq *cfs_rq = task_cfs_rq(donor);
 	int cse_is_idle, pse_is_idle;
-	enum preempt_wakeup_action preempt_action = PREEMPT_WAKEUP_NONE;
 
 	if (unlikely(se == pse))
 		return;
@@ -8886,26 +8874,40 @@ static void check_preempt_wakeup_fair(st
 	 */
 	if (sched_feat(PREEMPT_SHORT) && (pse->slice < se->slice)) {
 		preempt_action = PREEMPT_WAKEUP_SHORT;
-	} else {
-		/*
-		 * If @p potentially is completing work required by current then
-		 * consider preemption.
-		 */
-		preempt_action = __do_preempt_buddy(rq, cfs_rq, wake_flags,
-						      pse, se);
+		goto pick;
+
 	}
 
+	/*
+	 * If @p potentially is completing work required by current
+	 * then consider preemption.
+	 */
+	if (in_task() && !entity_eligible(cfs_rq, se)) {
+		/* Reschedule if waker is no longer eligible. */
+		preempt_action = PREEMPT_WAKEUP_RESCHED;
+		goto preempt;
+
+	} 
+
+	if (sched_feat(NEXT_BUDDY))
+		set_preempt_buddy(rq, cfs_rq, wake_flags, pse, se);
+
+	if (wake_flags & WF_SYNC)
+		preempt_action = preempt_sync(rq, wake_flags, pse, se);
+
 	switch (preempt_action) {
 	case PREEMPT_WAKEUP_NONE:
 		return;
+
 	case PREEMPT_WAKEUP_RESCHED:
 		goto preempt;
+
 	case PREEMPT_WAKEUP_SHORT:
-		fallthrough;
-	case PREEMPT_WAKEUP_NEXT:
-		break;
+	case PREEMPT_WAKEUP_PICK:
+		goto pick;
 	}
 
+pick:
 	/*
 	 * If @p has become the most eligible task, force preemption.
 	 */

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ