linux-kernel - Re: [RFC PATCH 1/1] sched/eevdf: Skip eligibility check for current entity during wakeup preemption

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20240424150721.GQ30852@noisy.programming.kicks-ass.net>
Date: Wed, 24 Apr 2024 17:07:21 +0200
From: Peter Zijlstra <peterz@...radead.org>
To: K Prateek Nayak <kprateek.nayak@....com>
Cc: Ingo Molnar <mingo@...hat.com>, Juri Lelli <juri.lelli@...hat.com>,
	Vincent Guittot <vincent.guittot@...aro.org>,
	Dietmar Eggemann <dietmar.eggemann@....com>,
	Steven Rostedt <rostedt@...dmis.org>,
	Ben Segall <bsegall@...gle.com>, Mel Gorman <mgorman@...e.de>,
	Daniel Bristot de Oliveira <bristot@...hat.com>,
	Valentin Schneider <vschneid@...hat.com>,
	linux-kernel@...r.kernel.org,
	Tobias Huschle <huschle@...ux.ibm.com>,
	Luis Machado <luis.machado@....com>, Chen Yu <yu.c.chen@...el.com>,
	Abel Wu <wuyun.abel@...edance.com>,
	Tianchen Ding <dtcccc@...ux.alibaba.com>,
	Youssef Esmat <youssefesmat@...omium.org>,
	Xuewen Yan <xuewen.yan94@...il.com>,
	"Gautham R. Shenoy" <gautham.shenoy@....com>
Subject: Re: [RFC PATCH 1/1] sched/eevdf: Skip eligibility check for current
 entity during wakeup preemption

On Mon, Mar 25, 2024 at 11:32:26AM +0530, K Prateek Nayak wrote:
> With the curr entity's eligibility check, a wakeup preemption is very
> likely when an entity with positive lag joins the runqueue pushing the
> avg_vruntime of the runqueue backwards, making the vruntime of the
> current entity ineligible. This leads to aggressive wakeup preemption
> which was previously guarded by wakeup_granularity_ns in legacy CFS.
> Below figure depicts one such aggressive preemption scenario with EEVDF
> in DeathStarBench [1]:
> 
> 				deadline for Nginx
> 		   |
> 	+-------+  |			|
>     /-- | Nginx | -|------------------> |
>     |	+-------+  |			|
>     |		   |
>     |	-----------|-------------------------------> vruntime timeline
>     |		   \--> rq->avg_vruntime
>     |
>     | 	wakes service on the same runqueue since system is busy
>     |
>     |	+---------+|
>     \-->| Service || (service has +ve lag pushes avg_vruntime backwards)
> 	+---------+|
> 	  |	   |
>    wakeup |	+--|-----+			 |
>  preempts \---->| N|ginx | --------------------> | {deadline for Nginx}
> 		+--|-----+  			 |
> 	   (Nginx ineligible)
> 	-----------|-------------------------------> vruntime timeline
> 		   \--> rq->avg_vruntime

This graph is really hard to interpret. If you want to illustrate
avg_vruntime moves back, you should not align it. That's really
disorienting.

In both (upper and lower) nginx has the same vruntime thus *that* should
be aligned. The lower will have service placed left and with that
avg_vruntime also moves left, rendering nginx in-eligible.


> Signed-off-by: K Prateek Nayak <kprateek.nayak@....com>
> ---
>  kernel/sched/fair.c     | 24 ++++++++++++++++++++----
>  kernel/sched/features.h |  1 +
>  2 files changed, 21 insertions(+), 4 deletions(-)
> 
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 6a16129f9a5c..a9b145a4eab0 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -875,7 +875,7 @@ struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
>   *
>   * Which allows tree pruning through eligibility.
>   */
> -static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq)
> +static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq, bool wakeup_preempt)
>  {
>  	struct rb_node *node = cfs_rq->tasks_timeline.rb_root.rb_node;
>  	struct sched_entity *se = __pick_first_entity(cfs_rq);
> @@ -889,7 +889,23 @@ static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq)
>  	if (cfs_rq->nr_running == 1)
>  		return curr && curr->on_rq ? curr : se;
>  
> -	if (curr && (!curr->on_rq || !entity_eligible(cfs_rq, curr)))
> +	if (curr && !curr->on_rq)
> +		curr = NULL;
> +
> +	/*
> +	 * When an entity with positive lag wakes up, it pushes the
> +	 * avg_vruntime of the runqueue backwards. This may causes the
> +	 * current entity to be ineligible soon into its run leading to
> +	 * wakeup preemption.
> +	 *
> +	 * To prevent such aggressive preemption of the current running
> +	 * entity during task wakeups, skip the eligibility check if the
> +	 * slice promised to the entity since its selection has not yet
> +	 * elapsed.
> +	 */
> +	if (curr &&
> +	    !(sched_feat(RUN_TO_PARITY_WAKEUP) && wakeup_preempt && curr->vlag == curr->deadline) &&
> +	    !entity_eligible(cfs_rq, curr))
>  		curr = NULL;
>  
>  	/*

So I see what you want to do, but this is highly unreadable.

I'll try something like the below on top of queue/sched/eevdf, but I
should probably first look at fixing those reported crashes on that tree
:/

---
 kernel/sched/fair.c     | 60 ++++++++++++++++++++++++++++++++++---------------
 kernel/sched/features.h | 11 +++++----
 2 files changed, 49 insertions(+), 22 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 8a9206d8532f..23977ed1cb2c 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -855,6 +855,39 @@ struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
 	return __node_2_se(left);
 }
 
+static inline bool pick_curr(struct cfs_rq *cfs_rq,
+			     struct sched_entity *curr, struct sched_entity *wakee)
+{
+	/*
+	 * Nothing to preserve...
+	 */
+	if (!curr || !sched_feat(RESPECT_SLICE))
+		return false;
+
+	/*
+	 * Allow preemption at the 0-lag point -- even if not all of the slice
+	 * is consumed. Note: placement of positive lag can push V left and render
+	 * @curr instantly ineligible irrespective the time on-cpu.
+	 */
+	if (sched_feat(RUN_TO_PARITY) && !entity_eligible(cfs_rq, curr))
+		return false;
+
+	/*
+	 * Don't preserve @curr when the @wakee has a shorter slice and earlier
+	 * deadline. IOW, explicitly allow preemption.
+	 */
+	if (sched_feat(PREEMPT_SHORT) && wakee &&
+	    wakee->slice < curr->slice &&
+	    (s64)(wakee->deadline - curr->deadline) < 0)
+		return false;
+
+	/*
+	 * Preserve @curr to allow it to finish its first slice.
+	 * See the HACK in set_next_entity().
+	 */
+	return curr->vlag == curr->deadline;
+}
+
 /*
  * Earliest Eligible Virtual Deadline First
  *
@@ -874,28 +907,27 @@ struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
  *
  * Which allows tree pruning through eligibility.
  */
-static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq)
+static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq, struct sched_entity *wakee)
 {
 	struct rb_node *node = cfs_rq->tasks_timeline.rb_root.rb_node;
 	struct sched_entity *se = __pick_first_entity(cfs_rq);
 	struct sched_entity *curr = cfs_rq->curr;
 	struct sched_entity *best = NULL;
 
+	if (curr && !curr->on_rq)
+		curr = NULL;
+
 	/*
 	 * We can safely skip eligibility check if there is only one entity
 	 * in this cfs_rq, saving some cycles.
 	 */
 	if (cfs_rq->nr_running == 1)
-		return curr && curr->on_rq ? curr : se;
-
-	if (curr && (!curr->on_rq || !entity_eligible(cfs_rq, curr)))
-		curr = NULL;
+		return curr ?: se;
 
 	/*
-	 * Once selected, run a task until it either becomes non-eligible or
-	 * until it gets a new slice. See the HACK in set_next_entity().
+	 * Preserve @curr to let it finish its slice.
 	 */
-	if (sched_feat(RUN_TO_PARITY) && curr && curr->vlag == curr->deadline)
+	if (pick_curr(cfs_rq, curr, wakee))
 		return curr;
 
 	/* Pick the leftmost entity if it's eligible */
@@ -5507,7 +5539,7 @@ pick_next_entity(struct rq *rq, struct cfs_rq *cfs_rq)
 		return cfs_rq->next;
 	}
 
-	struct sched_entity *se = pick_eevdf(cfs_rq);
+	struct sched_entity *se = pick_eevdf(cfs_rq, NULL);
 	if (se->sched_delayed) {
 		dequeue_entities(rq, se, DEQUEUE_SLEEP | DEQUEUE_DELAYED);
 		SCHED_WARN_ON(se->sched_delayed);
@@ -8548,15 +8580,7 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int
 	cfs_rq = cfs_rq_of(se);
 	update_curr(cfs_rq);
 
-	if (sched_feat(PREEMPT_SHORT) && pse->slice < se->slice &&
-	    entity_eligible(cfs_rq, pse) &&
-	    (s64)(pse->deadline - se->deadline) < 0 &&
-	    se->vlag == se->deadline) {
-		/* negate RUN_TO_PARITY */
-		se->vlag = se->deadline - 1;
-	}
-
-	if (pick_eevdf(cfs_rq) == pse)
+	if (pick_eevdf(cfs_rq, pse) == pse)
 		goto preempt;
 
 	return;
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 64ce99cf04ec..2285dc30294c 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -10,12 +10,15 @@ SCHED_FEAT(PLACE_LAG, true)
  */
 SCHED_FEAT(PLACE_DEADLINE_INITIAL, true)
 /*
- * Inhibit (wakeup) preemption until the current task has either matched the
- * 0-lag point or until is has exhausted it's slice.
+ * Inhibit (wakeup) preemption until the current task has exhausted its slice.
  */
-SCHED_FEAT(RUN_TO_PARITY, true)
+SCHED_FEAT(RESPECT_SLICE, true)
 /*
- * Allow tasks with a shorter slice to disregard RUN_TO_PARITY
+ * Relax RESPECT_SLICE to allow preemption once current has reached 0-lag.
+ */
+SCHED_FEAT(RUN_TO_PARITY, false)
+/*
+ * Allow tasks with a shorter slice to disregard RESPECT_SLICE
  */
 SCHED_FEAT(PREEMPT_SHORT, true)