linux-kernel - Re: [RFC PATCH 42/86] sched: force preemption on tick expiration

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20231108095600.GM8262@noisy.programming.kicks-ass.net>
Date:   Wed, 8 Nov 2023 10:56:00 +0100
From:   Peter Zijlstra <peterz@...radead.org>
To:     Ankur Arora <ankur.a.arora@...cle.com>
Cc:     linux-kernel@...r.kernel.org, tglx@...utronix.de,
        torvalds@...ux-foundation.org, paulmck@...nel.org,
        linux-mm@...ck.org, x86@...nel.org, akpm@...ux-foundation.org,
        luto@...nel.org, bp@...en8.de, dave.hansen@...ux.intel.com,
        hpa@...or.com, mingo@...hat.com, juri.lelli@...hat.com,
        vincent.guittot@...aro.org, willy@...radead.org, mgorman@...e.de,
        jon.grimm@....com, bharata@....com, raghavendra.kt@....com,
        boris.ostrovsky@...cle.com, konrad.wilk@...cle.com,
        jgross@...e.com, andrew.cooper3@...rix.com, mingo@...nel.org,
        bristot@...nel.org, mathieu.desnoyers@...icios.com,
        geert@...ux-m68k.org, glaubitz@...sik.fu-berlin.de,
        anton.ivanov@...bridgegreys.com, mattst88@...il.com,
        krypton@...ich-teichert.org, rostedt@...dmis.org,
        David.Laight@...lab.com, richard@....at, mjguzik@...il.com
Subject: Re: [RFC PATCH 42/86] sched: force preemption on tick expiration

On Tue, Nov 07, 2023 at 01:57:28PM -0800, Ankur Arora wrote:

> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 4d86c618ffa2..fe7e5e9b2207 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -1016,8 +1016,11 @@ static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se);
>   * XXX: strictly: vd_i += N*r_i/w_i such that: vd_i > ve_i
>   * this is probably good enough.
>   */
> -static void update_deadline(struct cfs_rq *cfs_rq, struct sched_entity *se)
> +static void update_deadline(struct cfs_rq *cfs_rq,
> +			    struct sched_entity *se, bool tick)
>  {
> +	struct rq *rq = rq_of(cfs_rq);
> +
>  	if ((s64)(se->vruntime - se->deadline) < 0)
>  		return;
>  
> @@ -1033,13 +1036,19 @@ static void update_deadline(struct cfs_rq *cfs_rq, struct sched_entity *se)
>  	 */
>  	se->deadline = se->vruntime + calc_delta_fair(se->slice, se);
>  
> +	if (cfs_rq->nr_running < 2)
> +		return;
> +
>  	/*
> -	 * The task has consumed its request, reschedule.
> +	 * The task has consumed its request, reschedule; eagerly
> +	 * if it ignored our last lazy reschedule.
>  	 */
> -	if (cfs_rq->nr_running > 1) {
> -		resched_curr(rq_of(cfs_rq));
> -		clear_buddies(cfs_rq, se);
> -	}
> +	if (tick && test_tsk_thread_flag(rq->curr, TIF_NEED_RESCHED_LAZY))
> +		__resched_curr(rq, RESCHED_eager);
> +	else
> +		resched_curr(rq);
> +
> +	clear_buddies(cfs_rq, se);
>  }
>  
>  #include "pelt.h"
> @@ -1147,7 +1156,7 @@ static void update_tg_load_avg(struct cfs_rq *cfs_rq)
>  /*
>   * Update the current task's runtime statistics.
>   */
> -static void update_curr(struct cfs_rq *cfs_rq)
> +static void __update_curr(struct cfs_rq *cfs_rq, bool tick)
>  {
>  	struct sched_entity *curr = cfs_rq->curr;
>  	u64 now = rq_clock_task(rq_of(cfs_rq));
> @@ -1174,7 +1183,7 @@ static void update_curr(struct cfs_rq *cfs_rq)
>  	schedstat_add(cfs_rq->exec_clock, delta_exec);
>  
>  	curr->vruntime += calc_delta_fair(delta_exec, curr);
> -	update_deadline(cfs_rq, curr);
> +	update_deadline(cfs_rq, curr, tick);
>  	update_min_vruntime(cfs_rq);
>  
>  	if (entity_is_task(curr)) {
> @@ -1188,6 +1197,11 @@ static void update_curr(struct cfs_rq *cfs_rq)
>  	account_cfs_rq_runtime(cfs_rq, delta_exec);
>  }
>  
> +static void update_curr(struct cfs_rq *cfs_rq)
> +{
> +	__update_curr(cfs_rq, false);
> +}
> +
>  static void update_curr_fair(struct rq *rq)
>  {
>  	update_curr(cfs_rq_of(&rq->curr->se));
> @@ -5309,7 +5323,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
>  	/*
>  	 * Update run-time statistics of the 'current'.
>  	 */
> -	update_curr(cfs_rq);
> +	__update_curr(cfs_rq, true);
>  
>  	/*
>  	 * Ensure that runnable average is periodically updated.

I'm thinking this will be less of a mess if you flip it around some.

(ignore the hrtick mess, I'll try and get that cleaned up)

This way you have two distinct sites to handle the preemption. the
update_curr() would be 'FULL ? force : lazy' while the tick one gets the
special magic bits.

---
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index df348aa55d3c..5399696de9e0 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1016,10 +1016,10 @@ static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se);
  * XXX: strictly: vd_i += N*r_i/w_i such that: vd_i > ve_i
  * this is probably good enough.
  */
-static void update_deadline(struct cfs_rq *cfs_rq, struct sched_entity *se)
+static bool update_deadline(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
 	if ((s64)(se->vruntime - se->deadline) < 0)
-		return;
+		return false;
 
 	/*
 	 * For EEVDF the virtual time slope is determined by w_i (iow.
@@ -1037,9 +1037,11 @@ static void update_deadline(struct cfs_rq *cfs_rq, struct sched_entity *se)
 	 * The task has consumed its request, reschedule.
 	 */
 	if (cfs_rq->nr_running > 1) {
-		resched_curr(rq_of(cfs_rq));
 		clear_buddies(cfs_rq, se);
+		return true;
 	}
+
+	return false;
 }
 
 #include "pelt.h"
@@ -1147,18 +1149,19 @@ static void update_tg_load_avg(struct cfs_rq *cfs_rq)
 /*
  * Update the current task's runtime statistics.
  */
-static void update_curr(struct cfs_rq *cfs_rq)
+static bool __update_curr(struct cfs_rq *cfs_rq)
 {
 	struct sched_entity *curr = cfs_rq->curr;
 	u64 now = rq_clock_task(rq_of(cfs_rq));
 	u64 delta_exec;
+	bool ret;
 
 	if (unlikely(!curr))
-		return;
+		return false;
 
 	delta_exec = now - curr->exec_start;
 	if (unlikely((s64)delta_exec <= 0))
-		return;
+		return false;
 
 	curr->exec_start = now;
 
@@ -1174,7 +1177,7 @@ static void update_curr(struct cfs_rq *cfs_rq)
 	schedstat_add(cfs_rq->exec_clock, delta_exec);
 
 	curr->vruntime += calc_delta_fair(delta_exec, curr);
-	update_deadline(cfs_rq, curr);
+	ret = update_deadline(cfs_rq, curr);
 	update_min_vruntime(cfs_rq);
 
 	if (entity_is_task(curr)) {
@@ -1186,6 +1189,14 @@ static void update_curr(struct cfs_rq *cfs_rq)
 	}
 
 	account_cfs_rq_runtime(cfs_rq, delta_exec);
+
+	return ret;
+}
+
+static void update_curr(struct cfs_rq *cfs_rq)
+{
+	if (__update_curr(cfs_rq))
+		resched_curr(rq_of(cfs_rq));
 }
 
 static void update_curr_fair(struct rq *rq)
@@ -5309,7 +5320,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
 	/*
 	 * Update run-time statistics of the 'current'.
 	 */
-	update_curr(cfs_rq);
+	bool resched = __update_curr(cfs_rq);
 
 	/*
 	 * Ensure that runnable average is periodically updated.
@@ -5317,22 +5328,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
 	update_load_avg(cfs_rq, curr, UPDATE_TG);
 	update_cfs_group(curr);
 
-#ifdef CONFIG_SCHED_HRTICK
-	/*
-	 * queued ticks are scheduled to match the slice, so don't bother
-	 * validating it and just reschedule.
-	 */
-	if (queued) {
-		resched_curr(rq_of(cfs_rq));
-		return;
-	}
-	/*
-	 * don't let the period tick interfere with the hrtick preemption
-	 */
-	if (!sched_feat(DOUBLE_TICK) &&
-			hrtimer_active(&rq_of(cfs_rq)->hrtick_timer))
-		return;
-#endif
+	return resched;
 }
 
 
@@ -12387,12 +12383,16 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
 {
 	struct cfs_rq *cfs_rq;
 	struct sched_entity *se = &curr->se;
+	bool resched = false;
 
 	for_each_sched_entity(se) {
 		cfs_rq = cfs_rq_of(se);
-		entity_tick(cfs_rq, se, queued);
+		resched |= entity_tick(cfs_rq, se, queued);
 	}
 
+	if (resched)
+		resched_curr(rq);
+
 	if (static_branch_unlikely(&sched_numa_balancing))
 		task_tick_numa(rq, curr);