linux-kernel - [RFC PATCH 44/86] sched: voluntary preemption

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20231107215742.363031-45-ankur.a.arora@oracle.com>
Date:   Tue,  7 Nov 2023 13:57:30 -0800
From:   Ankur Arora <ankur.a.arora@...cle.com>
To:     linux-kernel@...r.kernel.org
Cc:     tglx@...utronix.de, peterz@...radead.org,
        torvalds@...ux-foundation.org, paulmck@...nel.org,
        linux-mm@...ck.org, x86@...nel.org, akpm@...ux-foundation.org,
        luto@...nel.org, bp@...en8.de, dave.hansen@...ux.intel.com,
        hpa@...or.com, mingo@...hat.com, juri.lelli@...hat.com,
        vincent.guittot@...aro.org, willy@...radead.org, mgorman@...e.de,
        jon.grimm@....com, bharata@....com, raghavendra.kt@....com,
        boris.ostrovsky@...cle.com, konrad.wilk@...cle.com,
        jgross@...e.com, andrew.cooper3@...rix.com, mingo@...nel.org,
        bristot@...nel.org, mathieu.desnoyers@...icios.com,
        geert@...ux-m68k.org, glaubitz@...sik.fu-berlin.de,
        anton.ivanov@...bridgegreys.com, mattst88@...il.com,
        krypton@...ich-teichert.org, rostedt@...dmis.org,
        David.Laight@...LAB.COM, richard@....at, mjguzik@...il.com,
        Ankur Arora <ankur.a.arora@...cle.com>
Subject: [RFC PATCH 44/86] sched: voluntary preemption

The no preemption model allows running to completion in kernel context.
For voluntary preemption, allow preemption by higher scheduling
classes.

To do this resched_curr() now takes a parameter that specifies if the
resched is for a scheduler class above the runqueue's current task.
And reschedules eagerly, if so.

Also define scheduler feature PREEMPT_PRIORITY which can be used to
toggle voluntary preemption model at runtime.

TODO: Both RT, deadline work but I'm almost certainly not doing all the
right things for both.

Signed-off-by: Ankur Arora <ankur.a.arora@...cle.com>
---
 kernel/Kconfig.preempt    | 19 ++++++-------------
 kernel/sched/core.c       | 28 +++++++++++++++++-----------
 kernel/sched/core_sched.c |  2 +-
 kernel/sched/deadline.c   | 22 +++++++++++-----------
 kernel/sched/fair.c       | 18 +++++++++---------
 kernel/sched/features.h   |  5 +++++
 kernel/sched/idle.c       |  2 +-
 kernel/sched/rt.c         | 26 +++++++++++++-------------
 kernel/sched/sched.h      |  2 +-
 9 files changed, 64 insertions(+), 60 deletions(-)

diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
index 074fe5e253b5..e16114b679e3 100644
--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@ -20,23 +20,16 @@ config PREEMPT_NONE
 	  at runtime.
 
 config PREEMPT_VOLUNTARY
-	bool "Voluntary Kernel Preemption (Desktop)"
+	bool "Voluntary Kernel Preemption"
 	depends on !ARCH_NO_PREEMPT
 	select PREEMPTION
 	help
-	  This option reduces the latency of the kernel by adding more
-	  "explicit preemption points" to the kernel code. These new
-	  preemption points have been selected to reduce the maximum
-	  latency of rescheduling, providing faster application reactions,
-	  at the cost of slightly lower throughput.
+	  This option reduces the latency of the kernel by allowing
+	  processes in higher scheduling policy classes preempt ones
+	  lower down.
 
-	  This allows reaction to interactive events by allowing a
-	  low priority process to voluntarily preempt itself even if it
-	  is in kernel mode executing a system call. This allows
-	  applications to run more 'smoothly' even when the system is
-	  under load.
-
-	  Select this if you are building a kernel for a desktop system.
+	  Higher priority processes in the same scheduling policy class
+	  do not preempt others in the same class.
 
 config PREEMPT
 	bool "Preemptible Kernel (Low-Latency Desktop)"
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 2a50a64255c6..3fa78e8afb7d 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -256,7 +256,7 @@ void sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags)
 	 */
 	if (!(flags & DEQUEUE_SAVE) && rq->nr_running == 1 &&
 	    rq->core->core_forceidle_count && rq->curr == rq->idle)
-		resched_curr(rq);
+		resched_curr(rq, false);
 }
 
 static int sched_task_is_throttled(struct task_struct *p, int cpu)
@@ -1074,9 +1074,12 @@ void __resched_curr(struct rq *rq, resched_t rs)
  *
  *  - in userspace: run to completion semantics are only for kernel tasks
  *
- * Otherwise (regardless of priority), run to completion.
+ *  - running under voluntary preemption (sched_feat(PREEMPT_PRIORITY))
+ *    and a task from a sched_class above wants the CPU
+ *
+ * Otherwise, run to completion.
  */
-void resched_curr(struct rq *rq)
+void resched_curr(struct rq *rq, bool above)
 {
 	resched_t rs = RESCHED_lazy;
 	int context;
@@ -1112,6 +1115,9 @@ void resched_curr(struct rq *rq)
 		goto resched;
 	}
 
+	if (sched_feat(PREEMPT_PRIORITY) && above)
+		rs = RESCHED_eager;
+
 resched:
 	__resched_curr(rq, rs);
 }
@@ -1123,7 +1129,7 @@ void resched_cpu(int cpu)
 
 	raw_spin_rq_lock_irqsave(rq, flags);
 	if (cpu_online(cpu) || cpu == smp_processor_id())
-		resched_curr(rq);
+		resched_curr(rq, true);
 	raw_spin_rq_unlock_irqrestore(rq, flags);
 }
 
@@ -2277,7 +2283,7 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
 	if (p->sched_class == rq->curr->sched_class)
 		rq->curr->sched_class->check_preempt_curr(rq, p, flags);
 	else if (sched_class_above(p->sched_class, rq->curr->sched_class))
-		resched_curr(rq);
+		resched_curr(rq, true);
 
 	/*
 	 * A queue event has occurred, and we're going to schedule.  In
@@ -2764,7 +2770,7 @@ int push_cpu_stop(void *arg)
 		deactivate_task(rq, p, 0);
 		set_task_cpu(p, lowest_rq->cpu);
 		activate_task(lowest_rq, p, 0);
-		resched_curr(lowest_rq);
+		resched_curr(lowest_rq, true);
 	}
 
 	double_unlock_balance(rq, lowest_rq);
@@ -3999,7 +4005,7 @@ void wake_up_if_idle(int cpu)
 	if (is_idle_task(rcu_dereference(rq->curr))) {
 		guard(rq_lock_irqsave)(rq);
 		if (is_idle_task(rq->curr))
-			resched_curr(rq);
+			resched_curr(rq, true);
 	}
 }
 
@@ -6333,7 +6339,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
 			continue;
 		}
 
-		resched_curr(rq_i);
+		resched_curr(rq_i, false);
 	}
 
 out_set_next:
@@ -6388,7 +6394,7 @@ static bool try_steal_cookie(int this, int that)
 		set_task_cpu(p, this);
 		activate_task(dst, p, 0);
 
-		resched_curr(dst);
+		resched_curr(dst, false);
 
 		success = true;
 		break;
@@ -8743,7 +8749,7 @@ int __sched yield_to(struct task_struct *p, bool preempt)
 		 * fairness.
 		 */
 		if (preempt && rq != p_rq)
-			resched_curr(p_rq);
+			resched_curr(p_rq, true);
 	}
 
 out_unlock:
@@ -10300,7 +10306,7 @@ void sched_move_task(struct task_struct *tsk)
 		 * throttled one but it's still the running task. Trigger a
 		 * resched to make sure that task can still run.
 		 */
-		resched_curr(rq);
+		resched_curr(rq, true);
 	}
 
 unlock:
diff --git a/kernel/sched/core_sched.c b/kernel/sched/core_sched.c
index a57fd8f27498..32f234f2a210 100644
--- a/kernel/sched/core_sched.c
+++ b/kernel/sched/core_sched.c
@@ -89,7 +89,7 @@ static unsigned long sched_core_update_cookie(struct task_struct *p,
 	 * next scheduling edge, rather than always forcing a reschedule here.
 	 */
 	if (task_on_cpu(rq, p))
-		resched_curr(rq);
+		resched_curr(rq, false);
 
 	task_rq_unlock(rq, p, &rf);
 
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index e6815c3bd2f0..ecb47b5e9588 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -1177,7 +1177,7 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
 	if (dl_task(rq->curr))
 		check_preempt_curr_dl(rq, p, 0);
 	else
-		resched_curr(rq);
+		resched_curr(rq, false);
 
 #ifdef CONFIG_SMP
 	/*
@@ -1367,7 +1367,7 @@ static void update_curr_dl(struct rq *rq)
 			enqueue_task_dl(rq, curr, ENQUEUE_REPLENISH);
 
 		if (!is_leftmost(curr, &rq->dl))
-			resched_curr(rq);
+			resched_curr(rq, false);
 	}
 
 	/*
@@ -1914,7 +1914,7 @@ static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p)
 	    cpudl_find(&rq->rd->cpudl, p, NULL))
 		return;
 
-	resched_curr(rq);
+	resched_curr(rq, false);
 }
 
 static int balance_dl(struct rq *rq, struct task_struct *p, struct rq_flags *rf)
@@ -1943,7 +1943,7 @@ static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p,
 				  int flags)
 {
 	if (dl_entity_preempt(&p->dl, &rq->curr->dl)) {
-		resched_curr(rq);
+		resched_curr(rq, false);
 		return;
 	}
 
@@ -2307,7 +2307,7 @@ static int push_dl_task(struct rq *rq)
 	if (dl_task(rq->curr) &&
 	    dl_time_before(next_task->dl.deadline, rq->curr->dl.deadline) &&
 	    rq->curr->nr_cpus_allowed > 1) {
-		resched_curr(rq);
+		resched_curr(rq, false);
 		return 0;
 	}
 
@@ -2353,7 +2353,7 @@ static int push_dl_task(struct rq *rq)
 	activate_task(later_rq, next_task, 0);
 	ret = 1;
 
-	resched_curr(later_rq);
+	resched_curr(later_rq, false);
 
 	double_unlock_balance(rq, later_rq);
 
@@ -2457,7 +2457,7 @@ static void pull_dl_task(struct rq *this_rq)
 	}
 
 	if (resched)
-		resched_curr(this_rq);
+		resched_curr(this_rq, false);
 }
 
 /*
@@ -2654,7 +2654,7 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p)
 		if (dl_task(rq->curr))
 			check_preempt_curr_dl(rq, p, 0);
 		else
-			resched_curr(rq);
+			resched_curr(rq, false);
 	} else {
 		update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 0);
 	}
@@ -2687,7 +2687,7 @@ static void prio_changed_dl(struct rq *rq, struct task_struct *p,
 		 * runqueue.
 		 */
 		if (dl_time_before(rq->dl.earliest_dl.curr, p->dl.deadline))
-			resched_curr(rq);
+			resched_curr(rq, false);
 	} else {
 		/*
 		 * Current may not be deadline in case p was throttled but we
@@ -2697,14 +2697,14 @@ static void prio_changed_dl(struct rq *rq, struct task_struct *p,
 		 */
 		if (!dl_task(rq->curr) ||
 		    dl_time_before(p->dl.deadline, rq->curr->dl.deadline))
-			resched_curr(rq);
+			resched_curr(rq, false);
 	}
 #else
 	/*
 	 * We don't know if p has a earlier or later deadline, so let's blindly
 	 * set a (maybe not needed) rescheduling point.
 	 */
-	resched_curr(rq);
+	resched_curr(rq, false);
 #endif
 }
 
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index fe7e5e9b2207..448fe36e7bbb 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1046,7 +1046,7 @@ static void update_deadline(struct cfs_rq *cfs_rq,
 	if (tick && test_tsk_thread_flag(rq->curr, TIF_NEED_RESCHED_LAZY))
 		__resched_curr(rq, RESCHED_eager);
 	else
-		resched_curr(rq);
+		resched_curr(rq, false);
 
 	clear_buddies(cfs_rq, se);
 }
@@ -5337,7 +5337,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
 	 * validating it and just reschedule.
 	 */
 	if (queued) {
-		resched_curr(rq_of(cfs_rq));
+		resched_curr(rq_of(cfs_rq), false);
 		return;
 	}
 	/*
@@ -5483,7 +5483,7 @@ static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
 	 * hierarchy can be throttled
 	 */
 	if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
-		resched_curr(rq_of(cfs_rq));
+		resched_curr(rq_of(cfs_rq), false);
 }
 
 static __always_inline
@@ -5743,7 +5743,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
 
 	/* Determine whether we need to wake up potentially idle CPU: */
 	if (rq->curr == rq->idle && rq->cfs.nr_running)
-		resched_curr(rq);
+		resched_curr(rq, false);
 }
 
 #ifdef CONFIG_SMP
@@ -6448,7 +6448,7 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
 
 		if (delta < 0) {
 			if (task_current(rq, p))
-				resched_curr(rq);
+				resched_curr(rq, false);
 			return;
 		}
 		hrtick_start(rq, delta);
@@ -8143,7 +8143,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
 	return;
 
 preempt:
-	resched_curr(rq);
+	resched_curr(rq, false);
 }
 
 #ifdef CONFIG_SMP
@@ -12294,7 +12294,7 @@ static inline void task_tick_core(struct rq *rq, struct task_struct *curr)
 	 */
 	if (rq->core->core_forceidle_count && rq->cfs.nr_running == 1 &&
 	    __entity_slice_used(&curr->se, MIN_NR_TASKS_DURING_FORCEIDLE))
-		resched_curr(rq);
+		resched_curr(rq, false);
 }
 
 /*
@@ -12459,7 +12459,7 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
 	 */
 	if (task_current(rq, p)) {
 		if (p->prio > oldprio)
-			resched_curr(rq);
+			resched_curr(rq, false);
 	} else
 		check_preempt_curr(rq, p, 0);
 }
@@ -12561,7 +12561,7 @@ static void switched_to_fair(struct rq *rq, struct task_struct *p)
 		 * if we can still preempt the current task.
 		 */
 		if (task_current(rq, p))
-			resched_curr(rq);
+			resched_curr(rq, false);
 		else
 			check_preempt_curr(rq, p, 0);
 	}
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 9b4c2967b2b7..9bf30732b03f 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -92,6 +92,11 @@ SCHED_FEAT(HZ_BW, true)
 
 #if defined(CONFIG_PREEMPT)
 SCHED_FEAT(FORCE_PREEMPT, true)
+SCHED_FEAT(PREEMPT_PRIORITY, true)
+#elif defined(CONFIG_PREEMPT_VOLUNTARY)
+SCHED_FEAT(FORCE_PREEMPT, false)
+SCHED_FEAT(PREEMPT_PRIORITY, true)
 #else
 SCHED_FEAT(FORCE_PREEMPT, false)
+SCHED_FEAT(PREEMPT_PRIORITY, false)
 #endif
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index eacd204e2879..3ef039869be9 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -403,7 +403,7 @@ balance_idle(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
  */
 static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int flags)
 {
-	resched_curr(rq);
+	resched_curr(rq, true);
 }
 
 static void put_prev_task_idle(struct rq *rq, struct task_struct *prev)
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 5fdb93f1b87e..8d87e42d30d8 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -589,7 +589,7 @@ static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
 			enqueue_rt_entity(rt_se, 0);
 
 		if (rt_rq->highest_prio.curr < curr->prio)
-			resched_curr(rq);
+			resched_curr(rq, false);
 	}
 }
 
@@ -682,7 +682,7 @@ static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
 		return;
 
 	enqueue_top_rt_rq(rt_rq);
-	resched_curr(rq);
+	resched_curr(rq, false);
 }
 
 static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
@@ -1076,7 +1076,7 @@ static void update_curr_rt(struct rq *rq)
 			rt_rq->rt_time += delta_exec;
 			exceeded = sched_rt_runtime_exceeded(rt_rq);
 			if (exceeded)
-				resched_curr(rq);
+				resched_curr(rq, false);
 			raw_spin_unlock(&rt_rq->rt_runtime_lock);
 			if (exceeded)
 				do_start_rt_bandwidth(sched_rt_bandwidth(rt_rq));
@@ -1691,7 +1691,7 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
 	 * to try and push the current task away:
 	 */
 	requeue_task_rt(rq, p, 1);
-	resched_curr(rq);
+	resched_curr(rq, false);
 }
 
 static int balance_rt(struct rq *rq, struct task_struct *p, struct rq_flags *rf)
@@ -1718,7 +1718,7 @@ static int balance_rt(struct rq *rq, struct task_struct *p, struct rq_flags *rf)
 static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flags)
 {
 	if (p->prio < rq->curr->prio) {
-		resched_curr(rq);
+		resched_curr(rq, false);
 		return;
 	}
 
@@ -2074,7 +2074,7 @@ static int push_rt_task(struct rq *rq, bool pull)
 	 * just reschedule current.
 	 */
 	if (unlikely(next_task->prio < rq->curr->prio)) {
-		resched_curr(rq);
+		resched_curr(rq, false);
 		return 0;
 	}
 
@@ -2162,7 +2162,7 @@ static int push_rt_task(struct rq *rq, bool pull)
 	deactivate_task(rq, next_task, 0);
 	set_task_cpu(next_task, lowest_rq->cpu);
 	activate_task(lowest_rq, next_task, 0);
-	resched_curr(lowest_rq);
+	resched_curr(lowest_rq, false);
 	ret = 1;
 
 	double_unlock_balance(rq, lowest_rq);
@@ -2456,7 +2456,7 @@ static void pull_rt_task(struct rq *this_rq)
 	}
 
 	if (resched)
-		resched_curr(this_rq);
+		resched_curr(this_rq, false);
 }
 
 /*
@@ -2555,7 +2555,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p)
 			rt_queue_push_tasks(rq);
 #endif /* CONFIG_SMP */
 		if (p->prio < rq->curr->prio && cpu_online(cpu_of(rq)))
-			resched_curr(rq);
+			resched_curr(rq, false);
 	}
 }
 
@@ -2583,11 +2583,11 @@ prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio)
 		 * then reschedule.
 		 */
 		if (p->prio > rq->rt.highest_prio.curr)
-			resched_curr(rq);
+			resched_curr(rq, false);
 #else
 		/* For UP simply resched on drop of prio */
 		if (oldprio < p->prio)
-			resched_curr(rq);
+			resched_curr(rq, false);
 #endif /* CONFIG_SMP */
 	} else {
 		/*
@@ -2596,7 +2596,7 @@ prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio)
 		 * then reschedule.
 		 */
 		if (p->prio < rq->curr->prio)
-			resched_curr(rq);
+			resched_curr(rq, false);
 	}
 }
 
@@ -2668,7 +2668,7 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
 			if (test_tsk_thread_flag(rq->curr, TIF_NEED_RESCHED_LAZY))
 				__resched_curr(rq, RESCHED_eager);
 			else
-				resched_curr(rq);
+				resched_curr(rq, false);
 
 			return;
 		}
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index e29a8897f573..9a745dd7482f 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2435,7 +2435,7 @@ extern void init_sched_fair_class(void);
 extern void reweight_task(struct task_struct *p, int prio);
 
 extern void __resched_curr(struct rq *rq, resched_t rs);
-extern void resched_curr(struct rq *rq);
+extern void resched_curr(struct rq *rq, bool above);
 extern void resched_cpu(int cpu);
 
 extern struct rt_bandwidth def_rt_bandwidth;
-- 
2.31.1