lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20240624110624.GJ31592@noisy.programming.kicks-ass.net>
Date: Mon, 24 Jun 2024 13:06:24 +0200
From: Peter Zijlstra <peterz@...radead.org>
To: Tejun Heo <tj@...nel.org>
Cc: torvalds@...ux-foundation.org, mingo@...hat.com, juri.lelli@...hat.com,
	vincent.guittot@...aro.org, dietmar.eggemann@....com,
	rostedt@...dmis.org, bsegall@...gle.com, mgorman@...e.de,
	bristot@...hat.com, vschneid@...hat.com, ast@...nel.org,
	daniel@...earbox.net, andrii@...nel.org, martin.lau@...nel.org,
	joshdon@...gle.com, brho@...gle.com, pjt@...gle.com,
	derkling@...gle.com, haoluo@...gle.com, dvernet@...a.com,
	dschatzberg@...a.com, dskarlat@...cmu.edu, riel@...riel.com,
	changwoo@...lia.com, himadrics@...ia.fr, memxor@...il.com,
	andrea.righi@...onical.com, joel@...lfernandes.org,
	linux-kernel@...r.kernel.org, bpf@...r.kernel.org,
	kernel-team@...a.com
Subject: Re: [PATCH 05/39] sched: Add sched_class->switching_to() and expose
 check_class_changing/changed()

On Wed, May 01, 2024 at 05:09:40AM -1000, Tejun Heo wrote:
> When a task switches to a new sched_class, the prev and new classes are
> notified through ->switched_from() and ->switched_to(), respectively, after
> the switching is done.
> 
> A new BPF extensible sched_class will have callbacks that allow the BPF
> scheduler to keep track of relevant task states (like priority and cpumask).
> Those callbacks aren't called while a task is on a different sched_class.
> When a task comes back, we wanna tell the BPF progs the up-to-date state
> before the task gets enqueued, so we need a hook which is called before the
> switching is committed.
> 
> This patch adds ->switching_to() which is called during sched_class switch
> through check_class_changing() before the task is restored. Also, this patch
> exposes check_class_changing/changed() in kernel/sched/sched.h. They will be
> used by the new BPF extensible sched_class to implement implicit sched_class
> switching which is used e.g. when falling back to CFS when the BPF scheduler
> fails or unloads.
> 
> This is a prep patch and doesn't cause any behavior changes. The new
> operation and exposed functions aren't used yet.
> 
> v2: Improve patch description w/ details on planned use.
> 
> Signed-off-by: Tejun Heo <tj@...nel.org>
> Reviewed-by: David Vernet <dvernet@...a.com>
> Acked-by: Josh Don <joshdon@...gle.com>
> Acked-by: Hao Luo <haoluo@...gle.com>
> Acked-by: Barret Rhoden <brho@...gle.com>

> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index 8e23f19e8096..99e292368d11 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -2301,6 +2301,7 @@ struct sched_class {
>  	 * cannot assume the switched_from/switched_to pair is serialized by
>  	 * rq->lock. They are however serialized by p->pi_lock.
>  	 */
> +	void (*switching_to) (struct rq *this_rq, struct task_struct *task);
>  	void (*switched_from)(struct rq *this_rq, struct task_struct *task);
>  	void (*switched_to)  (struct rq *this_rq, struct task_struct *task);
>  	void (*reweight_task)(struct rq *this_rq, struct task_struct *task,

So I *think* that I can handle all the current cases in
sched_class::{en,de}queue_task() if we add {EN,DE}QUEUE_CLASS flags.

Would that work for the BPF thing as well?

Something like the very much incomplete below... It would allow removing
all these switch{ed,ing}_{to,from}() things entirely, instead of
adding yet more.

---
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 0935f9d4bb7b..da54c9f8f78d 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6864,15 +6864,22 @@ int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flag
 }
 EXPORT_SYMBOL(default_wake_function);
 
-void __setscheduler_prio(struct task_struct *p, int prio)
+struct sched_class *__setscheduler_class(int prio)
 {
+	struct sched_class *class;
+
 	if (dl_prio(prio))
-		p->sched_class = &dl_sched_class;
+		class = &dl_sched_class;
 	else if (rt_prio(prio))
-		p->sched_class = &rt_sched_class;
+		class = &rt_sched_class;
 	else
-		p->sched_class = &fair_sched_class;
+		class = &fair_sched_class;
 
+	return class;
+}
+
+void __setscheduler_prio(struct task_struct *p, int prio)
+{
 	p->prio = prio;
 }
 
@@ -6919,7 +6926,7 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
 {
 	int prio, oldprio, queued, running, queue_flag =
 		DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
-	const struct sched_class *prev_class;
+	const struct sched_class *prev_class, *class;
 	struct rq_flags rf;
 	struct rq *rq;
 
@@ -6977,6 +6984,10 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
 		queue_flag &= ~DEQUEUE_MOVE;
 
 	prev_class = p->sched_class;
+	class = __setscheduler_class(prio);
+	if (prev_class != class)
+		queue_flags |= DEQUEUE_CLASS;
+
 	queued = task_on_rq_queued(p);
 	running = task_current(rq, p);
 	if (queued)
@@ -7014,6 +7025,7 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
 			p->rt.timeout = 0;
 	}
 
+	p->class = class;
 	__setscheduler_prio(p, prio);
 
 	if (queued)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 62fd8bc6fd08..a03995d81c75 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2251,6 +2251,7 @@ extern const u32		sched_prio_to_wmult[40];
 #define DEQUEUE_MOVE		0x04 /* Matches ENQUEUE_MOVE */
 #define DEQUEUE_NOCLOCK		0x08 /* Matches ENQUEUE_NOCLOCK */
 #define DEQUEUE_MIGRATING	0x100 /* Matches ENQUEUE_MIGRATING */
+#define DEQUEUE_CLASS		0x200 /* Matches ENQUEUE_CLASS */
 
 #define ENQUEUE_WAKEUP		0x01
 #define ENQUEUE_RESTORE		0x02
@@ -2266,6 +2267,7 @@ extern const u32		sched_prio_to_wmult[40];
 #endif
 #define ENQUEUE_INITIAL		0x80
 #define ENQUEUE_MIGRATING	0x100
+#define ENQUEUE_CLASS		0x200
 
 #define RETRY_TASK		((void *)-1UL)
 
@@ -3603,6 +3605,7 @@ static inline int rt_effective_prio(struct task_struct *p, int prio)
 
 extern int __sched_setscheduler(struct task_struct *p, const struct sched_attr *attr, bool user, bool pi);
 extern int __sched_setaffinity(struct task_struct *p, struct affinity_context *ctx);
+extern struct sched_class *__setscheduler_class(int prio);
 extern void __setscheduler_prio(struct task_struct *p, int prio);
 extern void set_load_weight(struct task_struct *p, bool update_load);
 extern void enqueue_task(struct rq *rq, struct task_struct *p, int flags);
diff --git a/kernel/sched/syscalls.c b/kernel/sched/syscalls.c
index ae1b42775ef9..dc104d996204 100644
--- a/kernel/sched/syscalls.c
+++ b/kernel/sched/syscalls.c
@@ -612,7 +612,7 @@ int __sched_setscheduler(struct task_struct *p,
 {
 	int oldpolicy = -1, policy = attr->sched_policy;
 	int retval, oldprio, newprio, queued, running;
-	const struct sched_class *prev_class;
+	const struct sched_class *prev_class, *class;
 	struct balance_callback *head;
 	struct rq_flags rf;
 	int reset_on_fork;
@@ -783,6 +783,12 @@ int __sched_setscheduler(struct task_struct *p,
 			queue_flags &= ~DEQUEUE_MOVE;
 	}
 
+	class = prev_class = p->sched_class;
+	if (!(attr->sched_flags & SCHED_FLAG_KEEP_PARAMS))
+		class = __setscheduler_class(newprio);
+	if (prev_class != class)
+		queue_flags |= DEQUEUE_CLASS;
+
 	queued = task_on_rq_queued(p);
 	running = task_current(rq, p);
 	if (queued)
@@ -790,10 +796,9 @@ int __sched_setscheduler(struct task_struct *p,
 	if (running)
 		put_prev_task(rq, p);
 
-	prev_class = p->sched_class;
-
 	if (!(attr->sched_flags & SCHED_FLAG_KEEP_PARAMS)) {
 		__setscheduler_params(p, attr);
+		p->class = class;
 		__setscheduler_prio(p, newprio);
 	}
 	__setscheduler_uclamp(p, attr);

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ