lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <20250616163728.GB1613633@noisy.programming.kicks-ass.net>
Date: Mon, 16 Jun 2025 18:37:28 +0200
From: Peter Zijlstra <peterz@...radead.org>
To: Vincent Guittot <vincent.guittot@...aro.org>
Cc: mingo@...hat.com, juri.lelli@...hat.com, dietmar.eggemann@....com,
	rostedt@...dmis.org, bsegall@...gle.com, mgorman@...e.de,
	vschneid@...hat.com, clm@...a.com, linux-kernel@...r.kernel.org
Subject: Re: [RFC][PATCH 5/5] sched: Add ttwu_queue support for delayed tasks

On Mon, Jun 16, 2025 at 02:01:25PM +0200, Peter Zijlstra wrote:
> On Fri, Jun 06, 2025 at 05:03:36PM +0200, Vincent Guittot wrote:
> > On Tue, 20 May 2025 at 12:18, Peter Zijlstra <peterz@...radead.org> wrote:
> > >
> > > One of the things lost with introduction of DELAY_DEQUEUE is the
> > > ability of TTWU to move those tasks around on wakeup, since they're
> > > on_rq, and as such, need to be woken in-place.
> > 
> > I was thinking that you would call select_task_rq() somewhere in the
> > wake up path of delayed entity to get a chance to migrate it which was
> > one reason for the perf regression (and which would have also been
> > useful for EAS case) but IIUC, 
> 
> FWIW, the trivial form of all this is something like the below. The
> problem is that performance sucks :/ For me it is worse than not doing
> it. 

And because I was poking at the thing, I had to try the complicated
version again... This seems to survive long enough for a few benchmark
runs, and its not bad.

It very much burns after a while though :-( So I'll have to poke more at
this. Clearly I'm missing something (again!).

---
Index: linux-2.6/include/linux/sched.h
===================================================================
--- linux-2.6.orig/include/linux/sched.h
+++ linux-2.6/include/linux/sched.h
@@ -994,6 +994,7 @@ struct task_struct {
 	 * ->sched_remote_wakeup gets used, so it can be in this word.
 	 */
 	unsigned			sched_remote_wakeup:1;
+	unsigned			sched_remote_delayed:1;
 #ifdef CONFIG_RT_MUTEXES
 	unsigned			sched_rt_mutex:1;
 #endif
Index: linux-2.6/kernel/sched/core.c
===================================================================
--- linux-2.6.orig/kernel/sched/core.c
+++ linux-2.6/kernel/sched/core.c
@@ -3844,6 +3849,50 @@ static int ttwu_runnable(struct task_str
 }
 
 #ifdef CONFIG_SMP
+static void __ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags);
+
+static inline bool ttwu_do_migrate(struct task_struct *p, int cpu)
+{
+	if (task_cpu(p) == cpu)
+		return false;
+
+	if (p->in_iowait) {
+		delayacct_blkio_end(p);
+		atomic_dec(&task_rq(p)->nr_iowait);
+	}
+
+	psi_ttwu_dequeue(p);
+	set_task_cpu(p, cpu);
+	return true;
+}
+
+static int ttwu_delayed(struct rq *rq, struct task_struct *p, int wake_flags)
+{
+	int cpu = task_cpu(p);
+
+	/*
+	 * Notably it is possible for on-rq entities to get migrated -- even
+	 * sched_delayed ones.
+	 */
+	if (unlikely(cpu_of(rq) != cpu)) {
+		/* chase after it */
+		__ttwu_queue_wakelist(p, cpu, wake_flags | WF_DELAYED);
+		return 1;
+	}
+
+	if (task_on_rq_queued(p))
+		dequeue_task(rq, p, DEQUEUE_NOCLOCK | DEQUEUE_SLEEP | DEQUEUE_DELAYED);
+
+	cpu = select_task_rq(p, p->wake_cpu, &wake_flags);
+	if (!ttwu_do_migrate(p, cpu))
+		return 0;
+
+	wake_flags |= WF_MIGRATED;
+	/* shoot it to the other CPU */
+	__ttwu_queue_wakelist(p, cpu, wake_flags);
+	return 1;
+}
+
 void sched_ttwu_pending(void *arg)
 {
 	struct llist_node *llist = arg;
@@ -3857,39 +3906,12 @@ void sched_ttwu_pending(void *arg)
 	update_rq_clock(rq);
 
 	llist_for_each_entry_safe(p, t, llist, wake_entry.llist) {
-		struct rq *p_rq = task_rq(p);
-		int ret;
-
-		/*
-		 * This is the ttwu_runnable() case. Notably it is possible for
-		 * on-rq entities to get migrated -- even sched_delayed ones.
-		 */
-		if (unlikely(p_rq != rq)) {
-			rq_unlock(rq, &guard.rf);
-			p_rq = __task_rq_lock(p, &guard.rf);
-		}
-
-		ret = __ttwu_runnable(p_rq, p, WF_TTWU);
-
-		if (unlikely(p_rq != rq)) {
-			if (!ret)
-				set_task_cpu(p, cpu_of(rq));
-
-			__task_rq_unlock(p_rq, &guard.rf);
-			rq_lock(rq, &guard.rf);
-			update_rq_clock(rq);
-		}
-
-		if (ret)
-			continue;
-
-		/*
-		 * This is the 'normal' case where the task is blocked.
-		 */
-
 		if (WARN_ON_ONCE(p->on_cpu))
 			smp_cond_load_acquire(&p->on_cpu, !VAL);
 
+		if (p->sched_remote_delayed && ttwu_delayed(rq, p, WF_TTWU))
+			continue;
+
 		ttwu_do_activate(rq, p, p->sched_remote_wakeup ? WF_MIGRATED : 0, &guard.rf);
 	}
 
@@ -3933,6 +3955,7 @@ static void __ttwu_queue_wakelist(struct
 	struct rq *rq = cpu_rq(cpu);
 
 	p->sched_remote_wakeup = !!(wake_flags & WF_MIGRATED);
+	p->sched_remote_delayed = !!(wake_flags & WF_DELAYED);
 
 	WRITE_ONCE(rq->ttwu_pending, 1);
 	__smp_call_single_queue(cpu, &p->wake_entry.llist);
@@ -4371,17 +4394,8 @@ int try_to_wake_up(struct task_struct *p
 		 * their previous state and preserve Program Order.
 		 */
 		smp_cond_load_acquire(&p->on_cpu, !VAL);
-
-		if (task_cpu(p) != cpu) {
-			if (p->in_iowait) {
-				delayacct_blkio_end(p);
-				atomic_dec(&task_rq(p)->nr_iowait);
-			}
-
+		if (ttwu_do_migrate(p, cpu))
 			wake_flags |= WF_MIGRATED;
-			psi_ttwu_dequeue(p);
-			set_task_cpu(p, cpu);
-		}
 #else
 		cpu = task_cpu(p);
 #endif /* CONFIG_SMP */

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ