linux-kernel - [RFC][PATCH 1/2] sched: Avoid placing RT threads on cores handling long softirqs

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20220811015758.3001641-2-jstultz@google.com>
Date:   Thu, 11 Aug 2022 01:57:57 +0000
From:   John Stultz <jstultz@...gle.com>
To:     LKML <linux-kernel@...r.kernel.org>
Cc:     "Connor O'Brien" <connoro@...gle.com>,
        John Dias <joaodias@...gle.com>, Rick Yiu <rickyiu@...gle.com>,
        John Kacur <jkacur@...hat.com>,
        Qais Yousef <qais.yousef@....com>,
        Chris Redpath <chris.redpath@....com>,
        Abhijeet Dharmapurikar <adharmap@...cinc.com>,
        Peter Zijlstra <peterz@...radead.org>,
        Ingo Molnar <mingo@...hat.com>,
        Juri Lelli <juri.lelli@...hat.com>,
        Vincent Guittot <vincent.guittot@...aro.org>,
        Dietmar Eggemann <dietmar.eggemann@....com>,
        Steven Rostedt <rostedt@...dmis.org>,
        Thomas Gleixner <tglx@...utronix.de>, kernel-team@...roid.com,
        "J . Avila" <elavila@...gle.com>, John Stultz <jstultz@...gle.com>
Subject: [RFC][PATCH 1/2] sched: Avoid placing RT threads on cores handling
 long softirqs

From: Connor O'Brien <connoro@...gle.com>

In certain audio use cases, scheduling RT threads on cores that
are handling softirqs can lead to glitches. Prevent this
behavior in cases where the softirq is likely to take a long
time. To avoid unnecessary migrations, the old behavior is
preserved for RCU, SCHED and TIMER irqs which are expected to be
relatively quick.

This patch reworks and combines two related changes originally
by John Dias <joaodias@...gle.com>

Cc: John Dias <joaodias@...gle.com>
Cc: Connor O'Brien <connoro@...gle.com>
Cc: Rick Yiu <rickyiu@...gle.com>
Cc: John Kacur <jkacur@...hat.com>
Cc: Qais Yousef <qais.yousef@....com>
Cc: Chris Redpath <chris.redpath@....com>,
Cc: Abhijeet Dharmapurikar <adharmap@...cinc.com>
Cc: Peter Zijlstra <peterz@...radead.org>
Cc: Ingo Molnar <mingo@...hat.com>
Cc: Juri Lelli <juri.lelli@...hat.com>
Cc: Vincent Guittot <vincent.guittot@...aro.org>
Cc: Dietmar Eggemann <dietmar.eggemann@....com>
Cc: Steven Rostedt <rostedt@...dmis.org>
Cc: Thomas Gleixner <tglx@...utronix.de>
Cc: kernel-team@...roid.com
Signed-off-by: John Dias <joaodias@...gle.com>
[elavila: Port to mainline, amend commit text]
Signed-off-by: J. Avila <elavila@...gle.com>
[connoro: Reworked, simplified, and merged two patches together]
Signed-off-by: Connor O'Brien <connoro@...gle.com>
[jstultz: Further simplified and fixed issues, reworded commit
 message]
Signed-off-by: John Stultz <jstultz@...gle.com>
---
 include/linux/interrupt.h |  7 +++++
 init/Kconfig              | 11 +++++++
 kernel/sched/rt.c         | 65 +++++++++++++++++++++++++++++++++------
 kernel/softirq.c          |  9 ++++++
 4 files changed, 83 insertions(+), 9 deletions(-)

diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index a92bce40b04b..bac9da05b9c8 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -571,6 +571,12 @@ enum
  * _ IRQ_POLL: irq_poll_cpu_dead() migrates the queue
  */
 #define SOFTIRQ_HOTPLUG_SAFE_MASK (BIT(RCU_SOFTIRQ) | BIT(IRQ_POLL_SOFTIRQ))
+/* Softirq's where the handling might be long: */
+#define LONG_SOFTIRQ_MASK ((1 << NET_TX_SOFTIRQ)       | \
+			   (1 << NET_RX_SOFTIRQ)       | \
+			   (1 << BLOCK_SOFTIRQ)        | \
+			   (1 << IRQ_POLL_SOFTIRQ) | \
+			   (1 << TASKLET_SOFTIRQ))
 
 /* map softirq index to softirq name. update 'softirq_to_name' in
  * kernel/softirq.c when adding a new softirq.
@@ -606,6 +612,7 @@ extern void raise_softirq_irqoff(unsigned int nr);
 extern void raise_softirq(unsigned int nr);
 
 DECLARE_PER_CPU(struct task_struct *, ksoftirqd);
+DECLARE_PER_CPU(u32, active_softirqs);
 
 static inline struct task_struct *this_cpu_ksoftirqd(void)
 {
diff --git a/init/Kconfig b/init/Kconfig
index c7900e8975f1..b0dde2771314 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1278,6 +1278,17 @@ config SCHED_AUTOGROUP
 	  desktop applications.  Task group autogeneration is currently based
 	  upon task session.
 
+config RT_SOFTIRQ_OPTIMIZATION
+       bool "Improve RT scheduling during long softirq execution"
+       depends on ARM64
+       depends on SMP
+       default n
+       help
+         Enable an optimization which tries to avoid placing RT tasks on CPUs
+	 occupied by nonpreemptible tasks, such as a long softirq or CPUs
+	 which may soon block preemptions, such as a CPU running a ksoftirq
+	 thread which handles slow softirqs.
+
 config SYSFS_DEPRECATED
 	bool "Enable deprecated sysfs features to support old userspace tools"
 	depends on SYSFS
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 8c9ed9664840..22f491c8af3a 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1596,12 +1596,50 @@ static void yield_task_rt(struct rq *rq)
 #ifdef CONFIG_SMP
 static int find_lowest_rq(struct task_struct *task);
 
+#ifdef CONFIG_RT_SOFTIRQ_OPTIMIZATION
+/*
+ * Return whether the task on the given cpu is currently non-preemptible
+ * while handling a potentially long softirq, or if the task is likely
+ * to block preemptions soon because it is a ksoftirq thread that is
+ * handling slow softirq.
+ */
+static bool task_may_preempt(struct task_struct *task, int cpu)
+{
+	u32 softirqs = per_cpu(active_softirqs, cpu) |
+		       per_cpu(irq_stat, cpu).__softirq_pending;
+
+	struct task_struct *cpu_ksoftirqd = per_cpu(ksoftirqd, cpu);
+	struct task_struct *curr;
+	struct rq *rq = cpu_rq(cpu);
+	int ret;
+
+	rcu_read_lock();
+	curr = READ_ONCE(rq->curr); /* unlocked access */
+	ret = !((softirqs & LONG_SOFTIRQ_MASK) &&
+		 (curr == cpu_ksoftirqd ||
+		  task_thread_info(curr)->preempt_count & SOFTIRQ_MASK));
+	rcu_read_unlock();
+	return ret;
+}
+#else
+static bool task_may_preempt(struct task_struct *task, int cpu)
+{
+	return true;
+}
+#endif /* CONFIG_RT_SOFTIRQ_OPTIMIZATION */
+
+bool rt_task_fits_capacity_and_may_preempt(struct task_struct *p, int cpu)
+{
+	return task_may_preempt(p, cpu) && rt_task_fits_capacity(p, cpu);
+}
+
 static int
 select_task_rq_rt(struct task_struct *p, int cpu, int flags)
 {
 	struct task_struct *curr;
 	struct rq *rq;
 	bool test;
+	bool may_not_preempt;
 
 	/* For anything but wake ups, just return the task_cpu */
 	if (!(flags & (WF_TTWU | WF_FORK)))
@@ -1613,7 +1651,12 @@ select_task_rq_rt(struct task_struct *p, int cpu, int flags)
 	curr = READ_ONCE(rq->curr); /* unlocked access */
 
 	/*
-	 * If the current task on @p's runqueue is an RT task, then
+	 * If the current task on @p's runqueue is a softirq task,
+	 * it may run without preemption for a time that is
+	 * ill-suited for a waiting RT task. Therefore, try to
+	 * wake this RT task on another runqueue.
+	 *
+	 * Also, if the current task on @p's runqueue is an RT task, then
 	 * try to see if we can wake this RT task up on another
 	 * runqueue. Otherwise simply start this RT task
 	 * on its current runqueue.
@@ -1638,9 +1681,10 @@ select_task_rq_rt(struct task_struct *p, int cpu, int flags)
 	 * requirement of the task - which is only important on heterogeneous
 	 * systems like big.LITTLE.
 	 */
-	test = curr &&
-	       unlikely(rt_task(curr)) &&
-	       (curr->nr_cpus_allowed < 2 || curr->prio <= p->prio);
+	may_not_preempt = !task_may_preempt(curr, cpu);
+	test = (curr && (may_not_preempt ||
+			 (unlikely(rt_task(curr)) &&
+			  (curr->nr_cpus_allowed < 2 || curr->prio <= p->prio))));
 
 	if (test || !rt_task_fits_capacity(p, cpu)) {
 		int target = find_lowest_rq(p);
@@ -1653,11 +1697,14 @@ select_task_rq_rt(struct task_struct *p, int cpu, int flags)
 			goto out_unlock;
 
 		/*
-		 * Don't bother moving it if the destination CPU is
+		 * If cpu is non-preemptible, prefer remote cpu
+		 * even if it's running a higher-prio task.
+		 * Otherwise: Don't bother moving it if the destination CPU is
 		 * not running a lower priority task.
 		 */
 		if (target != -1 &&
-		    p->prio < cpu_rq(target)->rt.highest_prio.curr)
+		    (may_not_preempt ||
+		     p->prio < cpu_rq(target)->rt.highest_prio.curr))
 			cpu = target;
 	}
 
@@ -1898,11 +1945,11 @@ static int find_lowest_rq(struct task_struct *task)
 
 		ret = cpupri_find_fitness(&task_rq(task)->rd->cpupri,
 					  task, lowest_mask,
-					  rt_task_fits_capacity);
+					  rt_task_fits_capacity_and_may_preempt);
 	} else {
 
-		ret = cpupri_find(&task_rq(task)->rd->cpupri,
-				  task, lowest_mask);
+		ret = cpupri_find_fitness(&task_rq(task)->rd->cpupri,
+					  task, lowest_mask, task_may_preempt);
 	}
 
 	if (!ret)
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 9f0aef8aa9ff..92fa82400a80 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -60,6 +60,13 @@ static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp
 
 DEFINE_PER_CPU(struct task_struct *, ksoftirqd);
 
+/*
+ * active_softirqs -- per cpu, a mask of softirqs that are being handled,
+ * with the expectation that approximate answers are acceptable and therefore
+ * no synchronization.
+ */
+DEFINE_PER_CPU(u32, active_softirqs);
+
 const char * const softirq_to_name[NR_SOFTIRQS] = {
 	"HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", "IRQ_POLL",
 	"TASKLET", "SCHED", "HRTIMER", "RCU"
@@ -551,6 +558,7 @@ asmlinkage __visible void __softirq_entry __do_softirq(void)
 restart:
 	/* Reset the pending bitmask before enabling irqs */
 	set_softirq_pending(0);
+	__this_cpu_write(active_softirqs, pending);
 
 	local_irq_enable();
 
@@ -580,6 +588,7 @@ asmlinkage __visible void __softirq_entry __do_softirq(void)
 		pending >>= softirq_bit;
 	}
 
+	__this_cpu_write(active_softirqs, 0);
 	if (!IS_ENABLED(CONFIG_PREEMPT_RT) &&
 	    __this_cpu_read(ksoftirqd) == current)
 		rcu_softirq_qs();
-- 
2.37.1.559.g78731f0fdb-goog