linux-kernel - Re: [PATCH 2/3] rcu: Defer RCU kthreads wakeup when CPU is dying

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <ZYIqYEW/m5SyyFq8@lothringen>
Date: Wed, 20 Dec 2023 00:42:24 +0100
From: Frederic Weisbecker <frederic@...nel.org>
To: Hillf Danton <hdanton@...a.com>
Cc: LKML <linux-kernel@...r.kernel.org>, Boqun Feng <boqun.feng@...il.com>,
	Joel Fernandes <joel@...lfernandes.org>,
	Neeraj Upadhyay <neeraj.upadhyay@....com>,
	Uladzislau Rezki <urezki@...il.com>,
	"Paul E . McKenney" <paulmck@...nel.org>,
	Thomas Gleixner <tglx@...utronix.de>,
	Peter Zijlstra <peterz@...radead.org>
Subject: Re: [PATCH 2/3] rcu: Defer RCU kthreads wakeup when CPU is dying

On Tue, Dec 19, 2023 at 12:42:47PM +0800, Hillf Danton wrote:
> On Tue, 19 Dec 2023 00:19:15 +0100 Frederic Weisbecker <frederic@...nel.org>
> > +static void swake_up_one_online(struct swait_queue_head *wqh)
> > +{
> > +	int cpu = get_cpu();
> > +
> > +	/*
> > +	 * If called from rcutree_report_cpu_starting(), wake up
> > +	 * is dangerous that late in the CPU-down hotplug process. The
> > +	 * scheduler might queue an ignored hrtimer. Defer the wake up
> > +	 * to an online CPU instead.
> > +	 */
> 
> But why is scheduler having any interest selecting a dying CPU for
> adding a hrtimer on at the first place?

So indeed that timer could be unpinned. But we tried that and it's not
enough. If we want to make hrtimers and nohz infrastructure aware of
the fact the current CPU is offline when it queues an hrtimer, we must
face the ugliness below. And still it's hacky because we must also find an
online target whose earliest deadline is below/equal the scheduler hrtimer
we are trying to enqueue. And that requires even more ugliness that isn't
handled below.

So for now I assume that queuing a timer after hrtimers_cpu_dying() is
unreasonable and that RCU is the only candidate trying that. If there are
more to be reported, we shall see...

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index f2044d5a652b..9eac39fad31c 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -234,6 +234,7 @@ struct hrtimer_cpu_base {
 	struct hrtimer			*next_timer;
 	ktime_t				softirq_expires_next;
 	struct hrtimer			*softirq_next_timer;
+	int online;
 	struct hrtimer_clock_base	clock_base[HRTIMER_MAX_CLOCK_BASES];
 } ____cacheline_aligned;
 
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index a708d225c28e..83c75768f290 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1088,7 +1088,7 @@ int get_nohz_timer_target(void)
 	struct sched_domain *sd;
 	const struct cpumask *hk_mask;
 
-	if (housekeeping_cpu(cpu, HK_TYPE_TIMER)) {
+	if (housekeeping_cpu(cpu, HK_TYPE_TIMER) && cpu_online(cpu)) {
 		if (!idle_cpu(cpu))
 			return cpu;
 		default_cpu = cpu;
@@ -1109,7 +1109,8 @@ int get_nohz_timer_target(void)
 	}
 
 	if (default_cpu == -1)
-		default_cpu = housekeeping_any_cpu(HK_TYPE_TIMER);
+		default_cpu = cpumask_any_and(housekeeping_cpumask(HK_TYPE_TIMER),
+					      cpu_online_mask);
 
 	return default_cpu;
 }
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 6aaf0a3d6081..26cb9455272a 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -118,7 +118,7 @@ static inline void do_start_rt_bandwidth(struct rt_bandwidth *rt_b)
 		 */
 		hrtimer_forward_now(&rt_b->rt_period_timer, ns_to_ktime(0));
 		hrtimer_start_expires(&rt_b->rt_period_timer,
-				      HRTIMER_MODE_ABS_PINNED_HARD);
+				      HRTIMER_MODE_ABS_HARD);
 	}
 	raw_spin_unlock(&rt_b->rt_runtime_lock);
 }
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 760793998cdd..82f9ace2e4fd 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -36,6 +36,7 @@
 #include <linux/sched/sysctl.h>
 #include <linux/sched/rt.h>
 #include <linux/sched/deadline.h>
+#include <linux/sched/isolation.h>
 #include <linux/sched/nohz.h>
 #include <linux/sched/debug.h>
 #include <linux/timer.h>
@@ -206,6 +207,12 @@ struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base,
 #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
 	if (static_branch_likely(&timers_migration_enabled) && !pinned)
 		return &per_cpu(hrtimer_bases, get_nohz_timer_target());
+#else
+	if (!base->online) {
+		int cpu = cpumask_any_and(housekeeping_cpumask(HK_TYPE_TIMER),
+					  cpu_online_mask);
+		base = &per_cpu(hrtimer_bases, cpu);
+	}
 #endif
 	return base;
 }
@@ -254,7 +261,13 @@ switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base,
 		raw_spin_lock(&new_base->cpu_base->lock);
 
 		if (new_cpu_base != this_cpu_base &&
-		    hrtimer_check_target(timer, new_base)) {
+		    hrtimer_check_target(timer, new_base) &&
+		    /*
+		     * Crude hack and buggy: if this CPU is offline and
+		     * the timer is the earliest on the remote target, the timer
+		     * will expire late...
+		     */
+		    this_cpu_base->online) {
 			raw_spin_unlock(&new_base->cpu_base->lock);
 			raw_spin_lock(&base->cpu_base->lock);
 			new_cpu_base = this_cpu_base;
@@ -2183,6 +2196,7 @@ int hrtimers_prepare_cpu(unsigned int cpu)
 	cpu_base->softirq_next_timer = NULL;
 	cpu_base->expires_next = KTIME_MAX;
 	cpu_base->softirq_expires_next = KTIME_MAX;
+	cpu_base->online = 1;
 	hrtimer_cpu_base_init_expiry_lock(cpu_base);
 	return 0;
 }
@@ -2248,7 +2262,7 @@ int hrtimers_cpu_dying(unsigned int dying_cpu)
 	__hrtimer_get_next_event(new_base, HRTIMER_ACTIVE_SOFT);
 	/* Tell the other CPU to retrigger the next event */
 	smp_call_function_single(ncpu, retrigger_next_event, NULL, 0);
-
+	old_base->online = 0;
 	raw_spin_unlock(&new_base->lock);
 	raw_spin_unlock(&old_base->lock);