[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <8734hjausb.ffs@tglx>
Date: Thu, 16 Jan 2025 11:59:48 +0100
From: Thomas Gleixner <tglx@...utronix.de>
To: Frederic Weisbecker <frederic@...nel.org>
Cc: LKML <linux-kernel@...r.kernel.org>, Frederic Weisbecker
<frederic@...nel.org>, vlad.wing@...il.com, rcu@...r.kernel.org,
boqun.feng@...il.com, joel@...lfernandes.org, neeraj.upadhyay@....com,
urezki@...il.com, qiang.zhang1211@...il.com, Cheng-Jui.Wang@...iatek.com,
leitao@...ian.org, kernel-team@...a.com, Usama Arif
<usamaarif642@...il.com>, paulmck@...nel.org, Anna-Maria Behnsen
<anna-maria@...utronix.de>
Subject: Re: [PATCH 1/3 v3] hrtimers: Force migrate away hrtimers queued
after CPUHP_AP_HRTIMERS_DYING
On Tue, Dec 31 2024 at 18:07, Frederic Weisbecker wrote:
> hrtimers are migrated away from the dying CPU to any online target at
> the CPUHP_AP_HRTIMERS_DYING stage in order not to delay bandwidth timers
> handling tasks involved in the CPU hotplug forward progress.
>
> However wake ups can still be performed by the outgoing CPU after
> CPUHP_AP_HRTIMERS_DYING. Those can result again in bandwidth timers
> being armed. Depending on several considerations (crystal ball
> power management based election, earliest timer already enqueued, timer
> migration enabled or not), the target may eventually be the current
> CPU even if offline. If that happens, the timer is eventually ignored.
>
> The most notable example is RCU which had to deal with each an every of
> those wake-ups by deferring them to an online CPU, along with related
> workarounds:
>
> _ e787644caf76 (rcu: Defer RCU kthreads wakeup when CPU is dying)
> _ 9139f93209d1 (rcu/nocb: Fix RT throttling hrtimer armed from offline CPU)
> _ f7345ccc62a4 (rcu/nocb: Fix rcuog wake-up from offline softirq)
>
> The problem isn't confined to RCU though as the stop machine kthread
> (which runs CPUHP_AP_HRTIMERS_DYING) reports its completion at the end
> and performs a wake up that eventually arms the deadline server timer:
Where does it report the completion?
> WARNING: CPU: 94 PID: 588 at kernel/time/hrtimer.c:1086 hrtimer_start_range_ns+0x289/0x2d0
> CPU: 94 UID: 0 PID: 588 Comm: migration/94 Not tainted
> Stopper: multi_cpu_stop+0x0/0x120 <- stop_machine_cpuslocked+0x66/0xc0
> RIP: 0010:hrtimer_start_range_ns+0x289/0x2d0
> Call Trace:
> ? __warn+0xcf/0x1b0
> ? hrtimer_start_range_ns+0x289/0x2d0
> ? hrtimer_start_range_ns+0x186/0x2d0
> start_dl_timer+0xfc/0x150
> enqueue_dl_entity+0x367/0x640
> dl_server_start+0x53/0xa0
> enqueue_task_fair+0x363/0x460
> enqueue_task+0x3c/0x200
> ttwu_do_activate+0x94/0x240
> try_to_wake_up+0x315/0x600
> complete+0x4b/0x80
You trimmed the backtrace at the wrong end. You left all the useless
register gunk around, but removed the call chain leading up to the
completion. :)
I assume it's the complete in stomp_machine() ....
> Instead of providing yet another bandaid to work around the situation,
> fix it from hrtimers infrastructure instead: always migrate away a
> timer to an online target whenever it is enqueued from an offline CPU.
> +/*
> + * If the current CPU is offline and timers have been already
> + * migrated away, make sure not to enqueue locally and perform
> + * a remote retrigger as a last resort.
> + */
> +static void enqueue_hrtimer_offline(struct hrtimer *timer,
> + struct hrtimer_clock_base *base,
> + const enum hrtimer_mode mode)
> +{
> +#ifdef CONFIG_HOTPLUG_CPU
> + struct hrtimer_cpu_base *new_cpu_base, *old_cpu_base, *this_cpu_base;
> + struct hrtimer_clock_base *new_base;
> + int cpu;
> +
> + old_cpu_base = base->cpu_base;
> + this_cpu_base = this_cpu_ptr(&hrtimer_bases);
> +
> + if (old_cpu_base == this_cpu_base || !old_cpu_base->online) {
> + WARN_ON_ONCE(hrtimer_callback_running(timer));
> + cpu = cpumask_any_and(cpu_online_mask,
> + housekeeping_cpumask(HK_TYPE_TIMER));
> + new_cpu_base = &per_cpu(hrtimer_bases, cpu);
> + new_base = &new_cpu_base->clock_base[base->index];
> + WRITE_ONCE(timer->base, &migration_base);
> + raw_spin_unlock(&old_cpu_base->lock);
> + raw_spin_lock(&new_cpu_base->lock);
> + WRITE_ONCE(timer->base, new_base);
> + } else {
> + new_base = base;
> + new_cpu_base = new_base->cpu_base;
> + cpu = new_cpu_base->cpu;
> + }
> +
> + if (enqueue_hrtimer(timer, new_base, mode))
> + smp_call_function_single_async(cpu, &new_cpu_base->csd);
Duh. This reimplementation of switch_hrtimer_base() is really aweful. We
can be smarter than that. Untested patch below.
Thanks,
tglx
---
--- a/include/linux/hrtimer_defs.h
+++ b/include/linux/hrtimer_defs.h
@@ -125,6 +125,7 @@ struct hrtimer_cpu_base {
ktime_t softirq_expires_next;
struct hrtimer *softirq_next_timer;
struct hrtimer_clock_base clock_base[HRTIMER_MAX_CLOCK_BASES];
+ call_single_data_t csd;
} ____cacheline_aligned;
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -58,6 +58,8 @@
#define HRTIMER_ACTIVE_SOFT (HRTIMER_ACTIVE_HARD << MASK_SHIFT)
#define HRTIMER_ACTIVE_ALL (HRTIMER_ACTIVE_SOFT | HRTIMER_ACTIVE_HARD)
+static void retrigger_next_event(void *arg);
+
/*
* The timer bases:
*
@@ -111,7 +113,8 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base,
.clockid = CLOCK_TAI,
.get_time = &ktime_get_clocktai,
},
- }
+ },
+ .csd = CSD_INIT(retrigger_next_event, NULL)
};
static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = {
@@ -208,6 +211,13 @@ struct hrtimer_cpu_base *get_target_base
if (static_branch_likely(&timers_migration_enabled) && !pinned)
return &per_cpu(hrtimer_bases, get_nohz_timer_target());
#endif
+#ifdef CONFIG_HOTPLUG_CPU
+ if (unlikely(!base->online)) {
+ int cpu = cpumask_any_and(cpu_online_mask, housekeeping_cpumask(HK_TYPE_TIMER));
+
+ return &per_cpu(hrtimer_bases, cpu);
+ }
+#endif
return base;
}
@@ -254,7 +264,7 @@ switch_hrtimer_base(struct hrtimer *time
raw_spin_unlock(&base->cpu_base->lock);
raw_spin_lock(&new_base->cpu_base->lock);
- if (new_cpu_base != this_cpu_base &&
+ if (new_cpu_base != this_cpu_base && this_cpu_base->online &&
hrtimer_check_target(timer, new_base)) {
raw_spin_unlock(&new_base->cpu_base->lock);
raw_spin_lock(&base->cpu_base->lock);
@@ -716,8 +726,6 @@ static inline int hrtimer_is_hres_enable
return hrtimer_hres_enabled;
}
-static void retrigger_next_event(void *arg);
-
/*
* Switch to high resolution mode
*/
@@ -1206,6 +1214,7 @@ static int __hrtimer_start_range_ns(stru
u64 delta_ns, const enum hrtimer_mode mode,
struct hrtimer_clock_base *base)
{
+ struct hrtimer_cpu_base *this_cpu_base = this_cpu_ptr(&hrtimer_bases);
struct hrtimer_clock_base *new_base;
bool force_local, first;
@@ -1217,10 +1226,16 @@ static int __hrtimer_start_range_ns(stru
* and enforce reprogramming after it is queued no matter whether
* it is the new first expiring timer again or not.
*/
- force_local = base->cpu_base == this_cpu_ptr(&hrtimer_bases);
+ force_local = base->cpu_base == this_cpu_base;
force_local &= base->cpu_base->next_timer == timer;
/*
+ * Don't force local queuing if this enqueue happens on a unplugged
+ * CPU after hrtimer_cpu_dying() has been invoked.
+ */
+ force_local &= this_cpu_base->online;
+
+ /*
* Remove an active timer from the queue. In case it is not queued
* on the current CPU, make sure that remove_hrtimer() updates the
* remote data correctly.
@@ -1249,8 +1264,17 @@ static int __hrtimer_start_range_ns(stru
}
first = enqueue_hrtimer(timer, new_base, mode);
- if (!force_local)
- return first;
+ if (!force_local) {
+ if (likely(this_cpu_base->online))
+ return first;
+
+ if (first) {
+ struct hrtimer_cpu_base *new_cpu_base = new_base->cpu_base;
+
+ smp_call_function_single_async(new_cpu_base->cpu, &new_cpu_base->csd);
+ }
+ return 0;
+ }
/*
* Timer was forced to stay on the current CPU to avoid
Powered by blists - more mailing lists