Currently an hrtimer callback function cannot free its own timer because __run_hrtimer() still needs to clear HRTIMER_STATE_CALLBACK after it. Freeing the timer would result in a clear use-after-free. Solve this by using a scheme similar to regular timers; track the current running timer in hrtimer_clock_base::running. Suggested-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) --- include/linux/hrtimer.h | 47 ++++++----------- kernel/time/hrtimer.c | 130 ++++++++++++++++++++++++++++++++++++++---------- 2 files changed, 122 insertions(+), 55 deletions(-) --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -53,34 +53,27 @@ enum hrtimer_restart { * * 0x00 inactive * 0x01 enqueued into rbtree - * 0x02 callback function running - * 0x04 timer is migrated to another cpu + * 0x02 timer is migrated to another cpu * - * Special cases: - * 0x03 callback function running and enqueued - * (was requeued on another CPU) - * 0x05 timer was migrated on CPU hotunplug + * The callback state is not part of the timer->state because clearing it would + * mean touching the timer after the callback, this makes it impossible to free + * the timer from the callback function. * - * The "callback function running and enqueued" status is only possible on - * SMP. It happens for example when a posix timer expired and the callback + * Therefore we track the callback state in: + * + * timer->base->cpu_base->running == timer + * + * On SMP it is possible to have a "callback function running and enqueued" + * status. It happens for example when a posix timer expired and the callback * queued a signal. Between dropping the lock which protects the posix timer * and reacquiring the base lock of the hrtimer, another CPU can deliver the - * signal and rearm the timer. We have to preserve the callback running state, - * as otherwise the timer could be removed before the softirq code finishes the - * the handling of the timer. - * - * The HRTIMER_STATE_ENQUEUED bit is always or'ed to the current state - * to preserve the HRTIMER_STATE_CALLBACK in the above scenario. This - * also affects HRTIMER_STATE_MIGRATE where the preservation is not - * necessary. HRTIMER_STATE_MIGRATE is cleared after the timer is - * enqueued on the new cpu. + * signal and rearm the timer. * * All state transitions are protected by cpu_base->lock. */ #define HRTIMER_STATE_INACTIVE 0x00 #define HRTIMER_STATE_ENQUEUED 0x01 -#define HRTIMER_STATE_CALLBACK 0x02 -#define HRTIMER_STATE_MIGRATE 0x04 +#define HRTIMER_STATE_MIGRATE 0x02 /** * struct hrtimer - the basic hrtimer structure @@ -167,6 +160,8 @@ enum hrtimer_base_type { * struct hrtimer_cpu_base - the per cpu clock bases * @lock: lock protecting the base and associated clock bases * and timers + * @seq: seqcount around __run_hrtimer + * @running: pointer to the currently running hrtimer * @cpu: cpu number * @active_bases: Bitfield to mark bases with active timers * @clock_was_set_seq: Sequence counter of clock was set events @@ -188,6 +183,8 @@ enum hrtimer_base_type { */ struct hrtimer_cpu_base { raw_spinlock_t lock; + seqcount_t seq; + struct hrtimer *running; unsigned int cpu; unsigned int active_bases; unsigned int clock_was_set_seq; @@ -395,15 +392,7 @@ extern ktime_t hrtimer_get_remaining(con extern u64 hrtimer_get_next_event(void); -/* - * A timer is active, when it is enqueued into the rbtree or the - * callback function is running or it's in the state of being migrated - * to another cpu. - */ -static inline int hrtimer_active(const struct hrtimer *timer) -{ - return timer->state != HRTIMER_STATE_INACTIVE; -} +extern bool hrtimer_active(const struct hrtimer *timer); /* * Helper function to check, whether the timer is on one of the queues @@ -419,7 +408,7 @@ static inline int hrtimer_is_queued(stru */ static inline int hrtimer_callback_running(struct hrtimer *timer) { - return timer->state & HRTIMER_STATE_CALLBACK; + return timer->base->cpu_base->running == timer; } /* Forward a hrtimer so it expires after now: */ --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -67,6 +67,7 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) = { .lock = __RAW_SPIN_LOCK_UNLOCKED(hrtimer_bases.lock), + .seq = SEQCNT_ZERO(hrtimer_bases.seq), .clock_base = { { @@ -111,6 +112,19 @@ static inline int hrtimer_clockid_to_bas #ifdef CONFIG_SMP /* + * We require the migration_base for lock_hrtimer_base()/switch_hrtimer_base() + * such that hrtimer_callback_running() can unconditionally dereference + * timer->base->cpu_base + */ +static struct hrtimer_cpu_base migration_cpu_base = { + .seq = SEQCNT_ZERO(migration_cpu_base), +}; + +static struct hrtimer_clock_base migration_base = { + .cpu_base = &migration_cpu_base, +}; + +/* * We are using hashed locking: holding per_cpu(hrtimer_bases)[n].lock * means that all timers which are tied to this base via timer->base are * locked, and the base itself is locked too. @@ -119,8 +133,8 @@ static inline int hrtimer_clockid_to_bas * be found on the lists/queues. * * When the timer's base is locked, and the timer removed from list, it is - * possible to set timer->base = NULL and drop the lock: the timer remains - * locked. + * possible to set timer->base = &migration_base and drop the lock: the timer + * remains locked. */ static struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer, @@ -130,7 +144,7 @@ struct hrtimer_clock_base *lock_hrtimer_ for (;;) { base = timer->base; - if (likely(base != NULL)) { + if (likely(base != &migration_base)) { raw_spin_lock_irqsave(&base->cpu_base->lock, *flags); if (likely(base == timer->base)) return base; @@ -194,8 +208,8 @@ switch_hrtimer_base(struct hrtimer *time if (unlikely(hrtimer_callback_running(timer))) return base; - /* See the comment in lock_timer_base() */ - timer->base = NULL; + /* See the comment in lock_hrtimer_base() */ + timer->base = &migration_base; raw_spin_unlock(&base->cpu_base->lock); raw_spin_lock(&new_base->cpu_base->lock); @@ -840,11 +854,7 @@ static int enqueue_hrtimer(struct hrtime base->cpu_base->active_bases |= 1 << base->index; - /* - * HRTIMER_STATE_ENQUEUED is or'ed to the current state to preserve the - * state of a possibly running callback. - */ - timer->state |= HRTIMER_STATE_ENQUEUED; + timer->state = HRTIMER_STATE_ENQUEUED; return timerqueue_add(&base->active, &timer->node); } @@ -894,7 +904,6 @@ static inline int remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base) { if (hrtimer_is_queued(timer)) { - unsigned long state; int reprogram; /* @@ -908,13 +917,8 @@ remove_hrtimer(struct hrtimer *timer, st debug_deactivate(timer); timer_stats_hrtimer_clear_start_info(timer); reprogram = base->cpu_base == this_cpu_ptr(&hrtimer_bases); - /* - * We must preserve the CALLBACK state flag here, - * otherwise we could move the timer base in - * switch_hrtimer_base. - */ - state = timer->state & HRTIMER_STATE_CALLBACK; - __remove_hrtimer(timer, base, state, reprogram); + + __remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE, reprogram); return 1; } return 0; @@ -1114,6 +1118,70 @@ void hrtimer_init(struct hrtimer *timer, } EXPORT_SYMBOL_GPL(hrtimer_init); +/* + * A timer is active, when it is enqueued into the rbtree or the + * callback function is running or it's in the state of being migrated + * to another cpu. + */ +bool hrtimer_active(const struct hrtimer *timer) +{ + struct hrtimer_cpu_base *cpu_base; + unsigned int seq; + bool active; + + do { + active = false; + cpu_base = READ_ONCE(timer->base->cpu_base); + seq = raw_read_seqcount(&cpu_base->seq); + + if (timer->state != HRTIMER_STATE_INACTIVE || + cpu_base->running == timer) + active = true; + + } while (read_seqcount_retry(&cpu_base->seq, seq) || + cpu_base != READ_ONCE(timer->base->cpu_base)); + + return active; +} +EXPORT_SYMBOL_GPL(hrtimer_active); + +/* + * __run_hrtimer() hrtimer_active() + * + * [S] cpu_base->running = timer [R] timer->base->cpu_base + * [S] seq++ [R] seq + * WMB RMB + * [S] timer->state = INACTIVE + * [R] timer->state + * fn(); [R] cpu_base->running + * + * [S] timer->state = ENQUEUED (optional) + * WMB RMB + * [S] seq++ [R] seq + * [S] cpu_base->running = NULL [R] timer->base->cpu_base + * + * + * The WMBs+seq on the __run_hrtimer() split the thing into 3 distinct sections: + * + * - queued: the timer is queued + * - callback: the timer is being ran + * - post: the timer is inactive or (re)queued + * + * On the read side we ensure we observe timer->state and cpu_base->running + * from the same section, if anything changed while we looked at it, we retry. + * This includes timer->base changing because sequence numbers alone are + * insufficient for that. + * + * Note: this is unusual seqlock usage in that we do not have the odd/even + * thing, all we really care about is both reads happening in the same section + * of __run_hrtimer(). + * + * Note: both seq increments are not fully store ordered and this is fine, if + * for example the first seq increment is observed before the ->running store + * the section edge shifts slightly, but its a safe shift because here ->state + * is still ENQUEUED. + */ + static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base, struct hrtimer_clock_base *base, struct hrtimer *timer, ktime_t *now) @@ -1121,10 +1189,17 @@ static void __run_hrtimer(struct hrtimer enum hrtimer_restart (*fn)(struct hrtimer *); int restart; - WARN_ON(!irqs_disabled()); + lockdep_assert_held(&cpu_base->lock); debug_deactivate(timer); - __remove_hrtimer(timer, base, HRTIMER_STATE_CALLBACK, 0); + cpu_base->running = timer; + + /* + * separate the ->running assignment from the ->state assignment + */ + raw_write_seqcount_begin(&cpu_base->seq); + + __remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE, 0); timer_stats_account_hrtimer(timer); fn = timer->function; @@ -1140,7 +1215,7 @@ static void __run_hrtimer(struct hrtimer raw_spin_lock(&cpu_base->lock); /* - * Note: We clear the CALLBACK bit after enqueue_hrtimer and + * Note: We clear the running state after enqueue_hrtimer and * we do not reprogramm the event hardware. Happens either in * hrtimer_start_range_ns() or in hrtimer_interrupt() * @@ -1152,9 +1227,13 @@ static void __run_hrtimer(struct hrtimer !(timer->state & HRTIMER_STATE_ENQUEUED)) enqueue_hrtimer(timer, base); - WARN_ON_ONCE(!(timer->state & HRTIMER_STATE_CALLBACK)); + /* + * separate the ->running assignment from the ->state assignment + */ + raw_write_seqcount_end(&cpu_base->seq); - timer->state &= ~HRTIMER_STATE_CALLBACK; + WARN_ON_ONCE(cpu_base->running != timer); + cpu_base->running = NULL; } static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now) @@ -1523,11 +1602,10 @@ static void migrate_hrtimer_list(struct * hrtimer_interrupt after we migrated everything to * sort out already expired timers and reprogram the * event device. + * + * Sets timer->state = HRTIMER_STATE_ENQUEUED. */ enqueue_hrtimer(timer, new_base); - - /* Clear the migration state bit */ - timer->state &= ~HRTIMER_STATE_MIGRATE; } } -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/