Currently we switch to the stable sched_clock if we guess the TSC is usable, and then switch back to the unstable path if it turns out TSC isn't stable during SMP bringup after all. Delay switching to the stable path until after SMP bringup is complete. This way we'll avoid switching during the time we detect the worst of the TSC offences. Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> --- include/linux/sched.h | 5 +++++ init/main.c | 1 - kernel/sched/clock.c | 50 ++++++++++++++++++++++---------------------------- kernel/sched/core.c | 4 ++++ 4 files changed, 31 insertions(+), 29 deletions(-) --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2499,6 +2499,10 @@ extern u64 sched_clock_cpu(int cpu); extern void sched_clock_init(void); #ifndef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK +static inline void sched_clock_init_late(void) +{ +} + static inline void sched_clock_tick(void) { } @@ -2521,6 +2525,7 @@ static inline u64 local_clock(void) return sched_clock(); } #else +extern void sched_clock_init_late(void); /* * Architectures can set this to 1 if they have specified * CONFIG_HAVE_UNSTABLE_SCHED_CLOCK in their arch Kconfig, --- a/init/main.c +++ b/init/main.c @@ -616,7 +616,6 @@ asmlinkage __visible void __init start_k numa_policy_init(); if (late_time_init) late_time_init(); - sched_clock_init(); calibrate_delay(); pidmap_init(); anon_vma_init(); --- a/kernel/sched/clock.c +++ b/kernel/sched/clock.c @@ -77,6 +77,11 @@ EXPORT_SYMBOL_GPL(sched_clock); __read_mostly int sched_clock_running; +void sched_clock_init(void) +{ + sched_clock_running = 1; +} + #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK static DEFINE_STATIC_KEY_FALSE(__sched_clock_stable); static int __sched_clock_stable_early; @@ -96,12 +101,18 @@ void set_sched_clock_stable(void) { __sched_clock_stable_early = 1; - smp_mb(); /* matches sched_clock_init() */ - - if (!sched_clock_running) - return; + smp_mb(); /* matches sched_clock_init_late() */ - __set_sched_clock_stable(); + /* + * This really should only be called early (before + * sched_clock_init_late()) when guestimating our sched_clock() is + * solid. + * + * After that we test stability and we can negate our guess using + * clear_sched_clock_stable, possibly from a watchdog. + */ + if (WARN_ON_ONCE(sched_clock_running == 2)) + __set_sched_clock_stable(); } static void __clear_sched_clock_stable(struct work_struct *work) @@ -117,12 +128,10 @@ void clear_sched_clock_stable(void) { __sched_clock_stable_early = 0; - smp_mb(); /* matches sched_clock_init() */ - - if (!sched_clock_running) - return; + smp_mb(); /* matches sched_clock_init_late() */ - schedule_work(&sched_clock_work); + if (sched_clock_running == 2) + schedule_work(&sched_clock_work); } struct sched_clock_data { @@ -143,20 +152,9 @@ static inline struct sched_clock_data *c return &per_cpu(sched_clock_data, cpu); } -void sched_clock_init(void) +void sched_clock_init_late(void) { - u64 ktime_now = ktime_to_ns(ktime_get()); - int cpu; - - for_each_possible_cpu(cpu) { - struct sched_clock_data *scd = cpu_sdc(cpu); - - scd->tick_raw = 0; - scd->tick_gtod = ktime_now; - scd->clock = ktime_now; - } - - sched_clock_running = 1; + sched_clock_running = 2; /* * Ensure that it is impossible to not do a static_key update. @@ -362,11 +360,6 @@ EXPORT_SYMBOL_GPL(sched_clock_idle_wakeu #else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ -void sched_clock_init(void) -{ - sched_clock_running = 1; -} - u64 sched_clock_cpu(int cpu) { if (unlikely(!sched_clock_running)) @@ -374,6 +367,7 @@ u64 sched_clock_cpu(int cpu) return sched_clock(); } + #endif /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ /* --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7486,6 +7486,7 @@ void __init sched_init_smp(void) init_sched_dl_class(); sched_init_smt(); + sched_clock_init_late(); sched_smp_initialized = true; } @@ -7501,6 +7502,7 @@ early_initcall(migration_init); void __init sched_init_smp(void) { sched_init_granularity(); + sched_clock_init_late(); } #endif /* CONFIG_SMP */ @@ -7544,6 +7546,8 @@ void __init sched_init(void) int i, j; unsigned long alloc_size = 0, ptr; + sched_clock_init(); + for (i = 0; i < WAIT_TABLE_SIZE; i++) init_waitqueue_head(bit_wait_table + i);