[PATCH -v2] x86: Make calibrate_delay run in parallel. On a 4096 cpu machine, we noticed that 318 seconds were taken for bringing up the cpus. By specifying lpj=, we reduced that to 75 seconds. Andi Kleen suggested we rework the calibrate_delay calls to run in parallel. -v2: from Yinghai two path: one for initial boot cpus. and one for hotplug cpus initial path: after all cpu boot up, enter idle, use smp_call_function_many let every ap call __calibrate_delay. We can not put that calibrate_delay after local_irq_enable in start_secondary(), at that time that cpu could be involed with perf_event with nmi_watchdog enabling. that will cause strange calibrating result. add __calibrate_delay instead of changing calibrate_delay all over. use cpu_calibrated_delay_mask instead... use print_lpj to make print line complete. Signed-off-by: Robin Holt To: Andi Kleen Cc: Thomas Gleixner Cc: Ingo Molnar Signed-off-by: Yinghai Lu --- arch/x86/include/asm/cpumask.h | 1 arch/x86/kernel/cpu/common.c | 2 + arch/x86/kernel/smpboot.c | 58 ++++++++++++++++++++++++++++++++++------- include/linux/delay.h | 1 init/calibrate.c | 44 +++++++++++++++---------------- 5 files changed, 75 insertions(+), 31 deletions(-) -- Index: linux-2.6/arch/x86/include/asm/cpumask.h =================================================================== --- linux-2.6.orig/arch/x86/include/asm/cpumask.h +++ linux-2.6/arch/x86/include/asm/cpumask.h @@ -6,6 +6,7 @@ extern cpumask_var_t cpu_callin_mask; extern cpumask_var_t cpu_callout_mask; extern cpumask_var_t cpu_initialized_mask; +extern cpumask_var_t cpu_calibrated_delay_mask; extern cpumask_var_t cpu_sibling_setup_mask; extern void setup_cpu_local_masks(void); Index: linux-2.6/arch/x86/kernel/cpu/common.c =================================================================== --- linux-2.6.orig/arch/x86/kernel/cpu/common.c +++ linux-2.6/arch/x86/kernel/cpu/common.c @@ -45,6 +45,7 @@ cpumask_var_t cpu_initialized_mask; cpumask_var_t cpu_callout_mask; cpumask_var_t cpu_callin_mask; +cpumask_var_t cpu_calibrated_delay_mask; /* representing cpus for which sibling maps can be computed */ cpumask_var_t cpu_sibling_setup_mask; @@ -55,6 +56,7 @@ void __init setup_cpu_local_masks(void) alloc_bootmem_cpumask_var(&cpu_initialized_mask); alloc_bootmem_cpumask_var(&cpu_callin_mask); alloc_bootmem_cpumask_var(&cpu_callout_mask); + alloc_bootmem_cpumask_var(&cpu_calibrated_delay_mask); alloc_bootmem_cpumask_var(&cpu_sibling_setup_mask); } Index: linux-2.6/arch/x86/kernel/smpboot.c =================================================================== --- linux-2.6.orig/arch/x86/kernel/smpboot.c +++ linux-2.6/arch/x86/kernel/smpboot.c @@ -52,6 +52,7 @@ #include #include +#include #include #include #include @@ -207,15 +208,7 @@ static void __cpuinit smp_callin(void) * Need to setup vector mappings before we enable interrupts. */ setup_vector_irq(smp_processor_id()); - /* - * Get our bogomips. - * - * Need to enable IRQs because it can take longer and then - * the NMI watchdog might kill us. - */ - local_irq_enable(); - calibrate_delay(); - local_irq_disable(); + pr_debug("Stack at about %p\n", &cpuid); /* @@ -1037,6 +1030,8 @@ void __init native_smp_prepare_cpus(unsi } set_cpu_sibling_map(0); + /* already called earlier for boot cpu */ + cpumask_set_cpu(0, cpu_calibrated_delay_mask); if (smp_sanity_check(max_cpus) < 0) { printk(KERN_INFO "SMP disabled\n"); @@ -1125,8 +1120,53 @@ void __init native_smp_prepare_boot_cpu( per_cpu(cpu_state, me) = CPU_ONLINE; } +static void __cpuinit calibrate_delay_fn(void *info) +{ + int cpu = smp_processor_id(); + + cpu_data(cpu).loops_per_jiffy = __calibrate_delay(cpu, loops_per_jiffy); + cpumask_set_cpu(cpu, cpu_calibrated_delay_mask); +} + +#ifdef CONFIG_HOTPLUG_CPU +static int __cpuinit +cal_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) +{ + int cpu = (unsigned long)hcpu; + + switch (action) { + case CPU_ONLINE: + case CPU_ONLINE_FROZEN: + smp_call_function_single(cpu, calibrate_delay_fn, NULL, 1); + break; + } + + return NOTIFY_OK; +} + +static __cpuinitdata struct notifier_block __cpuinitdata cal_cpu_nfb = { + .notifier_call = cal_cpu_callback +}; + +static void __init register_cal_cpu_nfb(void) +{ + register_cpu_notifier(&cal_cpu_nfb); +} +#else +static void __init register_cal_cpu_nfb(void) +{ +} +#endif + void __init native_smp_cpus_done(unsigned int max_cpus) { + smp_call_function_many(cpu_online_mask, calibrate_delay_fn, NULL, 0); + while (cpumask_weight(cpu_calibrated_delay_mask) != num_online_cpus()) { + cpu_relax(); + touch_nmi_watchdog(); + } + register_cal_cpu_nfb(); + pr_debug("Boot done.\n"); impress_friends(); Index: linux-2.6/include/linux/delay.h =================================================================== --- linux-2.6.orig/include/linux/delay.h +++ linux-2.6/include/linux/delay.h @@ -43,6 +43,7 @@ static inline void ndelay(unsigned long extern unsigned long lpj_fine; void calibrate_delay(void); +unsigned long __calibrate_delay(int cpu, unsigned long lpj); void msleep(unsigned int msecs); unsigned long msleep_interruptible(unsigned int msecs); void usleep_range(unsigned long min, unsigned long max); Index: linux-2.6/init/calibrate.c =================================================================== --- linux-2.6.orig/init/calibrate.c +++ linux-2.6/init/calibrate.c @@ -183,32 +183,32 @@ recalibrate: return lpj; } -void __cpuinit calibrate_delay(void) +static void __cpuinit print_lpj(int cpu, char *str, unsigned long lpj) { - static bool printed; + pr_info("CPU%d: Calibrating delay%s" + "%lu.%02lu BogoMIPS (lpj=%lu)\n", cpu, str, + lpj/(500000/HZ), (lpj/(5000/HZ)) % 100, lpj); +} +unsigned long __cpuinit __calibrate_delay(int cpu, unsigned long lpj) +{ if (preset_lpj) { - loops_per_jiffy = preset_lpj; - if (!printed) - pr_info("Calibrating delay loop (skipped) " - "preset value.. "); - } else if ((!printed) && lpj_fine) { - loops_per_jiffy = lpj_fine; - pr_info("Calibrating delay loop (skipped), " - "value calculated using timer frequency.. "); - } else if ((loops_per_jiffy = calibrate_delay_direct()) != 0) { - if (!printed) - pr_info("Calibrating delay using timer " - "specific routine.. "); + lpj = preset_lpj; + print_lpj(cpu, " loop (skipped) preset value..", lpj); + } else if ((cpu == 0) && lpj_fine) { + lpj = lpj_fine; + print_lpj(cpu, " loop (skipped), value calculated using timer frequency.. ", lpj); + } else if ((lpj = calibrate_delay_direct()) != 0) { + print_lpj(cpu, " using timer specific routine.. ", lpj); } else { - if (!printed) - pr_info("Calibrating delay loop... "); - loops_per_jiffy = calibrate_delay_converge(); + lpj = calibrate_delay_converge(); + print_lpj(cpu, " loop ... ", lpj); } - if (!printed) - pr_cont("%lu.%02lu BogoMIPS (lpj=%lu)\n", - loops_per_jiffy/(500000/HZ), - (loops_per_jiffy/(5000/HZ)) % 100, loops_per_jiffy); - printed = true; + return lpj; +} + +void __cpuinit calibrate_delay(void) +{ + loops_per_jiffy = __calibrate_delay(0, loops_per_jiffy); }