From: Shaohui Zheng When hotplug a CPU with emulator, we are using a logical CPU to emulate the CPU hotplug process. For the CPU supported SMT, some logical CPUs are in the same socket, but it may located in different NUMA node after we have emulator. it misleads the scheduling domain to build the incorrect hierarchy, and it causes the following call trace when rebalance the scheduling domain: divide error: 0000 [#1] SMP last sysfs file: /sys/devices/system/cpu/cpu8/online CPU 0 Modules linked in: fbcon tileblit font bitblit softcursor radeon ttm drm_kms_helper e1000e usbhid via_rhine mii drm i2c_algo_bit igb dca Pid: 0, comm: swapper Not tainted 2.6.32hpe #78 X8DTN RIP: 0010:[] [] find_busiest_group+0x6c5/0xa10 RSP: 0018:ffff880028203c30 EFLAGS: 00010246 RAX: 0000000000000000 RBX: 0000000000015ac0 RCX: 0000000000000000 RDX: 0000000000000000 RSI: ffff880277e8cfa0 RDI: 0000000000000000 RBP: ffff880028203dc0 R08: ffff880277e8cfa0 R09: 0000000000000040 R10: 0000000000000000 R11: 0000000000000001 R12: 0000000000000000 R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000 FS: 0000000000000000(0000) GS:ffff880028200000(0000) knlGS:0000000000000000 CS: 0010 DS: 0018 ES: 0018 CR0: 000000008005003b CR2: 00007f16cfc85770 CR3: 0000000001001000 CR4: 00000000000006f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 Process swapper (pid: 0, threadinfo ffffffff81822000, task ffffffff8184a600) Stack: ffff880028203d60 ffff880028203cd0 ffff8801c204ff08 ffff880028203e38 <0> 0101ffff81018c59 ffff880028203e44 00000001810806bd ffff8801c204fe00 <0> 0000000528200000 ffffffff00000000 0000000000000018 0000000000015ac0 Call Trace: [] ? tick_dev_program_event+0x40/0xd0 [] rebalance_domains+0x17c/0x570 [] ? read_tsc+0x9/0x20 [] ? tick_dev_program_event+0x40/0xd0 [] run_rebalance_domains+0xbd/0xf0 [] __do_softirq+0xaf/0x1e0 [] ? handle_IRQ_event+0x58/0x160 [] call_softirq+0x1c/0x30 [] do_softirq+0x65/0xa0 [] irq_exit+0x7d/0x90 [] do_IRQ+0x70/0xe0 [] ret_from_intr+0x0/0x11 [] ? acpi_idle_enter_bm+0x281/0x2b5 [] ? acpi_idle_enter_bm+0x27a/0x2b5 [] ? cpuidle_idle_call+0x9f/0x130 [] ? cpu_idle+0xab/0x100 [] ? rest_init+0x66/0x70 [] ? start_kernel+0x3e3/0x3ef [] ? x86_64_start_reservations+0x125/0x129 [] ? x86_64_start_kernel+0xfa/0x109 Code: 00 00 e9 4c fb ff ff 0f 1f 80 00 00 00 00 48 8b b5 d8 fe ff ff 48 8b 45 a8 4d 29 ef 8b 56 08 48 c1 e0 0a 49 89 f0 48 89 d7 31 d2 <48> f7 f7 31 d2 48 89 45 a0 8b 76 08 4c 89 f0 48 c1 e0 0a 48 f7 RIP [] find_busiest_group+0x6c5/0xa10 RSP Solution: We put the logical CPU into a fake CPU socket, and assign it an unique phys_proc_id. For the fake socket, we put one logical CPU in only. This method fixes the above bug. CC: Sam Ravnborg Signed-off-by: Haicheng Li Signed-off-by: Shaohui Zheng --- Index: linux-hpe4/arch/x86/include/asm/processor.h =================================================================== --- linux-hpe4.orig/arch/x86/include/asm/processor.h 2010-11-17 09:00:51.354100239 +0800 +++ linux-hpe4/arch/x86/include/asm/processor.h 2010-11-17 09:01:10.222837594 +0800 @@ -113,6 +113,15 @@ /* Index into per_cpu list: */ u16 cpu_index; #endif + +#ifdef CONFIG_ARCH_CPU_PROBE_RELEASE + /* + * Use a logic cpu to emulate a physical cpu's hotplug. We put the + * logical cpu into a fake socket, assign a fake physical id to it, + * and create a fake core. + */ + __u8 cpu_probe_on; /* A flag to enable cpu probe/release */ +#endif } __attribute__((__aligned__(SMP_CACHE_BYTES))); #define X86_VENDOR_INTEL 0 Index: linux-hpe4/arch/x86/kernel/smpboot.c =================================================================== --- linux-hpe4.orig/arch/x86/kernel/smpboot.c 2010-11-17 09:01:10.202837209 +0800 +++ linux-hpe4/arch/x86/kernel/smpboot.c 2010-11-17 09:01:10.222837594 +0800 @@ -97,6 +97,7 @@ */ static DEFINE_MUTEX(x86_cpu_hotplug_driver_mutex); +#ifdef CONFIG_ARCH_CPU_PROBE_RELEASE void cpu_hotplug_driver_lock() { mutex_lock(&x86_cpu_hotplug_driver_mutex); @@ -106,6 +107,7 @@ { mutex_unlock(&x86_cpu_hotplug_driver_mutex); } +#endif #else static struct task_struct *idle_thread_array[NR_CPUS] __cpuinitdata ; @@ -198,6 +200,8 @@ { int cpuid, phys_id; unsigned long timeout; + u8 cpu_probe_on = 0; + struct cpuinfo_x86 *c; /* * If waken up by an INIT in an 82489DX configuration @@ -277,7 +281,20 @@ /* * Save our processor parameters */ + c = &cpu_data(cpuid); +#ifdef CONFIG_ARCH_CPU_PROBE_RELEASE + cpu_probe_on = c->cpu_probe_on; + phys_id = c->phys_proc_id; +#endif + smp_store_cpu_info(cpuid); +#ifdef CONFIG_ARCH_CPU_PROBE_RELEASE + if (cpu_probe_on) { + c->phys_proc_id = phys_id; /* restore the fake phys_proc_id */ + c->cpu_core_id = 0; /* force the logical cpu to core 0 */ + c->cpu_probe_on = cpu_probe_on; + } +#endif notify_cpu_starting(cpuid); @@ -400,6 +417,11 @@ { int i; struct cpuinfo_x86 *c = &cpu_data(cpu); + int cpu_probe_on = 0; + +#ifdef CONFIG_ARCH_CPU_PROBE_RELEASE + cpu_probe_on = c->cpu_probe_on; +#endif cpumask_set_cpu(cpu, cpu_sibling_setup_mask); @@ -431,7 +453,8 @@ for_each_cpu(i, cpu_sibling_setup_mask) { if (per_cpu(cpu_llc_id, cpu) != BAD_APICID && - per_cpu(cpu_llc_id, cpu) == per_cpu(cpu_llc_id, i)) { + per_cpu(cpu_llc_id, cpu) == per_cpu(cpu_llc_id, i) && + cpu_probe_on == 0) { cpumask_set_cpu(i, c->llc_shared_map); cpumask_set_cpu(cpu, cpu_data(i).llc_shared_map); } Index: linux-hpe4/arch/x86/kernel/topology.c =================================================================== --- linux-hpe4.orig/arch/x86/kernel/topology.c 2010-11-17 09:01:10.202837209 +0800 +++ linux-hpe4/arch/x86/kernel/topology.c 2010-11-17 09:01:10.222837594 +0800 @@ -70,6 +70,36 @@ } EXPORT_SYMBOL(arch_unregister_cpu); +#ifdef CONFIG_ARCH_CPU_PROBE_RELEASE +/* + * Put the logical cpu into a new sokect, and encapsule it into core 0. + */ +static void fake_cpu_socket_info(int cpu) +{ + struct cpuinfo_x86 *c = &cpu_data(cpu); + int i, phys_id = 0; + + /* calculate the max phys_id */ + for_each_present_cpu(i) { + struct cpuinfo_x86 *c = &cpu_data(i); + if (phys_id < c->phys_proc_id) + phys_id = c->phys_proc_id; + } + + c->phys_proc_id = phys_id + 1; /* pick up a unused phys_proc_id */ + c->cpu_core_id = 0; /* always put the logical cpu to core 0 */ + c->cpu_probe_on = 1; +} + +static void clear_cpu_socket_info(int cpu) +{ + struct cpuinfo_x86 *c = &cpu_data(cpu); + c->phys_proc_id = 0; + c->cpu_core_id = 0; + c->cpu_probe_on = 0; +} + + ssize_t arch_cpu_probe(const char *buf, size_t count) { int nid = 0; @@ -109,6 +139,7 @@ /* register cpu */ arch_register_cpu_node(selected, nid); acpi_map_lsapic_emu(selected, nid); + fake_cpu_socket_info(selected); return count; } @@ -132,10 +163,13 @@ arch_unregister_cpu(cpu); acpi_unmap_lsapic(cpu); + clear_cpu_socket_info(cpu); + set_cpu_present(cpu, true); return count; } EXPORT_SYMBOL(arch_cpu_release); +#endif CONFIG_ARCH_CPU_PROBE_RELEASE #else /* CONFIG_HOTPLUG_CPU */ -- Thanks & Regards, Shaohui -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/