lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <1338836326.28766.3.camel@sbsiddha-desk.sc.intel.com>
Date:	Mon, 04 Jun 2012 11:58:46 -0700
From:	Suresh Siddha <suresh.b.siddha@...el.com>
To:	Fenghua Yu <fenghua.yu@...el.com>
Cc:	Ingo Molnar <mingo@...e.hu>, Thomas Gleixner <tglx@...utronix.de>,
	H Peter Anvin <hpa@...or.com>,
	Tony Luck <tony.luck@...el.com>,
	Asit K Mallick <asit.k.mallick@...el.com>,
	Arjan Dan De Ven <arjan@...ux.intel.com>,
	linux-kernel <linux-kernel@...r.kernel.org>,
	x86 <x86@...nel.org>, linux-pm <linux-pm@...r.kernel.org>
Subject: Re: [PATCH 3/6] x86/smpboot.c: Wake up offline CPU via mwait or nmi

On Mon, 2012-06-04 at 11:17 -0700, Fenghua Yu wrote:
> From: Fenghua Yu <fenghua.yu@...el.com>
> 
> wakeup_secondary_cpu_via_soft() is defined to wake up offline CPU via mwait if
> the CPU is in mwait or via nmi if the CPU is in hlt.
> 
> A CPU boots up by INIT, INIT, STARTUP sequence when it boots up for the first
> time during boot time or hot plug.

I think this breaks suspend/resume as the cpu state gets lost.

Have you tried suspend/resume?

> 
> Signed-off-by: Fenghua Yu <fenghua.yu@...el.com>
> ---
>  arch/x86/include/asm/apic.h |    5 +-
>  arch/x86/kernel/smpboot.c   |  187 ++++++++++++++++++++++++++++++++++++------
>  2 files changed, 164 insertions(+), 28 deletions(-)
> 
> diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
> index eaff479..cad00b1 100644
> --- a/arch/x86/include/asm/apic.h
> +++ b/arch/x86/include/asm/apic.h
> @@ -425,7 +425,10 @@ extern struct apic *__apicdrivers[], *__apicdrivers_end[];
>  #ifdef CONFIG_SMP
>  extern atomic_t init_deasserted;
>  extern int wakeup_secondary_cpu_via_nmi(int apicid, unsigned long start_eip);
> -#endif
> +extern int wakeup_secondary_cpu_via_soft(int apicid, unsigned long start_eip);
> +#else /* CONFIG_SMP */
> +#define wakeup_secondary_cpu_via_soft	NULL
> +#endif /* CONFIG_SMP */
>  
>  #ifdef CONFIG_X86_LOCAL_APIC
>  
> diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
> index fd019d7..109df30 100644
> --- a/arch/x86/kernel/smpboot.c
> +++ b/arch/x86/kernel/smpboot.c
> @@ -472,13 +472,8 @@ void __inquire_remote_apic(int apicid)
>  	}
>  }
>  
> -/*
> - * Poke the other CPU in the eye via NMI to wake it up. Remember that the normal
> - * INIT, INIT, STARTUP sequence will reset the chip hard for us, and this
> - * won't ... remember to clear down the APIC, etc later.
> - */
> -int __cpuinit
> -wakeup_secondary_cpu_via_nmi(int logical_apicid, unsigned long start_eip)
> +static int __cpuinit
> +_wakeup_secondary_cpu_via_nmi(int apicid, int dest_mode)
>  {
>  	unsigned long send_status, accept_status = 0;
>  	int maxlvt;
> @@ -486,7 +481,7 @@ wakeup_secondary_cpu_via_nmi(int logical_apicid, unsigned long start_eip)
>  	/* Target chip */
>  	/* Boot on the stack */
>  	/* Kick the second */
> -	apic_icr_write(APIC_DM_NMI | apic->dest_logical, logical_apicid);
> +	apic_icr_write(APIC_DM_NMI | dest_mode, apicid);
>  
>  	pr_debug("Waiting for send to finish...\n");
>  	send_status = safe_apic_wait_icr_idle();
> @@ -511,6 +506,47 @@ wakeup_secondary_cpu_via_nmi(int logical_apicid, unsigned long start_eip)
>  	return (send_status | accept_status);
>  }
>  
> +/*
> + * Poke the other CPU in the eye via NMI to wake it up. Remember that the normal
> + * INIT, INIT, STARTUP sequence will reset the chip hard for us, and this
> + * won't ... remember to clear down the APIC, etc later.
> + */
> +int __cpuinit
> +wakeup_secondary_cpu_via_nmi_phys(int phys_apicid, unsigned long start_eip)
> +{
> +	return _wakeup_secondary_cpu_via_nmi(phys_apicid, APIC_DEST_PHYSICAL);
> +}
> +
> +int __cpuinit
> +wakeup_secondary_cpu_via_nmi(int logical_apicid, unsigned long start_eip)
> +{
> +	return _wakeup_secondary_cpu_via_nmi(logical_apicid, APIC_DEST_LOGICAL);
> +}
> +
> +DEFINE_PER_CPU(int, cpu_dead) = { 0 };
> +#define CPU_DEAD_TRIGGER	1
> +#define CPU_DEAD_MWAIT		2
> +#define CPU_DEAD_HLT		4
> +
> +static int wakeup_secondary_cpu_via_mwait(int cpu)
> +{
> +	per_cpu(cpu_dead, cpu) |= CPU_DEAD_TRIGGER;
> +	return 0;
> +}
> +
> +static int wakeup_cpu_nmi(unsigned int cmd, struct pt_regs *regs)
> +{
> +	int cpu = smp_processor_id();
> +	int *cpu_dead_ptr;
> +
> +	cpu_dead_ptr = &per_cpu(cpu_dead, cpu);
> +	if (!cpu_online(cpu) && (*cpu_dead_ptr & CPU_DEAD_HLT) &&
> +	    (*cpu_dead_ptr & CPU_DEAD_TRIGGER))
> +		return NMI_HANDLED;
> +
> +	return NMI_DONE;
> +}
> +
>  static int __cpuinit
>  wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip)
>  {
> @@ -626,6 +662,52 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip)
>  	return (send_status | accept_status);
>  }
>  
> +/*
> + * Kick a cpu.
> + *
> + * If the CPU is in mwait, wake it up by mwait method. Otherwise, if the CPU is
> + * in halt, wake it up by NMI. If none of above exists, wake it up by INIT boot
> + * APIC message.
> + *
> + * When the CPU first time boots up, i.e. cpu_dead is 0, it's waken up by INIT
> + * boot APIC message.
> + *
> + * At this point, the CPU should be in a fixed dead state. So we don't consider
> + * racy condition here.
> + */
> +int __cpuinit
> +wakeup_secondary_cpu_via_soft(int apicid, unsigned long start_eip)
> +{
> +	int cpu;
> +	int boot_error = 0;
> +	/* start_ip had better be page-aligned! */
> +	unsigned long start_ip = real_mode_header->trampoline_start;
> +
> +	for (cpu = 0; cpu < nr_cpu_ids; cpu++)
> +		if (apicid == apic->cpu_present_to_apicid(cpu))
> +			break;
> +
> +	if (cpu >= nr_cpu_ids)
> +		return -EINVAL;
> +
> +	if (per_cpu(cpu_dead, cpu) & CPU_DEAD_MWAIT) {
> +		boot_error = wakeup_secondary_cpu_via_mwait(cpu);
> +	} else if (per_cpu(cpu_dead, cpu) & CPU_DEAD_HLT) {
> +		int *cpu_dead_ptr;
> +
> +		cpu_dead_ptr = &per_cpu(cpu_dead, cpu);
> +		*cpu_dead_ptr |= CPU_DEAD_TRIGGER;
> +
> +		boot_error = wakeup_secondary_cpu_via_nmi_phys(apicid,
> +								start_ip);
> +		if (boot_error)
> +			*cpu_dead_ptr &= ~CPU_DEAD_TRIGGER;
> +	} else
> +		boot_error = wakeup_secondary_cpu_via_init(apicid, start_ip);
> +
> +	return boot_error;
> +}
> +
>  /* reduce the number of lines printed when booting a large cpu count system */
>  static void __cpuinit announce_cpu(int cpu, int apicid)
>  {
> @@ -778,6 +860,7 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu, struct task_struct *idle)
>  		 */
>  		smpboot_restore_warm_reset_vector();
>  	}
> +
>  	return boot_error;
>  }
>  
> @@ -977,6 +1060,20 @@ static void __init smp_cpu_index_default(void)
>  	}
>  }
>  
> +static bool mwait_supported(void)
> +{
> +	struct cpuinfo_x86 *c = __this_cpu_ptr(&cpu_info);
> +
> +	if (!(this_cpu_has(X86_FEATURE_MWAIT) && mwait_usable(c)))
> +		return false;
> +	if (!this_cpu_has(X86_FEATURE_CLFLSH))
> +		return false;
> +	if (__this_cpu_read(cpu_info.cpuid_level) < CPUID_MWAIT_LEAF)
> +		return false;
> +
> +	return true;
> +}
> +
>  /*
>   * Prepare for SMP bootup.  The MP table or ACPI has been read
>   * earlier.  Just do some sanity checking here and enable APIC mode.
> @@ -1051,6 +1148,11 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
>  		uv_system_init();
>  
>  	set_mtrr_aps_delayed_init();
> +
> +#ifdef CONFIG_HOTPLUG_CPU
> +	if (!mwait_supported())
> +		register_nmi_handler(NMI_LOCAL, wakeup_cpu_nmi, 0, "wake_cpu");
> +#endif
>  out:
>  	preempt_enable();
>  }
> @@ -1111,6 +1213,12 @@ static int __init _setup_possible_cpus(char *str)
>  }
>  early_param("possible_cpus", _setup_possible_cpus);
>  
> +static int __init setup_wakeup_cpu_via_init(char *str)
> +{
> +	apic->wakeup_secondary_cpu = NULL;
> +	return 0;
> +}
> +__setup("wakeup_cpu_via_init", setup_wakeup_cpu_via_init);
>  
>  /*
>   * cpu_possible_mask should be static, it cannot change as cpu's
> @@ -1286,6 +1394,28 @@ void play_dead_common(void)
>  	local_irq_disable();
>  }
>  
> +static bool wakeup_cpu(int *trigger)
> +{
> +	unsigned int timeout;
> +
> +	/*
> +	 * Wait up to 1 seconds to check if CPU wakeup trigger is set in
> +	 * cpu_dead by either memory write or NMI.
> +	 * If there is no CPU wakeup trigger, go back to sleep.
> +	 */
> +	for (timeout = 0; timeout < 1000000; timeout++) {
> +		/*
> +		 * Check if CPU0 wakeup NMI is issued and handled.
> +		 */
> +		if (*trigger & CPU_DEAD_TRIGGER)
> +			return true;
> +
> +		udelay(1);
> +	}
> +
> +	return false;
> +}
> +
>  /*
>   * We need to flush the caches before going to sleep, lest we have
>   * dirty data in our caches when we come back up.
> @@ -1296,14 +1426,9 @@ static inline void mwait_play_dead(void)
>  	unsigned int highest_cstate = 0;
>  	unsigned int highest_subcstate = 0;
>  	int i;
> -	void *mwait_ptr;
> -	struct cpuinfo_x86 *c = __this_cpu_ptr(&cpu_info);
> +	int *cpu_dead_ptr;
>  
> -	if (!(this_cpu_has(X86_FEATURE_MWAIT) && mwait_usable(c)))
> -		return;
> -	if (!this_cpu_has(X86_FEATURE_CLFLSH))
> -		return;
> -	if (__this_cpu_read(cpu_info.cpuid_level) < CPUID_MWAIT_LEAF)
> +	if (!mwait_supported())
>  		return;
>  
>  	eax = CPUID_MWAIT_LEAF;
> @@ -1328,16 +1453,10 @@ static inline void mwait_play_dead(void)
>  			(highest_subcstate - 1);
>  	}
>  
> -	/*
> -	 * This should be a memory location in a cache line which is
> -	 * unlikely to be touched by other processors.  The actual
> -	 * content is immaterial as it is not actually modified in any way.
> -	 */
> -	mwait_ptr = &current_thread_info()->flags;
> -
> -	wbinvd();
> -
> +	cpu_dead_ptr = &per_cpu(cpu_dead, smp_processor_id());
> +	*cpu_dead_ptr = CPU_DEAD_MWAIT;
>  	while (1) {
> +		*cpu_dead_ptr &= ~CPU_DEAD_TRIGGER;
>  		/*
>  		 * The CLFLUSH is a workaround for erratum AAI65 for
>  		 * the Xeon 7400 series.  It's not clear it is actually
> @@ -1345,20 +1464,34 @@ static inline void mwait_play_dead(void)
>  		 * The WBINVD is insufficient due to the spurious-wakeup
>  		 * case where we return around the loop.
>  		 */
> -		clflush(mwait_ptr);
> -		__monitor(mwait_ptr, 0, 0);
> +		wbinvd();
> +		clflush(cpu_dead_ptr);
> +		__monitor(cpu_dead_ptr, 0, 0);
>  		mb();
> -		__mwait(eax, 0);
> +		if ((*cpu_dead_ptr & CPU_DEAD_TRIGGER) == 0)
> +			__mwait(eax, 0);
> +
> +		/* Waken up by another CPU. */
> +		if (wakeup_cpu(cpu_dead_ptr))
> +			start_cpu();
>  	}
>  }
>  
>  static inline void hlt_play_dead(void)
>  {
> +	int *cpu_dead_ptr;
> +
>  	if (__this_cpu_read(cpu_info.x86) >= 4)
>  		wbinvd();
>  
> +	cpu_dead_ptr = &per_cpu(cpu_dead, smp_processor_id());
> +	*cpu_dead_ptr = CPU_DEAD_HLT;
>  	while (1) {
> +		*cpu_dead_ptr &= ~CPU_DEAD_TRIGGER;
>  		native_halt();
> +		/* If NMI wants to wake up me, I'll start. */
> +		if (wakeup_cpu(cpu_dead_ptr))
> +			start_cpu();
>  	}
>  }
>  


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ