linux-kernel - Re: [boot crash] Re: [tip:x86/mce3] x86, mce: use 64bit machine check code on 32bit

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <87ljlismux.fsf@basil.nowhere.org>
Date:	Mon, 17 Aug 2009 13:25:58 +0200
From:	Andi Kleen <andi@...stfloor.org>
To:	Ingo Molnar <mingo@...e.hu>
Cc:	linux-kernel@...r.kernel.org, mingo@...hat.com, hpa@...or.com,
	seto.hidetoshi@...fujitsu.com, ak@...ux.intel.com,
	tglx@...utronix.de, Yinghai Lu <yinghai@...nel.org>,
	Huang@...stfloor.org, "Ying <ying"@firstfloor.org,
	linux-tip-commits@...r.kernel.org
Subject: Re: [boot crash] Re: [tip:x86/mce3] x86, mce: use 64bit machine check code on 32bit

Ingo Molnar <mingo@...e.hu> writes:

Weird the original mail didn't make it through, only saw the replies.

>> all quirks.
>
> This commit causes a new regression, it broke the bootup on one of 
> my -tip testsystems, an older, Pentium-M based HP laptop (HP 
> OmniBook 6000 EA).
>
> The symptom is that the bootup hard-hangs after MCE init:
>
>  [    0.022996] Mount-cache hash table entries: 512
>  [    0.024996] Initializing cgroup subsys debug
>  [    0.025996] Initializing cgroup subsys cpuacct
>  [    0.026995] Initializing cgroup subsys devices
>  [    0.027995] Initializing cgroup subsys freezer
>  [    0.028995] mce: CPU supports 5 MCE banks

Thanks for testing. 

I assume the system boots with CONFIG_X86_NEW_MCE disabled and machine checks 
enabled, correct? As in you never booted with mce=off or a similar option
on older kernels.

First please test with the patch I posted in 

http://article.gmane.org/gmane.linux.kernel/875563

I don't see that one in tip. 

If that doesn't help please boot with the appended debug patch and post the console
log again, then we will hopefully  see where it hangs.

-Andi

commit 09f099eafbff70ecf55f7f111d2fb497ddb9a915
Author: Andi Kleen <ak@...ux.intel.com>
Date:   Mon Aug 17 13:15:50 2009 +0200

    Debug patch: trace mce init
    
    Signed-off-by: Andi Kleen <ak@...ux.intel.com>

diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 1cfb623..bfaed40 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -45,6 +45,8 @@
 
 #include "mce-internal.h"
 
+#define D printk("%s:%d\n", __FILE__, __LINE__)
+
 /* Handle unconfigured int18 (should never happen) */
 static void unexpected_machine_check(struct pt_regs *regs, long error_code)
 {
@@ -1196,6 +1198,8 @@ static int mce_cap_init(void)
 	if (cap & MCG_SER_P)
 		mce_ser = 1;
 
+	D;
+
 	return 0;
 }
 
@@ -1209,20 +1213,30 @@ static void mce_init(void)
 	 * Log the machine checks left over from the previous reset.
 	 */
 	bitmap_fill(all_banks, MAX_NR_BANKS);
+	D;
 	machine_check_poll(MCP_UC|(!mce_bootlog ? MCP_DONTLOG : 0), &all_banks);
 
+	D;
+
 	set_in_cr4(X86_CR4_MCE);
 
+	D;
+
 	rdmsrl(MSR_IA32_MCG_CAP, cap);
+	D;
 	if (cap & MCG_CTL_P)
 		wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
+	D;
 
 	for (i = 0; i < banks; i++) {
 		if (skip_bank_init(i))
 			continue;
+		printk("init bank %d\n", i);
 		wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]);
 		wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
 	}
+
+	D;
 }
 
 /* Add per CPU specific workarounds here */
@@ -1319,9 +1333,12 @@ static void mce_init_timer(void)
 	*n = check_interval * HZ;
 	if (!*n)
 		return;
+
+	D;
 	setup_timer(t, mcheck_timer, smp_processor_id());
 	t->expires = round_jiffies(jiffies + *n);
 	add_timer_on(t, smp_processor_id());
+	D;
 }
 
 /*
@@ -1340,15 +1357,21 @@ void __cpuinit mcheck_init(struct cpuinfo_x86 *c)
 
 	if (mce_cap_init() < 0) {
 		mce_disabled = 1;
+		D;
 		return;
 	}
+	D;
 	mce_cpu_quirks(c);
+	D;
 
 	machine_check_vector = do_machine_check;
 
 	mce_init();
+	D;
 	mce_cpu_features(c);
+	D;
 	mce_init_timer();
+	D;
 	INIT_WORK(&__get_cpu_var(mce_work), mce_process_work);
 }
 
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c
index e1acec0..0d6aeab 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_intel.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c
@@ -13,6 +13,8 @@
 #include <asm/msr.h>
 #include <asm/mce.h>
 
+#define D printk("%s:%d\n", __FILE__, __LINE__)
+
 /*
  * Support for Intel Correct Machine Check Interrupts. This allows
  * the CPU to raise an interrupt when a corrected machine check happened.
@@ -207,6 +209,8 @@ static void intel_init_cmci(void)
 	if (!cmci_supported(&banks))
 		return;
 
+	D;
+
 	mce_threshold_vector = intel_threshold_interrupt;
 	cmci_discover(banks, 1);
 	/*
@@ -217,10 +221,15 @@ static void intel_init_cmci(void)
 	 */
 	apic_write(APIC_LVTCMCI, THRESHOLD_APIC_VECTOR|APIC_DM_FIXED);
 	cmci_recheck();
+
+	D;
 }
 
 void mce_intel_feature_init(struct cpuinfo_x86 *c)
 {
+	D;
 	intel_init_thermal(c);
+	D;
 	intel_init_cmci();
+	D;
 }
diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c
index bff8dd1..b4c6ca0 100644
--- a/arch/x86/kernel/cpu/mcheck/therm_throt.c
+++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c
@@ -31,6 +31,8 @@
 #include <asm/mce.h>
 #include <asm/msr.h>
 
+#define D printk("%s:%d\n", __FILE__, __LINE__)
+
 /* How long to wait between reporting thermal events */
 #define CHECK_INTERVAL		(300 * HZ)
 
@@ -236,10 +238,14 @@ void intel_init_thermal(struct cpuinfo_x86 *c)
 	int tm2 = 0;
 	u32 l, h;
 
+	D;
+
 	/* Thermal monitoring depends on ACPI and clock modulation*/
 	if (!cpu_has(c, X86_FEATURE_ACPI) || !cpu_has(c, X86_FEATURE_ACC))
 		return;
 
+	D;
+
 	/*
 	 * First check if its enabled already, in which case there might
 	 * be some SMM goo which handles it, so we can't even put a handler
@@ -253,6 +259,9 @@ void intel_init_thermal(struct cpuinfo_x86 *c)
 		return;
 	}
 
+	D;
+
+
 	if (cpu_has(c, X86_FEATURE_TM2) && (l & MSR_IA32_MISC_ENABLE_TM2))
 		tm2 = 1;
 
@@ -264,6 +273,8 @@ void intel_init_thermal(struct cpuinfo_x86 *c)
 		return;
 	}
 
+	D;
+
 	/* We'll mask the thermal vector in the lapic till we're ready: */
 	h = THERMAL_APIC_VECTOR | APIC_DM_FIXED | APIC_LVT_MASKED;
 	apic_write(APIC_LVTTHMR, h);
@@ -286,4 +297,6 @@ void intel_init_thermal(struct cpuinfo_x86 *c)
 
 	/* enable thermal throttle processing */
 	atomic_set(&therm_throt_en, 1);
+
+	D;
 }


-- 
ak@...ux.intel.com -- Speaking for myself only.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/