[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <87ljlismux.fsf@basil.nowhere.org>
Date: Mon, 17 Aug 2009 13:25:58 +0200
From: Andi Kleen <andi@...stfloor.org>
To: Ingo Molnar <mingo@...e.hu>
Cc: linux-kernel@...r.kernel.org, mingo@...hat.com, hpa@...or.com,
seto.hidetoshi@...fujitsu.com, ak@...ux.intel.com,
tglx@...utronix.de, Yinghai Lu <yinghai@...nel.org>,
Huang@...stfloor.org, "Ying <ying"@firstfloor.org,
linux-tip-commits@...r.kernel.org
Subject: Re: [boot crash] Re: [tip:x86/mce3] x86, mce: use 64bit machine check code on 32bit
Ingo Molnar <mingo@...e.hu> writes:
Weird the original mail didn't make it through, only saw the replies.
>> all quirks.
>
> This commit causes a new regression, it broke the bootup on one of
> my -tip testsystems, an older, Pentium-M based HP laptop (HP
> OmniBook 6000 EA).
>
> The symptom is that the bootup hard-hangs after MCE init:
>
> [ 0.022996] Mount-cache hash table entries: 512
> [ 0.024996] Initializing cgroup subsys debug
> [ 0.025996] Initializing cgroup subsys cpuacct
> [ 0.026995] Initializing cgroup subsys devices
> [ 0.027995] Initializing cgroup subsys freezer
> [ 0.028995] mce: CPU supports 5 MCE banks
Thanks for testing.
I assume the system boots with CONFIG_X86_NEW_MCE disabled and machine checks
enabled, correct? As in you never booted with mce=off or a similar option
on older kernels.
First please test with the patch I posted in
http://article.gmane.org/gmane.linux.kernel/875563
I don't see that one in tip.
If that doesn't help please boot with the appended debug patch and post the console
log again, then we will hopefully see where it hangs.
-Andi
commit 09f099eafbff70ecf55f7f111d2fb497ddb9a915
Author: Andi Kleen <ak@...ux.intel.com>
Date: Mon Aug 17 13:15:50 2009 +0200
Debug patch: trace mce init
Signed-off-by: Andi Kleen <ak@...ux.intel.com>
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 1cfb623..bfaed40 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -45,6 +45,8 @@
#include "mce-internal.h"
+#define D printk("%s:%d\n", __FILE__, __LINE__)
+
/* Handle unconfigured int18 (should never happen) */
static void unexpected_machine_check(struct pt_regs *regs, long error_code)
{
@@ -1196,6 +1198,8 @@ static int mce_cap_init(void)
if (cap & MCG_SER_P)
mce_ser = 1;
+ D;
+
return 0;
}
@@ -1209,20 +1213,30 @@ static void mce_init(void)
* Log the machine checks left over from the previous reset.
*/
bitmap_fill(all_banks, MAX_NR_BANKS);
+ D;
machine_check_poll(MCP_UC|(!mce_bootlog ? MCP_DONTLOG : 0), &all_banks);
+ D;
+
set_in_cr4(X86_CR4_MCE);
+ D;
+
rdmsrl(MSR_IA32_MCG_CAP, cap);
+ D;
if (cap & MCG_CTL_P)
wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
+ D;
for (i = 0; i < banks; i++) {
if (skip_bank_init(i))
continue;
+ printk("init bank %d\n", i);
wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]);
wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
}
+
+ D;
}
/* Add per CPU specific workarounds here */
@@ -1319,9 +1333,12 @@ static void mce_init_timer(void)
*n = check_interval * HZ;
if (!*n)
return;
+
+ D;
setup_timer(t, mcheck_timer, smp_processor_id());
t->expires = round_jiffies(jiffies + *n);
add_timer_on(t, smp_processor_id());
+ D;
}
/*
@@ -1340,15 +1357,21 @@ void __cpuinit mcheck_init(struct cpuinfo_x86 *c)
if (mce_cap_init() < 0) {
mce_disabled = 1;
+ D;
return;
}
+ D;
mce_cpu_quirks(c);
+ D;
machine_check_vector = do_machine_check;
mce_init();
+ D;
mce_cpu_features(c);
+ D;
mce_init_timer();
+ D;
INIT_WORK(&__get_cpu_var(mce_work), mce_process_work);
}
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c
index e1acec0..0d6aeab 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_intel.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c
@@ -13,6 +13,8 @@
#include <asm/msr.h>
#include <asm/mce.h>
+#define D printk("%s:%d\n", __FILE__, __LINE__)
+
/*
* Support for Intel Correct Machine Check Interrupts. This allows
* the CPU to raise an interrupt when a corrected machine check happened.
@@ -207,6 +209,8 @@ static void intel_init_cmci(void)
if (!cmci_supported(&banks))
return;
+ D;
+
mce_threshold_vector = intel_threshold_interrupt;
cmci_discover(banks, 1);
/*
@@ -217,10 +221,15 @@ static void intel_init_cmci(void)
*/
apic_write(APIC_LVTCMCI, THRESHOLD_APIC_VECTOR|APIC_DM_FIXED);
cmci_recheck();
+
+ D;
}
void mce_intel_feature_init(struct cpuinfo_x86 *c)
{
+ D;
intel_init_thermal(c);
+ D;
intel_init_cmci();
+ D;
}
diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c
index bff8dd1..b4c6ca0 100644
--- a/arch/x86/kernel/cpu/mcheck/therm_throt.c
+++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c
@@ -31,6 +31,8 @@
#include <asm/mce.h>
#include <asm/msr.h>
+#define D printk("%s:%d\n", __FILE__, __LINE__)
+
/* How long to wait between reporting thermal events */
#define CHECK_INTERVAL (300 * HZ)
@@ -236,10 +238,14 @@ void intel_init_thermal(struct cpuinfo_x86 *c)
int tm2 = 0;
u32 l, h;
+ D;
+
/* Thermal monitoring depends on ACPI and clock modulation*/
if (!cpu_has(c, X86_FEATURE_ACPI) || !cpu_has(c, X86_FEATURE_ACC))
return;
+ D;
+
/*
* First check if its enabled already, in which case there might
* be some SMM goo which handles it, so we can't even put a handler
@@ -253,6 +259,9 @@ void intel_init_thermal(struct cpuinfo_x86 *c)
return;
}
+ D;
+
+
if (cpu_has(c, X86_FEATURE_TM2) && (l & MSR_IA32_MISC_ENABLE_TM2))
tm2 = 1;
@@ -264,6 +273,8 @@ void intel_init_thermal(struct cpuinfo_x86 *c)
return;
}
+ D;
+
/* We'll mask the thermal vector in the lapic till we're ready: */
h = THERMAL_APIC_VECTOR | APIC_DM_FIXED | APIC_LVT_MASKED;
apic_write(APIC_LVTTHMR, h);
@@ -286,4 +297,6 @@ void intel_init_thermal(struct cpuinfo_x86 *c)
/* enable thermal throttle processing */
atomic_set(&therm_throt_en, 1);
+
+ D;
}
--
ak@...ux.intel.com -- Speaking for myself only.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Powered by blists - more mailing lists