[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <YKtbBXZGpVZS1M4R@zn.tnic>
Date: Mon, 24 May 2021 09:51:33 +0200
From: Borislav Petkov <bp@...e.de>
To: James Feeney <james@...ealm.net>
Cc: linux-smp@...r.kernel.org, Jens Axboe <axboe@...nel.dk>,
lkml <linux-kernel@...r.kernel.org>
Subject: Re: linux 5.12 - fails to boot - soft lockup - CPU#0 stuck for 23s!
- RIP smp_call_function_single
On Sun, May 23, 2021 at 05:02:01PM -0600, James Feeney wrote:
> Ha! Yes, your patch *is* the culprit. You don't trust git bisect?
Well, git-bisect can very easily veer off into the fields if the
bisector makes a mistake. I'm not saying you have made a mistake but I
have done that mistake a bunch of times and have seen others do it too
so it is very easy to get lost.
And with my patch simply moving the LVT THMR read back in the boot
order, I was very sceptical how can that even be?!
But...
> "lvtthmr_init: 0x200" != "lvtthmr_init: 0x10200" != "lvtthmr_init: 0x10000"
... *this* is a good catch, I *think* I know what happens and the next
patch will confirm my theory, see the end of the mail. Remove all diffs
you have ontop of your 5.12 kernel, apply the one below, do the exact
same exercise with it and send me one full dmesg pls.
> System Management is *hard*, because it must build upon someone else's
> undocumented buggy software. Thank Intel.
No, SMM is shit. Complain to the OEMs about it.
However, this time I think it is *we* who are not doing stuff as
correctly as we should but let's see your results first.
Thx.
---
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index ddfb3cad8dff..5ac8b827bc12 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -296,6 +296,12 @@ struct cper_sec_mem_err;
extern void apei_mce_report_mem_error(int corrected,
struct cper_sec_mem_err *mem_err);
+#ifdef CONFIG_X86_THERMAL_VECTOR
+extern void mcheck_intel_therm_init(void);
+#else
+static inline void mcheck_intel_therm_init(void) { }
+#endif
+
/*
* Enumerate new IP types and HWID values in AMD processors which support
* Scalable MCA.
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 4a39fb429f15..f21009786877 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -1162,6 +1162,10 @@ void clear_local_APIC(void)
#ifdef CONFIG_X86_THERMAL_VECTOR
if (maxlvt >= 5) {
v = apic_read(APIC_LVTTHMR);
+
+ pr_info("%s: CPU%d, maxlvt: %d, APIC_LVTTHMR: 0x%x, masking LVT\n",
+ __func__, smp_processor_id(), maxlvt, v);
+
apic_write(APIC_LVTTHMR, v | APIC_LVT_MASKED);
}
#endif
diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
index bf7fe87a7e88..ded20b8612fe 100644
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -2190,6 +2190,7 @@ __setup("mce", mcheck_enable);
int __init mcheck_init(void)
{
+ mcheck_intel_therm_init();
mce_register_decode_chain(&early_nb);
mce_register_decode_chain(&mce_uc_nb);
mce_register_decode_chain(&mce_default_nb);
diff --git a/drivers/thermal/intel/therm_throt.c b/drivers/thermal/intel/therm_throt.c
index f8e882592ba5..0ebd2386839f 100644
--- a/drivers/thermal/intel/therm_throt.c
+++ b/drivers/thermal/intel/therm_throt.c
@@ -621,19 +621,30 @@ bool x86_thermal_enabled(void)
return atomic_read(&therm_throt_en);
}
+void __init mcheck_intel_therm_init(void)
+{
+ /*
+ * This function is only called on boot CPU. Save the init thermal
+ * LVT value on BSP and use that value to restore APs' thermal LVT
+ * entry BIOS programmed later
+ */
+ if (intel_thermal_supported(&boot_cpu_data)) {
+ lvtthmr_init = apic_read(APIC_LVTTHMR);
+ pr_info("%s: lvtthmr_init: 0x%x\n", __func__, lvtthmr_init);
+ } else {
+ pr_info("%s: !intel_thermal_supported\n", __func__);
+ }
+}
+
void intel_init_thermal(struct cpuinfo_x86 *c)
{
unsigned int cpu = smp_processor_id();
int tm2 = 0;
- u32 l, h;
+ u32 l, h, tmp = -1;
if (!intel_thermal_supported(c))
return;
- /* On the BSP? */
- if (c == &boot_cpu_data)
- lvtthmr_init = apic_read(APIC_LVTTHMR);
-
/*
* First check if its enabled already, in which case there might
* be some SMM goo which handles it, so we can't even put a handler
@@ -652,13 +663,17 @@ void intel_init_thermal(struct cpuinfo_x86 *c)
* BIOS has programmed on AP based on BSP's info we saved since BIOS
* is always setting the same value for all threads/cores.
*/
- if ((h & APIC_DM_FIXED_MASK) != APIC_DM_FIXED)
+ if ((h & APIC_DM_FIXED_MASK) != APIC_DM_FIXED) {
apic_write(APIC_LVTTHMR, lvtthmr_init);
+ tmp = apic_read(APIC_LVTTHMR);
+ }
+ pr_info("%s: CPU%d, lvtthmr_init: 0x%x, read: 0x%x, misc_enable (low): 0x%x\n",
+ __func__, cpu, lvtthmr_init, tmp, l);
if ((l & MSR_IA32_MISC_ENABLE_TM1) && (h & APIC_DM_SMI)) {
if (system_state == SYSTEM_BOOTING)
- pr_debug("CPU%d: Thermal monitoring handled by SMI\n", cpu);
+ pr_info("CPU%d: Thermal monitoring handled by SMI\n", cpu);
return;
}
--
Regards/Gruss,
Boris.
SUSE Software Solutions Germany GmbH, GF: Felix Imendörffer, HRB 36809, AG Nürnberg
Powered by blists - more mailing lists