[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20250624-wip-mca-updates-v4-14-236dd74f645f@amd.com>
Date: Tue, 24 Jun 2025 14:16:09 +0000
From: Yazen Ghannam <yazen.ghannam@....com>
To: <x86@...nel.org>, Tony Luck <tony.luck@...el.com>, "Rafael J. Wysocki"
<rafael@...nel.org>, Len Brown <lenb@...nel.org>
CC: <linux-kernel@...r.kernel.org>, <linux-edac@...r.kernel.org>,
<Smita.KoralahalliChannabasappa@....com>, Qiuxu Zhuo <qiuxu.zhuo@...el.com>,
<linux-acpi@...r.kernel.org>, Yazen Ghannam <yazen.ghannam@....com>
Subject: [PATCH v4 14/22] x86/mce: Separate global and per-CPU quirks
Many quirks are global configuration settings and a handful apply to
each CPU.
Move the per-CPU quirks to vendor init to execute them on each online
CPU. Set the global quirks during BSP-only init so they're only executed
once and early.
Reviewed-by: Qiuxu Zhuo <qiuxu.zhuo@...el.com>
Tested-by: Tony Luck <tony.luck@...el.com>
Reviewed-by: Tony Luck <tony.luck@...el.com>
Signed-off-by: Yazen Ghannam <yazen.ghannam@....com>
---
Notes:
Link:
https://lore.kernel.org/r/20250415-wip-mca-updates-v3-10-8ffd9eb4aa56@amd.com
v3->v4:
* Add newline in mce_amd_feature_init().
* Remove __mcheck_cpu_apply_quirks().
* Update code comment ref. __mcheck_cpu_apply_quirks().
v2->v3:
* Update code comment.
* Add tags from Qiuxu and Tony.
v1->v2:
* New in v2.
arch/x86/kernel/cpu/mce/amd.c | 24 +++++++++++++
arch/x86/kernel/cpu/mce/core.c | 79 +++++++++++------------------------------
arch/x86/kernel/cpu/mce/intel.c | 18 ++++++++++
3 files changed, 62 insertions(+), 59 deletions(-)
diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c
index d55c4b5f5354..28dc36cdd137 100644
--- a/arch/x86/kernel/cpu/mce/amd.c
+++ b/arch/x86/kernel/cpu/mce/amd.c
@@ -648,6 +648,28 @@ static void disable_err_thresholding(struct cpuinfo_x86 *c, unsigned int bank)
wrmsrq(MSR_K7_HWCR, hwcr);
}
+static void amd_apply_quirks(struct cpuinfo_x86 *c)
+{
+ struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
+
+ /* This should be disabled by the BIOS, but isn't always */
+ if (c->x86 == 15 && this_cpu_read(mce_num_banks) > 4) {
+ /*
+ * disable GART TBL walk error reporting, which
+ * trips off incorrectly with the IOMMU & 3ware
+ * & Cerberus:
+ */
+ clear_bit(10, (unsigned long *)&mce_banks[4].ctl);
+ }
+
+ /*
+ * Various K7s with broken bank 0 around. Always disable
+ * by default.
+ */
+ if (c->x86 == 6 && this_cpu_read(mce_num_banks))
+ mce_banks[0].ctl = 0;
+}
+
/* cpu init entry point, called from mce.c with preempt off */
void mce_amd_feature_init(struct cpuinfo_x86 *c)
{
@@ -655,6 +677,8 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
u32 low = 0, high = 0, address = 0;
int offset = -1;
+ amd_apply_quirks(c);
+
mce_flags.amd_threshold = 1;
for (bank = 0; bank < this_cpu_read(mce_num_banks); ++bank) {
diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
index a723c7886eae..4f6af060a395 100644
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -1807,8 +1807,9 @@ static void __mcheck_cpu_mce_banks_init(void)
struct mce_bank *b = &mce_banks[i];
/*
- * Init them all, __mcheck_cpu_apply_quirks() is going to apply
- * the required vendor quirks before
+ * Init them all by default.
+ *
+ * The required vendor quirks will be applied before
* __mcheck_cpu_init_prepare_banks() does the final bank setup.
*/
b->ctl = -1ULL;
@@ -1884,18 +1885,6 @@ static void __mcheck_cpu_init_prepare_banks(void)
static void apply_quirks_amd(struct cpuinfo_x86 *c)
{
- struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
-
- /* This should be disabled by the BIOS, but isn't always */
- if (c->x86 == 15 && this_cpu_read(mce_num_banks) > 4) {
- /*
- * disable GART TBL walk error reporting, which
- * trips off incorrectly with the IOMMU & 3ware
- * & Cerberus:
- */
- clear_bit(10, (unsigned long *)&mce_banks[4].ctl);
- }
-
if (c->x86 < 0x11 && mca_cfg.bootlog < 0) {
/*
* Lots of broken BIOS around that don't clear them
@@ -1904,13 +1893,6 @@ static void apply_quirks_amd(struct cpuinfo_x86 *c)
mca_cfg.bootlog = 0;
}
- /*
- * Various K7s with broken bank 0 around. Always disable
- * by default.
- */
- if (c->x86 == 6 && this_cpu_read(mce_num_banks))
- mce_banks[0].ctl = 0;
-
/*
* overflow_recov is supported for F15h Models 00h-0fh
* even though we don't have a CPUID bit for it.
@@ -1924,23 +1906,10 @@ static void apply_quirks_amd(struct cpuinfo_x86 *c)
static void apply_quirks_intel(struct cpuinfo_x86 *c)
{
- struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
-
/* Older CPUs (prior to family 6) don't need quirks. */
if (c->x86_vfm < INTEL_PENTIUM_PRO)
return;
- /*
- * SDM documents that on family 6 bank 0 should not be written
- * because it aliases to another special BIOS controlled
- * register.
- * But it's not aliased anymore on model 0x1a+
- * Don't ignore bank 0 completely because there could be a
- * valid event later, merely don't write CTL0.
- */
- if (c->x86_vfm < INTEL_NEHALEM_EP && this_cpu_read(mce_num_banks))
- mce_banks[0].init = false;
-
/*
* All newer Intel systems support MCE broadcasting. Enable
* synchronization with a one second timeout.
@@ -1978,29 +1947,6 @@ static void apply_quirks_zhaoxin(struct cpuinfo_x86 *c)
}
}
-/* Add per CPU specific workarounds here */
-static void __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
-{
- struct mca_config *cfg = &mca_cfg;
-
- switch (c->x86_vendor) {
- case X86_VENDOR_AMD:
- apply_quirks_amd(c);
- break;
- case X86_VENDOR_INTEL:
- apply_quirks_intel(c);
- break;
- case X86_VENDOR_ZHAOXIN:
- apply_quirks_zhaoxin(c);
- break;
- }
-
- if (cfg->monarch_timeout < 0)
- cfg->monarch_timeout = 0;
- if (cfg->bootlog != 0)
- cfg->panic_timeout = 30;
-}
-
static bool __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c)
{
if (c->x86 != 5)
@@ -2259,6 +2205,23 @@ void mca_bsp_init(struct cpuinfo_x86 *c)
if (cap & MCG_SER_P)
mca_cfg.ser = 1;
+
+ switch (c->x86_vendor) {
+ case X86_VENDOR_AMD:
+ apply_quirks_amd(c);
+ break;
+ case X86_VENDOR_INTEL:
+ apply_quirks_intel(c);
+ break;
+ case X86_VENDOR_ZHAOXIN:
+ apply_quirks_zhaoxin(c);
+ break;
+ }
+
+ if (mca_cfg.monarch_timeout < 0)
+ mca_cfg.monarch_timeout = 0;
+ if (mca_cfg.bootlog != 0)
+ mca_cfg.panic_timeout = 30;
}
/*
@@ -2278,8 +2241,6 @@ void mcheck_cpu_init(struct cpuinfo_x86 *c)
__mcheck_cpu_cap_init();
- __mcheck_cpu_apply_quirks(c);
-
if (!mce_gen_pool_init()) {
mca_cfg.disabled = 1;
pr_emerg("Couldn't allocate MCE records pool!\n");
diff --git a/arch/x86/kernel/cpu/mce/intel.c b/arch/x86/kernel/cpu/mce/intel.c
index efcf21e9552e..ae9417d634ac 100644
--- a/arch/x86/kernel/cpu/mce/intel.c
+++ b/arch/x86/kernel/cpu/mce/intel.c
@@ -468,8 +468,26 @@ static void intel_imc_init(struct cpuinfo_x86 *c)
}
}
+static void intel_apply_quirks(struct cpuinfo_x86 *c)
+{
+ /*
+ * SDM documents that on family 6 bank 0 should not be written
+ * because it aliases to another special BIOS controlled
+ * register.
+ * But it's not aliased anymore on model 0x1a+
+ * Don't ignore bank 0 completely because there could be a
+ * valid event later, merely don't write CTL0.
+ *
+ * Older CPUs (prior to family 6) can't reach this point and already
+ * return early due to the check of __mcheck_cpu_ancient_init().
+ */
+ if (c->x86_vfm < INTEL_NEHALEM_EP && this_cpu_read(mce_num_banks))
+ this_cpu_ptr(mce_banks_array)[0].init = false;
+}
+
void mce_intel_feature_init(struct cpuinfo_x86 *c)
{
+ intel_apply_quirks(c);
intel_init_cmci();
intel_init_lmce();
intel_imc_init(c);
--
2.49.0
Powered by blists - more mailing lists