[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <20251009132055.GA472268@yaz-khff2.amd.com>
Date: Thu, 9 Oct 2025 09:20:55 -0400
From: Yazen Ghannam <yazen.ghannam@....com>
To: Bert Karwatzki <spasswolf@....de>
Cc: Nikolay Borisov <nik.borisov@...e.com>, Borislav Petkov <bp@...en8.de>,
Tony Luck <tony.luck@...el.com>, linux-kernel@...r.kernel.org,
linux-next@...r.kernel.org, linux-edac@...r.kernel.org,
linux-acpi@...r.kernel.org, x86@...nel.org, rafael@...nel.org,
qiuxu.zhuo@...el.com, Smita.KoralahalliChannabasappa@....com
Subject: Re: spurious mce Hardware Error messages in next-20250912
On Fri, Sep 19, 2025 at 12:07:15AM +0200, Bert Karwatzki wrote:
> Am Donnerstag, dem 18.09.2025 um 17:00 -0400 schrieb Yazen Ghannam:
>
[...]
>
> [ 333.337523] [ C0] mce: DEBUG: CPU0 Bank:11 Status:0x8724aa0800000000
> [ 333.337532] [ C0] mce: DEBUG: CPU0 Bank:14 Status:0x8724a98800000000
Thanks Bert for gathering the data.
We still don't have a system that shows this behavior. But I was able to
simulate it by manually writing the register values.
Can you please try the patch below?
This adds additional checks to ignore invalid values. And it addresses
feedback from Nikolay about clearing status registers later.
If this works for you, then I can squash this into another revision of
the patch set.
Thanks,
Yazen
>From 11cdf1e18faa343c1786f6ac47f663937252c4d1 Mon Sep 17 00:00:00 2001
From: Yazen Ghannam <yazen.ghannam@....com>
Date: Mon, 22 Sep 2025 20:26:06 +0000
Subject: [PATCH] x86/mce: Rework DFR handling flow
Add a flag to poll for Deferred errors similar to MCP_UC for
uncorrectable errors. This will do checks specific to deferred errors
and fallback to common UC/CE checks otherwise.
Also, clear the MCA_DESTAT register at the end of the handler rather
than the beginning.
Signed-off-by: Yazen Ghannam <yazen.ghannam@....com>
---
arch/x86/include/asm/mce.h | 1 +
arch/x86/kernel/cpu/mce/amd.c | 13 ++++++++----
arch/x86/kernel/cpu/mce/core.c | 36 ++++++++++++++++++++--------------
3 files changed, 31 insertions(+), 19 deletions(-)
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index 1cfbfff0be3f..9652fc11860d 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -299,6 +299,7 @@ enum mcp_flags {
MCP_TIMESTAMP = BIT(0), /* log time stamp */
MCP_UC = BIT(1), /* log uncorrected errors */
MCP_QUEUE_LOG = BIT(2), /* only queue to genpool */
+ MCP_DFR = BIT(3), /* log deferred errors */
};
void machine_check_poll(enum mcp_flags flags, mce_banks_t *b);
diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c
index 9b746080351f..83fad4503b1c 100644
--- a/arch/x86/kernel/cpu/mce/amd.c
+++ b/arch/x86/kernel/cpu/mce/amd.c
@@ -839,7 +839,7 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_deferred_error)
/* APIC interrupt handler for deferred errors */
static void amd_deferred_error_interrupt(void)
{
- machine_check_poll(MCP_TIMESTAMP, &this_cpu_ptr(&mce_amd_data)->dfr_intr_banks);
+ machine_check_poll(MCP_TIMESTAMP | MCP_DFR, &this_cpu_ptr(&mce_amd_data)->dfr_intr_banks);
}
void mce_amd_handle_storm(unsigned int bank, bool on)
@@ -865,10 +865,15 @@ void amd_clear_bank(struct mce *m)
{
amd_reset_thr_limit(m->bank);
- if (m->kflags & MCE_CHECK_DFR_REGS)
+ /* Clear MCA_DESTAT for all deferred errors even those logged in MCA_STATUS. */
+ if (m->status & MCI_STATUS_DEFERRED)
mce_wrmsrq(MSR_AMD64_SMCA_MCx_DESTAT(m->bank), 0);
- else
- mce_wrmsrq(mca_msr_reg(m->bank, MCA_STATUS), 0);
+
+ /* Don't clear MCA_STATUS if MCA_DESTAT was used exclusively. */
+ if (m->kflags & MCE_CHECK_DFR_REGS)
+ return;
+
+ mce_wrmsrq(mca_msr_reg(m->bank, MCA_STATUS), 0);
}
/*
diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
index e2d51609d2cb..960efee4be3e 100644
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -731,27 +731,26 @@ static bool smca_should_log_poll_error(enum mcp_flags flags, struct mce_hw_err *
struct mce *m = &err->m;
/*
- * If this is a deferred error found in MCA_STATUS, then clear
- * the redundant data from the MCA_DESTAT register.
+ * If the MCA_STATUS register has a deferred error, then continue using it as
+ * the status register.
+ *
+ * MCA_DESTAT will be cleared at the end of the handler.
*/
- if (m->status & MCI_STATUS_VAL) {
- if (m->status & MCI_STATUS_DEFERRED)
- mce_wrmsrq(MSR_AMD64_SMCA_MCx_DESTAT(m->bank), 0);
-
+ if ((m->status & MCI_STATUS_VAL) && (m->status & MCI_STATUS_DEFERRED))
return true;
- }
/*
- * If the MCA_DESTAT register has valid data, then use
- * it as the status register.
+ * If the MCA_DESTAT register has a deferred error, then use it instead.
+ *
+ * MCA_STATUS will not be cleared at the end of the handler.
*/
m->status = mce_rdmsrq(MSR_AMD64_SMCA_MCx_DESTAT(m->bank));
+ if ((m->status & MCI_STATUS_VAL) && (m->status & MCI_STATUS_DEFERRED)) {
+ m->kflags |= MCE_CHECK_DFR_REGS;
+ return true;
+ }
- if (!(m->status & MCI_STATUS_VAL))
- return false;
-
- m->kflags |= MCE_CHECK_DFR_REGS;
- return true;
+ return false;
}
/*
@@ -780,13 +779,17 @@ static bool should_log_poll_error(enum mcp_flags flags, struct mce_hw_err *err)
{
struct mce *m = &err->m;
- if (mce_flags.smca)
+ if (flags & MCP_DFR)
return smca_should_log_poll_error(flags, err);
/* If this entry is not valid, ignore it. */
if (!(m->status & MCI_STATUS_VAL))
return false;
+ /* Ignore deferred errors if not looking for them (MCP_DFR not set). */
+ if (m->status & MCI_STATUS_DEFERRED)
+ return false;
+
/*
* If we are logging everything (at CPU online) or this
* is a corrected error, then we must log it.
@@ -1924,6 +1927,9 @@ static void __mcheck_cpu_init_prepare_banks(void)
bitmap_fill(all_banks, MAX_NR_BANKS);
machine_check_poll(MCP_UC | MCP_QUEUE_LOG, &all_banks);
+
+ if (mce_flags.smca)
+ machine_check_poll(MCP_DFR | MCP_QUEUE_LOG, &all_banks);
}
for (i = 0; i < this_cpu_read(mce_num_banks); i++) {
--
2.51.0
Powered by blists - more mailing lists