[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20240214160512.GDZczkuNJf_VVZxNeI@fat_crate.local>
Date: Wed, 14 Feb 2024 17:05:12 +0100
From: Borislav Petkov <bp@...en8.de>
To: Yazen Ghannam <yazen.ghannam@....com>
Cc: tony.luck@...el.com, linux-edac@...r.kernel.org,
linux-kernel@...r.kernel.org, avadhut.naik@....com,
john.allen@....com, muralidhara.mk@....com,
naveenkrishna.chatradhi@....com, sathyapriya.k@....com
Subject: Re: [PATCH 1/2] RAS/AMD/ATL, EDAC/amd64: Move MI300 Row Retirement
to ATL
On Wed, Feb 14, 2024 at 09:19:13AM -0500, Yazen Ghannam wrote:
> Yes, that's fine.
Easy peasy:
>From 98ecd3942837df907fbf9ceff7e23f55e55e40b2 Mon Sep 17 00:00:00 2001
From: Yazen Ghannam <yazen.ghannam@....com>
Date: Tue, 13 Feb 2024 21:35:15 -0600
Subject: [PATCH] RAS/AMD/ATL: Add MI300 row retirement support
DRAM row retirement depends on model-specific information that is best
done within the AMD Address Translation Library.
Export a generic wrapper function for other modules to use. Add any
model-specific helpers here.
Signed-off-by: Yazen Ghannam <yazen.ghannam@....com>
Signed-off-by: Borislav Petkov (AMD) <bp@...en8.de>
Link: https://lore.kernel.org/r/20240214033516.1344948-2-yazen.ghannam@amd.com
---
drivers/ras/amd/atl/Kconfig | 1 +
drivers/ras/amd/atl/umc.c | 51 +++++++++++++++++++++++++++++++++++++
include/linux/ras.h | 2 ++
3 files changed, 54 insertions(+)
diff --git a/drivers/ras/amd/atl/Kconfig b/drivers/ras/amd/atl/Kconfig
index a43513a700f1..df49c23e7f62 100644
--- a/drivers/ras/amd/atl/Kconfig
+++ b/drivers/ras/amd/atl/Kconfig
@@ -10,6 +10,7 @@
config AMD_ATL
tristate "AMD Address Translation Library"
depends on AMD_NB && X86_64 && RAS
+ depends on MEMORY_FAILURE
default N
help
This library includes support for implementation-specific
diff --git a/drivers/ras/amd/atl/umc.c b/drivers/ras/amd/atl/umc.c
index 7e310d1dfcfc..08c6dbd44c62 100644
--- a/drivers/ras/amd/atl/umc.c
+++ b/drivers/ras/amd/atl/umc.c
@@ -239,6 +239,57 @@ static unsigned long convert_dram_to_norm_addr_mi300(unsigned long addr)
return addr;
}
+/*
+ * When a DRAM ECC error occurs on MI300 systems, it is recommended to retire
+ * all memory within that DRAM row. This applies to the memory with a DRAM
+ * bank.
+ *
+ * To find the memory addresses, loop through permutations of the DRAM column
+ * bits and find the System Physical address of each. The column bits are used
+ * to calculate the intermediate Normalized address, so all permutations should
+ * be checked.
+ *
+ * See amd_atl::convert_dram_to_norm_addr_mi300() for MI300 address formats.
+ */
+#define MI300_NUM_COL BIT(HWEIGHT(MI300_UMC_MCA_COL))
+static void retire_row_mi300(struct atl_err *a_err)
+{
+ unsigned long addr;
+ struct page *p;
+ u8 col;
+
+ for (col = 0; col < MI300_NUM_COL; col++) {
+ a_err->addr &= ~MI300_UMC_MCA_COL;
+ a_err->addr |= FIELD_PREP(MI300_UMC_MCA_COL, col);
+
+ addr = amd_convert_umc_mca_addr_to_sys_addr(a_err);
+ if (IS_ERR_VALUE(addr))
+ continue;
+
+ addr = PHYS_PFN(addr);
+
+ /*
+ * Skip invalid or already poisoned pages to avoid unnecessary
+ * error messages from memory_failure().
+ */
+ p = pfn_to_online_page(addr);
+ if (!p)
+ continue;
+
+ if (PageHWPoison(p))
+ continue;
+
+ memory_failure(addr, 0);
+ }
+}
+
+void amd_retire_dram_row(struct atl_err *a_err)
+{
+ if (df_cfg.rev == DF4p5 && df_cfg.flags.heterogeneous)
+ return retire_row_mi300(a_err);
+}
+EXPORT_SYMBOL_GPL(amd_retire_dram_row);
+
static unsigned long get_addr(unsigned long addr)
{
if (df_cfg.rev == DF4p5 && df_cfg.flags.heterogeneous)
diff --git a/include/linux/ras.h b/include/linux/ras.h
index 09c632832bf1..a64182bc72ad 100644
--- a/include/linux/ras.h
+++ b/include/linux/ras.h
@@ -45,8 +45,10 @@ struct atl_err {
#if IS_ENABLED(CONFIG_AMD_ATL)
void amd_atl_register_decoder(unsigned long (*f)(struct atl_err *));
void amd_atl_unregister_decoder(void);
+void amd_retire_dram_row(struct atl_err *err);
unsigned long amd_convert_umc_mca_addr_to_sys_addr(struct atl_err *err);
#else
+static inline void amd_retire_dram_row(struct atl_err *err) { }
static inline unsigned long
amd_convert_umc_mca_addr_to_sys_addr(struct atl_err *err) { return -EINVAL; }
#endif /* CONFIG_AMD_ATL */
--
2.43.0
--
Regards/Gruss,
Boris.
https://people.kernel.org/tglx/notes-about-netiquette
Powered by blists - more mailing lists