linux-kernel - [PATCH v7 08/12] EDAC/amd64: Add Family ops to update GPU csrow and channel info

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20220203174942.31630-9-nchatrad@amd.com>
Date:   Thu, 3 Feb 2022 11:49:38 -0600
From:   Naveen Krishna Chatradhi <nchatrad@....com>
To:     <linux-edac@...r.kernel.org>, <x86@...nel.org>
CC:     <linux-kernel@...r.kernel.org>, <bp@...en8.de>, <mingo@...hat.com>,
        <mchehab@...nel.org>, <yazen.ghannam@....com>,
        Naveen Krishna Chatradhi <nchatrad@....com>,
        Muralidhara M K <muralimk@....com>
Subject: [PATCH v7 08/12] EDAC/amd64: Add Family ops to update GPU csrow and channel info

GPU node has 'X' number of PHYs and 'Y' number of channels.
This results in 'X*Y' number of instances in the Data Fabric.
Therefore the Data Fabric ID of an instance in GPU as below:
  df_inst_id = 'X' * number of channels per PHY + 'Y'

On CPUs the Data Fabric ID of an instance on a CPU is equal to the
UMC number. since the UMC number and channel are equal in CPU nodes,
the channel can be used as the Data Fabric ID of the instance.

Cc: Yazen Ghannam <yazen.ghannam@....com>
Co-developed-by: Muralidhara M K <muralimk@....com>
Signed-off-by: Muralidhara M K <muralimk@....com>
Signed-off-by: Naveen Krishna Chatradhi <nchatrad@....com>
---
v1->v7:
* New change in v7

 drivers/edac/amd64_edac.c | 60 +++++++++++++++++++++++++++++++++++++--
 drivers/edac/amd64_edac.h |  2 ++
 2 files changed, 60 insertions(+), 2 deletions(-)

diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c
index 10efe726a959..241419a0be93 100644
--- a/drivers/edac/amd64_edac.c
+++ b/drivers/edac/amd64_edac.c
@@ -3653,6 +3653,30 @@ static inline void decode_bus_error(int node_id, struct mce *m)
 	__log_ecc_error(mci, &err, ecc_type);
 }
 
+/*
+ * On CPUs, The Data Fabric ID of an instance is equal to the UMC number.
+ * And since the UMC number and channel are equal in CPU nodes, the channel can be used
+ * as the Data Fabric ID of the instance.
+ */
+static int f17_df_inst_id(struct mem_ctl_info *mci, struct amd64_pvt *pvt,
+			  struct err_info *err)
+{
+	return err->channel;
+}
+
+/*
+ * A GPU node has 'X' number of PHYs and 'Y' number of channels.
+ * This results in 'X*Y' number of instances in the Data Fabric.
+ * Therefore the Data Fabric ID of an instance can be found with the following formula:
+ * df_inst_id = 'X' * number of channels per PHY + 'Y'
+ *
+ */
+static int gpu_df_inst_id(struct mem_ctl_info *mci, struct amd64_pvt *pvt,
+			  struct err_info *err)
+{
+	return (err->csrow * pvt->channel_count / mci->nr_csrows) + err->channel;
+}
+
 /*
  * To find the UMC channel represented by this bank we need to match on its
  * instance_id. The instance_id of a bank is held in the lower 32 bits of its
@@ -3670,12 +3694,38 @@ static void update_umc_err_info(struct mce *m, struct err_info *err)
 	err->csrow = m->synd & 0x7;
 }
 
+/*
+ * The CPUs have one channel per UMC, So  UMC number is equivalent to a
+ * channel number. The GPUs have 8 channels per UMC, so the UMC number no
+ * longer works as a channel number.
+ * The channel number within a GPU UMC is given in MCA_IPID[15:12].
+ * However, the IDs are split such that two UMC values go to one UMC, and
+ * the channel numbers are split in two groups of four.
+ *
+ * Refer comment on get_umc_base_gpu() from amd64_edac.h
+ *
+ * For example,
+ * UMC0 CH[3:0] = 0x0005[3:0]000
+ * UMC0 CH[7:4] = 0x0015[3:0]000
+ * UMC1 CH[3:0] = 0x0025[3:0]000
+ * UMC1 CH[7:4] = 0x0035[3:0]000
+ */
+static void gpu_update_umc_err_info(struct mce *m, struct err_info *err)
+{
+	u8 ch = (m->ipid & GENMASK(31, 0)) >> 20;
+	u8 phy = ((m->ipid >> 12) & 0xf);
+
+	err->channel = ch % 2 ? phy + 4 : phy;
+	err->csrow = phy;
+}
+
 static void decode_umc_error(int node_id, struct mce *m)
 {
 	u8 ecc_type = (m->status >> 45) & 0x3;
 	struct mem_ctl_info *mci;
 	struct amd64_pvt *pvt;
 	struct err_info err;
+	u8 df_inst_id;
 	u64 sys_addr;
 
 	mci = edac_mc_find(node_id);
@@ -3705,7 +3755,9 @@ static void decode_umc_error(int node_id, struct mce *m)
 
 	pvt->ops->get_umc_err_info(m, &err);
 
-	if (umc_normaddr_to_sysaddr(m->addr, pvt->mc_node_id, err.channel, &sys_addr)) {
+	df_inst_id = pvt->ops->find_umc_inst_id(mci, pvt, &err);
+
+	if (umc_normaddr_to_sysaddr(m->addr, pvt->mc_node_id, df_inst_id, &sys_addr)) {
 		err.err_code = ERR_NORM_ADDR;
 		goto log_error;
 	}
@@ -4625,6 +4677,7 @@ static void per_family_init(struct amd64_pvt *pvt)
 		pvt->ops->get_mc_regs			= __read_mc_regs_df;
 		pvt->ops->populate_csrows		= init_csrows_df;
 		pvt->ops->get_umc_err_info		= update_umc_err_info;
+		pvt->ops->find_umc_inst_id		= f17_df_inst_id;
 
 		if (pvt->fam == 0x18) {
 			pvt->ctl_name			= "F18h";
@@ -4678,6 +4731,8 @@ static void per_family_init(struct amd64_pvt *pvt)
 				pvt->ops->dump_misc_regs	= gpu_dump_misc_regs;
 				pvt->ops->get_mc_regs		= gpu_read_mc_regs;
 				pvt->ops->populate_csrows	= gpu_init_csrows;
+				pvt->ops->get_umc_err_info	= gpu_update_umc_err_info;
+				pvt->ops->find_umc_inst_id	= gpu_df_inst_id;
 				goto end_fam;
 			} else {
 				pvt->ctl_name		= "F19h_M30h";
@@ -4707,6 +4762,7 @@ static void per_family_init(struct amd64_pvt *pvt)
 		pvt->ops->get_mc_regs			= __read_mc_regs_df;
 		pvt->ops->populate_csrows		= init_csrows_df;
 		pvt->ops->get_umc_err_info		= update_umc_err_info;
+		pvt->ops->find_umc_inst_id		= f17_df_inst_id;
  end_fam:
 		break;
 
@@ -4728,7 +4784,7 @@ static void per_family_init(struct amd64_pvt *pvt)
 	}
 
 	/* ops required for families 17h and later */
-	if (pvt->fam >= 0x17 && !pvt->ops->get_umc_err_info) {
+	if (pvt->fam >= 0x17 && (!pvt->ops->get_umc_err_info || !pvt->ops->find_umc_inst_id)) {
 		edac_dbg(1, "Platform specific helper routines not defined.\n");
 		return;
 	}
diff --git a/drivers/edac/amd64_edac.h b/drivers/edac/amd64_edac.h
index 71df08a496d2..b6177bd5d5ba 100644
--- a/drivers/edac/amd64_edac.h
+++ b/drivers/edac/amd64_edac.h
@@ -496,6 +496,8 @@ struct low_ops {
 	void (*setup_mci_misc_attrs)(struct mem_ctl_info *mci);
 	int  (*populate_csrows)(struct mem_ctl_info *mci);
 	void (*get_umc_err_info)(struct mce *m, struct err_info *err);
+	int  (*find_umc_inst_id)(struct mem_ctl_info *mci, struct amd64_pvt *pvt,
+				 struct err_info *err);
 };
 
 int __amd64_read_pci_cfg_dword(struct pci_dev *pdev, int offset,
-- 
2.25.1