linux-kernel - [PATCH v7 07/12] EDAC/amd64: Enumerate Aldebaran GPU nodes by adding family ops

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20220203174942.31630-8-nchatrad@amd.com>
Date:   Thu, 3 Feb 2022 11:49:37 -0600
From:   Naveen Krishna Chatradhi <nchatrad@....com>
To:     <linux-edac@...r.kernel.org>, <x86@...nel.org>
CC:     <linux-kernel@...r.kernel.org>, <bp@...en8.de>, <mingo@...hat.com>,
        <mchehab@...nel.org>, <yazen.ghannam@....com>,
        Naveen Krishna Chatradhi <nchatrad@....com>,
        Muralidhara M K <muralimk@....com>
Subject: [PATCH v7 07/12] EDAC/amd64: Enumerate Aldebaran GPU nodes by adding family ops

On newer heterogeneous systems with AMD CPUs, the data fabrics of the GPUs
are connected directly via custom links.

One such system, where Aldebaran GPU nodes are connected to the
Family 19h, model 30h family of CPU nodes, the Aldebaran GPUs can report
memory errors via SMCA banks.

Aldebaran GPU support was added to DRM framework
https://lists.freedesktop.org/archives/amd-gfx/2021-February/059694.html

The GPU nodes comes with HBM2 memory in-built, ECC support is enabled by
default and the UMCs on GPU node are different from the UMCs on CPU nodes.

GPU specific ops routines are defined to extend the amd64_edac
module to enumerate HBM memory leveraging the existing edac and the
amd64 specific data structures.

The UMC Phys on GPU nodes are enumerated as csrows and the UMC
channels connected to HBM banks are enumerated as ranks.

Cc: Yazen Ghannam <yazen.ghannam@....com>
Co-developed-by: Muralidhara M K <muralimk@....com>
Signed-off-by: Muralidhara M K <muralimk@....com>
Signed-off-by: Naveen Krishna Chatradhi <nchatrad@....com>
---
Link:
https://lkml.kernel.org/r/20210823185437.94417-4-nchatrad@amd.com

v6->v7:
* Added GPU specific ops function definitions, based on the refactor.

v5->v6:
* Added to support number of GPU northbridges with amd_gpu_nb_num()

v4->v5:
* Removed else condition in per_family_init for 19h family

v3->v4:
* Split "f17_addr_mask_to_cs_size" instead as did in 3rd patch earlier

v2->v3:
* Bifurcated the GPU code from v2

v1->v2:
* Restored line deletions and handled minor comments
* Modified commit message and some of the function comments
* variable df_inst_id is introduced instead of umc_num

v0->v1:
* Modifed the commit message
* Change the edac_cap
* kept sizes of both cpu and noncpu together
* return success if the !F3 condition true and remove unnecessary validation


 drivers/edac/amd64_edac.c | 285 +++++++++++++++++++++++++++++++++-----
 drivers/edac/amd64_edac.h |  21 +++
 2 files changed, 273 insertions(+), 33 deletions(-)

diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c
index 54af7e38d26c..10efe726a959 100644
--- a/drivers/edac/amd64_edac.c
+++ b/drivers/edac/amd64_edac.c
@@ -1012,6 +1012,12 @@ static int sys_addr_to_csrow(struct mem_ctl_info *mci, u64 sys_addr)
 /* Protect the PCI config register pairs used for DF indirect access. */
 static DEFINE_MUTEX(df_indirect_mutex);
 
+/* Total number of northbridges if in case of heterogeneous systems */
+static int amd_total_nb_num(void)
+{
+	return amd_nb_num() + amd_gpu_nb_num();
+}
+
 /*
  * Data Fabric Indirect Access uses FICAA/FICAD.
  *
@@ -1031,7 +1037,7 @@ static int __df_indirect_read(u16 node, u8 func, u16 reg, u8 instance_id, u32 *l
 	u32 ficaa;
 	int err = -ENODEV;
 
-	if (node >= amd_nb_num())
+	if (node >= amd_total_nb_num())
 		goto out;
 
 	F4 = node_to_amd_nb(node)->link;
@@ -1732,6 +1738,11 @@ static unsigned long f17_determine_edac_cap(struct amd64_pvt *pvt)
 	return edac_cap;
 }
 
+static unsigned long gpu_determine_edac_cap(struct amd64_pvt *pvt)
+{
+	return EDAC_FLAG_EC;
+}
+
 static void debug_display_dimm_sizes(struct amd64_pvt *, u8);
 
 static void debug_dump_dramcfg_low(struct amd64_pvt *pvt, u32 dclr, int chan)
@@ -1814,6 +1825,25 @@ static int f17_get_cs_mode(int dimm, u8 ctrl, struct amd64_pvt *pvt)
 	return cs_mode;
 }
 
+static int gpu_get_cs_mode(int dimm, u8 ctrl, struct amd64_pvt *pvt)
+{
+	return CS_EVEN_PRIMARY | CS_ODD_PRIMARY;
+}
+
+static void gpu_debug_display_dimm_sizes(struct amd64_pvt *pvt, u8 ctrl)
+{
+	int size, cs = 0, cs_mode;
+
+	edac_printk(KERN_DEBUG, EDAC_MC, "UMC%d chip selects:\n", ctrl);
+
+	cs_mode = CS_EVEN_PRIMARY | CS_ODD_PRIMARY;
+
+	for_each_chip_select(cs, ctrl, pvt) {
+		size = pvt->ops->dbam_to_cs(pvt, ctrl, cs_mode, cs);
+		amd64_info(EDAC_MC ": %d: %5dMB\n", cs, size);
+	}
+}
+
 static void debug_display_dimm_sizes_df(struct amd64_pvt *pvt, u8 ctrl)
 {
 	int dimm, size0, size1, cs0, cs1, cs_mode;
@@ -1835,6 +1865,27 @@ static void debug_display_dimm_sizes_df(struct amd64_pvt *pvt, u8 ctrl)
 	}
 }
 
+static void gpu_dump_misc_regs(struct amd64_pvt *pvt)
+{
+	struct amd64_umc *umc;
+	u32 i, umc_base;
+
+	for_each_umc(i) {
+		umc_base = gpu_get_umc_base(i, 0);
+		umc = &pvt->umc[i];
+
+		edac_dbg(1, "UMC%d UMC cfg: 0x%x\n", i, umc->umc_cfg);
+		edac_dbg(1, "UMC%d SDP ctrl: 0x%x\n", i, umc->sdp_ctrl);
+		edac_dbg(1, "UMC%d ECC ctrl: 0x%x\n", i, umc->ecc_ctrl);
+		edac_dbg(1, "UMC%d All HBMs support ECC: yes\n", i);
+
+		gpu_debug_display_dimm_sizes(pvt, i);
+	}
+
+	edac_dbg(1, "F0x104 (DRAM Hole Address): 0x%08x, base: 0x%08x\n",
+		 pvt->dhar, dhar_base(pvt));
+}
+
 static void __dump_misc_regs_df(struct amd64_pvt *pvt)
 {
 	struct amd64_umc *umc;
@@ -1973,6 +2024,43 @@ static void default_prep_chip_selects(struct amd64_pvt *pvt)
 	pvt->csels[1].m_cnt = 4;
 }
 
+static void gpu_prep_chip_selects(struct amd64_pvt *pvt)
+{
+	int umc;
+
+	for_each_umc(umc) {
+		pvt->csels[umc].b_cnt = 8;
+		pvt->csels[umc].m_cnt = 8;
+	}
+}
+
+static void gpu_read_umc_base_mask(struct amd64_pvt *pvt)
+{
+	u32 base_reg, mask_reg;
+	u32 *base, *mask;
+	int umc, cs;
+
+	for_each_umc(umc) {
+		for_each_chip_select(cs, umc, pvt) {
+			base_reg = gpu_get_umc_base(umc, cs) + UMCCH_BASE_ADDR;
+			base = &pvt->csels[umc].csbases[cs];
+
+			if (!amd_smn_read(pvt->mc_node_id, base_reg, base)) {
+				edac_dbg(0, "  DCSB%d[%d]=0x%08x reg: 0x%x\n",
+					 umc, cs, *base, base_reg);
+			}
+
+			mask_reg = gpu_get_umc_base(umc, cs) + UMCCH_ADDR_MASK;
+			mask = &pvt->csels[umc].csmasks[cs];
+
+			if (!amd_smn_read(pvt->mc_node_id, mask_reg, mask)) {
+				edac_dbg(0, "  DCSM%d[%d]=0x%08x reg: 0x%x\n",
+					 umc, cs, *mask, mask_reg);
+			}
+		}
+	}
+}
+
 static void read_umc_base_mask(struct amd64_pvt *pvt)
 {
 	u32 umc_base_reg, umc_base_reg_sec;
@@ -2172,6 +2260,11 @@ static void determine_memory_type(struct amd64_pvt *pvt)
 	pvt->dram_type = (pvt->dclr0 & BIT(16)) ? MEM_DDR3 : MEM_RDDR3;
 }
 
+static void gpu_determine_memory_type(struct amd64_pvt *pvt)
+{
+	pvt->dram_type = MEM_HBM2;
+}
+
 /* Get the number of DCT channels the memory controller is using. */
 static int k8_early_channel_count(struct amd64_pvt *pvt)
 {
@@ -2504,6 +2597,19 @@ static int f17_early_channel_count(struct amd64_pvt *pvt)
 	return channels;
 }
 
+static int gpu_early_channel_count(struct amd64_pvt *pvt)
+{
+	int i, channels = 0;
+
+	/* The memory channels in case of GPUs are fully populated */
+	for_each_umc(i)
+		channels += pvt->csels[i].b_cnt;
+
+	amd64_info("MCT channel count: %d\n", channels);
+
+	return channels;
+}
+
 static int ddr3_cs_size(unsigned i, bool dct_width)
 {
 	unsigned shift = 0;
@@ -2631,11 +2737,46 @@ static int f16_dbam_to_chip_select(struct amd64_pvt *pvt, u8 dct,
 		return ddr3_cs_size(cs_mode, false);
 }
 
+static int __addr_mask_to_cs_size(u32 addr_mask_orig, unsigned int cs_mode,
+				  int csrow_nr, int dimm)
+{
+	u32 msb, weight, num_zero_bits;
+	u32 addr_mask_deinterleaved;
+	int size = 0;
+
+	/*
+	 * The number of zero bits in the mask is equal to the number of bits
+	 * in a full mask minus the number of bits in the current mask.
+	 *
+	 * The MSB is the number of bits in the full mask because BIT[0] is
+	 * always 0.
+	 *
+	 * In the special 3 Rank interleaving case, a single bit is flipped
+	 * without swapping with the most significant bit. This can be handled
+	 * by keeping the MSB where it is and ignoring the single zero bit.
+	 */
+	msb = fls(addr_mask_orig) - 1;
+	weight = hweight_long(addr_mask_orig);
+	num_zero_bits = msb - weight - !!(cs_mode & CS_3R_INTERLEAVE);
+
+	/* Take the number of zero bits off from the top of the mask. */
+	addr_mask_deinterleaved = GENMASK_ULL(msb - num_zero_bits, 1);
+
+	edac_dbg(1, "CS%d DIMM%d AddrMasks:\n", csrow_nr, dimm);
+	edac_dbg(1, "  Original AddrMask: 0x%x\n", addr_mask_orig);
+	edac_dbg(1, "  Deinterleaved AddrMask: 0x%x\n", addr_mask_deinterleaved);
+
+	/* Register [31:1] = Address [39:9]. Size is in kBs here. */
+	size = (addr_mask_deinterleaved >> 2) + 1;
+
+	/* Return size in MBs. */
+	return size >> 10;
+}
+
 static int f17_addr_mask_to_cs_size(struct amd64_pvt *pvt, u8 umc,
 				    unsigned int cs_mode, int csrow_nr)
 {
-	u32 addr_mask_orig, addr_mask_deinterleaved;
-	u32 msb, weight, num_zero_bits;
+	u32 addr_mask_orig;
 	int cs_mask_nr = csrow_nr;
 	int dimm, size = 0;
 
@@ -2680,33 +2821,15 @@ static int f17_addr_mask_to_cs_size(struct amd64_pvt *pvt, u8 umc,
 	else
 		addr_mask_orig = pvt->csels[umc].csmasks[cs_mask_nr];
 
-	/*
-	 * The number of zero bits in the mask is equal to the number of bits
-	 * in a full mask minus the number of bits in the current mask.
-	 *
-	 * The MSB is the number of bits in the full mask because BIT[0] is
-	 * always 0.
-	 *
-	 * In the special 3 Rank interleaving case, a single bit is flipped
-	 * without swapping with the most significant bit. This can be handled
-	 * by keeping the MSB where it is and ignoring the single zero bit.
-	 */
-	msb = fls(addr_mask_orig) - 1;
-	weight = hweight_long(addr_mask_orig);
-	num_zero_bits = msb - weight - !!(cs_mode & CS_3R_INTERLEAVE);
-
-	/* Take the number of zero bits off from the top of the mask. */
-	addr_mask_deinterleaved = GENMASK_ULL(msb - num_zero_bits, 1);
-
-	edac_dbg(1, "CS%d DIMM%d AddrMasks:\n", csrow_nr, dimm);
-	edac_dbg(1, "  Original AddrMask: 0x%x\n", addr_mask_orig);
-	edac_dbg(1, "  Deinterleaved AddrMask: 0x%x\n", addr_mask_deinterleaved);
+	return __addr_mask_to_cs_size(addr_mask_orig, cs_mode, cs_mask_nr, dimm);
+}
 
-	/* Register [31:1] = Address [39:9]. Size is in kBs here. */
-	size = (addr_mask_deinterleaved >> 2) + 1;
+static int gpu_addr_mask_to_cs_size(struct amd64_pvt *pvt, u8 umc,
+				    unsigned int cs_mode, int csrow_nr)
+{
+	u32 addr_mask_orig = pvt->csels[umc].csmasks[csrow_nr];
 
-	/* Return size in MBs. */
-	return size >> 10;
+	return __addr_mask_to_cs_size(addr_mask_orig, cs_mode, csrow_nr, csrow_nr >> 1);
 }
 
 static void read_dram_ctl_register(struct amd64_pvt *pvt)
@@ -3703,6 +3826,11 @@ static void f17_determine_ecc_sym_sz(struct amd64_pvt *pvt)
 	}
 }
 
+/* ECC symbol size is not available on Aldebaran nodes */
+static void gpu_determine_ecc_sym_sz(struct amd64_pvt *pvt)
+{
+}
+
 static void read_top_mem_registers(struct amd64_pvt *pvt)
 {
 	u64 msr_val;
@@ -3724,6 +3852,25 @@ static void read_top_mem_registers(struct amd64_pvt *pvt)
 	}
 }
 
+static void gpu_read_mc_regs(struct amd64_pvt *pvt)
+{
+	u8 nid = pvt->mc_node_id;
+	struct amd64_umc *umc;
+	u32 i, umc_base;
+
+	/* Read registers from each UMC */
+	for_each_umc(i) {
+		umc_base = gpu_get_umc_base(i, 0);
+		umc = &pvt->umc[i];
+
+		amd_smn_read(nid, umc_base + UMCCH_UMC_CFG, &umc->umc_cfg);
+		amd_smn_read(nid, umc_base + UMCCH_SDP_CTRL, &umc->sdp_ctrl);
+		amd_smn_read(nid, umc_base + UMCCH_ECC_CTRL, &umc->ecc_ctrl);
+	}
+
+	amd64_read_pci_cfg(pvt->F0, DF_DHAR, &pvt->dhar);
+}
+
 /*
  * Retrieve the hardware registers of the memory controller.
  */
@@ -3850,6 +3997,35 @@ static u32 get_csrow_nr_pages(struct amd64_pvt *pvt, u8 dct, int csrow_nr_orig)
 	return nr_pages;
 }
 
+static int gpu_init_csrows(struct mem_ctl_info *mci)
+{
+	struct amd64_pvt *pvt = mci->pvt_info;
+	struct dimm_info *dimm;
+	int empty = 1;
+	u8 umc, cs;
+
+	for_each_umc(umc) {
+		for_each_chip_select(cs, umc, pvt) {
+			if (!csrow_enabled(cs, umc, pvt))
+				continue;
+
+			empty = 0;
+			dimm = mci->csrows[umc]->channels[cs]->dimm;
+
+			edac_dbg(1, "MC node: %d, csrow: %d\n",
+				 pvt->mc_node_id, cs);
+
+			dimm->nr_pages = get_csrow_nr_pages(pvt, umc, cs);
+			dimm->mtype = pvt->dram_type;
+			dimm->edac_mode = EDAC_SECDED;
+			dimm->dtype = DEV_X16;
+			dimm->grain = 64;
+		}
+	}
+
+	return empty;
+}
+
 static int init_csrows_df(struct mem_ctl_info *mci)
 {
 	struct amd64_pvt *pvt = mci->pvt_info;
@@ -4190,6 +4366,12 @@ static bool f17_check_ecc_enabled(struct amd64_pvt *pvt)
 		return true;
 }
 
+/* ECC is enabled by default on GPU nodes */
+static bool gpu_check_ecc_enabled(struct amd64_pvt *pvt)
+{
+	return true;
+}
+
 static inline void
 f1x_determine_edac_ctl_cap(struct mem_ctl_info *mci, struct amd64_pvt *pvt)
 {
@@ -4231,6 +4413,12 @@ f17_determine_edac_ctl_cap(struct mem_ctl_info *mci, struct amd64_pvt *pvt)
 	}
 }
 
+static inline void
+gpu_determine_edac_ctl_cap(struct mem_ctl_info *mci, struct amd64_pvt *pvt)
+{
+	mci->edac_ctl_cap |= EDAC_FLAG_SECDED;
+}
+
 static void f1x_setup_mci_misc_attrs(struct mem_ctl_info *mci)
 {
 	struct amd64_pvt *pvt = mci->pvt_info;
@@ -4251,6 +4439,22 @@ static void f1x_setup_mci_misc_attrs(struct mem_ctl_info *mci)
 	mci->get_sdram_scrub_rate = get_scrub_rate;
 }
 
+static void gpu_setup_mci_misc_attrs(struct mem_ctl_info *mci)
+{
+	struct amd64_pvt *pvt = mci->pvt_info;
+
+	mci->mtype_cap		= MEM_FLAG_HBM2;
+	mci->edac_ctl_cap	= EDAC_FLAG_NONE;
+
+	pvt->ops->determine_edac_ctl_cap(mci, pvt);
+
+	mci->edac_cap		= pvt->ops->determine_edac_cap(pvt);
+	mci->mod_name		= EDAC_MOD_STR;
+	mci->ctl_name		= pvt->ctl_name;
+	mci->dev_name		= pci_name(pvt->F3);
+	mci->ctl_page_to_phys	= NULL;
+}
+
 /*
  * returns a pointer to the family descriptor on success, NULL otherwise.
  */
@@ -4460,6 +4664,20 @@ static void per_family_init(struct amd64_pvt *pvt)
 				pvt->f0_id		= PCI_DEVICE_ID_AMD_ALDEBARAN_DF_F0;
 				pvt->f6_id		= PCI_DEVICE_ID_AMD_ALDEBARAN_DF_F6;
 				pvt->max_mcs		= 4;
+				pvt->ops->early_channel_count	= gpu_early_channel_count;
+				pvt->ops->dbam_to_cs		= gpu_addr_mask_to_cs_size;
+				pvt->ops->prep_chip_selects	= gpu_prep_chip_selects;
+				pvt->ops->determine_memory_type	= gpu_determine_memory_type;
+				pvt->ops->determine_ecc_sym_sz	= gpu_determine_ecc_sym_sz;
+				pvt->ops->determine_edac_ctl_cap = gpu_determine_edac_ctl_cap;
+				pvt->ops->determine_edac_cap	= gpu_determine_edac_cap;
+				pvt->ops->setup_mci_misc_attrs	= gpu_setup_mci_misc_attrs;
+				pvt->ops->get_cs_mode		= gpu_get_cs_mode;
+				pvt->ops->ecc_enabled		= gpu_check_ecc_enabled;
+				pvt->ops->get_base_mask		= gpu_read_umc_base_mask;
+				pvt->ops->dump_misc_regs	= gpu_dump_misc_regs;
+				pvt->ops->get_mc_regs		= gpu_read_mc_regs;
+				pvt->ops->populate_csrows	= gpu_init_csrows;
 				goto end_fam;
 			} else {
 				pvt->ctl_name		= "F19h_M30h";
@@ -4581,9 +4799,10 @@ static int init_one_instance(struct amd64_pvt *pvt)
 	if (pvt->channel_count < 0)
 		return ret;
 
+	/* Define layers for CPU and GPU nodes */
 	ret = -ENOMEM;
 	layers[0].type = EDAC_MC_LAYER_CHIP_SELECT;
-	layers[0].size = pvt->csels[0].b_cnt;
+	layers[0].size = amd_gpu_nb_num() ? pvt->max_mcs : pvt->csels[0].b_cnt;
 	layers[0].is_virt_csrow = true;
 	layers[1].type = EDAC_MC_LAYER_CHANNEL;
 
@@ -4592,7 +4811,7 @@ static int init_one_instance(struct amd64_pvt *pvt)
 	 * only one channel. Also, this simplifies handling later for the price
 	 * of a couple of KBs tops.
 	 */
-	layers[1].size = pvt->max_mcs;
+	layers[1].size = amd_gpu_nb_num() ? pvt->csels[0].b_cnt : pvt->max_mcs;
 	layers[1].is_virt_csrow = false;
 
 	mci = edac_mc_alloc(pvt->mc_node_id, ARRAY_SIZE(layers), layers, 0);
@@ -4786,7 +5005,7 @@ static int __init amd64_edac_init(void)
 	opstate_init();
 
 	err = -ENOMEM;
-	ecc_stngs = kcalloc(amd_nb_num(), sizeof(ecc_stngs[0]), GFP_KERNEL);
+	ecc_stngs = kcalloc(amd_total_nb_num(), sizeof(ecc_stngs[0]), GFP_KERNEL);
 	if (!ecc_stngs)
 		goto err_free;
 
@@ -4794,7 +5013,7 @@ static int __init amd64_edac_init(void)
 	if (!msrs)
 		goto err_free;
 
-	for (i = 0; i < amd_nb_num(); i++) {
+	for (i = 0; i < amd_total_nb_num(); i++) {
 		err = probe_one_instance(i);
 		if (err) {
 			/* unwind properly */
@@ -4852,7 +5071,7 @@ static void __exit amd64_edac_exit(void)
 	else
 		amd_unregister_ecc_decoder(decode_bus_error);
 
-	for (i = 0; i < amd_nb_num(); i++)
+	for (i = 0; i < amd_total_nb_num(); i++)
 		remove_one_instance(i);
 
 	kfree(ecc_stngs);
diff --git a/drivers/edac/amd64_edac.h b/drivers/edac/amd64_edac.h
index 66f7b5d45a37..71df08a496d2 100644
--- a/drivers/edac/amd64_edac.h
+++ b/drivers/edac/amd64_edac.h
@@ -653,3 +653,24 @@ static inline u32 dct_sel_baseaddr(struct amd64_pvt *pvt)
  * - GPU UMCs use 1 chip select. So we say UMC = EDAC CSROW.
  * - GPU UMCs use 8 channels. So we say UMC Channel = EDAC Channel.
  */
+static inline u32 gpu_get_umc_base(u8 umc, u8 channel)
+{
+	/*
+	 * On CPUs, there is one channel per UMC, so UMC numbering equals
+	 * channel numbering. On GPUs, there are eight channels per UMC,
+	 * so the channel numbering is different from UMC numbering.
+	 *
+	 * On CPU nodes channels are selected in 6th nibble
+	 * UMC chY[3:0]= [(chY*2 + 1) : (chY*2)]50000;
+	 *
+	 * On GPU nodes channels are selected in 3rd nibble
+	 * HBM chX[3:0]= [Y  ]5X[3:0]000;
+	 * HBM chX[7:4]= [Y+1]5X[3:0]000
+	 */
+	umc *= 2;
+
+	if (channel >= 4)
+		umc++;
+
+	return 0x50000 + (umc << 20) + ((channel % 4) << 12);
+}
-- 
2.25.1