linux-kernel - Re: [PATCH 7/7] EDAC/amd64: Add Error address conversion for UMC

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <013c3631-2cb0-9a5e-1d65-6d085725b3ea@amd.com>
Date:   Fri, 21 Jul 2023 10:49:43 -0400
From:   Yazen Ghannam <yazen.ghannam@....com>
To:     Muralidhara M K <muralimk@....com>, linux-edac@...r.kernel.org,
        x86@...nel.org
Cc:     yazen.ghannam@....com, linux-kernel@...r.kernel.org, bp@...en8.de,
        mingo@...hat.com, mchehab@...nel.org, nchatrad@....com,
        Muralidhara M K <muralidhara.mk@....com>
Subject: Re: [PATCH 7/7] EDAC/amd64: Add Error address conversion for UMC

On 7/20/2023 8:54 AM, Muralidhara M K wrote:
> From: Muralidhara M K <muralidhara.mk@....com>
> 
> Reported MCA address is DRAM address which needs to be converted
> to normalized address before Data fabric address translation.
> 
> Some AMD systems have on-chip memory capable of OnDie ECC support.
> OnDie-ECC error address to MCA is a DRAM decoded address reported with
> a DRAM address (PC/SID/Bank/ROW/COL) instead of normalized address
> unlike MI200’s UMC ECC, as the implementation difference between
> HBM3 ODECC and HBM2 host ECC.
> Because OnDie-ECC address reporting is done in the back-end of UMC and
> it no longer has normalized address at that point.
> So software needs to convert the reported MCA Error Address back to
> normalized address.
> 
> Signed-off-by: Muralidhara M K <muralidhara.mk@....com>
> ---
>   drivers/edac/amd64_edac.c | 160 ++++++++++++++++++++++++++++++++++++++
>   1 file changed, 160 insertions(+)
> 
> diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c
> index 74b2b47cc22a..304d104c25d8 100644
> --- a/drivers/edac/amd64_edac.c
> +++ b/drivers/edac/amd64_edac.c
> @@ -3076,6 +3076,159 @@ static void umc_get_err_info(struct mce *m, struct err_info *err)
>   	err->csrow = m->synd & 0x7;
>   }
>   
> +static bool internal_bit_wise_xor(u32 inp)
> +{
> +	bool tmp = 0;
> +
> +	for (int i = 0; i < 32; i++)
> +		tmp = tmp ^ ((inp >> i) & 0x1);
> +
> +	return tmp;
> +}
> +
> +/* mapping of MCA error address to normalized address */
> +static const u8 umc_mca2na_mapping[] = {
> +	0,  5,  6,  8,  9,  14, 12, 13,
> +	10, 11, 15, 16, 17, 18, 19, 20,
> +	21, 22, 23, 24, 25, 26, 27, 28,
> +	7,  29, 30,
> +};
> +
> +/*
> + * Read AMD PPR UMC::AddrHashBank and
> + * UMC::CH::AddrHashPC/PC2 register fields
> + */
> +static struct {
> +	u32 xor_enable	:1;
> +	u32 col_xor	:13;
> +	u32 row_xor	:18;
> +} addr_hash_pc, addr_hash_bank[4];
> +
> +static struct {
> +	u32 bank_xor	:6;
> +} addr_hash_pc2;
> +
> +/*
> + * The location of bank, column and row are fixed.
> + * location of column bit must be NA[5].
> + * Row bits are always placed in a contiguous stretch of NA above the
> + * column and bank bits.
> + * Bits below the row bits can be either column or bank in any order,
> + * with the exception that NA[5] must be a column bit.
> + * Stack ID(SID) bits are placed in the MSB position of the NA.
> + */
> +static int umc_ondie_addr_to_normaddr(u64 mca_addr, u16 nid)
> +{
> +	u32 bank[4], bank_hash[4], pc_hash;
> +	u32 col, row, rawbank = 0, pc;
> +	int i, temp = 0;
> +	u64 mca2na;
> +
> +	u32 gpu_umc_base = 0x90000;
> +
> +	/*
> +	 * the below calculation, trying to maps ondie error address
> +	 * to normalized address. logged ondie MCA address format is
> +	 * BEQ_MCA_RdDatAddr[27:0] =
> +	 *	{SID[1:0],PC[0],row[14:0],bank[3:0],col[4:0],1'b0}
> +	 * The conversion mappings are:
> +	 *
> +	 * Normalized location	  ondie MCA error Address
> +	 * ===================	  ======================
> +	 * NA[4]		  = 1'b0
> +	 * NA[5]	= col[0]  = BEQ_MCA_RdDatAddr[1]
> +	 * NA[6]	= col[1]  = BEQ_MCA_RdDatAddr[2]
> +	 * NA[8]	= col[2]  = BEQ_MCA_RdDatAddr[3]
> +	 * NA[9]	= col[3]  = BEQ_MCA_RdDatAddr[4]
> +	 * NA[14]	= col[4]  = BEQ_MCA_RdDatAddr[5]
> +	 * NA[12]	= bank[0] = BEQ_MCA_RdDatAddr[5]
> +	 * NA[13]	= bank[1] = BEQ_MCA_RdDatAddr[6]
> +	 * NA[10]	= bank[2] = BEQ_MCA_RdDatAddr[7]
> +	 * NA[11]	= bank[3] = BEQ_MCA_RdDatAddr[8]
> +	 *
> +	 * row low is 12 bit locations, low lsb bit starts from 10
> +	 * NA[15..26] = row[0..11]  = BEQ_MCA_RdDatAddr[10..21]
> +	 *
> +	 * row high is 2 bit locations, high lsb bit starts from 22
> +	 * NA[27..28] = row[12..13] = BEQ_MCA_RdDatAddr[22..23]
> +	 *
> +	 * NA[7]	= PC[0]   = BEQ_MCA_RdDatAddr[25]
> +	 * NA[29]	= sid[0]  = bank[4] = BEQ_MCA_RdDatAddr[26]
> +	 * NA[30]	= sid[1]  = bank[5] = BEQ_MCA_RdDatAddr[27]
> +	 * Basically, it calculates a locations to fit as shown in
> +	 * table umc_mca2na_mapping[].
> +	 *
> +	 * XORs need to be applied based on the hash settings below.
> +	 */
> +
> +	/* Calculate column and row */
> +	col = FIELD_GET(GENMASK(5, 1), mca_addr);
> +	row = FIELD_GET(GENMASK(23, 10), mca_addr);
> +
> +	/* Apply hashing on below banks for bank calculation */
> +	for (i = 0; i < 4; i++)
> +		bank_hash[i] = (mca_addr >> (6 + i)) & 0x1;
> +
> +	/* bank hash algorithm */
> +	for (i = 0; i < 4; i++) {
> +		/* Read AMD PPR UMC::AddrHashBank register*/
> +		if (!amd_smn_read(nid, gpu_umc_base + 0xC8 + (i * 4), &temp)) {
> +			addr_hash_bank[i].xor_enable = temp & 1;
> +			addr_hash_bank[i].col_xor = FIELD_GET(GENMASK(13, 1), temp);
> +			addr_hash_bank[i].row_xor = FIELD_GET(GENMASK(31, 14), temp);
> +			/* bank hash selection */
> +			bank[i] = bank_hash[i] ^ (addr_hash_bank[i].xor_enable &
> +				  (internal_bit_wise_xor(col & addr_hash_bank[i].col_xor) ^
> +				  internal_bit_wise_xor(row & addr_hash_bank[i].row_xor)));
> +		}
> +	}
> +
> +	/* To apply hash on pc bit */
> +	pc_hash = (mca_addr >> 25) & 0x1;
> +
> +	/* Read AMD PPR UMC::CH::AddrHashPC register */
> +	if (!amd_smn_read(nid, gpu_umc_base + 0xE0, &temp)) {
> +		addr_hash_pc.xor_enable = temp & 1;
> +		addr_hash_pc.col_xor = FIELD_GET(GENMASK(13, 1), temp);
> +		addr_hash_pc.row_xor = FIELD_GET(GENMASK(31, 14), temp);
> +	}
> +	/* Read AMD PPR UMC::CH::AddrHashPC2 register*/
> +	if (!amd_smn_read(nid, gpu_umc_base + 0xE4, &temp))
> +		addr_hash_pc2.bank_xor = FIELD_GET(GENMASK(5, 0), temp);
> +
> +	/* Calculate bank value from bank[0..3], bank[4] and bank[5] */
> +	for (i = 0; i < 4; i++)
> +		rawbank |= (bank[i] & 1) << i;
> +
> +	rawbank |= (mca_addr >> 22) & 0x30;
> +
> +	/* pseudochannel(pc) hash selection */
> +	pc = pc_hash ^ (addr_hash_pc.xor_enable &
> +		(internal_bit_wise_xor(col & addr_hash_pc.col_xor) ^
> +		internal_bit_wise_xor(row & addr_hash_pc.row_xor) ^
> +		internal_bit_wise_xor(rawbank & addr_hash_pc2.bank_xor)));
> +
> +	/* Mask b'25(pc_bit) and b'[9:6](bank) */
> +	mca_addr &= ~0x20003c0ULL;
> +
> +	for (i = 0; i < 4; i++)
> +		mca_addr |= (bank[i] << (6 + i));
> +
> +	 mca_addr |= (pc << 25);
> +
> +	/* NA[4..0] is fixed */
> +	mca2na = 0x0;
> +	/* convert mca error address to normalized address */
> +	for (i = 1; i < ARRAY_SIZE(umc_mca2na_mapping); i++)
> +		mca2na |= ((mca_addr >> i) & 0x1) << umc_mca2na_mapping[i];
> +
> +	mca_addr = mca2na;
> +	pr_emerg(HW_ERR "Error Addr: 0x%016llx\n", mca_addr);
> +	pr_emerg(HW_ERR "Error hit on Bank: %d Row: %d Column: %d\n", rawbank, row, col);
> +
> +	return mca_addr;
> +}
> +
>   static void decode_umc_error(int node_id, struct mce *m)
>   {
>   	u8 ecc_type = (m->status >> 45) & 0x3;
> @@ -3115,6 +3268,13 @@ static void decode_umc_error(int node_id, struct mce *m)
>   	pvt->ops->get_err_info(m, &err);
>   	df_inst_id = pvt->ops->get_inst_id(mci, pvt, &err);
>   
> +	/*
> +	 * The reported MCA address(Error Addr) is DRAM decoded address which needs to be
> +	 * converted to normalized address before DF address translation.
> +	 */
> +	if (pvt->fam == 0x19 && (pvt->model >= 0x90 && pvt->model <= 0x9f))
> +		m->addr = umc_ondie_addr_to_normaddr(m->addr, pvt->mc_node_id);
> +
>   	if (umc_normaddr_to_sysaddr(m->addr, pvt->mc_node_id, df_inst_id, &sys_addr)) {
>   		err.err_code = ERR_NORM_ADDR;
>   		goto log_error;

Same comment as previous patch. Leave this until address translation 
updates.

Furthermore, I'm not sure if overwriting m->addr is still a good idea, 
since we'd like to keep the original error information for other uses.

Thanks,
Yazen