lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <4406491a-d14c-4796-9a8a-7a4a3f99104d@amd.com>
Date: Wed, 6 Aug 2025 13:21:44 -0500
From: "Naik, Avadhut" <avadnaik@....com>
To: Yazen Ghannam <yazen.ghannam@....com>, Avadhut Naik <avadhut.naik@....com>
Cc: linux-edac@...r.kernel.org, bp@...en8.de, john.allen@....com,
 linux-kernel@...r.kernel.org
Subject: Re: [PATCH 1/2] RAS/AMD/ATL: Translate UMC normalized address to DRAM
 address using PRM

Hi,

On 7/28/2025 09:40, Yazen Ghannam wrote:
> On Thu, Jul 17, 2025 at 04:48:42PM +0000, Avadhut Naik wrote:
>> Modern AMD SOCs like Zen5 provide UEFI PRM module that implements various
> 
> Minor nit: Zen5 is a core revision and doesn't represent an SoC.
> 
> You could say "Recent AMD systems...". A platform (FW/OS) interface like
> PRM doesn't depend on hardware revisions.
> 
Will do!

>> address translation PRM handlers.[1] These handlers can be invoked by the
>> OS or hypervisor at runtime to perform address translations.
>>
>> On AMD's Zen-based SOCs, Unified Memory Controller (UMC) relative
>> "normalized" address is reported through MCA_ADDR of UMC SMCA bank type
>> on occurrence of a DRAM ECC error. This address must be converted into
>> system physical address and DRAM address to export additional information
>> about the error.
>>
>> Add support to convert normalized address into DRAM address through the
>> appropriate PRM handler. Support for obtaining the system physical address
> 
> It's not necessary to mention that the SPA translation already exists.
>
Okay!
 
>> already exists. Instead of logging the translated DRAM address locally,
>> register the translating function when the Address Translation library is
>> initialized. Modules like amd64_edac can then invoke the PRM handler to
>> add the DRAM address to their error records. Additionally, it can also be
>> exported through the RAS tracepont.
>>
>> [1] AMD Family 1Ah Models 00h–0Fh and Models 10h–1Fh ACPI v6.5 Porting Guide, Chapter 22
> 
> Could this be a link?
> https://www.kernel.org/doc/html/latest/process/maintainer-tip.html#links-to-documentation
> 
Will take a look.
>>
>> Signed-off-by: Avadhut Naik <avadhut.naik@....com>
>> ---
>>  drivers/ras/amd/atl/core.c     |  3 ++-
>>  drivers/ras/amd/atl/internal.h |  9 +++++++++
>>  drivers/ras/amd/atl/prm.c      | 36 ++++++++++++++++++++++++++++++----
>>  drivers/ras/amd/atl/umc.c      | 12 ++++++++++++
>>  drivers/ras/ras.c              | 18 +++++++++++++++--
>>  include/linux/ras.h            | 19 +++++++++++++++++-
>>  6 files changed, 89 insertions(+), 8 deletions(-)
>>
>> diff --git a/drivers/ras/amd/atl/core.c b/drivers/ras/amd/atl/core.c
>> index 4197e10993ac..ca1646d030ca 100644
>> --- a/drivers/ras/amd/atl/core.c
>> +++ b/drivers/ras/amd/atl/core.c
>> @@ -207,7 +207,8 @@ static int __init amd_atl_init(void)
>>  
>>  	/* Increment this module's recount so that it can't be easily unloaded. */
>>  	__module_get(THIS_MODULE);
>> -	amd_atl_register_decoder(convert_umc_mca_addr_to_sys_addr);
>> +	amd_atl_register_decoder(convert_umc_mca_addr_to_sys_addr,
>> +				 convert_umc_mca_addr_to_dram_addr);
>>  
>>  	pr_info("AMD Address Translation Library initialized\n");
>>  	return 0;
>> diff --git a/drivers/ras/amd/atl/internal.h b/drivers/ras/amd/atl/internal.h
>> index 2b6279d32774..53095310438c 100644
>> --- a/drivers/ras/amd/atl/internal.h
>> +++ b/drivers/ras/amd/atl/internal.h
>> @@ -279,18 +279,27 @@ int dehash_address(struct addr_ctx *ctx);
>>  
>>  unsigned long norm_to_sys_addr(u8 socket_id, u8 die_id, u8 coh_st_inst_id, unsigned long addr);
>>  unsigned long convert_umc_mca_addr_to_sys_addr(struct atl_err *err);
>> +int convert_umc_mca_addr_to_dram_addr(struct atl_err *err, struct dram_addr *dram_addr);
>>  
>>  u64 add_base_and_hole(struct addr_ctx *ctx, u64 addr);
>>  u64 remove_base_and_hole(struct addr_ctx *ctx, u64 addr);
>>  
>>  #ifdef CONFIG_AMD_ATL_PRM
>>  unsigned long prm_umc_norm_to_sys_addr(u8 socket_id, u64 umc_bank_inst_id, unsigned long addr);
>> +int prm_umc_norm_to_dram_addr(u8 socket_id, u64 bank_id,
>> +			      unsigned long addr, struct dram_addr *dram_addr);
>>  #else
>>  static inline unsigned long prm_umc_norm_to_sys_addr(u8 socket_id, u64 umc_bank_inst_id,
>>  						     unsigned long addr)
>>  {
>>         return -ENODEV;
>>  }
>> +
>> +static inline int prm_umc_norm_to_dram_addr(u8 socket_id, u64 bank_id,
>> +					    unsigned long addr, struct dram_addr *dram_addr)
>> +{
>> +	return -ENODEV;
>> +}
>>  #endif
>>  
>>  /*
>> diff --git a/drivers/ras/amd/atl/prm.c b/drivers/ras/amd/atl/prm.c
>> index 0931a20d213b..9bbaf8c85da0 100644
>> --- a/drivers/ras/amd/atl/prm.c
>> +++ b/drivers/ras/amd/atl/prm.c
>> @@ -19,10 +19,11 @@
>>  #include <linux/prmt.h>
>>  
>>  /*
>> - * PRM parameter buffer - normalized to system physical address, as described
>> - * in the "PRM Parameter Buffer" section of the AMD ACPI Porting Guide.
>> + * PRM parameter buffer - normalized to system physical address and normalized
>> + * to DRAM address, as described in the "PRM Parameter Buffer" section of the
>> + * AMD ACPI Porting Guide.
>>   */
>> -struct norm_to_sys_param_buf {
>> +struct prm_parameter_buffer {
>>  	u64 norm_addr;
>>  	u8 socket;
>>  	u64 bank_id;
>> @@ -33,9 +34,13 @@ static const guid_t norm_to_sys_guid = GUID_INIT(0xE7180659, 0xA65D, 0x451D,
>>  						 0x92, 0xCD, 0x2B, 0x56, 0xF1,
>>  						 0x2B, 0xEB, 0xA6);
>>  
>> +static const guid_t norm_to_dram_guid = GUID_INIT(0x7626C6AE, 0xF973, 0x429C,
>> +						 0xA9, 0x1C, 0x10, 0x7D, 0x7B,
>> +						 0xE2, 0x98, 0xB0);
>> +
>>  unsigned long prm_umc_norm_to_sys_addr(u8 socket_id, u64 bank_id, unsigned long addr)
>>  {
>> -	struct norm_to_sys_param_buf p_buf;
>> +	struct prm_parameter_buffer p_buf;
>>  	unsigned long ret_addr;
>>  	int ret;
>>  
>> @@ -55,3 +60,26 @@ unsigned long prm_umc_norm_to_sys_addr(u8 socket_id, u64 bank_id, unsigned long
>>  
>>  	return ret;
>>  }
>> +
>> +int prm_umc_norm_to_dram_addr(u8 socket_id, u64 bank_id,
>> +			      unsigned long addr, struct dram_addr *dram_addr)
>> +{
>> +	struct prm_parameter_buffer p_buf;
>> +	int ret;
>> +
>> +	p_buf.norm_addr	= addr;
>> +	p_buf.socket	= socket_id;
>> +	p_buf.bank_id	= bank_id;
>> +	p_buf.out_buf	= dram_addr;
>> +
>> +	ret = acpi_call_prm_handler(norm_to_dram_guid, &p_buf);
>> +	if (!ret)
>> +		return ret;
>> +
>> +	if (ret == -ENODEV)
>> +		pr_debug("PRM module/handler not available.\n");
>> +	else
>> +		pr_notice_once("PRM DRAM Address Translation failed.\n");
>> +
>> +	return ret;
>> +}
>> diff --git a/drivers/ras/amd/atl/umc.c b/drivers/ras/amd/atl/umc.c
>> index 6e072b7667e9..df6accae8929 100644
>> --- a/drivers/ras/amd/atl/umc.c
>> +++ b/drivers/ras/amd/atl/umc.c
>> @@ -427,3 +427,15 @@ unsigned long convert_umc_mca_addr_to_sys_addr(struct atl_err *err)
>>  
>>  	return norm_to_sys_addr(socket_id, die_id, coh_st_inst_id, addr);
>>  }
>> +
>> +int convert_umc_mca_addr_to_dram_addr(struct atl_err *err, struct dram_addr *dram_addr)
>> +{
>> +	u8 socket_id = topology_physical_package_id(err->cpu);
>> +	unsigned long addr = get_addr(err->addr);
>> +	u64 bank_id = err->ipid;
>> +	int ret;
>> +
>> +	ret = prm_umc_norm_to_dram_addr(socket_id, bank_id, addr, dram_addr);
>> +
>> +	return ret;
> 
> The 'ret' variable is not necessary. Just return the function call.
> 
> You can go even further and not use any new variables. Not sure how
> it'll be aesthetically, but that was my first thought.
> 
It may not look good aesthetically if we remove all variables.
Will remove ret though.

>> +}
>> diff --git a/drivers/ras/ras.c b/drivers/ras/ras.c
>> index a6e4792a1b2e..cae6388d41be 100644
>> --- a/drivers/ras/ras.c
>> +++ b/drivers/ras/ras.c
>> @@ -19,15 +19,20 @@
>>   */
>>  static unsigned long (*amd_atl_umc_na_to_spa)(struct atl_err *err);
>>  
>> -void amd_atl_register_decoder(unsigned long (*f)(struct atl_err *))
>> +static int (*amd_atl_umc_na_to_dram_addr)(struct atl_err *err, struct dram_addr *dram_addr);
>> +
>> +void amd_atl_register_decoder(unsigned long (*f1)(struct atl_err *),
>> +			      int (*f2)(struct atl_err *, struct dram_addr *))
>>  {
>> -	amd_atl_umc_na_to_spa = f;
>> +	amd_atl_umc_na_to_spa = f1;
>> +	amd_atl_umc_na_to_dram_addr = f2;
>>  }
>>  EXPORT_SYMBOL_GPL(amd_atl_register_decoder);
>>  
>>  void amd_atl_unregister_decoder(void)
>>  {
>>  	amd_atl_umc_na_to_spa = NULL;
>> +	amd_atl_umc_na_to_dram_addr = NULL;
>>  }
>>  EXPORT_SYMBOL_GPL(amd_atl_unregister_decoder);
>>  
>> @@ -39,6 +44,15 @@ unsigned long amd_convert_umc_mca_addr_to_sys_addr(struct atl_err *err)
>>  	return amd_atl_umc_na_to_spa(err);
>>  }
>>  EXPORT_SYMBOL_GPL(amd_convert_umc_mca_addr_to_sys_addr);
>> +
>> +int amd_convert_umc_mca_addr_to_dram_addr(struct atl_err *err, struct dram_addr *dram_addr)
>> +{
>> +	if (!amd_atl_umc_na_to_dram_addr)
>> +		return -EINVAL;
>> +
>> +	return amd_atl_umc_na_to_dram_addr(err, dram_addr);
>> +}
>> +EXPORT_SYMBOL_GPL(amd_convert_umc_mca_addr_to_dram_addr);
>>  #endif /* CONFIG_AMD_ATL */
>>  
>>  #define CREATE_TRACE_POINTS
>> diff --git a/include/linux/ras.h b/include/linux/ras.h
>> index a64182bc72ad..feb53f8470b0 100644
>> --- a/include/linux/ras.h
>> +++ b/include/linux/ras.h
>> @@ -42,15 +42,32 @@ struct atl_err {
>>  	u32 cpu;
>>  };
>>  
>> +struct dram_addr {
> 
> There's another struct in the kernel called 'dram_addr'.
> 
> It may help to make this more unique with a prefix: atl_dram_addr.
> 
Will change this.
>> +	u8 chip_select;
>> +	u8 bank_group;
>> +	u8 bank_addr;
>> +	u32 row_addr;
>> +	u16 col_addr;
>> +	u8 rank_mul;
>> +	u8 sub_ch;
>> +} __packed;
>> +
>>  #if IS_ENABLED(CONFIG_AMD_ATL)
>> -void amd_atl_register_decoder(unsigned long (*f)(struct atl_err *));
>> +void amd_atl_register_decoder(unsigned long (*f1)(struct atl_err *),
>> +			      int (*f2)(struct atl_err *, struct dram_addr *));
>>  void amd_atl_unregister_decoder(void);
>>  void amd_retire_dram_row(struct atl_err *err);
>>  unsigned long amd_convert_umc_mca_addr_to_sys_addr(struct atl_err *err);
>> +int amd_convert_umc_mca_addr_to_dram_addr(struct atl_err *err, struct dram_addr *dram_addr);
>>  #else
>>  static inline void amd_retire_dram_row(struct atl_err *err) { }
>>  static inline unsigned long
>>  amd_convert_umc_mca_addr_to_sys_addr(struct atl_err *err) { return -EINVAL; }
>> +static inline int amd_convert_umc_mca_addr_to_dram_addr(struct atl_err *err,
>> +							struct dram_addr *dram_addr)
>> +{
>> +	return -EINVAL;
>> +}
>>  #endif /* CONFIG_AMD_ATL */
>>  
>>  #endif /* __RAS_H__ */
>> -- 
> 
> Thanks,
> Yazen

-- 
Thanks,
Avadhut Naik


Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ