[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <4406491a-d14c-4796-9a8a-7a4a3f99104d@amd.com>
Date: Wed, 6 Aug 2025 13:21:44 -0500
From: "Naik, Avadhut" <avadnaik@....com>
To: Yazen Ghannam <yazen.ghannam@....com>, Avadhut Naik <avadhut.naik@....com>
Cc: linux-edac@...r.kernel.org, bp@...en8.de, john.allen@....com,
linux-kernel@...r.kernel.org
Subject: Re: [PATCH 1/2] RAS/AMD/ATL: Translate UMC normalized address to DRAM
address using PRM
Hi,
On 7/28/2025 09:40, Yazen Ghannam wrote:
> On Thu, Jul 17, 2025 at 04:48:42PM +0000, Avadhut Naik wrote:
>> Modern AMD SOCs like Zen5 provide UEFI PRM module that implements various
>
> Minor nit: Zen5 is a core revision and doesn't represent an SoC.
>
> You could say "Recent AMD systems...". A platform (FW/OS) interface like
> PRM doesn't depend on hardware revisions.
>
Will do!
>> address translation PRM handlers.[1] These handlers can be invoked by the
>> OS or hypervisor at runtime to perform address translations.
>>
>> On AMD's Zen-based SOCs, Unified Memory Controller (UMC) relative
>> "normalized" address is reported through MCA_ADDR of UMC SMCA bank type
>> on occurrence of a DRAM ECC error. This address must be converted into
>> system physical address and DRAM address to export additional information
>> about the error.
>>
>> Add support to convert normalized address into DRAM address through the
>> appropriate PRM handler. Support for obtaining the system physical address
>
> It's not necessary to mention that the SPA translation already exists.
>
Okay!
>> already exists. Instead of logging the translated DRAM address locally,
>> register the translating function when the Address Translation library is
>> initialized. Modules like amd64_edac can then invoke the PRM handler to
>> add the DRAM address to their error records. Additionally, it can also be
>> exported through the RAS tracepont.
>>
>> [1] AMD Family 1Ah Models 00h–0Fh and Models 10h–1Fh ACPI v6.5 Porting Guide, Chapter 22
>
> Could this be a link?
> https://www.kernel.org/doc/html/latest/process/maintainer-tip.html#links-to-documentation
>
Will take a look.
>>
>> Signed-off-by: Avadhut Naik <avadhut.naik@....com>
>> ---
>> drivers/ras/amd/atl/core.c | 3 ++-
>> drivers/ras/amd/atl/internal.h | 9 +++++++++
>> drivers/ras/amd/atl/prm.c | 36 ++++++++++++++++++++++++++++++----
>> drivers/ras/amd/atl/umc.c | 12 ++++++++++++
>> drivers/ras/ras.c | 18 +++++++++++++++--
>> include/linux/ras.h | 19 +++++++++++++++++-
>> 6 files changed, 89 insertions(+), 8 deletions(-)
>>
>> diff --git a/drivers/ras/amd/atl/core.c b/drivers/ras/amd/atl/core.c
>> index 4197e10993ac..ca1646d030ca 100644
>> --- a/drivers/ras/amd/atl/core.c
>> +++ b/drivers/ras/amd/atl/core.c
>> @@ -207,7 +207,8 @@ static int __init amd_atl_init(void)
>>
>> /* Increment this module's recount so that it can't be easily unloaded. */
>> __module_get(THIS_MODULE);
>> - amd_atl_register_decoder(convert_umc_mca_addr_to_sys_addr);
>> + amd_atl_register_decoder(convert_umc_mca_addr_to_sys_addr,
>> + convert_umc_mca_addr_to_dram_addr);
>>
>> pr_info("AMD Address Translation Library initialized\n");
>> return 0;
>> diff --git a/drivers/ras/amd/atl/internal.h b/drivers/ras/amd/atl/internal.h
>> index 2b6279d32774..53095310438c 100644
>> --- a/drivers/ras/amd/atl/internal.h
>> +++ b/drivers/ras/amd/atl/internal.h
>> @@ -279,18 +279,27 @@ int dehash_address(struct addr_ctx *ctx);
>>
>> unsigned long norm_to_sys_addr(u8 socket_id, u8 die_id, u8 coh_st_inst_id, unsigned long addr);
>> unsigned long convert_umc_mca_addr_to_sys_addr(struct atl_err *err);
>> +int convert_umc_mca_addr_to_dram_addr(struct atl_err *err, struct dram_addr *dram_addr);
>>
>> u64 add_base_and_hole(struct addr_ctx *ctx, u64 addr);
>> u64 remove_base_and_hole(struct addr_ctx *ctx, u64 addr);
>>
>> #ifdef CONFIG_AMD_ATL_PRM
>> unsigned long prm_umc_norm_to_sys_addr(u8 socket_id, u64 umc_bank_inst_id, unsigned long addr);
>> +int prm_umc_norm_to_dram_addr(u8 socket_id, u64 bank_id,
>> + unsigned long addr, struct dram_addr *dram_addr);
>> #else
>> static inline unsigned long prm_umc_norm_to_sys_addr(u8 socket_id, u64 umc_bank_inst_id,
>> unsigned long addr)
>> {
>> return -ENODEV;
>> }
>> +
>> +static inline int prm_umc_norm_to_dram_addr(u8 socket_id, u64 bank_id,
>> + unsigned long addr, struct dram_addr *dram_addr)
>> +{
>> + return -ENODEV;
>> +}
>> #endif
>>
>> /*
>> diff --git a/drivers/ras/amd/atl/prm.c b/drivers/ras/amd/atl/prm.c
>> index 0931a20d213b..9bbaf8c85da0 100644
>> --- a/drivers/ras/amd/atl/prm.c
>> +++ b/drivers/ras/amd/atl/prm.c
>> @@ -19,10 +19,11 @@
>> #include <linux/prmt.h>
>>
>> /*
>> - * PRM parameter buffer - normalized to system physical address, as described
>> - * in the "PRM Parameter Buffer" section of the AMD ACPI Porting Guide.
>> + * PRM parameter buffer - normalized to system physical address and normalized
>> + * to DRAM address, as described in the "PRM Parameter Buffer" section of the
>> + * AMD ACPI Porting Guide.
>> */
>> -struct norm_to_sys_param_buf {
>> +struct prm_parameter_buffer {
>> u64 norm_addr;
>> u8 socket;
>> u64 bank_id;
>> @@ -33,9 +34,13 @@ static const guid_t norm_to_sys_guid = GUID_INIT(0xE7180659, 0xA65D, 0x451D,
>> 0x92, 0xCD, 0x2B, 0x56, 0xF1,
>> 0x2B, 0xEB, 0xA6);
>>
>> +static const guid_t norm_to_dram_guid = GUID_INIT(0x7626C6AE, 0xF973, 0x429C,
>> + 0xA9, 0x1C, 0x10, 0x7D, 0x7B,
>> + 0xE2, 0x98, 0xB0);
>> +
>> unsigned long prm_umc_norm_to_sys_addr(u8 socket_id, u64 bank_id, unsigned long addr)
>> {
>> - struct norm_to_sys_param_buf p_buf;
>> + struct prm_parameter_buffer p_buf;
>> unsigned long ret_addr;
>> int ret;
>>
>> @@ -55,3 +60,26 @@ unsigned long prm_umc_norm_to_sys_addr(u8 socket_id, u64 bank_id, unsigned long
>>
>> return ret;
>> }
>> +
>> +int prm_umc_norm_to_dram_addr(u8 socket_id, u64 bank_id,
>> + unsigned long addr, struct dram_addr *dram_addr)
>> +{
>> + struct prm_parameter_buffer p_buf;
>> + int ret;
>> +
>> + p_buf.norm_addr = addr;
>> + p_buf.socket = socket_id;
>> + p_buf.bank_id = bank_id;
>> + p_buf.out_buf = dram_addr;
>> +
>> + ret = acpi_call_prm_handler(norm_to_dram_guid, &p_buf);
>> + if (!ret)
>> + return ret;
>> +
>> + if (ret == -ENODEV)
>> + pr_debug("PRM module/handler not available.\n");
>> + else
>> + pr_notice_once("PRM DRAM Address Translation failed.\n");
>> +
>> + return ret;
>> +}
>> diff --git a/drivers/ras/amd/atl/umc.c b/drivers/ras/amd/atl/umc.c
>> index 6e072b7667e9..df6accae8929 100644
>> --- a/drivers/ras/amd/atl/umc.c
>> +++ b/drivers/ras/amd/atl/umc.c
>> @@ -427,3 +427,15 @@ unsigned long convert_umc_mca_addr_to_sys_addr(struct atl_err *err)
>>
>> return norm_to_sys_addr(socket_id, die_id, coh_st_inst_id, addr);
>> }
>> +
>> +int convert_umc_mca_addr_to_dram_addr(struct atl_err *err, struct dram_addr *dram_addr)
>> +{
>> + u8 socket_id = topology_physical_package_id(err->cpu);
>> + unsigned long addr = get_addr(err->addr);
>> + u64 bank_id = err->ipid;
>> + int ret;
>> +
>> + ret = prm_umc_norm_to_dram_addr(socket_id, bank_id, addr, dram_addr);
>> +
>> + return ret;
>
> The 'ret' variable is not necessary. Just return the function call.
>
> You can go even further and not use any new variables. Not sure how
> it'll be aesthetically, but that was my first thought.
>
It may not look good aesthetically if we remove all variables.
Will remove ret though.
>> +}
>> diff --git a/drivers/ras/ras.c b/drivers/ras/ras.c
>> index a6e4792a1b2e..cae6388d41be 100644
>> --- a/drivers/ras/ras.c
>> +++ b/drivers/ras/ras.c
>> @@ -19,15 +19,20 @@
>> */
>> static unsigned long (*amd_atl_umc_na_to_spa)(struct atl_err *err);
>>
>> -void amd_atl_register_decoder(unsigned long (*f)(struct atl_err *))
>> +static int (*amd_atl_umc_na_to_dram_addr)(struct atl_err *err, struct dram_addr *dram_addr);
>> +
>> +void amd_atl_register_decoder(unsigned long (*f1)(struct atl_err *),
>> + int (*f2)(struct atl_err *, struct dram_addr *))
>> {
>> - amd_atl_umc_na_to_spa = f;
>> + amd_atl_umc_na_to_spa = f1;
>> + amd_atl_umc_na_to_dram_addr = f2;
>> }
>> EXPORT_SYMBOL_GPL(amd_atl_register_decoder);
>>
>> void amd_atl_unregister_decoder(void)
>> {
>> amd_atl_umc_na_to_spa = NULL;
>> + amd_atl_umc_na_to_dram_addr = NULL;
>> }
>> EXPORT_SYMBOL_GPL(amd_atl_unregister_decoder);
>>
>> @@ -39,6 +44,15 @@ unsigned long amd_convert_umc_mca_addr_to_sys_addr(struct atl_err *err)
>> return amd_atl_umc_na_to_spa(err);
>> }
>> EXPORT_SYMBOL_GPL(amd_convert_umc_mca_addr_to_sys_addr);
>> +
>> +int amd_convert_umc_mca_addr_to_dram_addr(struct atl_err *err, struct dram_addr *dram_addr)
>> +{
>> + if (!amd_atl_umc_na_to_dram_addr)
>> + return -EINVAL;
>> +
>> + return amd_atl_umc_na_to_dram_addr(err, dram_addr);
>> +}
>> +EXPORT_SYMBOL_GPL(amd_convert_umc_mca_addr_to_dram_addr);
>> #endif /* CONFIG_AMD_ATL */
>>
>> #define CREATE_TRACE_POINTS
>> diff --git a/include/linux/ras.h b/include/linux/ras.h
>> index a64182bc72ad..feb53f8470b0 100644
>> --- a/include/linux/ras.h
>> +++ b/include/linux/ras.h
>> @@ -42,15 +42,32 @@ struct atl_err {
>> u32 cpu;
>> };
>>
>> +struct dram_addr {
>
> There's another struct in the kernel called 'dram_addr'.
>
> It may help to make this more unique with a prefix: atl_dram_addr.
>
Will change this.
>> + u8 chip_select;
>> + u8 bank_group;
>> + u8 bank_addr;
>> + u32 row_addr;
>> + u16 col_addr;
>> + u8 rank_mul;
>> + u8 sub_ch;
>> +} __packed;
>> +
>> #if IS_ENABLED(CONFIG_AMD_ATL)
>> -void amd_atl_register_decoder(unsigned long (*f)(struct atl_err *));
>> +void amd_atl_register_decoder(unsigned long (*f1)(struct atl_err *),
>> + int (*f2)(struct atl_err *, struct dram_addr *));
>> void amd_atl_unregister_decoder(void);
>> void amd_retire_dram_row(struct atl_err *err);
>> unsigned long amd_convert_umc_mca_addr_to_sys_addr(struct atl_err *err);
>> +int amd_convert_umc_mca_addr_to_dram_addr(struct atl_err *err, struct dram_addr *dram_addr);
>> #else
>> static inline void amd_retire_dram_row(struct atl_err *err) { }
>> static inline unsigned long
>> amd_convert_umc_mca_addr_to_sys_addr(struct atl_err *err) { return -EINVAL; }
>> +static inline int amd_convert_umc_mca_addr_to_dram_addr(struct atl_err *err,
>> + struct dram_addr *dram_addr)
>> +{
>> + return -EINVAL;
>> +}
>> #endif /* CONFIG_AMD_ATL */
>>
>> #endif /* __RAS_H__ */
>> --
>
> Thanks,
> Yazen
--
Thanks,
Avadhut Naik
Powered by blists - more mailing lists