[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <20240619143117.00000c9c@Huawei.com>
Date: Wed, 19 Jun 2024 14:31:17 +0100
From: Jonathan Cameron <Jonathan.Cameron@...wei.com>
To: Mauro Carvalho Chehab <mchehab+huawei@...nel.org>
CC: Borislav Petkov <bp@...en8.de>, James Morse <james.morse@....com>, "Rafael
J. Wysocki" <rafael@...nel.org>, Shiju Jose <shiju.jose@...wei.com>, Tony
Luck <tony.luck@...el.com>, Uwe Kleine-König
<u.kleine-koenig@...gutronix.de>, "Alison Schofield"
<alison.schofield@...el.com>, Ard Biesheuvel <ardb@...nel.org>, Dan Williams
<dan.j.williams@...el.com>, Dave Jiang <dave.jiang@...el.com>, Ira Weiny
<ira.weiny@...el.com>, Len Brown <lenb@...nel.org>, Shuai Xue
<xueshuai@...ux.alibaba.com>, <linux-acpi@...r.kernel.org>,
<linux-edac@...r.kernel.org>, <linux-efi@...r.kernel.org>,
<linux-kernel@...r.kernel.org>
Subject: Re: [PATCH v2] efi/cper: align ARM CPER type with UEFI 2.9A/2.10
specs
On Wed, 19 Jun 2024 12:52:38 +0200
Mauro Carvalho Chehab <mchehab+huawei@...nel.org> wrote:
> Up to UEFI spec, the type byte of CPER struct was defined simply
> as:
>
> Type at byte offset 4:
>
> - Cache error
> - TLB Error
> - Bus Error
> - Micro-architectural Error
> All other values are reserved
>
> Yet, there was no information about how this would be encoded.
>
> Spec 2.9A corrected it by defining:
>
> - Bit 1 - Cache Error
> - Bit 2 - TLB Error
> - Bit 3 - Bus Error
> - Bit 4 - Micro-architectural Error
> All other values are reserved
>
> Spec 2.10 also preserve the same encoding as 2.9A
>
> See: https://uefi.org/specs/UEFI/2.10/Apx_N_Common_Platform_Error_Record.html#arm-processor-error-information
>
> Adjust CPER handling code for ARM to properly handle UEFI 2.9A and
> 2.10 encoding.
Hi Mauro,
I'd be tempted to use "ARM Processor" throughout this patch description
as could in theory be something else and currently the link
is the only way to tell!
A few comments inline.
Good catch on the spec change btw.
>
> Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@...nel.org>
> ---
> drivers/acpi/apei/ghes.c | 19 +++++++++++---
> drivers/firmware/efi/cper-arm.c | 44 ++++++++++++++-------------------
> include/linux/cper.h | 9 +++----
> 3 files changed, 37 insertions(+), 35 deletions(-)
>
> diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
> index ed32bbecb4a3..365de4115508 100644
> --- a/drivers/acpi/apei/ghes.c
> +++ b/drivers/acpi/apei/ghes.c
> @@ -546,9 +546,12 @@ static bool ghes_handle_arm_hw_error(struct acpi_hest_generic_data *gdata,
> p = (char *)(err + 1);
> for (i = 0; i < err->err_info_num; i++) {
> struct cper_arm_err_info *err_info = (struct cper_arm_err_info *)p;
> - bool is_cache = (err_info->type == CPER_ARM_CACHE_ERROR);
> + bool is_cache = (err_info->type & CPER_ARM_CACHE_ERROR);
Matches local style I guess but the () are unnecessary.
> bool has_pa = (err_info->validation_bits & CPER_ARM_INFO_VALID_PHYSICAL_ADDR);
> - const char *error_type = "unknown error";
> + char error_type[120] = "";
> + char *s = error_type;
> + int len = 0;
> + int i;
Shadowing i which is bad for readability.
>
> /*
> * The field (err_info->error_info & BIT(26)) is fixed to set to
> @@ -562,8 +565,16 @@ static bool ghes_handle_arm_hw_error(struct acpi_hest_generic_data *gdata,
> continue;
> }
>
> - if (err_info->type < ARRAY_SIZE(cper_proc_error_type_strs))
> - error_type = cper_proc_error_type_strs[err_info->type];
> + for (i = 0; i < ARRAY_SIZE(cper_proc_error_type_strs); i++) {
> + if (!(err_info->type & (1U << i)))
> + continue;
> +
> + len += snprintf(s, sizeof(err_info->type) - len, "%s ", cper_proc_error_type_strs[i]);
Size of the index into the type string array? I'm confused.
sizeof(error_type) maybe?
Also, maybe break that long line of code before cper_*
> + s += len;
> + }
> +
> + if (!*error_type)
> + strscpy(error_type, "unknown error", sizeof(error_type));
Perhaps should handle multiple bits where only one is unknown?
So maybe compare with a mask of known bits and print this on the end (perhaps
including which bit)?
>
> pr_warn_ratelimited(FW_WARN GHES_PFX
> "Unhandled processor error type: %s\n",
> diff --git a/drivers/firmware/efi/cper-arm.c b/drivers/firmware/efi/cper-arm.c
> index fa9c1c3bf168..f57641eb548a 100644
> --- a/drivers/firmware/efi/cper-arm.c
> +++ b/drivers/firmware/efi/cper-arm.c
> @@ -93,15 +93,11 @@ static void cper_print_arm_err_info(const char *pfx, u32 type,
> bool proc_context_corrupt, corrected, precise_pc, restartable_pc;
> bool time_out, access_mode;
>
> - /* If the type is unknown, bail. */
> - if (type > CPER_ARM_MAX_TYPE)
> - return;
> -
> /*
> * Vendor type errors have error information values that are vendor
> * specific.
> */
> - if (type == CPER_ARM_VENDOR_ERROR)
> + if (type & CPER_ARM_VENDOR_ERROR)
> return;
>
> if (error_info & CPER_ARM_ERR_VALID_TRANSACTION_TYPE) {
> @@ -116,43 +112,38 @@ static void cper_print_arm_err_info(const char *pfx, u32 type,
> if (error_info & CPER_ARM_ERR_VALID_OPERATION_TYPE) {
> op_type = ((error_info >> CPER_ARM_ERR_OPERATION_SHIFT)
> & CPER_ARM_ERR_OPERATION_MASK);
> - switch (type) {
> - case CPER_ARM_CACHE_ERROR:
> + if (type & CPER_ARM_CACHE_ERROR) {
> if (op_type < ARRAY_SIZE(arm_cache_err_op_strs)) {
> - printk("%soperation type: %s\n", pfx,
> + printk("%scache error: %s\n", pfx,
> arm_cache_err_op_strs[op_type]);
Can we keep that this is an operation type in print?
"%scache error, operation type: %s\n" perhaps?
> }
> - break;
> - case CPER_ARM_TLB_ERROR:
> + }
> + if (type & CPER_ARM_TLB_ERROR) {
> if (op_type < ARRAY_SIZE(arm_tlb_err_op_strs)) {
> - printk("%soperation type: %s\n", pfx,
> + printk("%sTLB error: %s\n", pfx,
> arm_tlb_err_op_strs[op_type]);
> }
> - break;
> - case CPER_ARM_BUS_ERROR:
> + }
> + if (type & CPER_ARM_BUS_ERROR) {
> if (op_type < ARRAY_SIZE(arm_bus_err_op_strs)) {
> - printk("%soperation type: %s\n", pfx,
> + printk("%sbus error: %s\n", pfx,
> arm_bus_err_op_strs[op_type]);
> }
> - break;
> }
> }
>
> if (error_info & CPER_ARM_ERR_VALID_LEVEL) {
> level = ((error_info >> CPER_ARM_ERR_LEVEL_SHIFT)
> & CPER_ARM_ERR_LEVEL_MASK);
Not a today thing, but would be lovely to use FIELD_GET()
for all these with appropriately fixed up mask definitions.
Right now it is inconsistent as the valid entries are handled
as shifted values, and we have GENMASK(X,0) plus a shift for these.
> - switch (type) {
> - case CPER_ARM_CACHE_ERROR:
> + if (type & CPER_ARM_CACHE_ERROR)
> printk("%scache level: %d\n", pfx, level);
> - break;
> - case CPER_ARM_TLB_ERROR:
> +
> + if (type & CPER_ARM_TLB_ERROR)
> printk("%sTLB level: %d\n", pfx, level);
> - break;
> - case CPER_ARM_BUS_ERROR:
> +
> + if (type & CPER_ARM_BUS_ERROR)
> printk("%saffinity level at which the bus error occurred: %d\n",
> pfx, level);
> - break;
> - }
> }
Powered by blists - more mailing lists