[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <CAJZ5v0jBN7-fkP_WB4CDg5UALn+GeY5i8gxM3FwrpKdM-m93rg@mail.gmail.com>
Date: Wed, 29 Jun 2022 19:56:48 +0200
From: "Rafael J. Wysocki" <rafael@...nel.org>
To: Tony Luck <tony.luck@...el.com>
Cc: "Rafael J. Wysocki" <rafael@...nel.org>,
Darren Hart <darren@...amperecomputing.com>,
LKML <linux-kernel@...r.kernel.org>,
Linux ACPI <linux-acpi@...r.kernel.org>,
Len Brown <lenb@...nel.org>, James Morse <james.morse@....com>,
Borislav Petkov <bp@...en8.de>,
Doug Rady <dcrady@...amperecomputing.com>
Subject: Re: [PATCH] ACPI/APEI: Better fix to avoid spamming the console with
old error logs
On Wed, Jun 22, 2022 at 7:09 PM Tony Luck <tony.luck@...el.com> wrote:
>
> The fix in commit 3f8dec116210 ("ACPI/APEI: Limit printable size of BERT
> table data") does not work as intended on systems where the BIOS has a
> fixed size block of memory for the BERT table, relying on s/w to quit
> when it finds a record with estatus->block_status == 0. On these systems
> all errors are suppressed because the check:
>
> if (region_len < ACPI_BERT_PRINT_MAX_LEN)
>
> always fails.
>
> New scheme skips individual CPER records that are too large, and also
> limits the total number of records that will be printed to 5.
>
> Fixes: 3f8dec116210 ("ACPI/APEI: Limit printable size of BERT table data")
> Cc: stable@...r.kernel.org
> Signed-off-by: Tony Luck <tony.luck@...el.com>
> ---
>
> Now in PATCH format with a real commit comment. This version fixes
> the issues seen by Intel's validation team.
>
> drivers/acpi/apei/bert.c | 31 +++++++++++++++++++++++--------
> 1 file changed, 23 insertions(+), 8 deletions(-)
>
> diff --git a/drivers/acpi/apei/bert.c b/drivers/acpi/apei/bert.c
> index 598fd19b65fa..45973aa6e06d 100644
> --- a/drivers/acpi/apei/bert.c
> +++ b/drivers/acpi/apei/bert.c
> @@ -29,16 +29,26 @@
>
> #undef pr_fmt
> #define pr_fmt(fmt) "BERT: " fmt
> +
> +#define ACPI_BERT_PRINT_MAX_RECORDS 5
> #define ACPI_BERT_PRINT_MAX_LEN 1024
>
> static int bert_disable;
>
> +/*
> + * Print "all" the error records in the BERT table, but avoid huge spam to
> + * the console if the BIOS included oversize records, or too many records.
> + * Skipping some records here does not lose anything because the full
> + * data is available to user tools in:
> + * /sys/firmware/acpi/tables/data/BERT
> + */
> static void __init bert_print_all(struct acpi_bert_region *region,
> unsigned int region_len)
> {
> struct acpi_hest_generic_status *estatus =
> (struct acpi_hest_generic_status *)region;
> int remain = region_len;
> + int printed = 0, skipped = 0;
> u32 estatus_len;
>
> while (remain >= sizeof(struct acpi_bert_region)) {
> @@ -46,24 +56,26 @@ static void __init bert_print_all(struct acpi_bert_region *region,
> if (remain < estatus_len) {
> pr_err(FW_BUG "Truncated status block (length: %u).\n",
> estatus_len);
> - return;
> + break;
> }
>
> /* No more error records. */
> if (!estatus->block_status)
> - return;
> + break;
>
> if (cper_estatus_check(estatus)) {
> pr_err(FW_BUG "Invalid error record.\n");
> - return;
> + break;
> }
>
> - pr_info_once("Error records from previous boot:\n");
> - if (region_len < ACPI_BERT_PRINT_MAX_LEN)
> + if (estatus_len < ACPI_BERT_PRINT_MAX_LEN &&
> + printed < ACPI_BERT_PRINT_MAX_RECORDS) {
> + pr_info_once("Error records from previous boot:\n");
> cper_estatus_print(KERN_INFO HW_ERR, estatus);
> - else
> - pr_info_once("Max print length exceeded, table data is available at:\n"
> - "/sys/firmware/acpi/tables/data/BERT");
> + printed++;
> + } else {
> + skipped++;
> + }
>
> /*
> * Because the boot error source is "one-time polled" type,
> @@ -75,6 +87,9 @@ static void __init bert_print_all(struct acpi_bert_region *region,
> estatus = (void *)estatus + estatus_len;
> remain -= estatus_len;
> }
> +
> + if (skipped)
> + pr_info(HW_ERR "Skipped %d error records\n", skipped);
> }
>
> static int __init setup_bert_disable(char *str)
> --
Applied as 5.20 material, thanks!
Powered by blists - more mailing lists