[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <3648791f-f9c5-858f-cf6e-e5954fad8019@huawei.com>
Date: Sat, 10 Apr 2021 15:09:47 +0800
From: Xiaofei Tan <tanxiaofei@...wei.com>
To: James Morse <james.morse@....com>, Borislav Petkov <bp@...en8.de>
CC: "Rafael J. Wysocki" <rafael@...nel.org>,
Len Brown <lenb@...nel.org>,
"Rafael J. Wysocki" <rjw@...ysocki.net>,
Tony Luck <tony.luck@...el.com>,
Andrew Morton <akpm@...ux-foundation.org>,
Joerg Roedel <jroedel@...e.de>,
Peter Zijlstra <peterz@...radead.org>,
ACPI Devel Maling List <linux-acpi@...r.kernel.org>,
Linux Kernel Mailing List <linux-kernel@...r.kernel.org>,
Linuxarm <linuxarm@...wei.com>
Subject: Re: [PATCH v5] ACPI / APEI: fix the regression of synchronous
external aborts occur in user-mode
Hi James, Boris
a friendly ping..
any comments for this ?
On 2021/2/5 20:55, Rafael J. Wysocki wrote:
> On Tue, Jan 26, 2021 at 2:32 PM tanxiaofei <tanxiaofei@...wei.com> wrote:
>>
>> @James
>> Hi James, please help to review this patch. Thank you very much. :)
>
> James, Boris, any comments?
>
>> On 2020/12/10 20:09, Xiaofei Tan wrote:
>>> After the commit 8fcc4ae6faf8 ("arm64: acpi: Make apei_claim_sea()
>>> synchronise with APEI's irq work") applied, do_sea() return directly
>>> for user-mode if apei_claim_sea() handled any error record. Therefore,
>>> each error record reported by the user-mode SEA must be effectively
>>> processed in APEI GHES driver.
>>>
>>> Currently, GHES driver only processes Memory Error Section.(Ignore PCIe
>>> Error Section, as it has nothing to do with SEA). It is not enough.
>>> Because ARM Processor Error could also be used for SEA in some hardware
>>> platforms, such as Kunpeng9xx series. We can't ask them to switch to
>>> use Memory Error Section for two reasons:
>>> 1)The server was delivered to customers, and it will introduce
>>> compatibility issue.
>>> 2)It make sense to use ARM Processor Error Section. Because either
>>> cache or memory errors could generate SEA when consumed by a processor.
>>>
>>> Do memory failure handling for ARM Processor Error Section just like
>>> for Memory Error Section.
>>>
>>> Signed-off-by: Xiaofei Tan <tanxiaofei@...wei.com>
>>> ---
>>> Changes since v4:
>>> - 1. Change the patch name from " ACPI / APEI: do memory failure on the
>>> physical address reported by ARM processor error section" to this
>>> more proper one.
>>> - 2. Add a comment in the code to tell why not filter out corrected
>>> error in an uncorrected section.
>>>
>>> Changes since v3:
>>> - Print unhandled error following James Morse's advice.
>>>
>>> Changes since v2:
>>> - Updated commit log
>>> ---
>>> drivers/acpi/apei/ghes.c | 76 +++++++++++++++++++++++++++++++++++++-----------
>>> 1 file changed, 59 insertions(+), 17 deletions(-)
>>>
>>> diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
>>> index fce7ade..0893968 100644
>>> --- a/drivers/acpi/apei/ghes.c
>>> +++ b/drivers/acpi/apei/ghes.c
>>> @@ -441,28 +441,35 @@ static void ghes_kick_task_work(struct callback_head *head)
>>> gen_pool_free(ghes_estatus_pool, (unsigned long)estatus_node, node_len);
>>> }
>>>
>>> -static bool ghes_handle_memory_failure(struct acpi_hest_generic_data *gdata,
>>> - int sev)
>>> +static bool ghes_do_memory_failure(u64 physical_addr, int flags)
>>> {
>>> unsigned long pfn;
>>> - int flags = -1;
>>> - int sec_sev = ghes_severity(gdata->error_severity);
>>> - struct cper_sec_mem_err *mem_err = acpi_hest_get_payload(gdata);
>>>
>>> if (!IS_ENABLED(CONFIG_ACPI_APEI_MEMORY_FAILURE))
>>> return false;
>>>
>>> - if (!(mem_err->validation_bits & CPER_MEM_VALID_PA))
>>> - return false;
>>> -
>>> - pfn = mem_err->physical_addr >> PAGE_SHIFT;
>>> + pfn = PHYS_PFN(physical_addr);
>>> if (!pfn_valid(pfn)) {
>>> pr_warn_ratelimited(FW_WARN GHES_PFX
>>> "Invalid address in generic error data: %#llx\n",
>>> - mem_err->physical_addr);
>>> + physical_addr);
>>> return false;
>>> }
>>>
>>> + memory_failure_queue(pfn, flags);
>>> + return true;
>>> +}
>>> +
>>> +static bool ghes_handle_memory_failure(struct acpi_hest_generic_data *gdata,
>>> + int sev)
>>> +{
>>> + int flags = -1;
>>> + int sec_sev = ghes_severity(gdata->error_severity);
>>> + struct cper_sec_mem_err *mem_err = acpi_hest_get_payload(gdata);
>>> +
>>> + if (!(mem_err->validation_bits & CPER_MEM_VALID_PA))
>>> + return false;
>>> +
>>> /* iff following two events can be handled properly by now */
>>> if (sec_sev == GHES_SEV_CORRECTED &&
>>> (gdata->flags & CPER_SEC_ERROR_THRESHOLD_EXCEEDED))
>>> @@ -470,14 +477,51 @@ static bool ghes_handle_memory_failure(struct acpi_hest_generic_data *gdata,
>>> if (sev == GHES_SEV_RECOVERABLE && sec_sev == GHES_SEV_RECOVERABLE)
>>> flags = 0;
>>>
>>> - if (flags != -1) {
>>> - memory_failure_queue(pfn, flags);
>>> - return true;
>>> - }
>>> + if (flags != -1)
>>> + return ghes_do_memory_failure(mem_err->physical_addr, flags);
>>>
>>> return false;
>>> }
>>>
>>> +static bool ghes_handle_arm_hw_error(struct acpi_hest_generic_data *gdata, int sev)
>>> +{
>>> + struct cper_sec_proc_arm *err = acpi_hest_get_payload(gdata);
>>> + struct cper_arm_err_info *err_info;
>>> + bool queued = false;
>>> + int sec_sev, i;
>>> +
>>> + log_arm_hw_error(err);
>>> +
>>> + sec_sev = ghes_severity(gdata->error_severity);
>>> + if (sev != GHES_SEV_RECOVERABLE || sec_sev != GHES_SEV_RECOVERABLE)
>>> + return false;
>>> +
>>> + err_info = (struct cper_arm_err_info *) (err + 1);
>>> + for (i = 0; i < err->err_info_num; i++, err_info++) {
>>> + bool is_cache = (err_info->type == CPER_ARM_CACHE_ERROR);
>>> + bool has_pa = (err_info->validation_bits & CPER_ARM_INFO_VALID_PHYSICAL_ADDR);
>>> +
>>> + /*
>>> + * The field (err_info->error_info & BIT(26)) is fixed to set to
>>> + * 1 in some old firmware of HiSilicon Kunpeng920. We assume that
>>> + * firmware won't mix corrected errors in an uncorrected section,
>>> + * and don't filter out 'corrected' error here.
>>> + */
>>> + if (!is_cache || !has_pa) {
>>> + pr_warn_ratelimited(FW_WARN GHES_PFX
>>> + "Unhandled processor error type %s\n",
>>> + err_info->type < ARRAY_SIZE(cper_proc_error_type_strs) ?
>>> + cper_proc_error_type_strs[err_info->type] : "unknown error");
>>> + continue;
>>> + }
>>> +
>>> + if (ghes_do_memory_failure(err_info->physical_fault_addr, 0))
>>> + queued = true;
>>> + }
>>> +
>>> + return queued;
>>> +}
>>> +
>>> /*
>>> * PCIe AER errors need to be sent to the AER driver for reporting and
>>> * recovery. The GHES severities map to the following AER severities and
>>> @@ -605,9 +649,7 @@ static bool ghes_do_proc(struct ghes *ghes,
>>> ghes_handle_aer(gdata);
>>> }
>>> else if (guid_equal(sec_type, &CPER_SEC_PROC_ARM)) {
>>> - struct cper_sec_proc_arm *err = acpi_hest_get_payload(gdata);
>>> -
>>> - log_arm_hw_error(err);
>>> + queued = ghes_handle_arm_hw_error(gdata, sev);
>>> } else {
>>> void *err = acpi_hest_get_payload(gdata);
>>>
>>>
>>
>
> .
>
Powered by blists - more mailing lists