[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <73d0834a539a4a69bca670141dd06bc8@huawei.com>
Date: Fri, 26 Apr 2024 11:45:56 +0000
From: Shiju Jose <shiju.jose@...wei.com>
To: Daniel Ferguson <danielf@...amperecomputing.com>, "Rafael J. Wysocki"
<rafael@...nel.org>, Len Brown <lenb@...nel.org>, James Morse
<james.morse@....com>, Tony Luck <tony.luck@...el.com>, Borislav Petkov
<bp@...en8.de>
CC: "linux-acpi@...r.kernel.org" <linux-acpi@...r.kernel.org>,
"linux-kernel@...r.kernel.org" <linux-kernel@...r.kernel.org>,
"linux-edac@...r.kernel.org" <linux-edac@...r.kernel.org>, luoshengwei
<luoshengwei@...wei.com>, Jason Tian <jason@...amperecomputing.com>
Subject: RE: [PATCH v5 2/2] RAS: Report ARM processor information to userspace
Tested-by: Shiju Jose <shiju.jose@...wei.com>
CPU core isolation feature in rasdaemon has dependency on this kernel patch.
Thanks,
Shiju
>-----Original Message-----
>From: Daniel Ferguson <danielf@...amperecomputing.com>
>Sent: 21 March 2024 22:56
>To: Rafael J. Wysocki <rafael@...nel.org>; Len Brown <lenb@...nel.org>;
>James Morse <james.morse@....com>; Tony Luck <tony.luck@...el.com>;
>Borislav Petkov <bp@...en8.de>
>Cc: linux-acpi@...r.kernel.org; linux-kernel@...r.kernel.org; linux-
>edac@...r.kernel.org; Daniel Ferguson <danielf@...amperecomputing.com>;
>luoshengwei <luoshengwei@...wei.com>; Jason Tian
><jason@...amperecomputing.com>
>Subject: [PATCH v5 2/2] RAS: Report ARM processor information to userspace
>
>From: Shengwei Luo <luoshengwei@...wei.com>
>
>The original arm_event trace code only traces out ARM processor error
>information data. It's not enough for user to take appropriate action.
>
>According to UEFI_2_9 specification chapter N2.4.4, the ARM processor error
>section includes several ARM processor error information, several ARM
>processor context information and several vendor specific error information
>structures. In addition to these info, there are error severity and cpu logical
>index about the event. Report all of these information to userspace via perf i/f.
>So that the user can do cpu core isolation according to error severity and other
>info.
>
>Signed-off-by: Shengwei Luo <luoshengwei@...wei.com>
>Signed-off-by: Jason Tian <jason@...amperecomputing.com>
>Signed-off-by: Daniel Ferguson <danielf@...amperecomputing.com>
>---
> drivers/acpi/apei/ghes.c | 3 +--
> drivers/ras/ras.c | 46
>++++++++++++++++++++++++++++++++++++++++++++--
> include/linux/ras.h | 15 ++++++++++++---
> include/ras/ras_event.h | 48
>+++++++++++++++++++++++++++++++++++++++++++-----
> 4 files changed, 100 insertions(+), 12 deletions(-)
>
>diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c index
>58014558b8e0..a93c80fe1bab 100644
>--- a/drivers/acpi/apei/ghes.c
>+++ b/drivers/acpi/apei/ghes.c
>@@ -535,9 +535,8 @@ static bool ghes_handle_arm_hw_error(struct
>acpi_hest_generic_data *gdata,
> int sec_sev, i;
> char *p;
>
>- log_arm_hw_error(err);
>-
> sec_sev = ghes_severity(gdata->error_severity);
>+ log_arm_hw_error(err, sec_sev);
> if (sev != GHES_SEV_RECOVERABLE || sec_sev !=
>GHES_SEV_RECOVERABLE)
> return false;
>
>diff --git a/drivers/ras/ras.c b/drivers/ras/ras.c index
>249dce21a738..3e2beed2db07 100644
>--- a/drivers/ras/ras.c
>+++ b/drivers/ras/ras.c
>@@ -53,9 +53,51 @@ void log_non_standard_event(const guid_t *sec_type,
>const guid_t *fru_id, }
>
> #if defined(CONFIG_ARM) || defined(CONFIG_ARM64) -void
>log_arm_hw_error(struct cper_sec_proc_arm *err)
>+void log_arm_hw_error(struct cper_sec_proc_arm *err, const u8 sev)
> {
>- trace_arm_event(err);
>+ u32 pei_len;
>+ u32 ctx_len = 0;
>+ s32 vsei_len;
>+ u8 *pei_err;
>+ u8 *ctx_err;
>+ u8 *ven_err_data;
>+ struct cper_arm_err_info *err_info;
>+ struct cper_arm_ctx_info *ctx_info;
>+ int n, sz;
>+ int cpu;
>+
>+ pei_len = sizeof(struct cper_arm_err_info) * err->err_info_num;
>+ pei_err = (u8 *)err + sizeof(struct cper_sec_proc_arm);
>+
>+ err_info = (struct cper_arm_err_info *)(err + 1);
>+ ctx_info = (struct cper_arm_ctx_info *)(err_info + err->err_info_num);
>+ ctx_err = (u8 *)ctx_info;
>+ for (n = 0; n < err->context_info_num; n++) {
>+ sz = sizeof(struct cper_arm_ctx_info) + ctx_info->size;
>+ ctx_info = (struct cper_arm_ctx_info *)((long)ctx_info + sz);
>+ ctx_len += sz;
>+ }
>+
>+ vsei_len = err->section_length - (sizeof(struct cper_sec_proc_arm) +
>+ pei_len + ctx_len);
>+ if (vsei_len < 0) {
>+ pr_warn(FW_BUG
>+ "section length: %d\n", err->section_length);
>+ pr_warn(FW_BUG
>+ "section length is too small\n");
>+ pr_warn(FW_BUG
>+ "firmware-generated error record is incorrect\n");
>+ vsei_len = 0;
>+ }
>+ ven_err_data = (u8 *)ctx_info;
>+
>+ cpu = GET_LOGICAL_INDEX(err->mpidr);
>+ /* when return value is invalid, set cpu index to -1 */
>+ if (cpu < 0)
>+ cpu = -1;
>+
>+ trace_arm_event(err, pei_err, pei_len, ctx_err, ctx_len,
>+ ven_err_data, (u32)vsei_len, sev, cpu);
> }
> #endif
>
>diff --git a/include/linux/ras.h b/include/linux/ras.h index
>811feb9d8160..2070e4ae0626 100644
>--- a/include/linux/ras.h
>+++ b/include/linux/ras.h
>@@ -25,7 +25,7 @@ void log_non_standard_event(const guid_t *sec_type,
> const guid_t *fru_id, const char *fru_text,
> const u8 sev, const u8 *err, const u32 len); #if
>defined(CONFIG_ARM) || defined(CONFIG_ARM64) -void
>log_arm_hw_error(struct cper_sec_proc_arm *err);
>+void log_arm_hw_error(struct cper_sec_proc_arm *err, const u8 sev);
> #endif
> #else
> static inline void
>@@ -35,7 +35,7 @@ log_non_standard_event(const guid_t *sec_type, { return;
>} #if defined(CONFIG_ARM) || defined(CONFIG_ARM64) static inline void -
>log_arm_hw_error(struct cper_sec_proc_arm *err) { return; }
>+log_arm_hw_error(struct cper_sec_proc_arm *err, const u8 sev) { return;
>+}
> #endif
> #endif
>
>@@ -55,5 +55,14 @@ static inline void amd_retire_dram_row(struct atl_err
>*err) { } static inline unsigned long
>amd_convert_umc_mca_addr_to_sys_addr(struct atl_err *err) { return -EINVAL;
>} #endif /* CONFIG_AMD_ATL */
>-
>+#if defined(CONFIG_ARM) || defined(CONFIG_ARM64) #include
>+<asm/smp_plat.h>
>+/*
>+ * Include ARM specific SMP header which provides a function mapping
>+mpidr to
>+ * cpu logical index.
>+ */
>+#define GET_LOGICAL_INDEX(mpidr) get_logical_index(mpidr &
>+MPIDR_HWID_BITMASK) #else #define GET_LOGICAL_INDEX(mpidr) -EINVAL
>+#endif /* CONFIG_ARM || CONFIG_ARM64 */
> #endif /* __RAS_H__ */
>diff --git a/include/ras/ras_event.h b/include/ras/ras_event.h index
>c011ea236e9b..a7d7b6e717b6 100644
>--- a/include/ras/ras_event.h
>+++ b/include/ras/ras_event.h
>@@ -168,11 +168,24 @@ TRACE_EVENT(mc_event,
> * This event is generated when hardware detects an ARM processor error
> * has occurred. UEFI 2.6 spec section N.2.4.4.
> */
>+#define APEIL "ARM Processor Err Info data len"
>+#define APEID "ARM Processor Err Info raw data"
>+#define APECIL "ARM Processor Err Context Info data len"
>+#define APECID "ARM Processor Err Context Info raw data"
>+#define VSEIL "Vendor Specific Err Info data len"
>+#define VSEID "Vendor Specific Err Info raw data"
> TRACE_EVENT(arm_event,
>
>- TP_PROTO(const struct cper_sec_proc_arm *proc),
>+ TP_PROTO(const struct cper_sec_proc_arm *proc, const u8 *pei_err,
>+ const u32 pei_len,
>+ const u8 *ctx_err,
>+ const u32 ctx_len,
>+ const u8 *oem,
>+ const u32 oem_len,
>+ u8 sev,
>+ int cpu),
>
>- TP_ARGS(proc),
>+ TP_ARGS(proc, pei_err, pei_len, ctx_err, ctx_len, oem, oem_len, sev,
>+cpu),
>
> TP_STRUCT__entry(
> __field(u64, mpidr)
>@@ -180,6 +193,14 @@ TRACE_EVENT(arm_event,
> __field(u32, running_state)
> __field(u32, psci_state)
> __field(u8, affinity)
>+ __field(u32, pei_len)
>+ __dynamic_array(u8, buf, pei_len)
>+ __field(u32, ctx_len)
>+ __dynamic_array(u8, buf1, ctx_len)
>+ __field(u32, oem_len)
>+ __dynamic_array(u8, buf2, oem_len)
>+ __field(u8, sev)
>+ __field(int, cpu)
> ),
>
> TP_fast_assign(
>@@ -199,12 +220,29 @@ TRACE_EVENT(arm_event,
> __entry->running_state = ~0;
> __entry->psci_state = ~0;
> }
>+ __entry->pei_len = pei_len;
>+ memcpy(__get_dynamic_array(buf), pei_err, pei_len);
>+ __entry->ctx_len = ctx_len;
>+ memcpy(__get_dynamic_array(buf1), ctx_err, ctx_len);
>+ __entry->oem_len = oem_len;
>+ memcpy(__get_dynamic_array(buf2), oem, oem_len);
>+ __entry->sev = sev;
>+ __entry->cpu = cpu;
> ),
>
>- TP_printk("affinity level: %d; MPIDR: %016llx; MIDR: %016llx; "
>- "running state: %d; PSCI state: %d",
>+ TP_printk("cpu: %d; error: %d; affinity level: %d; MPIDR: %016llx; MIDR:
>%016llx; "
>+ "running state: %d; PSCI state: %d; "
>+ "%s: %d; %s: %s; %s: %d; %s: %s; %s: %d; %s: %s",
>+ __entry->cpu,
>+ __entry->sev,
> __entry->affinity, __entry->mpidr, __entry->midr,
>- __entry->running_state, __entry->psci_state)
>+ __entry->running_state, __entry->psci_state,
>+ APEIL, __entry->pei_len, APEID,
>+ __print_hex(__get_dynamic_array(buf), __entry->pei_len),
>+ APECIL, __entry->ctx_len, APECID,
>+ __print_hex(__get_dynamic_array(buf1), __entry->ctx_len),
>+ VSEIL, __entry->oem_len, VSEID,
>+ __print_hex(__get_dynamic_array(buf2), __entry->oem_len))
> );
>
> /*
>
>--
>2.43.0
>
Powered by blists - more mailing lists