[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <81b90308-fdb1-3686-33a3-1e7ec42a7ef8@amd.com>
Date: Mon, 30 Oct 2023 14:03:19 -0700
From: Smita Koralahalli <Smita.KoralahalliChannabasappa@....com>
To: Ira Weiny <ira.weiny@...el.com>,
Dan Williams <dan.j.williams@...el.com>,
Jonathan Cameron <jonathan.cameron@...wei.com>
Cc: Yazen Ghannam <yazen.ghannam@....com>,
Davidlohr Bueso <dave@...olabs.net>,
Dave Jiang <dave.jiang@...el.com>,
Alison Schofield <alison.schofield@...el.com>,
Vishal Verma <vishal.l.verma@...el.com>,
Ard Biesheuvel <ardb@...nel.org>, linux-efi@...r.kernel.org,
linux-kernel@...r.kernel.org, linux-cxl@...r.kernel.org
Subject: Re: [PATCH RFC v2 3/3] cxl/memdev: Register for and process CPER
events
Hi Ira,
On 10/26/2023 11:21 AM, Ira Weiny wrote:
> If the firmware has configured CXL event support to be firmware first
> the OS can process those events through CPER records. Matching memory
> devices to the CPER records can be done via the serial number which is
> part of the CPER record header.
>
> Detect firmware first, register a notifier callback for each memdev, and
> trace events when they match a device registered.
>
> Signed-off-by: Ira Weiny <ira.weiny@...el.com>
>
> ---
> Changes from RFC v1:
> [iweiny: adjust to cper_event enum instead of converting guids]
> ---
> drivers/cxl/core/mbox.c | 45 +++++++++++++++++++++++++-------
> drivers/cxl/cxlmem.h | 7 +++++
> drivers/cxl/pci.c | 69 ++++++++++++++++++++++++++++++++++++++++++++++++-
> 3 files changed, 110 insertions(+), 11 deletions(-)
>
> diff --git a/drivers/cxl/core/mbox.c b/drivers/cxl/core/mbox.c
> index 4df4f614f490..3f760d1d21de 100644
> --- a/drivers/cxl/core/mbox.c
> +++ b/drivers/cxl/core/mbox.c
> @@ -860,26 +860,51 @@ static const uuid_t mem_mod_event_uuid =
> UUID_INIT(0xfe927475, 0xdd59, 0x4339,
> 0xa5, 0x86, 0x79, 0xba, 0xb1, 0x13, 0xb7, 0x74);
>
> -static void cxl_event_trace_record(const struct cxl_memdev *cxlmd,
> - enum cxl_event_log_type type,
> - struct cxl_event_record_raw *record)
> +void cxl_event_trace_record(const struct cxl_memdev *cxlmd,
> + enum cxl_event_log_type type,
> + struct cxl_event_record_raw *record,
> + enum cxl_cper_event cper_event)
> {
> - uuid_t *id = &record->hdr.id;
> -
> - if (uuid_equal(id, &gen_media_event_uuid)) {
> + switch (cper_event) {
> + case CXL_CPER_EVENT_GEN_MEDIA: {
> struct cxl_event_gen_media *rec =
> (struct cxl_event_gen_media *)record;
>
> trace_cxl_general_media(cxlmd, type, rec);
> - } else if (uuid_equal(id, &dram_event_uuid)) {
> + break;
> + }
> + case CXL_CPER_EVENT_DRAM: {
> struct cxl_event_dram *rec = (struct cxl_event_dram *)record;
>
> trace_cxl_dram(cxlmd, type, rec);
> - } else if (uuid_equal(id, &mem_mod_event_uuid)) {
> + break;
> + }
> + case CXL_CPER_EVENT_MEM_MODULE: {
> struct cxl_event_mem_module *rec =
> (struct cxl_event_mem_module *)record;
>
> trace_cxl_memory_module(cxlmd, type, rec);
> + break;
> + }
> + }
> +}
> +EXPORT_SYMBOL_NS_GPL(cxl_event_trace_record, CXL);
> +
> +static void __cxl_event_trace_record(const struct cxl_memdev *cxlmd,
> + enum cxl_event_log_type type,
> + struct cxl_event_record_raw *record)
> +{
> + uuid_t *id = &record->hdr.id;
> +
> + if (uuid_equal(id, &gen_media_event_uuid)) {
> + cxl_event_trace_record(cxlmd, type, record,
> + CXL_CPER_EVENT_GEN_MEDIA);
> + } else if (uuid_equal(id, &dram_event_uuid)) {
> + cxl_event_trace_record(cxlmd, type, record,
> + CXL_CPER_EVENT_DRAM);
> + } else if (uuid_equal(id, &mem_mod_event_uuid)) {
> + cxl_event_trace_record(cxlmd, type, record,
> + CXL_CPER_EVENT_MEM_MODULE);
> } else {
> /* For unknown record types print just the header */
> trace_cxl_generic_event(cxlmd, type, record);
> @@ -991,8 +1016,8 @@ static void cxl_mem_get_records_log(struct cxl_memdev_state *mds,
> break;
>
> for (i = 0; i < nr_rec; i++)
> - cxl_event_trace_record(cxlmd, type,
> - &payload->records[i]);
> + __cxl_event_trace_record(cxlmd, type,
> + &payload->records[i]);
>
> if (payload->flags & CXL_GET_EVENT_FLAG_OVERFLOW)
> trace_cxl_overflow(cxlmd, type, payload);
> diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h
> index 706f8a6d1ef4..89bd85e7f51c 100644
> --- a/drivers/cxl/cxlmem.h
> +++ b/drivers/cxl/cxlmem.h
> @@ -6,6 +6,7 @@
> #include <linux/cdev.h>
> #include <linux/uuid.h>
> #include <linux/rcuwait.h>
> +#include <linux/efi.h>
> #include "cxl.h"
>
> /* CXL 2.0 8.2.8.5.1.1 Memory Device Status Register */
> @@ -477,6 +478,8 @@ struct cxl_memdev_state {
> struct cxl_security_state security;
> struct cxl_fw_state fw;
>
> + struct notifier_block cxl_cper_nb;
> +
> struct rcuwait mbox_wait;
> int (*mbox_send)(struct cxl_memdev_state *mds,
> struct cxl_mbox_cmd *cmd);
> @@ -863,6 +866,10 @@ void set_exclusive_cxl_commands(struct cxl_memdev_state *mds,
> void clear_exclusive_cxl_commands(struct cxl_memdev_state *mds,
> unsigned long *cmds);
> void cxl_mem_get_event_records(struct cxl_memdev_state *mds, u32 status);
> +void cxl_event_trace_record(const struct cxl_memdev *cxlmd,
> + enum cxl_event_log_type type,
> + struct cxl_event_record_raw *record,
> + enum cxl_cper_event cper_event);
> int cxl_set_timestamp(struct cxl_memdev_state *mds);
> int cxl_poison_state_init(struct cxl_memdev_state *mds);
> int cxl_mem_get_poison(struct cxl_memdev *cxlmd, u64 offset, u64 len,
> diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c
> index 44a21ab7add5..36d6f03e55de 100644
> --- a/drivers/cxl/pci.c
> +++ b/drivers/cxl/pci.c
> @@ -1,5 +1,6 @@
> // SPDX-License-Identifier: GPL-2.0-only
> /* Copyright(c) 2020 Intel Corporation. All rights reserved. */
> +#include <asm-generic/unaligned.h>
> #include <linux/io-64-nonatomic-lo-hi.h>
> #include <linux/moduleparam.h>
> #include <linux/module.h>
> @@ -10,6 +11,7 @@
> #include <linux/pci.h>
> #include <linux/aer.h>
> #include <linux/io.h>
> +#include <linux/efi.h>
> #include "cxlmem.h"
> #include "cxlpci.h"
> #include "cxl.h"
> @@ -748,6 +750,69 @@ static bool cxl_event_int_is_fw(u8 setting)
> return mode == CXL_INT_FW;
> }
>
> +#define CXL_EVENT_HDR_FLAGS_REC_SEVERITY GENMASK(1, 0)
> +int cxl_cper_event_call(struct notifier_block *nb, unsigned long action, void *data)
> +{
> + struct cxl_cper_notifier_data *nd = data;
> + struct cxl_event_record_raw record = (struct cxl_event_record_raw) {
> + .hdr.id = UUID_INIT(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
> + };
> + enum cxl_event_log_type log_type;
> + struct cxl_memdev_state *mds;
> + u32 hdr_flags;
> +
> + mds = container_of(nb, struct cxl_memdev_state, cxl_cper_nb);
> +
> + /* Need serial number for device identification */
> + if (!(nd->rec->hdr.validation_bits & CPER_CXL_DEVICE_SN_VALID))
> + return NOTIFY_DONE;
For all the event records that I tested so far, this has never been
true. That is CPER_CXL_DEVICE_SN_VALID is never set which might not log
the records at all. Should we be bit more lenient here and include
validating device_id (bdf) instead and check if cxlds exist?
pci_get_domain_bus_and_slot() and pci_get_drvdata()..
> +
> + /* FIXME endianess and bytes of serial number need verification */
> + /* FIXME Should other values be checked? */
> + if (memcmp(&mds->cxlds.serial, &nd->rec->hdr.dev_serial_num,
> + sizeof(mds->cxlds.serial)))
> + return NOTIFY_DONE;
> +
> + /* ensure record can always handle the full CPER provided data */
> + BUILD_BUG_ON(sizeof(record) <
> + (CPER_CXL_COMP_EVENT_LOG_SIZE + sizeof(record.hdr.id)));
> +
> + /*
> + * UEFI v2.10 defines N.2.14 defines the CXL CPER record as not
> + * including the uuid field.
> + */
> + memcpy(&record.hdr.length, &nd->rec->comp_event_log,
> + CPER_CXL_REC_LEN(nd->rec));
I'm doubtful this will do the job. I think we should copy into each
field of struct cxl_event_record_hdr individually starting from length
by pointer arithmetic (which is definitely bad, but I cannot think of a
better way to do this) and then do memcpy for data field in struct
cxl_event_record_raw..
Any other suggestions would be helpful as well.
I can make these changes and validate it on my end if that works..?
Thanks,
Smita
> +
> + /* Fabricate a log type */
> + hdr_flags = get_unaligned_le24(record.hdr.flags);
> + log_type = FIELD_GET(CXL_EVENT_HDR_FLAGS_REC_SEVERITY, hdr_flags);
> +
> + cxl_event_trace_record(mds->cxlds.cxlmd, log_type, &record,
> + nd->cper_event);
> +
> + return NOTIFY_OK;
> +}
> +
> +static void cxl_unregister_cper_events(void *_mds)
> +{
> + struct cxl_memdev_state *mds = _mds;
> +
> + unregister_cxl_cper_notifier(&mds->cxl_cper_nb);
> +}
> +
> +static void register_cper_events(struct cxl_memdev_state *mds)
> +{
> + mds->cxl_cper_nb.notifier_call = cxl_cper_event_call;
> +
> + if (register_cxl_cper_notifier(&mds->cxl_cper_nb)) {
> + dev_err(mds->cxlds.dev, "CPER registration failed\n");
> + return;
> + }
> +
> + devm_add_action_or_reset(mds->cxlds.dev, cxl_unregister_cper_events, mds);
> +}
> +
> static int cxl_event_config(struct pci_host_bridge *host_bridge,
> struct cxl_memdev_state *mds)
> {
> @@ -758,8 +823,10 @@ static int cxl_event_config(struct pci_host_bridge *host_bridge,
> * When BIOS maintains CXL error reporting control, it will process
> * event records. Only one agent can do so.
> */
> - if (!host_bridge->native_cxl_error)
> + if (!host_bridge->native_cxl_error) {
> + register_cper_events(mds);
> return 0;
> + }
>
> rc = cxl_mem_alloc_event_buf(mds);
> if (rc)
>
Powered by blists - more mailing lists