lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20240215121757.00005e72@Huawei.com>
Date: Thu, 15 Feb 2024 12:17:57 +0000
From: Jonathan Cameron <Jonathan.Cameron@...wei.com>
To: Smita Koralahalli <Smita.KoralahalliChannabasappa@....com>
CC: <linux-efi@...r.kernel.org>, <linux-kernel@...r.kernel.org>,
	<linux-cxl@...r.kernel.org>, Ard Biesheuvel <ardb@...nel.org>, "Alison
 Schofield" <alison.schofield@...el.com>, Vishal Verma
	<vishal.l.verma@...el.com>, Ira Weiny <ira.weiny@...el.com>, Dan Williams
	<dan.j.williams@...el.com>, Yazen Ghannam <yazen.ghannam@....com>
Subject: Re: [PATCH v2 3/4] acpi/ghes, efi/cper: Recognize and process CXL
 Protocol Errors.

On Tue, 9 Jan 2024 03:47:54 +0000
Smita Koralahalli <Smita.KoralahalliChannabasappa@....com> wrote:

> UEFI v2.10 section N.2.13 defines a CPER record for CXL Protocol errors.
> 
> Add GHES support to detect CXL CPER Protocol record and cache error
> severity, device_id, serial number and CXL RAS capability struct in
> struct cxl_cper_event_info.
> 
> Signed-off-by: Smita Koralahalli <Smita.KoralahalliChannabasappa@....com>
> ---
> v2:
> 	Change to sub-struct for protocol error specific elemenets.
> 	Set serial number unconditionally.
> 	Copy entire cxl_ras_capability_regs struct rather than pointer.
> 	Calculate error severity in efi/cper and change to enum.
> ---
>  drivers/acpi/apei/ghes.c        | 11 ++++++
>  drivers/firmware/efi/cper_cxl.c | 68 +++++++++++++++++++++++++++++++++
>  include/linux/cxl-event.h       | 13 +++++++
>  3 files changed, 92 insertions(+)
> 
> diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
> index 60b615d361d3..1d4f3d68a0bc 100644
> --- a/drivers/acpi/apei/ghes.c
> +++ b/drivers/acpi/apei/ghes.c
> @@ -714,6 +714,14 @@ static void cxl_cper_post_event(enum cxl_event_type event_type,
>  		cper_callback(event_type, &info);
>  }
>  
> +void cxl_cper_handle_prot_err(struct acpi_hest_generic_data *gdata)
> +{
> +	struct cxl_cper_event_info info;
> +
> +	if (cxl_cper_handle_prot_err_info(gdata, &info))
> +		return;
> +}
> +
>  int cxl_cper_register_callback(cxl_cper_callback callback)
>  {
>  	guard(rwsem_write)(&cxl_cper_rw_sem);
> @@ -768,6 +776,9 @@ static bool ghes_do_proc(struct ghes *ghes,
>  		else if (guid_equal(sec_type, &CPER_SEC_PROC_ARM)) {
>  			queued = ghes_handle_arm_hw_error(gdata, sev);
>  		}
> +		else if (guid_equal(sec_type, &CPER_SEC_CXL_PROT_ERR)) {
> +			cxl_cper_handle_prot_err(gdata);
> +		}
>  		else if (guid_equal(sec_type, &CPER_SEC_CXL_GEN_MEDIA_GUID)) {
>  			struct cxl_cper_event_rec *rec = acpi_hest_get_payload(gdata);
>  
> diff --git a/drivers/firmware/efi/cper_cxl.c b/drivers/firmware/efi/cper_cxl.c
> index 4fd8d783993e..9b9b8c8f1157 100644
> --- a/drivers/firmware/efi/cper_cxl.c
> +++ b/drivers/firmware/efi/cper_cxl.c
> @@ -8,6 +8,7 @@
>   */
>  
>  #include <linux/cper.h>
> +#include <acpi/ghes.h>
>  #include "cper_cxl.h"
>  
>  #define PROT_ERR_VALID_AGENT_TYPE		BIT_ULL(0)
> @@ -44,6 +45,17 @@ enum {
>  	USP,	/* CXL Upstream Switch Port */
>  };
>  
> +static enum cxl_aer_err_type cper_severity_cxl_aer(int cper_severity)
> +{
> +	switch (cper_severity) {
> +	case CPER_SEV_RECOVERABLE:
> +	case CPER_SEV_FATAL:
> +		return CXL_AER_UNCORRECTABLE;
> +	default:
> +		return CXL_AER_CORRECTABLE;
> +	}
> +}
> +
>  void cper_print_prot_err(const char *pfx, const struct cper_sec_prot_err *prot_err)
>  {
>  	if (prot_err->valid_bits & PROT_ERR_VALID_AGENT_TYPE)
> @@ -176,3 +188,59 @@ void cper_print_prot_err(const char *pfx, const struct cper_sec_prot_err *prot_e
>  			       sizeof(cxl_ras->header_log), 0);
>  	}
>  }
> +
> +int cxl_cper_handle_prot_err_info(struct acpi_hest_generic_data *gdata,
> +				  struct cxl_cper_event_info *info)
> +{
> +	struct cper_sec_prot_err *prot_err = acpi_hest_get_payload(gdata);
> +	struct cper_cxl_event_devid *device_id = &info->rec.hdr.device_id;
> +	struct cper_cxl_event_sn *dev_serial_num =  &info->rec.hdr.dev_serial_num;
> +	size_t size = sizeof(*prot_err) + prot_err->dvsec_len;

Not obvious what this is size of.  I'd rename it to reflect that's only
the distance to the end of the dvsec copy.
Or just compute the pointer below directly by putting this maths inline.

> +
> +	if (!(prot_err->valid_bits & PROT_ERR_VALID_ERROR_LOG)) {
> +		pr_err(FW_WARN "Not a valid protocol error log\n");
> +		return -EINVAL;
> +	}
> +
> +	if (!(prot_err->valid_bits & PROT_ERR_VALID_DEVICE_ID)) {
> +		pr_err(FW_WARN "Not a valid Device ID\n");
"No device ID\n"
is more accurate description.

I'd move this down to next to where we check the data is valid.
So keep each validity check next to where it matters rather than
a bunch of checks up here.  (mostly because I started writing you
didn't check it was valid down there before remembering this
earlier code :)

> +		return -EINVAL;
> +	}
> +
> +	/*
> +	 * Set device serial number unconditionally.
> +	 *
> +	 * Print a warning message if it is not valid. The device serial
> +	 * number is considered valid for CXL 1.1 device, CXL 2.0 device,
is required for
perhaps?  These all got renamed in the CXL spec.  We should use that naming
because it deliberately avoids limiting to particular spec versions.
	CXL RCD, CXL SLD, CXL LD, 
> +	 * CXL 2.0 Logical device, or CXL 2.0 Fabric Manager Managed
> +	 * Logical Device.

Not sure what this is now called.. :(


> +	 */
> +	if (!(prot_err->valid_bits & PROT_ERR_VALID_SERIAL_NUMBER) ||
> +	      prot_err->agent_type > 0x4 || prot_err->agent_type == RCH_DP)
> +		pr_warn(FW_WARN "Not a valid serial number\n");
> +
> +	dev_serial_num->lower_dw = prot_err->dev_serial_num.lower_dw;
> +	dev_serial_num->upper_dw = prot_err->dev_serial_num.upper_dw;
> +
> +	/*
> +	 * The device ID or agent address is only valid for CXL 1.1 device,
> +	 * CXL 2.0 device, CXL 2.0 Logical device, CXL 2.0 Fabric Manager
> +	 * Managed Logical Device, CXL Root Port, CXL Downstream Switch
> +	 * Port, or CXL Upstream Switch Port.
> +	 */
> +	if (prot_err->agent_type <= 0x7 && prot_err->agent_type != RCH_DP) {

> +		device_id->segment_num = prot_err->agent_addr.segment;
> +		device_id->bus_num = prot_err->agent_addr.bus;
> +		device_id->device_num = prot_err->agent_addr.device;
> +		device_id->func_num = prot_err->agent_addr.function;
> +	} else {
> +		pr_err(FW_WARN "Not a valid agent type\n");
> +		return -EINVAL;
> +	}
> +
> +	info->p_err.cxl_ras = *(struct cxl_ras_capability_regs *)((long)prot_err + size);

Casting to a long isn't nice. Keep it as a pointer for this maths
a u8 * or void * would work.  Particularly if you did it as something
a bit more self documenting like

u8 *dvsec_start = (u8 *)(prot_err + 1);
u8 *cap_start = dvsec_start + prot_err->dvsec_length;

info->p_err.cxl_ras = *(struct cxl_ras_capability_regs *)cap_start;

> +
> +	info->p_err.severity = cper_severity_cxl_aer(gdata->error_severity);
> +
> +	return 0;
> +}


Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ