linux-kernel - Re: [PATCH v9 12/16] cxl/pci: Introduce CXL Endpoint protocol error handlers

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20250612175556.000036b5@huawei.com>
Date: Thu, 12 Jun 2025 17:55:56 +0100
From: Jonathan Cameron <Jonathan.Cameron@...wei.com>
To: Terry Bowman <terry.bowman@....com>
CC: <PradeepVineshReddy.Kodamati@....com>, <dave@...olabs.net>,
	<dave.jiang@...el.com>, <alison.schofield@...el.com>,
	<vishal.l.verma@...el.com>, <ira.weiny@...el.com>,
	<dan.j.williams@...el.com>, <bhelgaas@...gle.com>, <bp@...en8.de>,
	<ming.li@...omail.com>, <shiju.jose@...wei.com>, <dan.carpenter@...aro.org>,
	<Smita.KoralahalliChannabasappa@....com>, <kobayashi.da-06@...itsu.com>,
	<yanfei.xu@...el.com>, <rrichter@....com>, <peterz@...radead.org>,
	<colyli@...e.de>, <uaisheng.ye@...el.com>,
	<fabio.m.de.francesco@...ux.intel.com>, <ilpo.jarvinen@...ux.intel.com>,
	<yazen.ghannam@....com>, <linux-cxl@...r.kernel.org>,
	<linux-kernel@...r.kernel.org>, <linux-pci@...r.kernel.org>
Subject: Re: [PATCH v9 12/16] cxl/pci: Introduce CXL Endpoint protocol error
 handlers

On Tue, 3 Jun 2025 12:22:35 -0500
Terry Bowman <terry.bowman@....com> wrote:

> CXL Endpoint protocol errors are currently handled using PCI error
> handlers. The CXL Endpoint requires CXL specific handling in the case of
> uncorrectable error handling not provided by the PCI handlers.
> 
> Add CXL specific handlers for CXL Endpoints. Rename the existing
> cxl_error_handlers to be pci_error_handlers to more correctly indicate
> the error type and follow naming consistency.

I wonder if a rename precursor patch would be better here than doing it
all in one go?

Having not read the patch description thoroughly this had me confused ;)

> 
> Keep the existing PCI Endpoint handlers. PCI handlers can be called if the
> CXL device is not trained for alternate protocol (CXL). Update the CXL
> Endpoint PCI handlers to call the CXL handler. If the CXL uncorrectable
> handler returns PCI_ERS_RESULT_PANIC then the PCI handler invokes panic().
> 
> Signed-off-by: Terry Bowman <terry.bowman@....com>


>  
>  static bool cxl_handle_endpoint_ras(struct cxl_dev_state *cxlds)
>  {
> -

So this snuck in somewhere between upstream and here.  If it is in this
set let's push the removal back to where it came from.

>  	return __cxl_handle_ras(&cxlds->cxlmd->dev, cxlds->serial, cxlds->regs.ras);
>  }
>  
> @@ -844,14 +843,15 @@ static void cxl_handle_rdport_errors(struct cxl_dev_state *cxlds)
>  static void cxl_handle_rdport_errors(struct cxl_dev_state *cxlds) { }
>  #endif
>
> +pci_ers_result_t cxl_error_detected(struct device *dev)
> +{
> +	struct pci_dev *pdev = to_pci_dev(dev);
> +	struct cxl_dev_state *cxlds = pci_get_drvdata(pdev);
> +	struct device *cxlmd_dev = &cxlds->cxlmd->dev;
> +	pci_ers_result_t ue;
> +
> +	scoped_guard(device, cxlmd_dev) {
>  		if (!dev->driver) {
>  			dev_warn(&pdev->dev,
>  				 "%s: memdev disabled, abort error handling\n",
>  				 dev_name(dev));
> -			return PCI_ERS_RESULT_DISCONNECT;
> +			return PCI_ERS_RESULT_PANIC;
>  		}
>  
>  		if (cxlds->rcd)
> @@ -892,29 +900,25 @@ pci_ers_result_t cxl_error_detected(struct pci_dev *pdev,
>  		ue = cxl_handle_endpoint_ras(cxlds);
>  	}
>  
> -

Maybe something more in the patch description on why this chunk isn't relevant?
I guess we don't need the more complex handling as these are all panic :)

> -	switch (state) {
> -	case pci_channel_io_normal:
> -		if (ue) {
> -			device_release_driver(dev);
> -			return PCI_ERS_RESULT_NEED_RESET;
> -		}
> -		return PCI_ERS_RESULT_CAN_RECOVER;
> -	case pci_channel_io_frozen:
> -		dev_warn(&pdev->dev,
> -			 "%s: frozen state error detected, disable CXL.mem\n",
> -			 dev_name(dev));
> -		device_release_driver(dev);
> -		return PCI_ERS_RESULT_NEED_RESET;
> -	case pci_channel_io_perm_failure:
> -		dev_warn(&pdev->dev,
> -			 "failure state error detected, request disconnect\n");
> -		return PCI_ERS_RESULT_DISCONNECT;
> -	}
> -	return PCI_ERS_RESULT_NEED_RESET;
> +	return ue;
>  }
>  EXPORT_SYMBOL_NS_GPL(cxl_error_detected, "CXL");

>  static int cxl_flit_size(struct pci_dev *pdev)
>  {
>  	if (cxl_pci_flit_256(pdev))
> diff --git a/drivers/cxl/core/ras.c b/drivers/cxl/core/ras.c
> index 0ef8c2068c0c..664f532cc838 100644
> --- a/drivers/cxl/core/ras.c
> +++ b/drivers/cxl/core/ras.c

> @@ -244,6 +244,8 @@ static struct pci_dev *sbdf_to_pci(struct cxl_prot_error_info *err_info)
>  static void cxl_handle_prot_error(struct cxl_prot_error_info *err_info)
>  {
>  	struct pci_dev *pdev __free(pci_dev_put) = pci_dev_get(sbdf_to_pci(err_info));
> +	struct cxl_dev_state *cxlds = pci_get_drvdata(pdev);
> +	struct device *cxlmd_dev __free(put_device) = get_device(&cxlds->cxlmd->dev);
>  
>  	if (!pdev) {
>  		pr_err("Failed to find the CXL device\n");
> @@ -260,15 +262,13 @@ static void cxl_handle_prot_error(struct cxl_prot_error_info *err_info)
>  
>  	if (err_info->severity == AER_CORRECTABLE) {
>  		int aer = pdev->aer_cap;
> -		struct cxl_dev_state *cxlds = pci_get_drvdata(pdev);
> -		struct device *dev __free(put_device) = get_device(&cxlds->cxlmd->dev);

This code move seems somewhat unrelated?  Maybe it's in the wrong patch and
becomes necessary later?

>  
>  		if (aer)
>  			pci_clear_and_set_config_dword(pdev,
>  						       aer + PCI_ERR_COR_STATUS,
>  						       0, PCI_ERR_COR_INTERNAL);
>  
> -		cxl_cor_error_detected(pdev);
> +		cxl_cor_error_detected(&pdev->dev);
>  
>  		pcie_clear_device_status(pdev);
>  	} else {