[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20250612175556.000036b5@huawei.com>
Date: Thu, 12 Jun 2025 17:55:56 +0100
From: Jonathan Cameron <Jonathan.Cameron@...wei.com>
To: Terry Bowman <terry.bowman@....com>
CC: <PradeepVineshReddy.Kodamati@....com>, <dave@...olabs.net>,
<dave.jiang@...el.com>, <alison.schofield@...el.com>,
<vishal.l.verma@...el.com>, <ira.weiny@...el.com>,
<dan.j.williams@...el.com>, <bhelgaas@...gle.com>, <bp@...en8.de>,
<ming.li@...omail.com>, <shiju.jose@...wei.com>, <dan.carpenter@...aro.org>,
<Smita.KoralahalliChannabasappa@....com>, <kobayashi.da-06@...itsu.com>,
<yanfei.xu@...el.com>, <rrichter@....com>, <peterz@...radead.org>,
<colyli@...e.de>, <uaisheng.ye@...el.com>,
<fabio.m.de.francesco@...ux.intel.com>, <ilpo.jarvinen@...ux.intel.com>,
<yazen.ghannam@....com>, <linux-cxl@...r.kernel.org>,
<linux-kernel@...r.kernel.org>, <linux-pci@...r.kernel.org>
Subject: Re: [PATCH v9 12/16] cxl/pci: Introduce CXL Endpoint protocol error
handlers
On Tue, 3 Jun 2025 12:22:35 -0500
Terry Bowman <terry.bowman@....com> wrote:
> CXL Endpoint protocol errors are currently handled using PCI error
> handlers. The CXL Endpoint requires CXL specific handling in the case of
> uncorrectable error handling not provided by the PCI handlers.
>
> Add CXL specific handlers for CXL Endpoints. Rename the existing
> cxl_error_handlers to be pci_error_handlers to more correctly indicate
> the error type and follow naming consistency.
I wonder if a rename precursor patch would be better here than doing it
all in one go?
Having not read the patch description thoroughly this had me confused ;)
>
> Keep the existing PCI Endpoint handlers. PCI handlers can be called if the
> CXL device is not trained for alternate protocol (CXL). Update the CXL
> Endpoint PCI handlers to call the CXL handler. If the CXL uncorrectable
> handler returns PCI_ERS_RESULT_PANIC then the PCI handler invokes panic().
>
> Signed-off-by: Terry Bowman <terry.bowman@....com>
>
> static bool cxl_handle_endpoint_ras(struct cxl_dev_state *cxlds)
> {
> -
So this snuck in somewhere between upstream and here. If it is in this
set let's push the removal back to where it came from.
> return __cxl_handle_ras(&cxlds->cxlmd->dev, cxlds->serial, cxlds->regs.ras);
> }
>
> @@ -844,14 +843,15 @@ static void cxl_handle_rdport_errors(struct cxl_dev_state *cxlds)
> static void cxl_handle_rdport_errors(struct cxl_dev_state *cxlds) { }
> #endif
>
> +pci_ers_result_t cxl_error_detected(struct device *dev)
> +{
> + struct pci_dev *pdev = to_pci_dev(dev);
> + struct cxl_dev_state *cxlds = pci_get_drvdata(pdev);
> + struct device *cxlmd_dev = &cxlds->cxlmd->dev;
> + pci_ers_result_t ue;
> +
> + scoped_guard(device, cxlmd_dev) {
> if (!dev->driver) {
> dev_warn(&pdev->dev,
> "%s: memdev disabled, abort error handling\n",
> dev_name(dev));
> - return PCI_ERS_RESULT_DISCONNECT;
> + return PCI_ERS_RESULT_PANIC;
> }
>
> if (cxlds->rcd)
> @@ -892,29 +900,25 @@ pci_ers_result_t cxl_error_detected(struct pci_dev *pdev,
> ue = cxl_handle_endpoint_ras(cxlds);
> }
>
> -
Maybe something more in the patch description on why this chunk isn't relevant?
I guess we don't need the more complex handling as these are all panic :)
> - switch (state) {
> - case pci_channel_io_normal:
> - if (ue) {
> - device_release_driver(dev);
> - return PCI_ERS_RESULT_NEED_RESET;
> - }
> - return PCI_ERS_RESULT_CAN_RECOVER;
> - case pci_channel_io_frozen:
> - dev_warn(&pdev->dev,
> - "%s: frozen state error detected, disable CXL.mem\n",
> - dev_name(dev));
> - device_release_driver(dev);
> - return PCI_ERS_RESULT_NEED_RESET;
> - case pci_channel_io_perm_failure:
> - dev_warn(&pdev->dev,
> - "failure state error detected, request disconnect\n");
> - return PCI_ERS_RESULT_DISCONNECT;
> - }
> - return PCI_ERS_RESULT_NEED_RESET;
> + return ue;
> }
> EXPORT_SYMBOL_NS_GPL(cxl_error_detected, "CXL");
> static int cxl_flit_size(struct pci_dev *pdev)
> {
> if (cxl_pci_flit_256(pdev))
> diff --git a/drivers/cxl/core/ras.c b/drivers/cxl/core/ras.c
> index 0ef8c2068c0c..664f532cc838 100644
> --- a/drivers/cxl/core/ras.c
> +++ b/drivers/cxl/core/ras.c
> @@ -244,6 +244,8 @@ static struct pci_dev *sbdf_to_pci(struct cxl_prot_error_info *err_info)
> static void cxl_handle_prot_error(struct cxl_prot_error_info *err_info)
> {
> struct pci_dev *pdev __free(pci_dev_put) = pci_dev_get(sbdf_to_pci(err_info));
> + struct cxl_dev_state *cxlds = pci_get_drvdata(pdev);
> + struct device *cxlmd_dev __free(put_device) = get_device(&cxlds->cxlmd->dev);
>
> if (!pdev) {
> pr_err("Failed to find the CXL device\n");
> @@ -260,15 +262,13 @@ static void cxl_handle_prot_error(struct cxl_prot_error_info *err_info)
>
> if (err_info->severity == AER_CORRECTABLE) {
> int aer = pdev->aer_cap;
> - struct cxl_dev_state *cxlds = pci_get_drvdata(pdev);
> - struct device *dev __free(put_device) = get_device(&cxlds->cxlmd->dev);
This code move seems somewhat unrelated? Maybe it's in the wrong patch and
becomes necessary later?
>
> if (aer)
> pci_clear_and_set_config_dword(pdev,
> aer + PCI_ERR_COR_STATUS,
> 0, PCI_ERR_COR_INTERNAL);
>
> - cxl_cor_error_detected(pdev);
> + cxl_cor_error_detected(&pdev->dev);
>
> pcie_clear_device_status(pdev);
> } else {
Powered by blists - more mailing lists