linux-kernel - Re: [RESEND v13 23/25] CXL/PCI: Introduce CXL uncorrectable protocol error recovery

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20251208184037.GA3421722@bhelgaas>
Date: Mon, 8 Dec 2025 12:40:37 -0600
From: Bjorn Helgaas <helgaas@...nel.org>
To: Terry Bowman <terry.bowman@....com>
Cc: dave@...olabs.net, jonathan.cameron@...wei.com, dave.jiang@...el.com,
	alison.schofield@...el.com, dan.j.williams@...el.com,
	bhelgaas@...gle.com, shiju.jose@...wei.com, ming.li@...omail.com,
	Smita.KoralahalliChannabasappa@....com, rrichter@....com,
	dan.carpenter@...aro.org, PradeepVineshReddy.Kodamati@....com,
	lukas@...ner.de, Benjamin.Cheatham@....com,
	sathyanarayanan.kuppuswamy@...ux.intel.com,
	linux-cxl@...r.kernel.org, alucerop@....com, ira.weiny@...el.com,
	linux-kernel@...r.kernel.org, linux-pci@...r.kernel.org
Subject: Re: [RESEND v13 23/25] CXL/PCI: Introduce CXL uncorrectable protocol
 error recovery

On Tue, Nov 04, 2025 at 11:03:03AM -0600, Terry Bowman wrote:
> Implement cxl_do_recovery() to handle uncorrectable protocol
> errors (UCE), following the design of pcie_do_recovery(). Unlike PCIe,
> all CXL UCEs are treated as fatal and trigger a kernel panic to avoid
> potential CXL memory corruption.
> 
> Add cxl_walk_port(), analogous to pci_walk_bridge(), to traverse the
> CXL topology from the error source through downstream CXL ports and
> endpoints.
> 
> Introduce cxl_report_error_detected(), mirroring PCI's
> report_error_detected(), and implement device locking for the affected
> subtree. Endpoints require locking the PCI device (pdev->dev) and the
> CXL memdev (cxlmd->dev). CXL ports require locking the PCI
> device (pdev->dev) and the parent CXL port.
> 
> The device locks should be taken early where possible. The initially
> reporting device will be locked after kfifo dequeue. Iterated devices
> will be locked in cxl_report_error_detected() and must lock the
> iterated devices except for the first device as it has already been
> locked.
> 
> Export pci_aer_clear_fatal_status() for use when a UCE is not present.
> 
> Signed-off-by: Terry Bowman <terry.bowman@....com>

Acked-by: Bjorn Helgaas <bhelgaas@...gle.com> # drivers/pci/

> ---
> 
> Changes in v12->v13:
> - Add guard() before calling cxl_pci_drv_bound() (Dave Jiang)
> - Add guard() calls for EP (cxlds->cxlmd->dev & pdev->dev) and ports
>   (pdev->dev & parent cxl_port) in cxl_report_error_detected() and
>   cxl_handle_proto_error() (Terry)
> - Remove unnecessary check for endpoint port. (Dave Jiang)
> - Remove check for RCIEP EP in cxl_report_error_detected(). (Terry)
> 
> Changes in v11->v12:
> - Clean up port discovery in cxl_do_recovery() (Dave)
> - Add PCI_EXP_TYPE_RC_END to type check in cxl_report_error_detected()
> 
> Changes in v10->v11:
> - pci_ers_merge_results() - Move to earlier patch
> ---
>  drivers/cxl/core/ras.c | 135 ++++++++++++++++++++++++++++++++++++++++-
>  drivers/pci/pci.h      |   1 -
>  drivers/pci/pcie/aer.c |   1 +
>  include/linux/aer.h    |   2 +
>  4 files changed, 135 insertions(+), 4 deletions(-)
> 
> diff --git a/drivers/cxl/core/ras.c b/drivers/cxl/core/ras.c
> index 5bc144cde0ee..52c6f19564b6 100644
> --- a/drivers/cxl/core/ras.c
> +++ b/drivers/cxl/core/ras.c
> @@ -259,8 +259,138 @@ static void device_unlock_if(struct device *dev, bool take)
>  		device_unlock(dev);
>  }
>  
> +/**
> + * cxl_report_error_detected
> + * @dev: Device being reported
> + * @data: Result
> + * @err_pdev: Device with initial detected error. Is locked immediately
> + *            after KFIFO dequeue.
> + */
> +static int cxl_report_error_detected(struct device *dev, void *data, struct pci_dev *err_pdev)
> +{
> +	bool need_lock = (dev != &err_pdev->dev);
> +	pci_ers_result_t vote, *result = data;
> +	struct pci_dev *pdev;
> +
> +	if (!dev || !dev_is_pci(dev))
> +		return 0;
> +	pdev = to_pci_dev(dev);
> +
> +	device_lock_if(&pdev->dev, need_lock);
> +	if (is_pcie_endpoint(pdev) && !cxl_pci_drv_bound(pdev)) {
> +		device_unlock_if(&pdev->dev, need_lock);
> +		return PCI_ERS_RESULT_NONE;
> +	}
> +
> +	if (pdev->aer_cap)
> +		pci_clear_and_set_config_dword(pdev,
> +					       pdev->aer_cap + PCI_ERR_COR_STATUS,
> +					       0, PCI_ERR_COR_INTERNAL);
> +
> +	if (is_pcie_endpoint(pdev)) {
> +		struct cxl_dev_state *cxlds = pci_get_drvdata(pdev);
> +
> +		device_lock_if(&cxlds->cxlmd->dev, need_lock);
> +		vote = cxl_error_detected(&cxlds->cxlmd->dev);
> +		device_unlock_if(&cxlds->cxlmd->dev, need_lock);
> +	} else {
> +		vote = cxl_port_error_detected(dev);
> +	}
> +
> +	pcie_clear_device_status(pdev);
> +	*result = pcie_ers_merge_result(*result, vote);
> +	device_unlock_if(&pdev->dev, need_lock);
> +
> +	return 0;
> +}
> +
> +static int match_port_by_parent_dport(struct device *dev, const void *dport_dev)
> +{
> +	struct cxl_port *port;
> +
> +	if (!is_cxl_port(dev))
> +		return 0;
> +
> +	port = to_cxl_port(dev);
> +
> +	return port->parent_dport->dport_dev == dport_dev;
> +}
> +
> +/**
> + * cxl_walk_port
> + *
> + * @port: Port be traversed into
> + * @cb: Callback for handling the CXL Ports
> + * @userdata: Result
> + * @err_pdev: Device with initial detected error. Is locked immediately
> + *            after KFIFO dequeue.
> + */
> +static void cxl_walk_port(struct cxl_port *port,
> +			  int (*cb)(struct device *, void *, struct pci_dev *),
> +			  void *userdata,
> +			  struct pci_dev *err_pdev)
> +{
> +	struct cxl_port *err_port __free(put_cxl_port) = get_cxl_port(err_pdev);
> +	bool need_lock = (port != err_port);
> +	struct cxl_dport *dport = NULL;
> +	unsigned long index;
> +
> +	device_lock_if(&port->dev, need_lock);
> +	if (is_cxl_endpoint(port)) {
> +		cb(port->uport_dev->parent, userdata, err_pdev);
> +		device_unlock_if(&port->dev, need_lock);
> +		return;
> +	}
> +
> +	if (port->uport_dev && dev_is_pci(port->uport_dev))
> +		cb(port->uport_dev, userdata, err_pdev);
> +
> +	/*
> +	 * Iterate over the set of Downstream Ports recorded in port->dports (XArray):
> +	 *  - For each dport, attempt to find a child CXL Port whose parent dport
> +	 *    match.
> +	 *  - Invoke the provided callback on the dport's device.
> +	 *  - If a matching child CXL Port device is found, recurse into that port to
> +	 *    continue the walk.
> +	 */
> +	xa_for_each(&port->dports, index, dport)
> +	{
> +		struct device *child_port_dev __free(put_device) =
> +			bus_find_device(&cxl_bus_type, &port->dev, dport->dport_dev,
> +					match_port_by_parent_dport);
> +
> +		cb(dport->dport_dev, userdata, err_pdev);
> +		if (child_port_dev)
> +			cxl_walk_port(to_cxl_port(child_port_dev), cb, userdata, err_pdev);
> +	}
> +	device_unlock_if(&port->dev, need_lock);
> +}
> +
>  static void cxl_do_recovery(struct pci_dev *pdev)
>  {
> +	pci_ers_result_t status = PCI_ERS_RESULT_CAN_RECOVER;
> +	struct cxl_port *port __free(put_cxl_port) = get_cxl_port(pdev);
> +
> +	if (!port) {
> +		pci_err(pdev, "Failed to find the CXL device\n");
> +		return;
> +	}
> +
> +	cxl_walk_port(port, cxl_report_error_detected, &status, pdev);
> +	if (status == PCI_ERS_RESULT_PANIC)
> +		panic("CXL cachemem error.");
> +
> +	/*
> +	 * If we have native control of AER, clear error status in the device
> +	 * that detected the error.  If the platform retained control of AER,
> +	 * it is responsible for clearing this status.  In that case, the
> +	 * signaling device may not even be visible to the OS.
> +	 */
> +	if (cxl_error_is_native(pdev)) {
> +		pcie_clear_device_status(pdev);
> +		pci_aer_clear_nonfatal_status(pdev);
> +		pci_aer_clear_fatal_status(pdev);
> +	}
>  }
>  
>  void cxl_handle_cor_ras(struct device *dev, u64 serial, void __iomem *ras_base)
> @@ -483,16 +613,15 @@ static void cxl_proto_err_work_fn(struct work_struct *work)
>  			if (!cxl_pci_drv_bound(pdev))
>  				return;
>  			cxlmd_dev = &cxlds->cxlmd->dev;
> -			device_lock_if(cxlmd_dev, cxlmd_dev);
>  		} else {
>  			cxlmd_dev = NULL;
>  		}
>  
> +		/* Lock the CXL parent Port */
>  		struct cxl_port *port __free(put_cxl_port) = get_cxl_port(pdev);
> -		if (!port)
> -			return;
>  		guard(device)(&port->dev);
>  
> +		device_lock_if(cxlmd_dev, cxlmd_dev);
>  		cxl_handle_proto_error(&wd);
>  		device_unlock_if(cxlmd_dev, cxlmd_dev);
>  	}
> diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
> index 2af6ea82526d..3637996d37ab 100644
> --- a/drivers/pci/pci.h
> +++ b/drivers/pci/pci.h
> @@ -1174,7 +1174,6 @@ void pci_restore_aer_state(struct pci_dev *dev);
>  static inline void pci_no_aer(void) { }
>  static inline void pci_aer_init(struct pci_dev *d) { }
>  static inline void pci_aer_exit(struct pci_dev *d) { }
> -static inline void pci_aer_clear_fatal_status(struct pci_dev *dev) { }
>  static inline int pci_aer_clear_status(struct pci_dev *dev) { return -EINVAL; }
>  static inline int pci_aer_raw_clear_status(struct pci_dev *dev) { return -EINVAL; }
>  static inline void pci_save_aer_state(struct pci_dev *dev) { }
> diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
> index e806fa05280b..4cf44297bb24 100644
> --- a/drivers/pci/pcie/aer.c
> +++ b/drivers/pci/pcie/aer.c
> @@ -297,6 +297,7 @@ void pci_aer_clear_fatal_status(struct pci_dev *dev)
>  	if (status)
>  		pci_write_config_dword(dev, aer + PCI_ERR_UNCOR_STATUS, status);
>  }
> +EXPORT_SYMBOL_GPL(pci_aer_clear_fatal_status);
>  
>  /**
>   * pci_aer_raw_clear_status - Clear AER error registers.
> diff --git a/include/linux/aer.h b/include/linux/aer.h
> index 6b2c87d1b5b6..64aef69fb546 100644
> --- a/include/linux/aer.h
> +++ b/include/linux/aer.h
> @@ -66,6 +66,7 @@ struct cxl_proto_err_work_data {
>  
>  #if defined(CONFIG_PCIEAER)
>  int pci_aer_clear_nonfatal_status(struct pci_dev *dev);
> +void pci_aer_clear_fatal_status(struct pci_dev *dev);
>  int pcie_aer_is_native(struct pci_dev *dev);
>  void pci_aer_unmask_internal_errors(struct pci_dev *dev);
>  #else
> @@ -73,6 +74,7 @@ static inline int pci_aer_clear_nonfatal_status(struct pci_dev *dev)
>  {
>  	return -EINVAL;
>  }
> +static inline void pci_aer_clear_fatal_status(struct pci_dev *dev) { }
>  static inline int pcie_aer_is_native(struct pci_dev *dev) { return 0; }
>  static inline void pci_aer_unmask_internal_errors(struct pci_dev *dev) { }
>  #endif
> -- 
> 2.34.1
>