[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <550635db-00ce-410e-add0-77c1a75adb11@gmail.com>
Date: Tue, 19 Aug 2025 22:12:41 +0800
From: Ethan Zhao <etzhao1900@...il.com>
To: Nicolin Chen <nicolinc@...dia.com>, robin.murphy@....com,
joro@...tes.org, bhelgaas@...gle.com, jgg@...dia.com
Cc: will@...nel.org, robin.clark@....qualcomm.com, yong.wu@...iatek.com,
matthias.bgg@...il.com, angelogioacchino.delregno@...labora.com,
thierry.reding@...il.com, vdumpa@...dia.com, jonathanh@...dia.com,
rafael@...nel.org, lenb@...nel.org, kevin.tian@...el.com,
yi.l.liu@...el.com, baolu.lu@...ux.intel.com,
linux-arm-kernel@...ts.infradead.org, iommu@...ts.linux.dev,
linux-kernel@...r.kernel.org, linux-arm-msm@...r.kernel.org,
linux-mediatek@...ts.infradead.org, linux-tegra@...r.kernel.org,
linux-acpi@...r.kernel.org, linux-pci@...r.kernel.org,
patches@...ts.linux.dev, pjaroszynski@...dia.com, vsethi@...dia.com,
helgaas@...nel.org
Subject: Re: [PATCH v3 5/5] pci: Suspend iommu function prior to resetting a
device
On 8/12/2025 6:59 AM, Nicolin Chen wrote:
> PCIe permits a device to ignore ATS invalidation TLPs, while processing a
> reset. This creates a problem visible to the OS where an ATS invalidation
> command will time out: e.g. an SVA domain will have no coordination with a
> reset event and can racily issue ATS invalidations to a resetting device.
>
> The PCIe spec in sec 10.3.1 IMPLEMENTATION NOTE recommends to disable and
> block ATS before initiating a Function Level Reset. It also mentions that
> other reset methods could have the same vulnerability as well.
>
> Now iommu_dev_reset_prepare/done() helpers are introduced for this matter.
> Use them in all the existing reset functions, which will attach the device
> to an IOMMU_DOMAIN_BLOCKED during a reset, so as to allow IOMMU driver to:
> - invoke pci_disable_ats() and pci_enable_ats(), if necessary
> - wait for all ATS invalidations to complete
> - stop issuing new ATS invalidations
> - fence any incoming ATS queries
>
> Signed-off-by: Nicolin Chen <nicolinc@...dia.com>
> ---
> drivers/pci/pci-acpi.c | 17 +++++++++--
> drivers/pci/pci.c | 68 ++++++++++++++++++++++++++++++++++++++----
> drivers/pci/quirks.c | 23 +++++++++++++-
> 3 files changed, 100 insertions(+), 8 deletions(-)
>
> diff --git a/drivers/pci/pci-acpi.c b/drivers/pci/pci-acpi.c
> index ddb25960ea47d..adaf46422c05d 100644
> --- a/drivers/pci/pci-acpi.c
> +++ b/drivers/pci/pci-acpi.c
> @@ -9,6 +9,7 @@
>
> #include <linux/delay.h>
> #include <linux/init.h>
> +#include <linux/iommu.h>
> #include <linux/irqdomain.h>
> #include <linux/pci.h>
> #include <linux/msi.h>
> @@ -969,6 +970,7 @@ void pci_set_acpi_fwnode(struct pci_dev *dev)
> int pci_dev_acpi_reset(struct pci_dev *dev, bool probe)
> {
> acpi_handle handle = ACPI_HANDLE(&dev->dev);
> + int ret = 0;
>
> if (!handle || !acpi_has_method(handle, "_RST"))
> return -ENOTTY;
> @@ -976,12 +978,23 @@ int pci_dev_acpi_reset(struct pci_dev *dev, bool probe)
> if (probe)
> return 0;
>
> + /*
> + * Per PCIe r6.3, sec 10.3.1 IMPLEMENTATION NOTE, software disables ATS
> + * before initiating a reset. Notify the iommu driver that enabled ATS.
> + */
> + ret = iommu_dev_reset_prepare(&dev->dev);
> + if (ret) {
> + pci_err(dev, "failed to stop IOMMU\n");
> + return ret;
> + }
> +
> if (ACPI_FAILURE(acpi_evaluate_object(handle, "_RST", NULL, NULL))) {
> pci_warn(dev, "ACPI _RST failed\n");
> - return -ENOTTY;
> + ret = -ENOTTY;
> }
>
> - return 0;
> + iommu_dev_reset_done(&dev->dev);
> + return ret;
> }
>
> bool acpi_pci_power_manageable(struct pci_dev *dev)
> diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
> index b0f4d98036cdd..d6d87e22d81b3 100644
> --- a/drivers/pci/pci.c
> +++ b/drivers/pci/pci.c
> @@ -13,6 +13,7 @@
> #include <linux/delay.h>
> #include <linux/dmi.h>
> #include <linux/init.h>
> +#include <linux/iommu.h>
> #include <linux/msi.h>
> #include <linux/of.h>
> #include <linux/pci.h>
> @@ -4529,13 +4530,26 @@ EXPORT_SYMBOL(pci_wait_for_pending_transaction);
> */
> int pcie_flr(struct pci_dev *dev)
> {
> + int ret = 0;
> +
> if (!pci_wait_for_pending_transaction(dev))
> pci_err(dev, "timed out waiting for pending transaction; performing function level reset anyway\n");
>
> + /*
> + * Per PCIe r6.3, sec 10.3.1 IMPLEMENTATION NOTE, software disables ATS
> + * before initiating a reset. Notify the iommu driver that enabled ATS.
> + * Have to call it after waiting for pending DMA transaction.
> + */
> + ret = iommu_dev_reset_prepare(&dev->dev);
If we dont' consider the association between IOMMU and devices in FLR(),
it can be understood that more complex processing logic resides outside
this function. However, if the FLR() function already synchironizes and
handles the association with IOMMU like this (disabling ATS by attaching
device to blocking domain), then how would the following scenarios
behave ?
1. Reset one of PCIe alias devices.
2. Reset PF when its VFs are actvie.
....
Thanks,
Ethan> + if (ret) {
> + pci_err(dev, "failed to stop IOMMU\n");
> + return ret;
> + }
> +
> pcie_capability_set_word(dev, PCI_EXP_DEVCTL, PCI_EXP_DEVCTL_BCR_FLR);
>
> if (dev->imm_ready)
> - return 0;
> + goto done;
>
> /*
> * Per PCIe r4.0, sec 6.6.2, a device must complete an FLR within
> @@ -4544,7 +4558,11 @@ int pcie_flr(struct pci_dev *dev)
> */
> msleep(100);
>
> - return pci_dev_wait(dev, "FLR", PCIE_RESET_READY_POLL_MS);
> + ret = pci_dev_wait(dev, "FLR", PCIE_RESET_READY_POLL_MS);
> +
> +done:
> + iommu_dev_reset_done(&dev->dev);
> + return ret;
> }
> EXPORT_SYMBOL_GPL(pcie_flr);
>
> @@ -4572,6 +4590,7 @@ EXPORT_SYMBOL_GPL(pcie_reset_flr);
>
> static int pci_af_flr(struct pci_dev *dev, bool probe)
> {
> + int ret = 0;
> int pos;
> u8 cap;
>
> @@ -4598,10 +4617,21 @@ static int pci_af_flr(struct pci_dev *dev, bool probe)
> PCI_AF_STATUS_TP << 8))
> pci_err(dev, "timed out waiting for pending transaction; performing AF function level reset anyway\n");
>
> + /*
> + * Per PCIe r6.3, sec 10.3.1 IMPLEMENTATION NOTE, software disables ATS
> + * before initiating a reset. Notify the iommu driver that enabled ATS.
> + * Have to call it after waiting for pending DMA transaction.
> + */
> + ret = iommu_dev_reset_prepare(&dev->dev);
> + if (ret) {
> + pci_err(dev, "failed to stop IOMMU\n");
> + return ret;
> + }
> +
> pci_write_config_byte(dev, pos + PCI_AF_CTRL, PCI_AF_CTRL_FLR);
>
> if (dev->imm_ready)
> - return 0;
> + goto done;
>
> /*
> * Per Advanced Capabilities for Conventional PCI ECN, 13 April 2006,
> @@ -4611,7 +4641,11 @@ static int pci_af_flr(struct pci_dev *dev, bool probe)
> */
> msleep(100);
>
> - return pci_dev_wait(dev, "AF_FLR", PCIE_RESET_READY_POLL_MS);
> + ret = pci_dev_wait(dev, "AF_FLR", PCIE_RESET_READY_POLL_MS);
> +
> +done:
> + iommu_dev_reset_done(&dev->dev);
> + return ret;
> }
>
> /**
> @@ -4632,6 +4666,7 @@ static int pci_af_flr(struct pci_dev *dev, bool probe)
> static int pci_pm_reset(struct pci_dev *dev, bool probe)
> {
> u16 csr;
> + int ret;
>
> if (!dev->pm_cap || dev->dev_flags & PCI_DEV_FLAGS_NO_PM_RESET)
> return -ENOTTY;
> @@ -4646,6 +4681,16 @@ static int pci_pm_reset(struct pci_dev *dev, bool probe)
> if (dev->current_state != PCI_D0)
> return -EINVAL;
>
> + /*
> + * Per PCIe r6.3, sec 10.3.1 IMPLEMENTATION NOTE, software disables ATS
> + * before initiating a reset. Notify the iommu driver that enabled ATS.
> + */
> + ret = iommu_dev_reset_prepare(&dev->dev);
> + if (ret) {
> + pci_err(dev, "failed to stop IOMMU\n");
> + return ret;
> + }
> +
> csr &= ~PCI_PM_CTRL_STATE_MASK;
> csr |= PCI_D3hot;
> pci_write_config_word(dev, dev->pm_cap + PCI_PM_CTRL, csr);
> @@ -4656,7 +4701,9 @@ static int pci_pm_reset(struct pci_dev *dev, bool probe)
> pci_write_config_word(dev, dev->pm_cap + PCI_PM_CTRL, csr);
> pci_dev_d3_sleep(dev);
>
> - return pci_dev_wait(dev, "PM D3hot->D0", PCIE_RESET_READY_POLL_MS);
> + ret = pci_dev_wait(dev, "PM D3hot->D0", PCIE_RESET_READY_POLL_MS);
> + iommu_dev_reset_done(&dev->dev);
> + return ret;
> }
>
> /**
> @@ -5111,6 +5158,16 @@ static int cxl_reset_bus_function(struct pci_dev *dev, bool probe)
> if (rc)
> return -ENOTTY;
>
> + /*
> + * Per PCIe r6.3, sec 10.3.1 IMPLEMENTATION NOTE, software disables ATS
> + * before initiating a reset. Notify the iommu driver that enabled ATS.
> + */
> + rc = iommu_dev_reset_prepare(&dev->dev);
> + if (rc) {
> + pci_err(dev, "failed to stop IOMMU\n");
> + return rc;
> + }
> +
> if (reg & PCI_DVSEC_CXL_PORT_CTL_UNMASK_SBR) {
> val = reg;
> } else {
> @@ -5125,6 +5182,7 @@ static int cxl_reset_bus_function(struct pci_dev *dev, bool probe)
> pci_write_config_word(bridge, dvsec + PCI_DVSEC_CXL_PORT_CTL,
> reg);
>
> + iommu_dev_reset_done(&dev->dev);
> return rc;
> }
>
> diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c
> index d97335a401930..6157c6c02bdb0 100644
> --- a/drivers/pci/quirks.c
> +++ b/drivers/pci/quirks.c
> @@ -21,6 +21,7 @@
> #include <linux/pci.h>
> #include <linux/isa-dma.h> /* isa_dma_bridge_buggy */
> #include <linux/init.h>
> +#include <linux/iommu.h>
> #include <linux/delay.h>
> #include <linux/acpi.h>
> #include <linux/dmi.h>
> @@ -4225,6 +4226,26 @@ static const struct pci_dev_reset_methods pci_dev_reset_methods[] = {
> { 0 }
> };
>
> +static int __pci_dev_specific_reset(struct pci_dev *dev, bool probe,
> + const struct pci_dev_reset_methods *i)
> +{
> + int ret;
> +
> + /*
> + * Per PCIe r6.3, sec 10.3.1 IMPLEMENTATION NOTE, software disables ATS
> + * before initiating a reset. Notify the iommu driver that enabled ATS.
> + */
> + ret = iommu_dev_reset_prepare(&dev->dev);
> + if (ret) {
> + pci_err(dev, "failed to stop IOMMU\n");
> + return ret;
> + }
> +
> + ret = i->reset(dev, probe);
> + iommu_dev_reset_done(&dev->dev);
> + return ret;
> +}
> +
> /*
> * These device-specific reset methods are here rather than in a driver
> * because when a host assigns a device to a guest VM, the host may need
> @@ -4239,7 +4260,7 @@ int pci_dev_specific_reset(struct pci_dev *dev, bool probe)
> i->vendor == (u16)PCI_ANY_ID) &&
> (i->device == dev->device ||
> i->device == (u16)PCI_ANY_ID))
> - return i->reset(dev, probe);
> + return __pci_dev_specific_reset(dev, probe, i);
> }
>
> return -ENOTTY;
Powered by blists - more mailing lists