[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <8ad1dec8-79eb-4189-aab0-cd62a1ed0c37@linux.alibaba.com>
Date: Mon, 1 Dec 2025 20:21:51 +0800
From: "guanghuifeng@...ux.alibaba.com" <guanghuifeng@...ux.alibaba.com>
To: Ilpo Järvinen <ilpo.jarvinen@...ux.intel.com>
Cc: bhelgaas@...gle.com, linux-pci@...r.kernel.org,
LKML <linux-kernel@...r.kernel.org>, kanie@...ux.alibaba.com,
alikernel-developer@...ux.alibaba.com
Subject: Re: [PATCH v2] PCI: Fix PCIe SBR dev/link wait error
在 2025/12/1 17:21, Ilpo Järvinen 写道:
> On Sun, 30 Nov 2025, Guanghui Feng wrote:
>
>> When executing a PCIe secondary bus reset, all downstream switches and
>> endpoints will generate reset events. Simultaneously, all PCIe links
>> will undergo retraining, and each link will independently re-execute the
>> LTSSM state machine training. Therefore, after executing the SBR, it is
>> necessary to wait for all downstream links and devices to complete
>> recovery. Otherwise, after the SBR returns, accessing devices with some
>> links or endpoints not yet fully recovered may result in driver errors,
>> or even trigger device offline issues.
>>
>> Signed-off-by: Guanghui Feng <guanghuifeng@...ux.alibaba.com>
>> Reviewed-by: Guixin Liu <kanie@...ux.alibaba.com>
>> ---
> Hi,
>
> In future, when posting an update, please always explain here
> below the --- line what was changed between the versions.
OK
>> drivers/pci/pci.c | 138 ++++++++++++++++++++++++++++++++--------------
>> 1 file changed, 97 insertions(+), 41 deletions(-)
>>
>> diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
>> index b14dd064006c..76afecb11164 100644
>> --- a/drivers/pci/pci.c
>> +++ b/drivers/pci/pci.c
>> @@ -4788,6 +4788,63 @@ static int pci_bus_max_d3cold_delay(const struct pci_bus *bus)
>> return max(min_delay, max_delay);
>> }
>>
>> +static int pci_readiness_check(struct pci_dev *pdev, struct pci_dev *child,
>> + unsigned long start_t, char *reset_type)
>> +{
>> + int elapsed = jiffies_to_msecs(jiffies - start_t);
>> +
>> + if (pci_dev_is_disconnected(pdev) || pci_dev_is_disconnected(child))
>> + return 0;
>> +
>> + if (pcie_get_speed_cap(pdev) <= PCIE_SPEED_5_0GT) {
>> + u16 status;
>> +
>> + pci_dbg(pdev, "waiting %d ms for downstream link\n", elapsed);
>> +
>> + if (!pci_dev_wait(child, reset_type, 0))
>> + return 0;
>> +
>> + if (PCI_RESET_WAIT > elapsed)
>> + return PCI_RESET_WAIT - elapsed;
>> +
>> + /*
>> + * If the port supports active link reporting we now check
>> + * whether the link is active and if not bail out early with
>> + * the assumption that the device is not present anymore.
>> + */
>> + if (!pdev->link_active_reporting)
>> + return -ENOTTY;
>> +
>> + pcie_capability_read_word(pdev, PCI_EXP_LNKSTA, &status);
>> + if (!(status & PCI_EXP_LNKSTA_DLLLA))
>> + return -ENOTTY;
>> +
>> + if (!pci_dev_wait(child, reset_type, 0))
>> + return 0;
>> +
>> + if (PCIE_RESET_READY_POLL_MS > elapsed)
>> + return PCIE_RESET_READY_POLL_MS - elapsed;
>> +
>> + return -ENOTTY;
>> + }
>> +
>> + pci_dbg(pdev, "waiting %d ms for downstream link, after activation\n",
>> + elapsed);
>> + if (!pcie_wait_for_link_delay(pdev, true, 0)) {
>> + /* Did not train, no need to wait any further */
>> + pci_info(pdev, "Data Link Layer Link Active not set in %d msec\n", elapsed);
>> + return -ENOTTY;
>> + }
>> +
>> + if (!pci_dev_wait(child, reset_type, 0))
>> + return 0;
>> +
>> + if (PCIE_RESET_READY_POLL_MS > elapsed)
>> + return PCIE_RESET_READY_POLL_MS - elapsed;
>> +
>> + return -ENOTTY;
>> +}
>> +
>> /**
>> * pci_bridge_wait_for_secondary_bus - Wait for secondary bus to be accessible
>> * @dev: PCI bridge
>> @@ -4802,12 +4859,14 @@ static int pci_bus_max_d3cold_delay(const struct pci_bus *bus)
>> * 4.3.2.
>> *
>> * Return 0 on success or -ENOTTY if the first device on the secondary bus
>> - * failed to become accessible.
>> + * failed to become accessible or a value greater than 0 indicates the
>> + * left required waiting time..
>> */
>> -int pci_bridge_wait_for_secondary_bus(struct pci_dev *dev, char *reset_type)
>> +static int __pci_bridge_wait_for_secondary_bus(struct pci_dev *dev, unsigned long start_t,
>> + char *reset_type)
>> {
>> - struct pci_dev *child __free(pci_dev_put) = NULL;
>> - int delay;
>> + struct pci_dev *child;
>> + int delay, ret, elapsed = jiffies_to_msecs(jiffies - start_t);
>>
>> if (pci_dev_is_disconnected(dev))
>> return 0;
>> @@ -4835,8 +4894,6 @@ int pci_bridge_wait_for_secondary_bus(struct pci_dev *dev, char *reset_type)
>> return 0;
>> }
>>
>> - child = pci_dev_get(list_first_entry(&dev->subordinate->devices,
>> - struct pci_dev, bus_list));
>> up_read(&pci_bus_sem);
>>
>> /*
>> @@ -4844,8 +4901,10 @@ int pci_bridge_wait_for_secondary_bus(struct pci_dev *dev, char *reset_type)
>> * accessing the device after reset (that is 1000 ms + 100 ms).
>> */
>> if (!pci_is_pcie(dev)) {
>> - pci_dbg(dev, "waiting %d ms for secondary bus\n", 1000 + delay);
>> - msleep(1000 + delay);
>> + if (1000 + delay > elapsed)
>> + return 1000 + delay - elapsed;
>> +
>> + pci_dbg(dev, "waiting %d ms for secondary bus\n", elapsed);
>> return 0;
>> }
>>
>> @@ -4867,41 +4926,40 @@ int pci_bridge_wait_for_secondary_bus(struct pci_dev *dev, char *reset_type)
>> if (!pcie_downstream_port(dev))
>> return 0;
>>
>> - if (pcie_get_speed_cap(dev) <= PCIE_SPEED_5_0GT) {
>> - u16 status;
>> -
>> - pci_dbg(dev, "waiting %d ms for downstream link\n", delay);
>> - msleep(delay);
>> -
>> - if (!pci_dev_wait(child, reset_type, PCI_RESET_WAIT - delay))
>> - return 0;
>> + if (delay > elapsed)
>> + return delay - elapsed;
>>
>> + down_read(&pci_bus_sem);
>> + list_for_each_entry(child, &dev->subordinate->devices, bus_list) {
>> /*
>> - * If the port supports active link reporting we now check
>> - * whether the link is active and if not bail out early with
>> - * the assumption that the device is not present anymore.
>> + * Check if all devices under the same bus have completed
>> + * the reset process, including multifunction devices in
>> + * the same bus.
>> */
>> - if (!dev->link_active_reporting)
>> - return -ENOTTY;
>> + ret = pci_readiness_check(dev, child, start_t, reset_type);
>>
>> - pcie_capability_read_word(dev, PCI_EXP_LNKSTA, &status);
>> - if (!(status & PCI_EXP_LNKSTA_DLLLA))
>> - return -ENOTTY;
>> + if (ret == 0 && child->subordinate)
>> + ret = __pci_bridge_wait_for_secondary_bus(child, start_t, reset_type);
>>
>> - return pci_dev_wait(child, reset_type,
>> - PCIE_RESET_READY_POLL_MS - PCI_RESET_WAIT);
>> + if(ret)
>> + break;
>> }
>> + up_read(&pci_bus_sem);
>>
>> - pci_dbg(dev, "waiting %d ms for downstream link, after activation\n",
>> - delay);
>> - if (!pcie_wait_for_link_delay(dev, true, delay)) {
>> - /* Did not train, no need to wait any further */
>> - pci_info(dev, "Data Link Layer Link Active not set in %d msec\n", delay);
>> - return -ENOTTY;
>> - }
>> + return ret;
>> +}
>>
>> - return pci_dev_wait(child, reset_type,
>> - PCIE_RESET_READY_POLL_MS - delay);
>> +int pci_bridge_wait_for_secondary_bus(struct pci_dev *dev, char *reset_type)
>> +{
>> + int res = 0;
>> + unsigned long start_t = jiffies;
>> +
>> + do {
>> + msleep(res);
>> + res = __pci_bridge_wait_for_secondary_bus(dev, start_t, reset_type);
>> + } while (res > 0);
>> +
>> + return res;
>> }
>>
>> void pci_reset_secondary_bus(struct pci_dev *dev)
>> @@ -5542,10 +5600,8 @@ static void pci_bus_restore_locked(struct pci_bus *bus)
>>
>> list_for_each_entry(dev, &bus->devices, bus_list) {
>> pci_dev_restore(dev);
>> - if (dev->subordinate) {
>> - pci_bridge_wait_for_secondary_bus(dev, "bus reset");
>> + if (dev->subordinate)
>> pci_bus_restore_locked(dev->subordinate);
>> - }
>> }
>> }
> ???
>
> Unfortunately, this takes a wrong turn and is very much against the
> feedback I gave to you.
1. The implementation of
pci_bridge_secondary_bus_reset/pci_bridge_wait_for_secondary_bus
has been modified to wait for all downstream switches/link/devices
to complete their recovery
before returning.
2. Therefore, when pci_bus_restore_locked is called via __pci_reset_bus,
the waiting process has
already been completed via
pci_bridge_secondary_bus_reset/pci_bridge_wait_for_secondary_bus,
so pci_bus_restore_locked does not require additional waiting.
>>
>> @@ -5575,14 +5631,14 @@ static void pci_slot_restore_locked(struct pci_slot *slot)
>> {
>> struct pci_dev *dev;
>>
>> + pci_bridge_wait_for_secondary_bus(slot->bus->self, "slot reset");
>> +
>> list_for_each_entry(dev, &slot->bus->devices, bus_list) {
>> if (!dev->slot || dev->slot != slot)
>> continue;
>> pci_dev_restore(dev);
>> - if (dev->subordinate) {
>> - pci_bridge_wait_for_secondary_bus(dev, "slot reset");
>> + if (dev->subordinate)
>> pci_bus_restore_locked(dev->subordinate);
>> - }
1. When pci_slot_restore_locked is called for a PCIe slot reset,
pci_bridge_wait_for_secondary_bus
has already been executed to wait for all switches, links, and
devices to complete their recovery firstly
>
Powered by blists - more mailing lists