[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <CAMz9Wg-_0g2DrkjiUfyGwfYg_WgGTp9wiKoA_nR-AWtm1ixX4A@mail.gmail.com>
Date: Wed, 21 Jan 2026 13:35:27 +0800
From: AceLan Kao <acelan.kao@...onical.com>
To: Andreas Noever <andreas.noever@...il.com>, Mika Westerberg <westeri@...nel.org>,
Yehezkel Bernat <YehezkelShB@...il.com>, linux-usb@...r.kernel.org,
linux-kernel@...r.kernel.org
Subject: Re: [PATCH] thunderbolt: Fix PCIe device enumeration with delayed rescan
Chia-Lin Kao (AceLan) <acelan.kao@...onical.com> 於 2026年1月21日週三 下午1:27寫道:
>
> PCIe devices behind Thunderbolt tunnels may fail to enumerate when
> spurious hotplug events prevent pciehp from detecting link-up.
>
> Root cause:
>
> Spurious unplug events occur immediately after tunnel activation:
>
> [ 932.438] thunderbolt: acking hot unplug event on 702:2
> [ 932.852] thunderbolt: PCIe Up path activation complete
> [ 932.855] thunderbolt: hotplug event for upstream port 702:2
> (unplug: 0)
> [ 932.855] thunderbolt: hotplug event for upstream port 702:2
> (unplug: 1)
>
> These events disrupt pciehp timing, causing device enumeration to fail
> ~70% of the time on affected hardware. Manual PCI rescan succeeds,
> proving devices are present and functional on the bus.
>
> Solution:
>
> Schedule delayed work (300ms) after tunnel activation to:
> 1. Check if pciehp successfully enumerated devices (device count increased)
> 2. If not, trigger pci_rescan_bus() to discover devices manually
> 3. Log results for observability
>
> The delayed work approach is non-blocking and only rescans when actually
> needed, avoiding overhead on systems where pciehp works correctly.
>
> Signed-off-by: Chia-Lin Kao (AceLan) <acelan.kao@...onical.com>
> ---
> Logs: https://people.canonical.com/~acelan/bugs/tbt_storage/
> merged.out.bad: Plugged-in TBT storage, but eventually fails to enumerate
> merged.out.good: Plugged-in TBT storage, and successfully enumerates
> merged.out.patched: Plugged-in TBT storage, it should fail without this
> patch, but it works now
> ---
> drivers/thunderbolt/tb.c | 95 ++++++++++++++++++++++++++++++++++++++++
> 1 file changed, 95 insertions(+)
>
> diff --git a/drivers/thunderbolt/tb.c b/drivers/thunderbolt/tb.c
> index 293fc9f258a5c..1cfc9a265c453 100644
> --- a/drivers/thunderbolt/tb.c
> +++ b/drivers/thunderbolt/tb.c
> @@ -11,6 +11,7 @@
> #include <linux/delay.h>
> #include <linux/pm_runtime.h>
> #include <linux/platform_data/x86/apple.h>
> +#include <linux/pci.h>
>
> #include "tb.h"
> #include "tb_regs.h"
> @@ -18,6 +19,7 @@
>
> #define TB_TIMEOUT 100 /* ms */
> #define TB_RELEASE_BW_TIMEOUT 10000 /* ms */
> +#define TB_PCIEHP_ENUMERATION_DELAY 300 /* ms */
>
> /*
> * How many time bandwidth allocation request from graphics driver is
> @@ -83,6 +85,16 @@ struct tb_hotplug_event {
> int retry;
> };
>
> +/* Delayed work to verify PCIe enumeration after tunnel activation */
> +struct tb_pci_rescan_work {
> + struct delayed_work work;
> + struct tb *tb;
> + struct pci_bus *bus;
> + int devices_before;
> + u64 route;
> + u8 port;
> +};
> +
> static void tb_scan_port(struct tb_port *port);
> static void tb_handle_hotplug(struct work_struct *work);
> static void tb_dp_resource_unavailable(struct tb *tb, struct tb_port *port,
> @@ -90,6 +102,60 @@ static void tb_dp_resource_unavailable(struct tb *tb, struct tb_port *port,
> static void tb_queue_dp_bandwidth_request(struct tb *tb, u64 route, u8 port,
> int retry, unsigned long delay);
>
> +static void tb_pci_rescan_work_fn(struct work_struct *work)
> +{
> + struct tb_pci_rescan_work *rescan_work =
> + container_of(work, typeof(*rescan_work), work.work);
> + struct tb *tb = rescan_work->tb;
> + struct pci_bus *bus = rescan_work->bus;
> + int devices_after = 0;
> + struct pci_dev *dev;
> + struct tb_switch *sw;
> + struct tb_port *port;
> +
> + mutex_lock(&tb->lock);
> +
> + sw = tb_switch_find_by_route(tb, rescan_work->route);
> + if (!sw) {
> + tb_dbg(tb, "Switch at route %llx disappeared, skipping rescan\n",
> + rescan_work->route);
> + goto out_unlock;
> + }
> +
> + port = &sw->ports[rescan_work->port];
> +
> + pci_lock_rescan_remove();
> + for_each_pci_dev(dev)
> + devices_after++;
> + pci_unlock_rescan_remove();
> +
> + if (devices_after > rescan_work->devices_before) {
> + tb_port_dbg(port, "pciehp enumerated %d new device(s)\n",
> + devices_after - rescan_work->devices_before);
> + } else {
> + tb_port_info(port, "pciehp failed to enumerate devices, triggering rescan\n");
> +
> + pci_lock_rescan_remove();
> + pci_rescan_bus(bus);
> +
> + devices_after = 0;
> + for_each_pci_dev(dev)
> + devices_after++;
> + pci_unlock_rescan_remove();
> +
> + if (devices_after > rescan_work->devices_before)
> + tb_port_info(port, "rescan found %d new device(s)\n",
> + devices_after - rescan_work->devices_before);
> + else
> + tb_port_warn(port, "no devices found even after rescan\n");
> + }
> +
> + tb_switch_put(sw);
> +out_unlock:
> + mutex_unlock(&tb->lock);
> + kfree(rescan_work);
> +}
> +
> static void tb_queue_hotplug(struct tb *tb, u64 route, u8 port, bool unplug)
> {
> struct tb_hotplug_event *ev;
> @@ -2400,6 +2466,35 @@ static int tb_tunnel_pci(struct tb *tb, struct tb_switch *sw)
> tb_sw_warn(sw, "failed to connect xHCI\n");
>
> list_add_tail(&tunnel->list, &tcm->tunnel_list);
> +
> + /* Verify pciehp enumeration; trigger rescan if needed */
> + if (tb->nhi && tb->nhi->pdev && tb->nhi->pdev->bus) {
> + struct pci_bus *bus = tb->nhi->pdev->bus;
> + struct pci_bus *scan_bus = bus->parent ? bus->parent : bus;
> + struct tb_pci_rescan_work *rescan_work;
> + struct pci_dev *dev;
> + int devices_before = 0;
> +
> + pci_lock_rescan_remove();
> + for_each_pci_dev(dev)
> + devices_before++;
> + pci_unlock_rescan_remove();
> +
> + rescan_work = kmalloc_obj(rescan_work, GFP_KERNEL);
Sorry, didn't re-check after checkpatch modified it.
kmalloc_obj() is undefined here.
I'll submit v2 later.
> + if (!rescan_work)
> + return 0;
> +
> + rescan_work->tb = tb;
> + rescan_work->bus = scan_bus;
> + rescan_work->devices_before = devices_before;
> + rescan_work->route = tb_route(sw);
> + rescan_work->port = up->port;
> +
> + INIT_DELAYED_WORK(&rescan_work->work, tb_pci_rescan_work_fn);
> + queue_delayed_work(tb->wq, &rescan_work->work,
> + msecs_to_jiffies(TB_PCIEHP_ENUMERATION_DELAY));
> + }
> +
> return 0;
> }
>
> --
> 2.51.0
>
--
Chia-Lin Kao(AceLan)
http://blog.acelan.idv.tw/
E-Mail: acelan.kaoATcanonical.com (s/AT/@/)
Powered by blists - more mailing lists