lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <CAMz9Wg-_0g2DrkjiUfyGwfYg_WgGTp9wiKoA_nR-AWtm1ixX4A@mail.gmail.com>
Date: Wed, 21 Jan 2026 13:35:27 +0800
From: AceLan Kao <acelan.kao@...onical.com>
To: Andreas Noever <andreas.noever@...il.com>, Mika Westerberg <westeri@...nel.org>, 
	Yehezkel Bernat <YehezkelShB@...il.com>, linux-usb@...r.kernel.org, 
	linux-kernel@...r.kernel.org
Subject: Re: [PATCH] thunderbolt: Fix PCIe device enumeration with delayed rescan

Chia-Lin Kao (AceLan) <acelan.kao@...onical.com> 於 2026年1月21日週三 下午1:27寫道:
>
> PCIe devices behind Thunderbolt tunnels may fail to enumerate when
> spurious hotplug events prevent pciehp from detecting link-up.
>
> Root cause:
>
> Spurious unplug events occur immediately after tunnel activation:
>
>   [  932.438] thunderbolt: acking hot unplug event on 702:2
>   [  932.852] thunderbolt: PCIe Up path activation complete
>   [  932.855] thunderbolt: hotplug event for upstream port 702:2
>             (unplug: 0)
>   [  932.855] thunderbolt: hotplug event for upstream port 702:2
>             (unplug: 1)
>
> These events disrupt pciehp timing, causing device enumeration to fail
> ~70% of the time on affected hardware. Manual PCI rescan succeeds,
> proving devices are present and functional on the bus.
>
> Solution:
>
> Schedule delayed work (300ms) after tunnel activation to:
> 1. Check if pciehp successfully enumerated devices (device count increased)
> 2. If not, trigger pci_rescan_bus() to discover devices manually
> 3. Log results for observability
>
> The delayed work approach is non-blocking and only rescans when actually
> needed, avoiding overhead on systems where pciehp works correctly.
>
> Signed-off-by: Chia-Lin Kao (AceLan) <acelan.kao@...onical.com>
> ---
> Logs: https://people.canonical.com/~acelan/bugs/tbt_storage/
> merged.out.bad: Plugged-in TBT storage, but eventually fails to enumerate
> merged.out.good: Plugged-in TBT storage, and successfully enumerates
> merged.out.patched: Plugged-in TBT storage, it should fail without this
>                     patch, but it works now
> ---
>  drivers/thunderbolt/tb.c | 95 ++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 95 insertions(+)
>
> diff --git a/drivers/thunderbolt/tb.c b/drivers/thunderbolt/tb.c
> index 293fc9f258a5c..1cfc9a265c453 100644
> --- a/drivers/thunderbolt/tb.c
> +++ b/drivers/thunderbolt/tb.c
> @@ -11,6 +11,7 @@
>  #include <linux/delay.h>
>  #include <linux/pm_runtime.h>
>  #include <linux/platform_data/x86/apple.h>
> +#include <linux/pci.h>
>
>  #include "tb.h"
>  #include "tb_regs.h"
> @@ -18,6 +19,7 @@
>
>  #define TB_TIMEOUT             100     /* ms */
>  #define TB_RELEASE_BW_TIMEOUT  10000   /* ms */
> +#define TB_PCIEHP_ENUMERATION_DELAY 300        /* ms */
>
>  /*
>   * How many time bandwidth allocation request from graphics driver is
> @@ -83,6 +85,16 @@ struct tb_hotplug_event {
>         int retry;
>  };
>
> +/* Delayed work to verify PCIe enumeration after tunnel activation */
> +struct tb_pci_rescan_work {
> +       struct delayed_work work;
> +       struct tb *tb;
> +       struct pci_bus *bus;
> +       int devices_before;
> +       u64 route;
> +       u8 port;
> +};
> +
>  static void tb_scan_port(struct tb_port *port);
>  static void tb_handle_hotplug(struct work_struct *work);
>  static void tb_dp_resource_unavailable(struct tb *tb, struct tb_port *port,
> @@ -90,6 +102,60 @@ static void tb_dp_resource_unavailable(struct tb *tb, struct tb_port *port,
>  static void tb_queue_dp_bandwidth_request(struct tb *tb, u64 route, u8 port,
>                                           int retry, unsigned long delay);
>
> +static void tb_pci_rescan_work_fn(struct work_struct *work)
> +{
> +       struct tb_pci_rescan_work *rescan_work =
> +               container_of(work, typeof(*rescan_work), work.work);
> +       struct tb *tb = rescan_work->tb;
> +       struct pci_bus *bus = rescan_work->bus;
> +       int devices_after = 0;
> +       struct pci_dev *dev;
> +       struct tb_switch *sw;
> +       struct tb_port *port;
> +
> +       mutex_lock(&tb->lock);
> +
> +       sw = tb_switch_find_by_route(tb, rescan_work->route);
> +       if (!sw) {
> +               tb_dbg(tb, "Switch at route %llx disappeared, skipping rescan\n",
> +                      rescan_work->route);
> +               goto out_unlock;
> +       }
> +
> +       port = &sw->ports[rescan_work->port];
> +
> +       pci_lock_rescan_remove();
> +       for_each_pci_dev(dev)
> +               devices_after++;
> +       pci_unlock_rescan_remove();
> +
> +       if (devices_after > rescan_work->devices_before) {
> +               tb_port_dbg(port, "pciehp enumerated %d new device(s)\n",
> +                           devices_after - rescan_work->devices_before);
> +       } else {
> +               tb_port_info(port, "pciehp failed to enumerate devices, triggering rescan\n");
> +
> +               pci_lock_rescan_remove();
> +               pci_rescan_bus(bus);
> +
> +               devices_after = 0;
> +               for_each_pci_dev(dev)
> +                       devices_after++;
> +               pci_unlock_rescan_remove();
> +
> +               if (devices_after > rescan_work->devices_before)
> +                       tb_port_info(port, "rescan found %d new device(s)\n",
> +                                    devices_after - rescan_work->devices_before);
> +               else
> +                       tb_port_warn(port, "no devices found even after rescan\n");
> +       }
> +
> +       tb_switch_put(sw);
> +out_unlock:
> +       mutex_unlock(&tb->lock);
> +       kfree(rescan_work);
> +}
> +
>  static void tb_queue_hotplug(struct tb *tb, u64 route, u8 port, bool unplug)
>  {
>         struct tb_hotplug_event *ev;
> @@ -2400,6 +2466,35 @@ static int tb_tunnel_pci(struct tb *tb, struct tb_switch *sw)
>                 tb_sw_warn(sw, "failed to connect xHCI\n");
>
>         list_add_tail(&tunnel->list, &tcm->tunnel_list);
> +
> +       /* Verify pciehp enumeration; trigger rescan if needed */
> +       if (tb->nhi && tb->nhi->pdev && tb->nhi->pdev->bus) {
> +               struct pci_bus *bus = tb->nhi->pdev->bus;
> +               struct pci_bus *scan_bus = bus->parent ? bus->parent : bus;
> +               struct tb_pci_rescan_work *rescan_work;
> +               struct pci_dev *dev;
> +               int devices_before = 0;
> +
> +               pci_lock_rescan_remove();
> +               for_each_pci_dev(dev)
> +                       devices_before++;
> +               pci_unlock_rescan_remove();
> +
> +               rescan_work = kmalloc_obj(rescan_work, GFP_KERNEL);
Sorry, didn't re-check after checkpatch modified it.
kmalloc_obj() is undefined here.
I'll submit v2 later.

> +               if (!rescan_work)
> +                       return 0;
> +
> +               rescan_work->tb = tb;
> +               rescan_work->bus = scan_bus;
> +               rescan_work->devices_before = devices_before;
> +               rescan_work->route = tb_route(sw);
> +               rescan_work->port = up->port;
> +
> +               INIT_DELAYED_WORK(&rescan_work->work, tb_pci_rescan_work_fn);
> +               queue_delayed_work(tb->wq, &rescan_work->work,
> +                                  msecs_to_jiffies(TB_PCIEHP_ENUMERATION_DELAY));
> +       }
> +
>         return 0;
>  }
>
> --
> 2.51.0
>


-- 
Chia-Lin Kao(AceLan)
http://blog.acelan.idv.tw/
E-Mail: acelan.kaoATcanonical.com (s/AT/@/)

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ