linux-kernel - [PATCH v2 3/3] PCI: Clean up NUMA-node awareness in pci_bus

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20260122145208.1013-4-guojinhui.liam@bytedance.com>
Date: Thu, 22 Jan 2026 22:52:08 +0800
From: "Jinhui Guo" <guojinhui.liam@...edance.com>
To: <dakr@...nel.org>, <alexanderduyck@...com>, <bhelgaas@...gle.com>, 
	<bvanassche@....org>, <dan.j.williams@...el.com>, 
	<gregkh@...uxfoundation.org>, <helgaas@...nel.org>, <rafael@...nel.org>, 
	<tj@...nel.org>, <frederic@...nel.org>
Cc: <guojinhui.liam@...edance.com>, <linux-kernel@...r.kernel.org>, 
	<linux-pci@...r.kernel.org>
Subject: [PATCH v2 3/3] PCI: Clean up NUMA-node awareness in pci_bus_type probe

With NUMA-node-aware probing now handled by the driver core,
the equivalent code in the PCI driver is redundant and can
be removed.

Dropping it speeds up asynchronous probe by 35%; the gain
comes from eliminating the work_on_cpu() call in pci_call_probe()
that previously pinned every worker to the same CPU, forcing
serial probe of devices on the same NUMA node.

Testing three NVMe devices on the same NUMA node of an AMD
EPYC 9A64 2.4 GHz processor shows a 35% probe-time improvement
with the patch:

Before (all on CPU 0):
  nvme 0000:01:00.0: CPU: 0, COMM: kworker/0:1, cost: 52266334ns
  nvme 0000:02:00.0: CPU: 0, COMM: kworker/0:0, cost: 50787194ns
  nvme 0000:03:00.0: CPU: 0, COMM: kworker/0:2, cost: 50541584ns

After (spread across CPUs 1, 2, 4):
  nvme 0000:01:00.0: CPU: 1, COMM: kworker/u1025:2, cost: 35399608ns
  nvme 0000:02:00.0: CPU: 2, COMM: kworker/u1025:3, cost: 35156157ns
  nvme 0000:03:00.0: CPU: 4, COMM: kworker/u1025:0, cost: 35322116ns

The improvement grows with more PCI devices because fewer probes
contend for the same CPU.

Signed-off-by: Jinhui Guo <guojinhui.liam@...edance.com>
---
 drivers/pci/pci-driver.c | 116 +++------------------------------------
 include/linux/pci.h      |   4 --
 kernel/sched/isolation.c |   2 -
 3 files changed, 8 insertions(+), 114 deletions(-)

diff --git a/drivers/pci/pci-driver.c b/drivers/pci/pci-driver.c
index 6b80400ee9b9..258f16da6550 100644
--- a/drivers/pci/pci-driver.c
+++ b/drivers/pci/pci-driver.c
@@ -296,17 +296,9 @@ static struct attribute *pci_drv_attrs[] = {
 };
 ATTRIBUTE_GROUPS(pci_drv);
 
-struct drv_dev_and_id {
-	struct pci_driver *drv;
-	struct pci_dev *dev;
-	const struct pci_device_id *id;
-};
-
-static int local_pci_probe(struct drv_dev_and_id *ddi)
+static int pci_call_probe(struct pci_driver *drv, struct pci_dev *dev,
+			  const struct pci_device_id *id)
 {
-	struct pci_dev *pci_dev = ddi->dev;
-	struct pci_driver *pci_drv = ddi->drv;
-	struct device *dev = &pci_dev->dev;
 	int rc;
 
 	/*
@@ -318,113 +310,25 @@ static int local_pci_probe(struct drv_dev_and_id *ddi)
 	 * count, in its probe routine and pm_runtime_get_noresume() in
 	 * its remove routine.
 	 */
-	pm_runtime_get_sync(dev);
-	pci_dev->driver = pci_drv;
-	rc = pci_drv->probe(pci_dev, ddi->id);
+	pm_runtime_get_sync(&dev->dev);
+	dev->driver = drv;
+	rc = drv->probe(dev, id);
 	if (!rc)
 		return rc;
 	if (rc < 0) {
-		pci_dev->driver = NULL;
-		pm_runtime_put_sync(dev);
+		dev->driver = NULL;
+		pm_runtime_put_sync(&dev->dev);
 		return rc;
 	}
 	/*
 	 * Probe function should return < 0 for failure, 0 for success
 	 * Treat values > 0 as success, but warn.
 	 */
-	pci_warn(pci_dev, "Driver probe function unexpectedly returned %d\n",
+	pci_warn(dev, "Driver probe function unexpectedly returned %d\n",
 		 rc);
 	return 0;
 }
 
-static struct workqueue_struct *pci_probe_wq;
-
-struct pci_probe_arg {
-	struct drv_dev_and_id *ddi;
-	struct work_struct work;
-	int ret;
-};
-
-static void local_pci_probe_callback(struct work_struct *work)
-{
-	struct pci_probe_arg *arg = container_of(work, struct pci_probe_arg, work);
-
-	arg->ret = local_pci_probe(arg->ddi);
-}
-
-static bool pci_physfn_is_probed(struct pci_dev *dev)
-{
-#ifdef CONFIG_PCI_IOV
-	return dev->is_virtfn && dev->physfn->is_probed;
-#else
-	return false;
-#endif
-}
-
-static int pci_call_probe(struct pci_driver *drv, struct pci_dev *dev,
-			  const struct pci_device_id *id)
-{
-	int error, node, cpu;
-	struct drv_dev_and_id ddi = { drv, dev, id };
-
-	/*
-	 * Execute driver initialization on node where the device is
-	 * attached.  This way the driver likely allocates its local memory
-	 * on the right node.
-	 */
-	node = dev_to_node(&dev->dev);
-	dev->is_probed = 1;
-
-	cpu_hotplug_disable();
-	/*
-	 * Prevent nesting work_on_cpu() for the case where a Virtual Function
-	 * device is probed from work_on_cpu() of the Physical device.
-	 */
-	if (node < 0 || node >= MAX_NUMNODES || !node_online(node) ||
-	    pci_physfn_is_probed(dev)) {
-		error = local_pci_probe(&ddi);
-	} else {
-		struct pci_probe_arg arg = { .ddi = &ddi };
-
-		INIT_WORK_ONSTACK(&arg.work, local_pci_probe_callback);
-		/*
-		 * The target election and the enqueue of the work must be within
-		 * the same RCU read side section so that when the workqueue pool
-		 * is flushed after a housekeeping cpumask update, further readers
-		 * are guaranteed to queue the probing work to the appropriate
-		 * targets.
-		 */
-		rcu_read_lock();
-		cpu = cpumask_any_and(cpumask_of_node(node),
-				      housekeeping_cpumask(HK_TYPE_DOMAIN));
-
-		if (cpu < nr_cpu_ids) {
-			struct workqueue_struct *wq = pci_probe_wq;
-
-			if (WARN_ON_ONCE(!wq))
-				wq = system_percpu_wq;
-			queue_work_on(cpu, wq, &arg.work);
-			rcu_read_unlock();
-			flush_work(&arg.work);
-			error = arg.ret;
-		} else {
-			rcu_read_unlock();
-			error = local_pci_probe(&ddi);
-		}
-
-		destroy_work_on_stack(&arg.work);
-	}
-
-	dev->is_probed = 0;
-	cpu_hotplug_enable();
-	return error;
-}
-
-void pci_probe_flush_workqueue(void)
-{
-	flush_workqueue(pci_probe_wq);
-}
-
 /**
  * __pci_device_probe - check if a driver wants to claim a specific PCI device
  * @drv: driver to call to check if it wants the PCI device
@@ -1734,10 +1638,6 @@ static int __init pci_driver_init(void)
 {
 	int ret;
 
-	pci_probe_wq = alloc_workqueue("sync_wq", WQ_PERCPU, 0);
-	if (!pci_probe_wq)
-		return -ENOMEM;
-
 	ret = bus_register(&pci_bus_type);
 	if (ret)
 		return ret;
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 7e36936bb37a..ae05faa105e2 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -486,7 +486,6 @@ struct pci_dev {
 	unsigned int	io_window_1k:1;		/* Intel bridge 1K I/O windows */
 	unsigned int	irq_managed:1;
 	unsigned int	non_compliant_bars:1;	/* Broken BARs; ignore them */
-	unsigned int	is_probed:1;		/* Device probing in progress */
 	unsigned int	link_active_reporting:1;/* Device capable of reporting link active */
 	unsigned int	no_vf_scan:1;		/* Don't scan for VFs after IOV enablement */
 	unsigned int	no_command_memory:1;	/* No PCI_COMMAND_MEMORY */
@@ -1211,7 +1210,6 @@ struct pci_bus *pci_create_root_bus(struct device *parent, int bus,
 				    struct pci_ops *ops, void *sysdata,
 				    struct list_head *resources);
 int pci_host_probe(struct pci_host_bridge *bridge);
-void pci_probe_flush_workqueue(void);
 int pci_bus_insert_busn_res(struct pci_bus *b, int bus, int busmax);
 int pci_bus_update_busn_res_end(struct pci_bus *b, int busmax);
 void pci_bus_release_busn_res(struct pci_bus *b);
@@ -2085,8 +2083,6 @@ static inline int pci_has_flag(int flag) { return 0; }
 _PCI_NOP_ALL(read, *)
 _PCI_NOP_ALL(write,)
 
-static inline void pci_probe_flush_workqueue(void) { }
-
 static inline struct pci_dev *pci_get_device(unsigned int vendor,
 					     unsigned int device,
 					     struct pci_dev *from)
diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c
index ef152d401fe2..3d28d8163ee4 100644
--- a/kernel/sched/isolation.c
+++ b/kernel/sched/isolation.c
@@ -8,7 +8,6 @@
  *
  */
 #include <linux/sched/isolation.h>
-#include <linux/pci.h>
 #include "sched.h"
 
 enum hk_flags {
@@ -144,7 +143,6 @@ int housekeeping_update(struct cpumask *isol_mask)
 
 	synchronize_rcu();
 
-	pci_probe_flush_workqueue();
 	mem_cgroup_flush_workqueue();
 	vmstat_flush_workqueue();
 
-- 
2.20.1