[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <9b4ce0df-1fbf-4052-9eb9-1f3d6ad6a685@I-love.SAKURA.ne.jp>
Date: Mon, 15 Dec 2025 23:09:54 +0900
From: Tetsuo Handa <penguin-kernel@...ove.SAKURA.ne.jp>
To: Jason Gunthorpe <jgg@...pe.ca>, Leon Romanovsky <leon@...nel.org>,
Majd Dibbiny <majd@...lanox.com>, Doug Ledford <dledford@...hat.com>,
Yuval Shaia <yuval.shaia@...cle.com>
Cc: Bernard Metzler <bernard.metzler@...ux.dev>,
OFED mailing list <linux-rdma@...r.kernel.org>,
Network Development <netdev@...r.kernel.org>
Subject: Re: [not-yet-signed PATCH] RDMA/core: flush gid_cache_wq WQ from
disable_device()
On 2025/12/11 22:24, Tetsuo Handa wrote:
> Since a reproducer for this bug is not available, I haven't verified
> whether this is a bug syzbot is currently reporting in
> https://syzkaller.appspot.com/bug?extid=881d65229ca4f9ae8c84 .
> But I'd like to add Reported-by: syzbot if netdevice_event_work_handler()
> is supposed to be called for releasing GID entry upon NETDEV_UNREGISTER
> event. Thus, please review this change.
I can observe using simple atomic_t counters that there are sometimes pending
netdevice_event() works as of immediately before clearing DEVICE_REGISTERED flag.
That is, clearing DEVICE_REGISTERED flag without flushing pending netdevice_event()
works results in failing to process some of netdev events.
I considered resolving DEVICE_REGISTERED flag inside netdevice_event() and then
flush pending netdevice_event() works after clearing DEVICE_REGISTERED flag (diff
is shown below). But I immediately got circular locking dependency problem by just
executing "rdma link add siw0 type siw netdev lo" command line. Therefore, I guess
that the reason RDMA code defers netdevice_event() handling to WQ context is to
avoid circular locking dependency problem. But I guess that due to lack of reliable
flushing mechanism when clearing DEVICE_REGISTERED flag, sometimes operations for
deleting GID entry are not invoked, and syzbot is reporting refcount leak...
drivers/infiniband/core/core_priv.h | 5 +++++
drivers/infiniband/core/device.c | 12 ++++++++++++
drivers/infiniband/core/roce_gid_mgmt.c | 45 ++++++++++++++++++++++++++-------------------
3 files changed, 43 insertions(+), 19 deletions(-)
diff --git a/drivers/infiniband/core/core_priv.h b/drivers/infiniband/core/core_priv.h
index 05102769a918..96ccfeb85547 100644
--- a/drivers/infiniband/core/core_priv.h
+++ b/drivers/infiniband/core/core_priv.h
@@ -99,6 +99,11 @@ void ib_enum_all_roce_netdevs(roce_netdev_filter filter,
void *filter_cookie,
roce_netdev_callback cb,
void *cookie);
+extern struct workqueue_struct *gid_cache_wq;
+struct netdev_event_work_cmd;
+void roce_reserve_netdev_callback(struct ib_device *ib_dev, struct netdev_event_work_cmd *cmds,
+ struct net_device *ndev);
+void ib_reserve_enum_all_roce_netdevs(struct netdev_event_work_cmd *cmds, struct net_device *ndev);
typedef int (*nldev_callback)(struct ib_device *device,
struct sk_buff *skb,
diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c
index 13e8a1714bbd..1817a6d207d1 100644
--- a/drivers/infiniband/core/device.c
+++ b/drivers/infiniband/core/device.c
@@ -1303,6 +1303,7 @@ static void disable_device(struct ib_device *device)
down_write(&devices_rwsem);
xa_clear_mark(&devices, device->index, DEVICE_REGISTERED);
up_write(&devices_rwsem);
+ flush_workqueue(gid_cache_wq);
/*
* Remove clients in LIFO order, see assign_client_id. This could be
@@ -2446,6 +2447,17 @@ void ib_enum_all_roce_netdevs(roce_netdev_filter filter,
up_read(&devices_rwsem);
}
+void ib_reserve_enum_all_roce_netdevs(struct netdev_event_work_cmd *cmds, struct net_device *ndev)
+{
+ struct ib_device *dev;
+ unsigned long index;
+
+ down_read(&devices_rwsem);
+ xa_for_each_marked(&devices, index, dev, DEVICE_REGISTERED)
+ roce_reserve_netdev_callback(dev, cmds, ndev);
+ up_read(&devices_rwsem);
+}
+
/*
* ib_enum_all_devs - enumerate all ib_devices
* @cb: Callback to call for each found ib_device
diff --git a/drivers/infiniband/core/roce_gid_mgmt.c b/drivers/infiniband/core/roce_gid_mgmt.c
index a9f2c6b1b29e..371f3bc564eb 100644
--- a/drivers/infiniband/core/roce_gid_mgmt.c
+++ b/drivers/infiniband/core/roce_gid_mgmt.c
@@ -42,7 +42,7 @@
#include <rdma/ib_cache.h>
#include <rdma/ib_addr.h>
-static struct workqueue_struct *gid_cache_wq;
+struct workqueue_struct *gid_cache_wq;
enum gid_op_type {
GID_DEL = 0,
@@ -69,6 +69,12 @@ struct netdev_event_work {
struct netdev_event_work_cmd cmds[ROCE_NETDEV_CALLBACK_SZ];
};
+struct netdev_event_work2 {
+ struct work_struct work;
+ struct ib_device *ib_dev;
+ struct netdev_event_work_cmd cmds[ROCE_NETDEV_CALLBACK_SZ];
+};
+
static const struct {
bool (*is_supported)(const struct ib_device *device, u32 port_num);
enum ib_gid_type gid_type;
@@ -633,39 +639,41 @@ static void del_netdev_default_ips_join(struct ib_device *ib_dev, u32 port,
}
}
-/* The following functions operate on all IB devices. netdevice_event and
- * addr_event execute ib_enum_all_roce_netdevs through a work.
+/* The following functions operate on all IB devices.
+ * netdevice_event executes ib_enum_roce_netdev through netdev_event_work2.
+ * addr_event executes ib_enum_all_roce_netdevs through update_gid_event_work.
* ib_enum_all_roce_netdevs iterates through all IB devices.
*/
static void netdevice_event_work_handler(struct work_struct *_work)
{
- struct netdev_event_work *work =
- container_of(_work, struct netdev_event_work, work);
+ struct netdev_event_work2 *work =
+ container_of(_work, struct netdev_event_work2, work);
unsigned int i;
for (i = 0; i < ARRAY_SIZE(work->cmds) && work->cmds[i].cb; i++) {
- ib_enum_all_roce_netdevs(work->cmds[i].filter,
- work->cmds[i].filter_ndev,
- work->cmds[i].cb,
- work->cmds[i].ndev);
+ ib_enum_roce_netdev(work->ib_dev,
+ work->cmds[i].filter,
+ work->cmds[i].filter_ndev,
+ work->cmds[i].cb,
+ work->cmds[i].ndev);
dev_put(work->cmds[i].ndev);
dev_put(work->cmds[i].filter_ndev);
}
+ ib_device_put(work->ib_dev); /* Acquired by roce_reserve_netdev_callback(). */
kfree(work);
}
-static int netdevice_queue_work(struct netdev_event_work_cmd *cmds,
- struct net_device *ndev)
+void roce_reserve_netdev_callback(struct ib_device *ib_dev, struct netdev_event_work_cmd *cmds,
+ struct net_device *ndev)
{
unsigned int i;
- struct netdev_event_work *ndev_work =
- kmalloc(sizeof(*ndev_work), GFP_KERNEL);
-
- if (!ndev_work)
- return NOTIFY_DONE;
+ struct netdev_event_work2 *ndev_work =
+ kmalloc(sizeof(*ndev_work), GFP_KERNEL | __GFP_NOFAIL);
+ refcount_inc(&ib_dev->refcount); /* Dropped by netdevice_event_work_handler(). */
+ ndev_work->ib_dev = ib_dev;
memcpy(ndev_work->cmds, cmds, sizeof(ndev_work->cmds));
for (i = 0; i < ARRAY_SIZE(ndev_work->cmds) && ndev_work->cmds[i].cb; i++) {
if (!ndev_work->cmds[i].ndev)
@@ -678,8 +686,6 @@ static int netdevice_queue_work(struct netdev_event_work_cmd *cmds,
INIT_WORK(&ndev_work->work, netdevice_event_work_handler);
queue_work(gid_cache_wq, &ndev_work->work);
-
- return NOTIFY_DONE;
}
static const struct netdev_event_work_cmd add_cmd = {
@@ -820,7 +826,8 @@ static int netdevice_event(struct notifier_block *this, unsigned long event,
return NOTIFY_DONE;
}
- return netdevice_queue_work(cmds, ndev);
+ ib_reserve_enum_all_roce_netdevs(cmds, ndev);
+ return NOTIFY_DONE;
}
static void update_gid_event_work_handler(struct work_struct *_work)
[ T1228] SoftiWARP attached
[ T1222] lo speed is unknown, defaulting to 1000
[ T1222] lo speed is unknown, defaulting to 1000
[ T1222] lo speed is unknown, defaulting to 1000
[ T1222] ======================================================
[ T1222] WARNING: possible circular locking dependency detected
[ T1222] 6.19.0-rc1-dirty #232 Not tainted
[ T1222] ------------------------------------------------------
[ T1222] rdma/1222 is trying to acquire lock:
[ T1222] ffffffffba281a28 (rtnl_mutex){+.+.}-{4:4}, at: ib_get_eth_speed+0x7a/0x360 [ib_core]
[ T1222]
but task is already holding lock:
[ T1222] ffff88d54bd34fa8 (&device->compat_devs_mutex){+.+.}-{4:4}, at: add_one_compat_dev+0x72/0x380 [ib_core]
[ T1222]
which lock already depends on the new lock.
[ T1222]
the existing dependency chain (in reverse order) is:
[ T1222]
-> #3 (&device->compat_devs_mutex){+.+.}-{4:4}:
[ T1222] __lock_acquire+0x56d/0xbe0
[ T1222] lock_acquire.part.0+0x78/0x1c0
[ T1222] __mutex_lock+0xc7/0x10b0
[ T1222] add_one_compat_dev+0x72/0x380 [ib_core]
[ T1222] enable_device_and_get+0x1a4/0x200 [ib_core]
[ T1222] ib_register_device+0xf3/0x260 [ib_core]
[ T1222] siw_newlink+0xa4/0x140 [siw]
[ T1222] nldev_newlink+0x1d9/0x300 [ib_core]
[ T1222] rdma_nl_rcv_msg+0x12f/0x2f0 [ib_core]
[ T1222] rdma_nl_rcv_skb.constprop.0.isra.0+0xb2/0x100 [ib_core]
[ T1222] netlink_unicast+0x203/0x2e0
[ T1222] netlink_sendmsg+0x1f8/0x420
[ T1222] sock_sendmsg_nosec+0x81/0x90
[ T1222] __sys_sendto+0x125/0x180
[ T1222] __x64_sys_sendto+0x24/0x30
[ T1222] do_syscall_64+0x98/0x3c0
[ T1222] entry_SYSCALL_64_after_hwframe+0x76/0x7e
[ T1222]
-> #2 (rdma_nets_rwsem){.+.+}-{4:4}:
[ T1222] __lock_acquire+0x56d/0xbe0
[ T1222] lock_acquire.part.0+0x78/0x1c0
[ T1222] down_read+0x31/0x150
[ T1222] enable_device_and_get+0x147/0x200 [ib_core]
[ T1222] ib_register_device+0xf3/0x260 [ib_core]
[ T1222] siw_newlink+0xa4/0x140 [siw]
[ T1222] nldev_newlink+0x1d9/0x300 [ib_core]
[ T1222] rdma_nl_rcv_msg+0x12f/0x2f0 [ib_core]
[ T1222] rdma_nl_rcv_skb.constprop.0.isra.0+0xb2/0x100 [ib_core]
[ T1222] netlink_unicast+0x203/0x2e0
[ T1222] netlink_sendmsg+0x1f8/0x420
[ T1222] sock_sendmsg_nosec+0x81/0x90
[ T1222] __sys_sendto+0x125/0x180
[ T1222] __x64_sys_sendto+0x24/0x30
[ T1222] do_syscall_64+0x98/0x3c0
[ T1222] entry_SYSCALL_64_after_hwframe+0x76/0x7e
[ T1222]
-> #1 (devices_rwsem){++++}-{4:4}:
[ T1222] __lock_acquire+0x56d/0xbe0
[ T1222] lock_acquire.part.0+0x78/0x1c0
[ T1222] down_read+0x31/0x150
[ T1222] ib_reserve_enum_all_roce_netdevs+0x36/0xc0 [ib_core]
[ T1222] netdevice_event+0x114/0x240 [ib_core]
[ T1222] call_netdevice_register_net_notifiers+0x79/0x1b0
[ T1222] register_netdevice_notifier+0x8e/0x130
[ T1222] 0xffffffffc08992a4
[ T1222] 0xffffffffc089918f
[ T1222] do_one_initcall+0x70/0x380
[ T1222] do_init_module+0x84/0x260
[ T1222] init_module_from_file+0xd3/0xf0
[ T1222] idempotent_init_module+0x11a/0x310
[ T1222] __x64_sys_finit_module+0x71/0xe0
[ T1222] do_syscall_64+0x98/0x3c0
[ T1222] entry_SYSCALL_64_after_hwframe+0x76/0x7e
[ T1222]
-> #0 (rtnl_mutex){+.+.}-{4:4}:
[ T1222] check_prev_add+0xe1/0xca0
[ T1222] validate_chain+0x52c/0x7e0
[ T1222] __lock_acquire+0x56d/0xbe0
[ T1222] lock_acquire.part.0+0x78/0x1c0
[ T1222] __mutex_lock+0xc7/0x10b0
[ T1222] ib_get_eth_speed+0x7a/0x360 [ib_core]
[ T1222] siw_query_port+0x4b/0x190 [siw]
[ T1222] ib_setup_port_attrs+0x99/0x250 [ib_core]
[ T1222] add_one_compat_dev+0x286/0x380 [ib_core]
[ T1222] enable_device_and_get+0x1a4/0x200 [ib_core]
[ T1222] ib_register_device+0xf3/0x260 [ib_core]
[ T1222] siw_newlink+0xa4/0x140 [siw]
[ T1222] nldev_newlink+0x1d9/0x300 [ib_core]
[ T1222] rdma_nl_rcv_msg+0x12f/0x2f0 [ib_core]
[ T1222] rdma_nl_rcv_skb.constprop.0.isra.0+0xb2/0x100 [ib_core]
[ T1222] netlink_unicast+0x203/0x2e0
[ T1222] netlink_sendmsg+0x1f8/0x420
[ T1222] sock_sendmsg_nosec+0x81/0x90
[ T1222] __sys_sendto+0x125/0x180
[ T1222] __x64_sys_sendto+0x24/0x30
[ T1222] do_syscall_64+0x98/0x3c0
[ T1222] entry_SYSCALL_64_after_hwframe+0x76/0x7e
[ T1222]
other info that might help us debug this:
[ T1222] Chain exists of:
rtnl_mutex --> rdma_nets_rwsem --> &device->compat_devs_mutex
[ T1222] Possible unsafe locking scenario:
[ T1222] CPU0 CPU1
[ T1222] ---- ----
[ T1222] lock(&device->compat_devs_mutex);
[ T1222] lock(rdma_nets_rwsem);
[ T1222] lock(&device->compat_devs_mutex);
[ T1222] lock(rtnl_mutex);
[ T1222]
*** DEADLOCK ***
[ T1222] 5 locks held by rdma/1222:
[ T1222] #0: ffffffffc0ae1b18 (&rdma_nl_types[idx].sem){.+.+}-{4:4}, at: rdma_nl_rcv_msg+0x9e/0x2f0 [ib_core]
[ T1222] #1: ffffffffc0ae8a30 (link_ops_rwsem){++++}-{4:4}, at: nldev_newlink+0x278/0x300 [ib_core]
[ T1222] #2: ffffffffc0ae3e50 (devices_rwsem){++++}-{4:4}, at: enable_device_and_get+0x5c/0x200 [ib_core]
[ T1222] #3: ffffffffc0ae3c50 (rdma_nets_rwsem){.+.+}-{4:4}, at: enable_device_and_get+0x147/0x200 [ib_core]
[ T1222] #4: ffff88d54bd34fa8 (&device->compat_devs_mutex){+.+.}-{4:4}, at: add_one_compat_dev+0x72/0x380 [ib_core]
[ T1222]
stack backtrace:
[ T1222] CPU: 5 UID: 0 PID: 1222 Comm: rdma Not tainted 6.19.0-rc1-dirty #232 PREEMPT(voluntary)
[ T1222] Hardware name: VMware, Inc. VMware Virtual Platform/440BX Desktop Reference Platform, BIOS 6.00 11/12/2020
[ T1222] Call Trace:
[ T1222] <TASK>
[ T1222] dump_stack_lvl+0x6e/0xa0
[ T1222] print_circular_bug.cold+0x38/0x46
[ T1222] check_noncircular+0x148/0x170
[ T1222] check_prev_add+0xe1/0xca0
[ T1222] ? is_bpf_text_address+0x6e/0x100
[ T1222] ? kernel_text_address+0x120/0x130
[ T1222] validate_chain+0x52c/0x7e0
[ T1222] __lock_acquire+0x56d/0xbe0
[ T1222] lock_acquire.part.0+0x78/0x1c0
[ T1222] ? ib_get_eth_speed+0x7a/0x360 [ib_core]
[ T1222] __mutex_lock+0xc7/0x10b0
[ T1222] ? ib_get_eth_speed+0x7a/0x360 [ib_core]
[ T1222] ? find_held_lock+0x2b/0x80
[ T1222] ? ib_get_eth_speed+0x7a/0x360 [ib_core]
[ T1222] ? ib_get_eth_speed+0x7a/0x360 [ib_core]
[ T1222] ib_get_eth_speed+0x7a/0x360 [ib_core]
[ T1222] ? netlink_sendmsg+0x1f8/0x420
[ T1222] siw_query_port+0x4b/0x190 [siw]
[ T1222] ib_setup_port_attrs+0x99/0x250 [ib_core]
[ T1222] add_one_compat_dev+0x286/0x380 [ib_core]
[ T1222] enable_device_and_get+0x1a4/0x200 [ib_core]
[ T1222] ib_register_device+0xf3/0x260 [ib_core]
[ T1222] siw_newlink+0xa4/0x140 [siw]
[ T1222] nldev_newlink+0x1d9/0x300 [ib_core]
[ T1222] rdma_nl_rcv_msg+0x12f/0x2f0 [ib_core]
[ T1222] ? __lock_acquire+0x56d/0xbe0
[ T1222] rdma_nl_rcv_skb.constprop.0.isra.0+0xb2/0x100 [ib_core]
[ T1222] netlink_unicast+0x203/0x2e0
[ T1222] netlink_sendmsg+0x1f8/0x420
[ T1222] sock_sendmsg_nosec+0x81/0x90
[ T1222] __sys_sendto+0x125/0x180
[ T1222] __x64_sys_sendto+0x24/0x30
[ T1222] do_syscall_64+0x98/0x3c0
[ T1222] ? switch_fpu_return+0xd6/0x100
[ T1222] ? do_syscall_64+0x16d/0x3c0
[ T1222] ? lockdep_hardirqs_on_prepare.part.0+0x9b/0x140
[ T1222] ? irqentry_exit+0x8c/0x5b0
[ T1222] ? trace_hardirqs_off+0x44/0xa0
[ T1222] entry_SYSCALL_64_after_hwframe+0x76/0x7e
[ T1222] RIP: 0033:0x7f38bc63d77e
[ T1222] Code: 4d 89 d8 e8 d4 bc 00 00 4c 8b 5d f8 41 8b 93 08 03 00 00 59 5e 48 83 f8 fc 74 11 c9 c3 0f 1f 80 00 00 00 00 48 8b 45 10 0f 05 <c9> c3 83 e2 39 83 fa 08 75 e7 e8 13 ff ff ff 0f 1f 00 f3 0f 1e fa
[ T1222] RSP: 002b:00007ffd972ef0b0 EFLAGS: 00000202 ORIG_RAX: 000000000000002c
[ T1222] RAX: ffffffffffffffda RBX: 00005612aac892d0 RCX: 00007f38bc63d77e
[ T1222] RDX: 000000000000002c RSI: 00005612aac882a0 RDI: 0000000000000004
[ T1222] RBP: 00007ffd972ef0c0 R08: 00007f38bc7d19a0 R09: 000000000000000c
[ T1222] R10: 0000000000000000 R11: 0000000000000202 R12: 00007ffd972ef380
[ T1222] R13: 00007ffd972ef3d8 R14: 00007ffd972f15ba R15: 0000000069401105
[ T1222] </TASK>
[ T1222] lo speed is unknown, defaulting to 1000
[ T1222] lo speed is unknown, defaulting to 1000
[ T1222] lo speed is unknown, defaulting to 1000
Powered by blists - more mailing lists