[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20250326223225.75860-1-kuniyu@amazon.com>
Date: Wed, 26 Mar 2025 15:32:22 -0700
From: Kuniyuki Iwashima <kuniyu@...zon.com>
To: <edumazet@...gle.com>
CC: <davem@...emloft.net>, <horms@...nel.org>, <kuba@...nel.org>,
<kuni1840@...il.com>, <kuniyu@...zon.com>, <netdev@...r.kernel.org>,
<pabeni@...hat.com>, <ychemla@...dia.com>
Subject: Re: [PATCH v5 net 2/3] net: Fix dev_net(dev) race in unregister_netdevice_notifier_dev_net().
From: Eric Dumazet <edumazet@...gle.com>
Date: Wed, 26 Mar 2025 23:10:04 +0100
> On Wed, Mar 26, 2025 at 10:55 PM Kuniyuki Iwashima <kuniyu@...zon.com> wrote:
> >
> > From: Yael Chemla <ychemla@...dia.com>
> > Date: Wed, 26 Mar 2025 15:46:40 +0200
> > > On 17/02/2025 21:11, Kuniyuki Iwashima wrote:
> > > > After the cited commit, dev_net(dev) is fetched before holding RTNL
> > > > and passed to __unregister_netdevice_notifier_net().
> > > >
> > > > However, dev_net(dev) might be different after holding RTNL.
> > > >
> > > > In the reported case [0], while removing a VF device, its netns was
> > > > being dismantled and the VF was moved to init_net.
> > > >
> > > > So the following sequence is basically illegal when dev was fetched
> > > > without lookup:
> > > >
> > > > net = dev_net(dev);
> > > > rtnl_net_lock(net);
> > > >
> > > > Let's use a new helper rtnl_net_dev_lock() to fix the race.
> > > >
> > > > It fetches dev_net_rcu(dev), bumps its net->passive, and checks if
> > > > dev_net_rcu(dev) is changed after rtnl_net_lock().
> > > >
> > > > [0]:
> > > > BUG: KASAN: slab-use-after-free in notifier_call_chain (kernel/notifier.c:75 (discriminator 2))
> > > > Read of size 8 at addr ffff88810cefb4c8 by task test-bridge-lag/21127
> > > > Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014
> > > > Call Trace:
> > > > <TASK>
> > > > dump_stack_lvl (lib/dump_stack.c:123)
> > > > print_report (mm/kasan/report.c:379 mm/kasan/report.c:489)
> > > > kasan_report (mm/kasan/report.c:604)
> > > > notifier_call_chain (kernel/notifier.c:75 (discriminator 2))
> > > > call_netdevice_notifiers_info (net/core/dev.c:2011)
> > > > unregister_netdevice_many_notify (net/core/dev.c:11551)
> > > > unregister_netdevice_queue (net/core/dev.c:11487)
> > > > unregister_netdev (net/core/dev.c:11635)
> > > > mlx5e_remove (drivers/net/ethernet/mellanox/mlx5/core/en_main.c:6552 drivers/net/ethernet/mellanox/mlx5/core/en_main.c:6579) mlx5_core
> > > > auxiliary_bus_remove (drivers/base/auxiliary.c:230)
> > > > device_release_driver_internal (drivers/base/dd.c:1275 drivers/base/dd.c:1296)
> > > > bus_remove_device (./include/linux/kobject.h:193 drivers/base/base.h:73 drivers/base/bus.c:583)
> > > > device_del (drivers/base/power/power.h:142 drivers/base/core.c:3855)
> > > > mlx5_rescan_drivers_locked (./include/linux/auxiliary_bus.h:241 drivers/net/ethernet/mellanox/mlx5/core/dev.c:333 drivers/net/ethernet/mellanox/mlx5/core/dev.c:535 drivers/net/ethernet/mellanox/mlx5/core/dev.c:549) mlx5_core
> > > > mlx5_unregister_device (drivers/net/ethernet/mellanox/mlx5/core/dev.c:468) mlx5_core
> > > > mlx5_uninit_one (./include/linux/instrumented.h:68 ./include/asm-generic/bitops/instrumented-non-atomic.h:141 drivers/net/ethernet/mellanox/mlx5/core/main.c:1563) mlx5_core
> > > > remove_one (drivers/net/ethernet/mellanox/mlx5/core/main.c:965 drivers/net/ethernet/mellanox/mlx5/core/main.c:2019) mlx5_core
> > > > pci_device_remove (./include/linux/pm_runtime.h:129 drivers/pci/pci-driver.c:475)
> > > > device_release_driver_internal (drivers/base/dd.c:1275 drivers/base/dd.c:1296)
> > > > unbind_store (drivers/base/bus.c:245)
> > > > kernfs_fop_write_iter (fs/kernfs/file.c:338)
> > > > vfs_write (fs/read_write.c:587 (discriminator 1) fs/read_write.c:679 (discriminator 1))
> > > > ksys_write (fs/read_write.c:732)
> > > > do_syscall_64 (arch/x86/entry/common.c:52 (discriminator 1) arch/x86/entry/common.c:83 (discriminator 1))
> > > > entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:130)
> > > > RIP: 0033:0x7f6a4d5018b7
> > > >
> > > > Fixes: 7fb1073300a2 ("net: Hold rtnl_net_lock() in (un)?register_netdevice_notifier_dev_net().")
> > > > Reported-by: Yael Chemla <ychemla@...dia.com>
> > > > Closes: https://lore.kernel.org/netdev/146eabfe-123c-4970-901e-e961b4c09bc3@nvidia.com/
> > > > Signed-off-by: Kuniyuki Iwashima <kuniyu@...zon.com>
> > > > Reviewed-by: Eric Dumazet <edumazet@...gle.com>
> > > > ---
> > > > v5:
> > > > * Use do-while loop
> > > >
> > > > v4:
> > > > * Fix build failure when !CONFIG_NET_NS
> > > > * Use net_passive_dec()
> > > >
> > > > v3:
> > > > * Bump net->passive instead of maybe_get_net()
> > > > * Remove msleep(1) loop
> > > > * Use rcu_access_pointer() instead of rcu_read_lock().
> > > >
> > > > v2:
> > > > * Use dev_net_rcu().
> > > > * Use msleep(1) instead of cond_resched() after maybe_get_net()
> > > > * Remove cond_resched() after net_eq() check
> > > >
> > > > v1: https://lore.kernel.org/netdev/20250130232435.43622-2-kuniyu@amazon.com/
> > > > ---
> > > > net/core/dev.c | 48 ++++++++++++++++++++++++++++++++++++++++++++----
> > > > 1 file changed, 44 insertions(+), 4 deletions(-)
> > > >
> > > > diff --git a/net/core/dev.c b/net/core/dev.c
> > > > index b91658e8aedb..19e268568282 100644
> > > > --- a/net/core/dev.c
> > > > +++ b/net/core/dev.c
> > > > @@ -2070,6 +2070,42 @@ static void __move_netdevice_notifier_net(struct net *src_net,
> > > > __register_netdevice_notifier_net(dst_net, nb, true);
> > > > }
> > > >
> > > > +static void rtnl_net_dev_lock(struct net_device *dev)
> > > > +{
> > > > + bool again;
> > > > +
> > > > + do {
> > > > + struct net *net;
> > > > +
> > > > + again = false;
> > > > +
> > > > + /* netns might be being dismantled. */
> > > > + rcu_read_lock();
> > > > + net = dev_net_rcu(dev);
> > > > + net_passive_inc(net);
> > >
> > > Hi Kuniyuki,
> > > It seems we are still encountering the previously reorted issue,
> > > even when running with your latest fix. However, the problem has become
> > > less frequent, now requiring multiple test iterations to reproduce.
> >
> > Thanks for reporting!
> >
> >
> > >
> > > 1) we identified the following warnings (each accompanied by a call
> > > trace; only one is detailed below, though others are similar):
> > >
> > > refcount_t: addition on 0; use-after-free.
> > > WARNING: CPU: 6 PID: 1105 at lib/refcount.c:25 refcount_warn_saturate
> > > (/usr/work/linux/lib/refcount.c:25 (discriminator 1))
> > >
> > > and also
> > >
> > > refcount_t: underflow; use-after-free.
> > > WARNING: CPU: 6 PID: 1105 at lib/refcount.c:28 refcount_warn_saturate
> > > (/usr/work/linux/lib/refcount.c:28 (discriminator 1))
> > >
> > >
> > > 2) test scenario:
> > > sets up a network topology of two VFs on different eSwitch, performs
> > > ping tests between them, verifies traffic rules offloading, and cleans
> > > up the environment afterward.
> > >
> > > 3) the warning is triggered upon reaching the
> > > unregister_netdevice_notifier_dev_net when both net->ns.count and
> > > net->passive reference counts are zero.
> >
> > It looks unlikely but I missed there is still a race window.
> >
> > If dev_net_rcu() is called between synchronize_net() and dev_net_set()
> > in netif_change_net_namespace(), there might be no synchronize_rcu()
> > called after that and net_passive_dec() could be called in cleanup_net()
> > earlier than net_passive_inc() ... ?
> >
> > Could you test this ?
> >
> > ---8<---
> > diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
> > index bd57d8fb54f1..c275e95c83ab 100644
> > --- a/include/net/net_namespace.h
> > +++ b/include/net/net_namespace.h
> > @@ -337,9 +337,9 @@ static inline void net_passive_dec(struct net *net)
> > }
> > #endif
> >
> > -static inline void net_passive_inc(struct net *net)
> > +static inline bool net_passive_inc(struct net *net)
> > {
> > - refcount_inc(&net->passive);
> > + return refcount_inc_not_zero(&net->passive);
>
> Hmm
>
> We want the two oher net_passive_inc() callers to have refcount safety.
>
> Please add a new net_passive_inc_not_zero()
Sure, will do.
Also, I'll remove the weird do-while {} and do everything under
ifdef CONFIG_NET_NS guard. If it's disabled, just calling
rtnl_net_lock() for &init_net is fine.
---8<---
diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
index bd57d8fb54f1..c91b13e6fb12 100644
--- a/include/net/net_namespace.h
+++ b/include/net/net_namespace.h
@@ -342,6 +342,11 @@ static inline void net_passive_inc(struct net *net)
refcount_inc(&net->passive);
}
+static inline bool net_passive_inc_not_zero(struct net *net)
+{
+ return refcount_inc_not_zero(&net->passive);
+}
+
/* Returns true if the netns initialization is completed successfully */
static inline bool net_initialized(const struct net *net)
{
diff --git a/net/core/dev.c b/net/core/dev.c
index b597cc27a115..408659d238b8 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2077,38 +2077,43 @@ static void __move_netdevice_notifier_net(struct net *src_net,
static void rtnl_net_dev_lock(struct net_device *dev)
{
- bool again;
-
- do {
- struct net *net;
-
- again = false;
+#ifdef CONFIG_NET_NS
+ struct net *net;
- /* netns might be being dismantled. */
- rcu_read_lock();
- net = dev_net_rcu(dev);
- net_passive_inc(net);
+again:
+ /* netns might be being dismantled. */
+ rcu_read_lock();
+ net = dev_net_rcu(dev);
+ if (!net_passive_inc_not_zero(net)) {
rcu_read_unlock();
+ msleep(1);
+ goto again;
+ }
+ rcu_read_unlock();
- rtnl_net_lock(net);
+ rtnl_net_lock(net);
-#ifdef CONFIG_NET_NS
- /* dev might have been moved to another netns. */
- if (!net_eq(net, rcu_access_pointer(dev->nd_net.net))) {
- rtnl_net_unlock(net);
- net_passive_dec(net);
- again = true;
- }
+ /* dev might have been moved to another netns. */
+ if (!net_eq(net, rcu_access_pointer(dev->nd_net.net))) {
+ rtnl_net_unlock(net);
+ net_passive_dec(net);
+ goto again;
+ }
+#else
+ rtnl_net_lock(&init_net);
#endif
- } while (again);
}
static void rtnl_net_dev_unlock(struct net_device *dev)
{
+#ifdef CONFIG_NET_NS
struct net *net = dev_net(dev);
rtnl_net_unlock(net);
net_passive_dec(net);
+#else
+ rtnl_net_unlock(&init_net);
+#endif
}
int register_netdevice_notifier_dev_net(struct net_device *dev,
---8<---
Powered by blists - more mailing lists