[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <20250212060905.14400-1-kuniyu@amazon.com>
Date: Wed, 12 Feb 2025 15:09:05 +0900
From: Kuniyuki Iwashima <kuniyu@...zon.com>
To: <edumazet@...gle.com>
CC: <davem@...emloft.net>, <horms@...nel.org>, <kuba@...nel.org>,
<kuni1840@...il.com>, <kuniyu@...zon.com>, <netdev@...r.kernel.org>,
<pabeni@...hat.com>, <ychemla@...dia.com>
Subject: Re: [PATCH v3 net 1/2] net: Fix dev_net(dev) race in unregister_netdevice_notifier_dev_net().
From: Eric Dumazet <edumazet@...gle.com>
Date: Tue, 11 Feb 2025 10:43:30 +0100
> On Tue, Feb 11, 2025 at 6:13 AM Kuniyuki Iwashima <kuniyu@...zon.com> wrote:
> >
> > After the cited commit, dev_net(dev) is fetched before holding RTNL
> > and passed to __unregister_netdevice_notifier_net().
> >
> > However, dev_net(dev) might be different after holding RTNL.
> >
> > In the reported case [0], while removing a VF device, its netns was
> > being dismantled and the VF was moved to init_net.
> >
> > So the following sequence is basically illegal when dev was fetched
> > without lookup:
> >
> > net = dev_net(dev);
> > rtnl_net_lock(net);
> >
> > Let's use a new helper rtnl_net_dev_lock() to fix the race.
> >
> > It fetches dev_net_rcu(dev), bumps its net->passive, and checks if
> > dev_net_rcu(dev) is changed after rtnl_net_lock().
> >
> >
>
> > Fixes: 7fb1073300a2 ("net: Hold rtnl_net_lock() in (un)?register_netdevice_notifier_dev_net().")
> > Reported-by: Yael Chemla <ychemla@...dia.com>
> > Closes: https://lore.kernel.org/netdev/146eabfe-123c-4970-901e-e961b4c09bc3@nvidia.com/
> > Signed-off-by: Kuniyuki Iwashima <kuniyu@...zon.com>
> > ---
> > v3:
> > * Bump net->passive instead of maybe_get_net()
> > * Remove msleep(1) loop
> > * Use rcu_access_pointer() instead of rcu_read_lock().
> >
> > v2:
> > * Use dev_net_rcu().
> > * Use msleep(1) instead of cond_resched() after maybe_get_net()
> > * Remove cond_resched() after net_eq() check
> >
> > v1: https://lore.kernel.org/netdev/20250130232435.43622-2-kuniyu@amazon.com/
> > ---
> > net/core/dev.c | 41 +++++++++++++++++++++++++++++++++++++----
> > 1 file changed, 37 insertions(+), 4 deletions(-)
> >
> > diff --git a/net/core/dev.c b/net/core/dev.c
> > index 55e356a68db6..1248fb368e78 100644
> > --- a/net/core/dev.c
> > +++ b/net/core/dev.c
> > @@ -2070,6 +2070,35 @@ static void __move_netdevice_notifier_net(struct net *src_net,
> > __register_netdevice_notifier_net(dst_net, nb, true);
> > }
> >
> > +static void rtnl_net_dev_lock(struct net_device *dev)
> > +{
> > + struct net *net;
> > +
>
> #ifdef CONFIG_NET_NS
> > +again:
> #endif
>
> > + /* netns might be being dismantled. */
> > + rcu_read_lock();
> > + net = dev_net_rcu(dev);
> > + refcount_inc(&net->passive);
> > + rcu_read_unlock();
> > +
> > + rtnl_net_lock(net);
> > +
>
> #ifdef CONFIG_NET_NS
>
> > + /* dev might have been moved to another netns. */
> > + if (!net_eq(net, rcu_access_pointer(dev->nd_net.net))) {
> > + rtnl_net_unlock(net);
> > + net_drop_ns(net);
> > + goto again;
> > + }
>
> #endif
>
> Or perhaps not use net_drop_ns() and rename/export net_free() to
> net_passive_dec() ?
Ah, we need both guard (for dev->nd_net.net) and net_passive_dec().
Or, we can simply rtnl_net_lock(&init_net) for !CONFIG_NET_NS and
keep net_drop_ns().
The former looked cleaner, so I'll do so in v4.
Thanks!
>
>
> diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
> index 7ba1402ca7796663bed3373b1a0c6a0249cd1599..62d1a1c39547bd5cca71082b8172d453b56a96db
> 100644
> --- a/include/net/net_namespace.h
> +++ b/include/net/net_namespace.h
> @@ -297,7 +297,7 @@ static inline int check_net(const struct net *net)
> }
>
> void net_drop_ns(void *);
> -
> +void net_passive_dec(struct net *net);
> #else
>
> static inline struct net *get_net(struct net *net)
> @@ -326,6 +326,11 @@ static inline int check_net(const struct net *net)
> }
>
> #define net_drop_ns NULL
> +static inline void net_passive_dec(struct net *net)
> +{
> + refcount_dec(&net->passive);
> +}
> +
> #endif
>
> /* Returns true if the netns initialization is completed successfully */
> diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
> index cb39a12b2f8295c605f08b5589932932150a1644..4303f2a4926243e2c0ff0c0387383cd8e0658019
> 100644
> --- a/net/core/net_namespace.c
> +++ b/net/core/net_namespace.c
> @@ -464,7 +464,7 @@ static void net_complete_free(void)
>
> }
>
> -static void net_free(struct net *net)
> +void net_passive_dec(struct net *net)
> {
> if (refcount_dec_and_test(&net->passive)) {
> kfree(rcu_access_pointer(net->gen));
> @@ -482,7 +482,7 @@ void net_drop_ns(void *p)
> struct net *net = (struct net *)p;
>
> if (net)
> - net_free(net);
> + net_passive_dec(net);
> }
>
> struct net *copy_net_ns(unsigned long flags,
> @@ -523,7 +523,7 @@ struct net *copy_net_ns(unsigned long flags,
> key_remove_domain(net->key_domain);
> #endif
> put_user_ns(user_ns);
> - net_free(net);
> + net_passive_dec(net);
> dec_ucounts:
> dec_net_namespaces(ucounts);
> return ERR_PTR(rv);
> @@ -672,7 +672,7 @@ static void cleanup_net(struct work_struct *work)
> key_remove_domain(net->key_domain);
> #endif
> put_user_ns(net->user_ns);
> - net_free(net);
> + net_passive_dec(net);
> }
> cleanup_net_task = NULL;
> }
Powered by blists - more mailing lists