[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <CANn89iLst-RHUmidAqHxxQAPrH8bJYT+WiFxPU0THJyWWH0ngQ@mail.gmail.com>
Date: Tue, 18 Mar 2025 19:55:33 +0100
From: Eric Dumazet <edumazet@...gle.com>
To: Paolo Abeni <pabeni@...hat.com>
Cc: netdev@...r.kernel.org, Andrew Lunn <andrew+netdev@...n.ch>,
"David S. Miller" <davem@...emloft.net>, Jakub Kicinski <kuba@...nel.org>, Simon Horman <horms@...nel.org>,
Sabrina Dubroca <sd@...asysnail.net>
Subject: Re: [PATCH v2] net: introduce per netns packet chains
On Tue, Mar 18, 2025 at 7:03 PM Paolo Abeni <pabeni@...hat.com> wrote:
>
> Currently network taps unbound to any interface are linked in the
> global ptype_all list, affecting the performance in all the network
> namespaces.
>
> Add per netns ptypes chains, so that in the mentioned case only
> the netns owning the packet socket(s) is affected.
>
> While at that drop the global ptype_all list: no in kernel user
> registers a tap on "any" type without specifying either the target
> device or the target namespace (and IMHO doing that would not make
> any sense).
>
> Note that this adds a conditional in the fast path (to check for
> per netns ptype_specific list) and increases the dataset size by
> a cacheline (owing the per netns lists).
>
> Signed-off-by: Paolo Abeni <pabeni@...hat.com>
> ---
> v1 -> v2:
> - fix comment typo
> - drop the doubtful RCU optimization
>
> rfc -> v1
> - fix procfs dump
> - fix dev->ptype_specific -> dev->ptype_all type in ptype_head()
> - dev_net() -> dev_net_rcu
> - add dev_nit_active_rcu variant
> - ptype specific netns deliver uses dev_net_rcu(skb->dev)) instead
> of dev_net(orig_dev)
> ---
> include/linux/netdevice.h | 12 ++++++++-
> include/net/hotdata.h | 1 -
> include/net/net_namespace.h | 3 +++
> net/core/dev.c | 52 +++++++++++++++++++++++++++----------
> net/core/hotdata.c | 1 -
> net/core/net-procfs.c | 28 +++++++++++++++-----
> net/core/net_namespace.c | 2 ++
> 7 files changed, 76 insertions(+), 23 deletions(-)
>
> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> index 67527243459b3..c51a99f24800d 100644
> --- a/include/linux/netdevice.h
> +++ b/include/linux/netdevice.h
> @@ -4276,7 +4276,17 @@ static __always_inline int ____dev_forward_skb(struct net_device *dev,
> return 0;
> }
>
> -bool dev_nit_active(struct net_device *dev);
> +bool dev_nit_active_rcu(struct net_device *dev);
> +static inline bool dev_nit_active(struct net_device *dev)
> +{
> + bool ret;
> +
> + rcu_read_lock();
> + ret = dev_nit_active_rcu(dev);
> + rcu_read_unlock();
> + return ret;
> +}
> +
> void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev);
>
> static inline void __dev_put(struct net_device *dev)
> diff --git a/include/net/hotdata.h b/include/net/hotdata.h
> index 30e9570beb2af..fda94b2647ffa 100644
> --- a/include/net/hotdata.h
> +++ b/include/net/hotdata.h
> @@ -23,7 +23,6 @@ struct net_hotdata {
> struct net_offload udpv6_offload;
> #endif
> struct list_head offload_base;
> - struct list_head ptype_all;
> struct kmem_cache *skbuff_cache;
> struct kmem_cache *skbuff_fclone_cache;
> struct kmem_cache *skb_small_head_cache;
> diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
> index f467a66abc6b1..bd57d8fb54f14 100644
> --- a/include/net/net_namespace.h
> +++ b/include/net/net_namespace.h
> @@ -83,6 +83,9 @@ struct net {
> struct llist_node defer_free_list;
> struct llist_node cleanup_list; /* namespaces on death row */
>
> + struct list_head ptype_all;
> + struct list_head ptype_specific;
> +
> #ifdef CONFIG_KEYS
> struct key_tag *key_domain; /* Key domain of operation tag */
> #endif
> diff --git a/net/core/dev.c b/net/core/dev.c
> index 6fa6ed5b57987..99ce4a3526e54 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -572,11 +572,19 @@ static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
>
> static inline struct list_head *ptype_head(const struct packet_type *pt)
> {
> - if (pt->type == htons(ETH_P_ALL))
> - return pt->dev ? &pt->dev->ptype_all : &net_hotdata.ptype_all;
> - else
> - return pt->dev ? &pt->dev->ptype_specific :
> - &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
> + if (pt->type == htons(ETH_P_ALL)) {
> + if (!pt->af_packet_net && !pt->dev)
> + return NULL;
> +
> + return pt->dev ? &pt->dev->ptype_all :
> + &pt->af_packet_net->ptype_all;
> + }
> +
> + if (pt->dev)
> + return &pt->dev->ptype_specific;
> +
> + return pt->af_packet_net ? &pt->af_packet_net->ptype_specific :
> + &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
> }
>
> /**
> @@ -596,6 +604,9 @@ void dev_add_pack(struct packet_type *pt)
> {
> struct list_head *head = ptype_head(pt);
>
> + if (WARN_ON_ONCE(!head))
> + return;
> +
> spin_lock(&ptype_lock);
> list_add_rcu(&pt->list, head);
> spin_unlock(&ptype_lock);
> @@ -620,6 +631,9 @@ void __dev_remove_pack(struct packet_type *pt)
> struct list_head *head = ptype_head(pt);
> struct packet_type *pt1;
>
> + if (!head)
> + return;
> +
> spin_lock(&ptype_lock);
>
> list_for_each_entry(pt1, head, list) {
> @@ -2463,16 +2477,18 @@ static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
> }
>
> /**
> - * dev_nit_active - return true if any network interface taps are in use
> + * dev_nit_active_rcu - return true if any network interface taps are in use
> + *
> + * The caller must hold the RCU lock
> *
> * @dev: network device to check for the presence of taps
> */
> -bool dev_nit_active(struct net_device *dev)
> +bool dev_nit_active_rcu(struct net_device *dev)
> {
> - return !list_empty(&net_hotdata.ptype_all) ||
> + return !list_empty(&dev_net_rcu(dev)->ptype_all) ||
> !list_empty(&dev->ptype_all);
> }
> -EXPORT_SYMBOL_GPL(dev_nit_active);
> +EXPORT_SYMBOL_GPL(dev_nit_active_rcu);
>
> /*
> * Support routine. Sends outgoing frames to any network
> @@ -2481,11 +2497,12 @@ EXPORT_SYMBOL_GPL(dev_nit_active);
>
> void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
> {
> - struct list_head *ptype_list = &net_hotdata.ptype_all;
> struct packet_type *ptype, *pt_prev = NULL;
> + struct list_head *ptype_list;
> struct sk_buff *skb2 = NULL;
>
> rcu_read_lock();
> + ptype_list = &dev_net_rcu(dev)->ptype_all;
> again:
> list_for_each_entry_rcu(ptype, ptype_list, list) {
> if (READ_ONCE(ptype->ignore_outgoing))
> @@ -2529,7 +2546,7 @@ void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
> pt_prev = ptype;
> }
>
> - if (ptype_list == &net_hotdata.ptype_all) {
> + if (ptype_list == &dev_net_rcu(dev)->ptype_all) {
Using the following should be faster, generated code will be shorter.
if (ptype_list != &dev->ptype_all)
> ptype_list = &dev->ptype_all;
> goto again;
> }
> @@ -3774,7 +3791,7 @@ static int xmit_one(struct sk_buff *skb, struct net_device *dev,
> unsigned int len;
> int rc;
>
> - if (dev_nit_active(dev))
> + if (dev_nit_active_rcu(dev))
> dev_queue_xmit_nit(skb, dev);
>
> len = skb->len;
> @@ -5718,7 +5735,8 @@ static int __netif_receive_skb_core(struct sk_buff **pskb, bool pfmemalloc,
> if (pfmemalloc)
> goto skip_taps;
>
> - list_for_each_entry_rcu(ptype, &net_hotdata.ptype_all, list) {
> + list_for_each_entry_rcu(ptype, &dev_net_rcu(skb->dev)->ptype_all,
> + list) {
> if (pt_prev)
> ret = deliver_skb(skb, pt_prev, orig_dev);
> pt_prev = ptype;
> @@ -5830,6 +5848,14 @@ static int __netif_receive_skb_core(struct sk_buff **pskb, bool pfmemalloc,
> deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
> &ptype_base[ntohs(type) &
> PTYPE_HASH_MASK]);
> +
> + /* The only per net ptype user - packet socket - matches
> + * the target netns vs dev_net(skb->dev); we need to
> + * process only such netns even when orig_dev lays in a
> + * different one.
> + */
> + deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
> + &dev_net_rcu(skb->dev)->ptype_specific);
I am banging my head here. I probably need some sleep :)
Powered by blists - more mailing lists