[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <1c9188a9-2b22-4350-ac99-3a5048ccd440@redhat.com>
Date: Mon, 17 Mar 2025 10:18:45 +0100
From: Paolo Abeni <pabeni@...hat.com>
To: netdev@...r.kernel.org
Cc: Andrew Lunn <andrew+netdev@...n.ch>, "David S. Miller"
<davem@...emloft.net>, Eric Dumazet <edumazet@...gle.com>,
Jakub Kicinski <kuba@...nel.org>, Simon Horman <horms@...nel.org>,
Jonathan Corbet <corbet@....net>, Sabrina Dubroca <sd@...asysnail.net>
Subject: Re: [RFC PATCH 2/2] net: hotdata optimization for netns ptypes
On 3/14/25 2:05 PM, Paolo Abeni wrote:
> Per netns ptype usage is/should be an exception, but the current code
> unconditionally touches the related lists for each RX and TX packet.
>
> Add a per device flag in the hot data net_device section to cache the
> 'netns ptype required' information, and update it accordingly to the
> relevant netns status. The new fields are placed in existing holes,
> moved slightly to fit the relevant cacheline groups.
>
> Be sure to keep such flag up2date when new devices are created and/or
> devices are moved across namespaces initializing it in list_netdevice().
>
> In the fast path we can skip per-netns list processing when such patch
> is clear.
>
> This avoid touching in the fastpath the additional cacheline needed by
> the previous patch.
>
> Signed-off-by: Paolo Abeni <pabeni@...hat.com>
> ---
> This is a little cumbersome for possibly little gain. An alternative
> could be caching even the per-device list status in similar
> flags. Both RX and TX could use a single conditional to completely
> skip all the per dev/netns list. Potentially even moving the per
> device lists out of hotdata.
>
> Side note: despite being unconditionally touched in fastpath on both
> RX and TX, currently dev->ptype_all is not placed in any cacheline
> group hotdata.
I think we could consider not empty ptype_all/specific lists as slightly
less fast path - packet socket processing will add considerable more
overhead.
If, so we could consolidate the packet type checks in a couple of read
mostly counters in hotdata, decreasing both the number of conditionals
and the data set size in fastpath. I'll test something alike the following:
---
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 0dbfe069a6e38..755dc9d9f9f4a 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2134,7 +2134,6 @@ struct net_device {
/* RX read-mostly hotpath */
__cacheline_group_begin(net_device_read_rx);
struct bpf_prog __rcu *xdp_prog;
- struct list_head ptype_specific;
int ifindex;
unsigned int real_num_rx_queues;
struct netdev_rx_queue *_rx;
@@ -2174,6 +2173,7 @@ struct net_device {
struct list_head unreg_list;
struct list_head close_list;
struct list_head ptype_all;
+ struct list_head ptype_specific;
struct {
struct list_head upper;
diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
index bd57d8fb54f14..41ea0febf2260 100644
--- a/include/net/net_namespace.h
+++ b/include/net/net_namespace.h
@@ -575,6 +575,26 @@ static inline void fnhe_genid_bump(struct net *net)
atomic_inc(&net->fnhe_genid);
}
+static inline int net_ptype_all_cnt(const struct net *net)
+{
+ return READ_ONCE(net->ipv4.ptype_all_count);
+}
+
+static inline void net_ptype_all_set_cnt(struct net *net, int cnt)
+{
+ WRITE_ONCE(net->ipv4.ptype_all_count, cnt);
+}
+
+static inline int net_ptype_specific_cnt(const struct net *net)
+{
+ return READ_ONCE(net->ipv4.ptype_specific_count);
+}
+
+static inline void net_ptype_specific_set_cnt(struct net *net, int cnt)
+{
+ WRITE_ONCE(net->ipv4.ptype_specific_count, cnt);
+}
+
#ifdef CONFIG_NET
void net_ns_init(void);
#else
diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index 6373e3f17da84..8df28c3c88929 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -75,6 +75,8 @@ struct netns_ipv4 {
/* TXRX readonly hotpath cache lines */
__cacheline_group_begin(netns_ipv4_read_txrx);
u8 sysctl_tcp_moderate_rcvbuf;
+ /* 2 bytes hole */
+ int ptype_all_count;
__cacheline_group_end(netns_ipv4_read_txrx);
/* RX readonly hotpath cache line */
@@ -82,9 +84,10 @@ struct netns_ipv4 {
u8 sysctl_ip_early_demux;
u8 sysctl_tcp_early_demux;
u8 sysctl_tcp_l3mdev_accept;
- /* 3 bytes hole, try to pack */
+ /* 1 bytes hole, try to pack */
int sysctl_tcp_reordering;
int sysctl_tcp_rmem[3];
+ int ptype_specific_count;
__cacheline_group_end(netns_ipv4_read_rx);
struct inet_timewait_death_row tcp_death_row;
diff --git a/net/core/dev.c b/net/core/dev.c
index ad1853da0a4b5..51cce0a164937 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -603,11 +603,18 @@ static inline struct list_head *ptype_head(const
struct packet_type *pt)
void dev_add_pack(struct packet_type *pt)
{
struct list_head *head = ptype_head(pt);
+ struct net *net;
if (WARN_ON_ONCE(!head))
return;
spin_lock(&ptype_lock);
+ net = pt->dev ? dev_net(pt->dev) : pt->af_packet_net;
+ if (pt->type == htons(ETH_P_ALL))
+ net_ptype_all_set_cnt(net, net_ptype_all_cnt(net) + 1);
+ else
+ net_ptype_specific_set_cnt(net,
+ net_ptype_specific_cnt(net) + 1);
list_add_rcu(&pt->list, head);
spin_unlock(&ptype_lock);
}
@@ -630,6 +637,7 @@ void __dev_remove_pack(struct packet_type *pt)
{
struct list_head *head = ptype_head(pt);
struct packet_type *pt1;
+ struct net *net;
if (!head)
return;
@@ -637,10 +645,16 @@ void __dev_remove_pack(struct packet_type *pt)
spin_lock(&ptype_lock);
list_for_each_entry(pt1, head, list) {
- if (pt == pt1) {
- list_del_rcu(&pt->list);
- goto out;
- }
+ if (pt != pt1)
+ continue;
+ list_del_rcu(&pt->list);
+ net = pt->dev ? dev_net(pt->dev) : pt->af_packet_net;
+ if (pt->type == htons(ETH_P_ALL))
+ net_ptype_all_set_cnt(net, net_ptype_all_cnt(net) - 1);
+ else
+ net_ptype_specific_set_cnt(net,
+ net_ptype_specific_cnt(net) - 1);
+ goto out;
}
pr_warn("dev_remove_pack: %p not found\n", pt);
@@ -2483,8 +2497,7 @@ static inline bool skb_loop_sk(struct packet_type
*ptype, struct sk_buff *skb)
*/
bool dev_nit_active(struct net_device *dev)
{
- return !list_empty(&dev_net(dev)->ptype_all) ||
- !list_empty(&dev->ptype_all);
+ return net_ptype_all_cnt(dev_net(dev)) > 0;
}
EXPORT_SYMBOL_GPL(dev_nit_active);
@@ -5671,11 +5684,49 @@ static inline int nf_ingress(struct sk_buff
*skb, struct packet_type **pt_prev,
return 0;
}
+static int deliver_ptype_all_skb(struct sk_buff *skb, int ret,
+ struct packet_type **ppt_prev,
+ struct net_device *orig_dev)
+{
+ struct packet_type *ptype, *pt_prev = *ppt_prev;
+
+ list_for_each_entry_rcu(ptype, &dev_net(skb->dev)->ptype_all, list) {
+ if (pt_prev)
+ ret = deliver_skb(skb, pt_prev, orig_dev);
+ pt_prev = ptype;
+ }
+
+ list_for_each_entry_rcu(ptype, &skb->dev->ptype_all, list) {
+ if (pt_prev)
+ ret = deliver_skb(skb, pt_prev, orig_dev);
+ pt_prev = ptype;
+ }
+ *ppt_prev = pt_prev;
+ return ret;
+}
+
+static void deliver_ptype_specific_skb(struct sk_buff *skb, bool
deliver_exact,
+ struct packet_type **ppt_prev,
+ struct net_device *orig_dev,
+ __be16 type)
+{
+ if (deliver_exact)
+ deliver_ptype_list_skb(skb, ppt_prev, orig_dev, type,
+ &dev_net(skb->dev)->ptype_specific);
+
+ deliver_ptype_list_skb(skb, ppt_prev, orig_dev, type,
+ &orig_dev->ptype_specific);
+
+ if (unlikely(skb->dev != orig_dev))
+ deliver_ptype_list_skb(skb, ppt_prev, orig_dev, type,
+ &skb->dev->ptype_specific);
+}
+
static int __netif_receive_skb_core(struct sk_buff **pskb, bool pfmemalloc,
struct packet_type **ppt_prev)
{
- struct packet_type *ptype, *pt_prev;
rx_handler_func_t *rx_handler;
+ struct packet_type *pt_prev;
struct sk_buff *skb = *pskb;
struct net_device *orig_dev;
bool deliver_exact = false;
@@ -5732,17 +5783,8 @@ static int __netif_receive_skb_core(struct
sk_buff **pskb, bool pfmemalloc,
if (pfmemalloc)
goto skip_taps;
- list_for_each_entry_rcu(ptype, &dev_net(skb->dev)->ptype_all, list) {
- if (pt_prev)
- ret = deliver_skb(skb, pt_prev, orig_dev);
- pt_prev = ptype;
- }
-
- list_for_each_entry_rcu(ptype, &skb->dev->ptype_all, list) {
- if (pt_prev)
- ret = deliver_skb(skb, pt_prev, orig_dev);
- pt_prev = ptype;
- }
+ if (net_ptype_all_cnt(dev_net(skb->dev)) > 0)
+ ret = deliver_ptype_all_skb(skb, ret, &pt_prev, orig_dev);
skip_taps:
#ifdef CONFIG_NET_INGRESS
@@ -5840,21 +5882,14 @@ static int __netif_receive_skb_core(struct
sk_buff **pskb, bool pfmemalloc,
type = skb->protocol;
/* deliver only exact match when indicated */
- if (likely(!deliver_exact)) {
+ if (likely(!deliver_exact))
deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
&ptype_base[ntohs(type) &
PTYPE_HASH_MASK]);
- deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
- &dev_net(orig_dev)->ptype_specific);
- }
-
- deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
- &orig_dev->ptype_specific);
- if (unlikely(skb->dev != orig_dev)) {
- deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
- &skb->dev->ptype_specific);
- }
+ if (net_ptype_specific_cnt(dev_net(skb->dev)) > 0)
+ deliver_ptype_specific_skb(skb, deliver_exact, &pt_prev,
+ orig_dev, type);
if (pt_prev) {
if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC)))
@@ -12566,7 +12601,6 @@ static void __init net_dev_struct_check(void)
CACHELINE_ASSERT_GROUP_SIZE(struct net_device, net_device_read_txrx, 46);
/* RX read-mostly hotpath */
- CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx,
ptype_specific);
CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx,
ifindex);
CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx,
real_num_rx_queues);
CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, _rx);
@@ -12581,7 +12615,7 @@ static void __init net_dev_struct_check(void)
#ifdef CONFIG_NET_XGRESS
CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx,
tcx_ingress);
#endif
- CACHELINE_ASSERT_GROUP_SIZE(struct net_device, net_device_read_rx, 92);
+ CACHELINE_ASSERT_GROUP_SIZE(struct net_device, net_device_read_rx, 76);
}
/*
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index b0dfdf791ece5..551355665aaee 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -1174,7 +1174,7 @@ static void __init netns_ipv4_struct_check(void)
/* TXRX readonly hotpath cache lines */
CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_txrx,
sysctl_tcp_moderate_rcvbuf);
- CACHELINE_ASSERT_GROUP_SIZE(struct netns_ipv4, netns_ipv4_read_txrx, 1);
+ CACHELINE_ASSERT_GROUP_SIZE(struct netns_ipv4, netns_ipv4_read_txrx, 7);
/* RX readonly hotpath cache line */
CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_rx,
@@ -1187,7 +1187,7 @@ static void __init netns_ipv4_struct_check(void)
sysctl_tcp_reordering);
CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_rx,
sysctl_tcp_rmem);
- CACHELINE_ASSERT_GROUP_SIZE(struct netns_ipv4, netns_ipv4_read_rx, 22);
+ CACHELINE_ASSERT_GROUP_SIZE(struct netns_ipv4, netns_ipv4_read_rx, 24);
}
#endif
Powered by blists - more mailing lists