netdev - Re: [RFC PATCH 2/2] net: hotdata optimization for netns ptypes

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <1c9188a9-2b22-4350-ac99-3a5048ccd440@redhat.com>
Date: Mon, 17 Mar 2025 10:18:45 +0100
From: Paolo Abeni <pabeni@...hat.com>
To: netdev@...r.kernel.org
Cc: Andrew Lunn <andrew+netdev@...n.ch>, "David S. Miller"
 <davem@...emloft.net>, Eric Dumazet <edumazet@...gle.com>,
 Jakub Kicinski <kuba@...nel.org>, Simon Horman <horms@...nel.org>,
 Jonathan Corbet <corbet@....net>, Sabrina Dubroca <sd@...asysnail.net>
Subject: Re: [RFC PATCH 2/2] net: hotdata optimization for netns ptypes



On 3/14/25 2:05 PM, Paolo Abeni wrote:
> Per netns ptype usage is/should be an exception, but the current code
> unconditionally touches the related lists for each RX and TX packet.
> 
> Add a per device flag in the hot data net_device section to cache the
> 'netns ptype required' information, and update it accordingly to the
> relevant netns status. The new fields are placed in existing holes,
> moved slightly to fit the relevant cacheline groups.
> 
> Be sure to keep such flag up2date when new devices are created and/or
> devices are moved across namespaces initializing it in list_netdevice().
> 
> In the fast path we can skip per-netns list processing when such patch
> is clear.
> 
> This avoid touching in the fastpath the additional cacheline needed by
> the previous patch.
> 
> Signed-off-by: Paolo Abeni <pabeni@...hat.com>
> ---
> This is a little cumbersome for possibly little gain. An alternative
> could be caching even the per-device list status in similar
> flags. Both RX and TX could use a single conditional to completely
> skip all the per dev/netns list. Potentially even moving the per
> device lists out of hotdata.
> 
> Side note: despite being unconditionally touched in fastpath on both
> RX and TX, currently dev->ptype_all is not placed in any cacheline
> group hotdata.

I think we could consider not empty ptype_all/specific lists as slightly
less fast path - packet socket processing will add considerable more
overhead.

If, so we could consolidate the packet type checks in a couple of read
mostly counters in hotdata, decreasing both the number of conditionals
and the data set size in fastpath. I'll test something alike the following:

---
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 0dbfe069a6e38..755dc9d9f9f4a 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2134,7 +2134,6 @@ struct net_device {
 	/* RX read-mostly hotpath */
 	__cacheline_group_begin(net_device_read_rx);
 	struct bpf_prog __rcu	*xdp_prog;
-	struct list_head	ptype_specific;
 	int			ifindex;
 	unsigned int		real_num_rx_queues;
 	struct netdev_rx_queue	*_rx;
@@ -2174,6 +2173,7 @@ struct net_device {
 	struct list_head	unreg_list;
 	struct list_head	close_list;
 	struct list_head	ptype_all;
+	struct list_head	ptype_specific;

 	struct {
 		struct list_head upper;
diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
index bd57d8fb54f14..41ea0febf2260 100644
--- a/include/net/net_namespace.h
+++ b/include/net/net_namespace.h
@@ -575,6 +575,26 @@ static inline void fnhe_genid_bump(struct net *net)
 	atomic_inc(&net->fnhe_genid);
 }

+static inline int net_ptype_all_cnt(const struct net *net)
+{
+	return READ_ONCE(net->ipv4.ptype_all_count);
+}
+
+static inline void net_ptype_all_set_cnt(struct net *net, int cnt)
+{
+	WRITE_ONCE(net->ipv4.ptype_all_count, cnt);
+}
+
+static inline int net_ptype_specific_cnt(const struct net *net)
+{
+	return READ_ONCE(net->ipv4.ptype_specific_count);
+}
+
+static inline void net_ptype_specific_set_cnt(struct net *net, int cnt)
+{
+	WRITE_ONCE(net->ipv4.ptype_specific_count, cnt);
+}
+
 #ifdef CONFIG_NET
 void net_ns_init(void);
 #else
diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index 6373e3f17da84..8df28c3c88929 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -75,6 +75,8 @@ struct netns_ipv4 {
 	/* TXRX readonly hotpath cache lines */
 	__cacheline_group_begin(netns_ipv4_read_txrx);
 	u8 sysctl_tcp_moderate_rcvbuf;
+	/* 2 bytes hole */
+	int ptype_all_count;
 	__cacheline_group_end(netns_ipv4_read_txrx);

 	/* RX readonly hotpath cache line */
@@ -82,9 +84,10 @@ struct netns_ipv4 {
 	u8 sysctl_ip_early_demux;
 	u8 sysctl_tcp_early_demux;
 	u8 sysctl_tcp_l3mdev_accept;
-	/* 3 bytes hole, try to pack */
+	/* 1 bytes hole, try to pack */
 	int sysctl_tcp_reordering;
 	int sysctl_tcp_rmem[3];
+	int ptype_specific_count;
 	__cacheline_group_end(netns_ipv4_read_rx);

 	struct inet_timewait_death_row tcp_death_row;
diff --git a/net/core/dev.c b/net/core/dev.c
index ad1853da0a4b5..51cce0a164937 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -603,11 +603,18 @@ static inline struct list_head *ptype_head(const
struct packet_type *pt)
 void dev_add_pack(struct packet_type *pt)
 {
 	struct list_head *head = ptype_head(pt);
+	struct net *net;

 	if (WARN_ON_ONCE(!head))
 		return;

 	spin_lock(&ptype_lock);
+	net = pt->dev ? dev_net(pt->dev) : pt->af_packet_net;
+	if (pt->type == htons(ETH_P_ALL))
+		net_ptype_all_set_cnt(net, net_ptype_all_cnt(net) + 1);
+	else
+		net_ptype_specific_set_cnt(net,
+					   net_ptype_specific_cnt(net) + 1);
 	list_add_rcu(&pt->list, head);
 	spin_unlock(&ptype_lock);
 }
@@ -630,6 +637,7 @@ void __dev_remove_pack(struct packet_type *pt)
 {
 	struct list_head *head = ptype_head(pt);
 	struct packet_type *pt1;
+	struct net *net;

 	if (!head)
 		return;
@@ -637,10 +645,16 @@ void __dev_remove_pack(struct packet_type *pt)
 	spin_lock(&ptype_lock);

 	list_for_each_entry(pt1, head, list) {
-		if (pt == pt1) {
-			list_del_rcu(&pt->list);
-			goto out;
-		}
+		if (pt != pt1)
+			continue;
+		list_del_rcu(&pt->list);
+		net = pt->dev ? dev_net(pt->dev) : pt->af_packet_net;
+		if (pt->type == htons(ETH_P_ALL))
+			net_ptype_all_set_cnt(net, net_ptype_all_cnt(net) - 1);
+		else
+			net_ptype_specific_set_cnt(net,
+					      net_ptype_specific_cnt(net) - 1);
+		goto out;
 	}

 	pr_warn("dev_remove_pack: %p not found\n", pt);
@@ -2483,8 +2497,7 @@ static inline bool skb_loop_sk(struct packet_type
*ptype, struct sk_buff *skb)
  */
 bool dev_nit_active(struct net_device *dev)
 {
-	return !list_empty(&dev_net(dev)->ptype_all) ||
-	       !list_empty(&dev->ptype_all);
+	return net_ptype_all_cnt(dev_net(dev)) > 0;
 }
 EXPORT_SYMBOL_GPL(dev_nit_active);

@@ -5671,11 +5684,49 @@ static inline int nf_ingress(struct sk_buff
*skb, struct packet_type **pt_prev,
 	return 0;
 }

+static int deliver_ptype_all_skb(struct sk_buff *skb, int ret,
+				 struct packet_type **ppt_prev,
+				 struct net_device *orig_dev)
+{
+	struct packet_type *ptype, *pt_prev = *ppt_prev;
+
+	list_for_each_entry_rcu(ptype, &dev_net(skb->dev)->ptype_all, list) {
+		if (pt_prev)
+			ret = deliver_skb(skb, pt_prev, orig_dev);
+		pt_prev = ptype;
+	}
+
+	list_for_each_entry_rcu(ptype, &skb->dev->ptype_all, list) {
+		if (pt_prev)
+			ret = deliver_skb(skb, pt_prev, orig_dev);
+		pt_prev = ptype;
+	}
+	*ppt_prev = pt_prev;
+	return ret;
+}
+
+static void deliver_ptype_specific_skb(struct sk_buff *skb, bool
deliver_exact,
+				       struct packet_type **ppt_prev,
+				       struct net_device *orig_dev,
+				       __be16 type)
+{
+	if (deliver_exact)
+		deliver_ptype_list_skb(skb, ppt_prev, orig_dev, type,
+				       &dev_net(skb->dev)->ptype_specific);
+
+	deliver_ptype_list_skb(skb, ppt_prev, orig_dev, type,
+			       &orig_dev->ptype_specific);
+
+	if (unlikely(skb->dev != orig_dev))
+		deliver_ptype_list_skb(skb, ppt_prev, orig_dev, type,
+				       &skb->dev->ptype_specific);
+}
+
 static int __netif_receive_skb_core(struct sk_buff **pskb, bool pfmemalloc,
 				    struct packet_type **ppt_prev)
 {
-	struct packet_type *ptype, *pt_prev;
 	rx_handler_func_t *rx_handler;
+	struct packet_type *pt_prev;
 	struct sk_buff *skb = *pskb;
 	struct net_device *orig_dev;
 	bool deliver_exact = false;
@@ -5732,17 +5783,8 @@ static int __netif_receive_skb_core(struct
sk_buff **pskb, bool pfmemalloc,
 	if (pfmemalloc)
 		goto skip_taps;

-	list_for_each_entry_rcu(ptype, &dev_net(skb->dev)->ptype_all, list) {
-		if (pt_prev)
-			ret = deliver_skb(skb, pt_prev, orig_dev);
-		pt_prev = ptype;
-	}
-
-	list_for_each_entry_rcu(ptype, &skb->dev->ptype_all, list) {
-		if (pt_prev)
-			ret = deliver_skb(skb, pt_prev, orig_dev);
-		pt_prev = ptype;
-	}
+	if (net_ptype_all_cnt(dev_net(skb->dev)) > 0)
+		ret = deliver_ptype_all_skb(skb, ret, &pt_prev, orig_dev);

 skip_taps:
 #ifdef CONFIG_NET_INGRESS
@@ -5840,21 +5882,14 @@ static int __netif_receive_skb_core(struct
sk_buff **pskb, bool pfmemalloc,
 	type = skb->protocol;

 	/* deliver only exact match when indicated */
-	if (likely(!deliver_exact)) {
+	if (likely(!deliver_exact))
 		deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
 				       &ptype_base[ntohs(type) &
 						   PTYPE_HASH_MASK]);
-		deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
-				       &dev_net(orig_dev)->ptype_specific);
-	}
-
-	deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
-			       &orig_dev->ptype_specific);

-	if (unlikely(skb->dev != orig_dev)) {
-		deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
-				       &skb->dev->ptype_specific);
-	}
+	if (net_ptype_specific_cnt(dev_net(skb->dev)) > 0)
+		deliver_ptype_specific_skb(skb, deliver_exact, &pt_prev,
+					   orig_dev, type);

 	if (pt_prev) {
 		if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC)))
@@ -12566,7 +12601,6 @@ static void __init net_dev_struct_check(void)
 	CACHELINE_ASSERT_GROUP_SIZE(struct net_device, net_device_read_txrx, 46);

 	/* RX read-mostly hotpath */
-	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx,
ptype_specific);
 	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx,
ifindex);
 	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx,
real_num_rx_queues);
 	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, _rx);
@@ -12581,7 +12615,7 @@ static void __init net_dev_struct_check(void)
 #ifdef CONFIG_NET_XGRESS
 	CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx,
tcx_ingress);
 #endif
-	CACHELINE_ASSERT_GROUP_SIZE(struct net_device, net_device_read_rx, 92);
+	CACHELINE_ASSERT_GROUP_SIZE(struct net_device, net_device_read_rx, 76);
 }

 /*
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index b0dfdf791ece5..551355665aaee 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -1174,7 +1174,7 @@ static void __init netns_ipv4_struct_check(void)
 	/* TXRX readonly hotpath cache lines */
 	CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_txrx,
 				      sysctl_tcp_moderate_rcvbuf);
-	CACHELINE_ASSERT_GROUP_SIZE(struct netns_ipv4, netns_ipv4_read_txrx, 1);
+	CACHELINE_ASSERT_GROUP_SIZE(struct netns_ipv4, netns_ipv4_read_txrx, 7);

 	/* RX readonly hotpath cache line */
 	CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_rx,
@@ -1187,7 +1187,7 @@ static void __init netns_ipv4_struct_check(void)
 				      sysctl_tcp_reordering);
 	CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_rx,
 				      sysctl_tcp_rmem);
-	CACHELINE_ASSERT_GROUP_SIZE(struct netns_ipv4, netns_ipv4_read_rx, 22);
+	CACHELINE_ASSERT_GROUP_SIZE(struct netns_ipv4, netns_ipv4_read_rx, 24);
 }
 #endif