lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite for Android: free password hash cracker in your pocket
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Date:   Sun, 15 May 2022 21:24:55 -0700
From:   Eric Dumazet <eric.dumazet@...il.com>
To:     "David S . Miller" <davem@...emloft.net>,
        Jakub Kicinski <kuba@...nel.org>,
        Paolo Abeni <pabeni@...hat.com>
Cc:     netdev <netdev@...r.kernel.org>,
        Eric Dumazet <edumazet@...gle.com>,
        Eric Dumazet <eric.dumazet@...il.com>
Subject: [PATCH net-next 3/4] net: add skb_defer_max sysctl

From: Eric Dumazet <edumazet@...gle.com>

commit 68822bdf76f1 ("net: generalize skb freeing
deferral to per-cpu lists") added another per-cpu
cache of skbs. It was expected to be small,
and an IPI was forced whenever the list reached 128
skbs.

We might need to be able to control more precisely
queue capacity and added latency.

An IPI is generated whenever queue reaches half capacity.

Default value of the new limit is 64.

Signed-off-by: Eric Dumazet <edumazet@...gle.com>
---
 Documentation/admin-guide/sysctl/net.rst |  8 ++++++++
 net/core/dev.c                           |  1 +
 net/core/dev.h                           |  2 +-
 net/core/skbuff.c                        | 15 +++++++++------
 net/core/sysctl_net_core.c               |  8 ++++++++
 5 files changed, 27 insertions(+), 7 deletions(-)

diff --git a/Documentation/admin-guide/sysctl/net.rst b/Documentation/admin-guide/sysctl/net.rst
index f86b5e1623c6922b070fd7c62e83271ee9aee46c..fa4dcdb283cf8937df8414906b57949cc7a3c2bc 100644
--- a/Documentation/admin-guide/sysctl/net.rst
+++ b/Documentation/admin-guide/sysctl/net.rst
@@ -322,6 +322,14 @@ a leaked reference faster. A larger value may be useful to prevent false
 warnings on slow/loaded systems.
 Default value is 10, minimum 1, maximum 3600.
 
+skb_defer_max
+-------------
+
+Max size (in skbs) of the per-cpu list of skbs being freed
+by the cpu which allocated them. Used by TCP stack so far.
+
+Default: 64
+
 optmem_max
 ----------
 
diff --git a/net/core/dev.c b/net/core/dev.c
index 35b6d79b0c51412534dc3b3374b8d797d212f2d8..ac22fedfeaf72dc0d46f4793bbd9b2d5dd301730 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4329,6 +4329,7 @@ int netdev_max_backlog __read_mostly = 1000;
 EXPORT_SYMBOL(netdev_max_backlog);
 
 int netdev_tstamp_prequeue __read_mostly = 1;
+unsigned int sysctl_skb_defer_max __read_mostly = 64;
 int netdev_budget __read_mostly = 300;
 /* Must be at least 2 jiffes to guarantee 1 jiffy timeout */
 unsigned int __read_mostly netdev_budget_usecs = 2 * USEC_PER_SEC / HZ;
diff --git a/net/core/dev.h b/net/core/dev.h
index 328b37af90ba9465d83c833dffd18547ddef4028..cbb8a925175a257f8ce2a27eebb02e03041eebb8 100644
--- a/net/core/dev.h
+++ b/net/core/dev.h
@@ -39,7 +39,7 @@ void dev_addr_check(struct net_device *dev);
 /* sysctls not referred to from outside net/core/ */
 extern int		netdev_budget;
 extern unsigned int	netdev_budget_usecs;
-
+extern unsigned int	sysctl_skb_defer_max;
 extern int		netdev_tstamp_prequeue;
 extern int		netdev_unregister_timeout_secs;
 extern int		weight_p;
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 1e2180682f2e94c45e3f26059af6d18be2d9f9d3..ecb8fe7045a2f9c080cd0299ff7c0c1ea88d996b 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -80,6 +80,7 @@
 #include <linux/user_namespace.h>
 #include <linux/indirect_call_wrapper.h>
 
+#include "dev.h"
 #include "sock_destructor.h"
 
 struct kmem_cache *skbuff_head_cache __ro_after_init;
@@ -6494,16 +6495,21 @@ void skb_attempt_defer_free(struct sk_buff *skb)
 	int cpu = skb->alloc_cpu;
 	struct softnet_data *sd;
 	unsigned long flags;
+	unsigned int defer_max;
 	bool kick;
 
 	if (WARN_ON_ONCE(cpu >= nr_cpu_ids) ||
 	    !cpu_online(cpu) ||
 	    cpu == raw_smp_processor_id()) {
-		__kfree_skb(skb);
+nodefer:	__kfree_skb(skb);
 		return;
 	}
 
 	sd = &per_cpu(softnet_data, cpu);
+	defer_max = READ_ONCE(sysctl_skb_defer_max);
+	if (READ_ONCE(sd->defer_count) >= defer_max)
+		goto nodefer;
+
 	/* We do not send an IPI or any signal.
 	 * Remote cpu will eventually call skb_defer_free_flush()
 	 */
@@ -6513,11 +6519,8 @@ void skb_attempt_defer_free(struct sk_buff *skb)
 	WRITE_ONCE(sd->defer_list, skb);
 	sd->defer_count++;
 
-	/* kick every time queue length reaches 128.
-	 * This condition should hardly be hit under normal conditions,
-	 * unless cpu suddenly stopped to receive NIC interrupts.
-	 */
-	kick = sd->defer_count == 128;
+	/* Send an IPI every time queue reaches half capacity. */
+	kick = sd->defer_count == (defer_max >> 1);
 
 	spin_unlock_irqrestore(&sd->defer_lock, flags);
 
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index 195ca5c2877125bf420128d3c6465ac216f459e5..ca8d38325e1e1d7775d61893fab94ff9499ef5f8 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -578,6 +578,14 @@ static struct ctl_table net_core_table[] = {
 		.extra1		= SYSCTL_ONE,
 		.extra2		= &int_3600,
 	},
+	{
+		.procname	= "skb_defer_max",
+		.data		= &sysctl_skb_defer_max,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= SYSCTL_ZERO,
+	},
 	{ }
 };
 
-- 
2.36.0.550.gb090851708-goog

Powered by blists - more mailing lists