lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [day] [month] [year] [list]
Message-ID: <20260129153458.4163797-1-edumazet@google.com>
Date: Thu, 29 Jan 2026 15:34:58 +0000
From: Eric Dumazet <edumazet@...gle.com>
To: "David S . Miller" <davem@...emloft.net>, Jakub Kicinski <kuba@...nel.org>, 
	Paolo Abeni <pabeni@...hat.com>
Cc: Simon Horman <horms@...nel.org>, Neal Cardwell <ncardwell@...gle.com>, 
	Kuniyuki Iwashima <kuniyu@...gle.com>, netdev@...r.kernel.org, eric.dumazet@...il.com, 
	Eric Dumazet <edumazet@...gle.com>
Subject: [PATCH net-next] tcp: reduce tcp sockets size by one cache line

By default, when a kmem_cache is created with SLAB_TYPESAFE_BY_RCU,
slub has to use extra storage for the freelist pointer after each
object, because slub assumes that any bit in the object
can be used by RCU readers.

Because proto_register() is also using SLAB_HWCACHE_ALIGN,
this forces slub to use one extra cache line per object.

We can instead put the slub freelist anywhere in the object,
granted the concurrent RCU readers are not supposed to
use the pointer value.

Add a new (struct sock)sk_freeptr field, in an union
with sk_rcu: No RCU readers would need to look at sk_rcu,
which is only used at free phase.

Tested:

grep . /sys/kernel/slab/TCP/{object_size,slab_size,objs_per_slab}
grep . /sys/kernel/slab/TCPv6/{object_size,slab_size,objs_per_slab}

Before:

/sys/kernel/slab/TCP/object_size:2368
/sys/kernel/slab/TCP/slab_size:2432
/sys/kernel/slab/TCP/objs_per_slab:13

/sys/kernel/slab/TCPv6/object_size:2496
/sys/kernel/slab/TCPv6/slab_size:2560
/sys/kernel/slab/TCPv6/objs_per_slab:12

After this patch, we can pack one more TCPv6 object per slab,
and object_size == slab_size.

/sys/kernel/slab/TCP/object_size:2368
/sys/kernel/slab/TCP/slab_size:2368
/sys/kernel/slab/TCP/objs_per_slab:13

/sys/kernel/slab/TCPv6/object_size:2496
/sys/kernel/slab/TCPv6/slab_size:2496
/sys/kernel/slab/TCPv6/objs_per_slab:13

Signed-off-by: Eric Dumazet <edumazet@...gle.com>
---
 include/net/sock.h  | 11 ++++++++++-
 net/core/sock.c     | 16 ++++++++++------
 net/ipv4/tcp_ipv4.c |  2 ++
 net/ipv6/tcp_ipv6.c |  2 ++
 4 files changed, 24 insertions(+), 7 deletions(-)

diff --git a/include/net/sock.h b/include/net/sock.h
index aafe8bdb2c0f936bc3a179e394c2df6830419997..66b56288c1d3850439b2a0bed00be801d5770efa 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -341,6 +341,7 @@ struct sk_filter;
   *	@sk_reuseport_cb: reuseport group container
   *	@sk_bpf_storage: ptr to cache and control for bpf_sk_storage
   *	@sk_rcu: used during RCU grace period
+  *	@sk_freeptr: used for SLAB_TYPESAFE_BY_RCU managed sockets
   *	@sk_clockid: clockid used by time-based scheduling (SO_TXTIME)
   *	@sk_txtime_deadline_mode: set deadline mode for SO_TXTIME
   *	@sk_txtime_report_errors: set report errors mode for SO_TXTIME
@@ -582,7 +583,14 @@ struct sock {
 	struct bpf_local_storage __rcu	*sk_bpf_storage;
 #endif
 	struct numa_drop_counters *sk_drop_counters;
-	struct rcu_head		sk_rcu;
+	/* sockets using SLAB_TYPESAFE_BY_RCU can use sk_freeptr.
+	 * By the time kfree() is called, sk_rcu can not be in
+	 * use and can be mangled.
+	 */
+	union {
+		struct rcu_head	sk_rcu;
+		freeptr_t	sk_freeptr;
+	};
 	netns_tracker		ns_tracker;
 	struct xarray		sk_user_frags;
 
@@ -1368,6 +1376,7 @@ struct proto {
 
 	struct kmem_cache	*slab;
 	unsigned int		obj_size;
+	unsigned int		freeptr_offset;
 	unsigned int		ipv6_pinfo_offset;
 	slab_flags_t		slab_flags;
 	unsigned int		useroffset;	/* Usercopy region offset */
diff --git a/net/core/sock.c b/net/core/sock.c
index a1c8b47b0d5662b882dc0f9257c54ed312c383b4..693e6d80f501ef552aa58928f28b78a578169536 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -4193,13 +4193,17 @@ int proto_register(struct proto *prot, int alloc_slab)
 		return -EINVAL;
 	}
 	if (alloc_slab) {
-		prot->slab = kmem_cache_create_usercopy(prot->name,
-					prot->obj_size, 0,
-					SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT |
-					prot->slab_flags,
-					prot->useroffset, prot->usersize,
-					NULL);
+		struct kmem_cache_args args = {
+			.useroffset	= prot->useroffset,
+			.usersize	= prot->usersize,
+			.freeptr_offset = prot->freeptr_offset,
+			.use_freeptr_offset = !!prot->freeptr_offset,
+		};
 
+		prot->slab = kmem_cache_create(prot->name, prot->obj_size,
+					&args,
+					SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT |
+					prot->slab_flags);
 		if (prot->slab == NULL) {
 			pr_crit("%s: Can't create sock SLAB cache!\n",
 				prot->name);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index ffdf52fbf6463b41d7c712f3710b681ecdf6e2d7..0fc8a42921aabac27dcb7c6a9db811498edbb31c 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -3460,6 +3460,8 @@ struct proto tcp_prot = {
 	.sysctl_rmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_rmem),
 	.max_header		= MAX_TCP_HEADER,
 	.obj_size		= sizeof(struct tcp_sock),
+	.freeptr_offset		= offsetof(struct tcp_sock,
+					   inet_conn.icsk_inet.sk.sk_freeptr),
 	.slab_flags		= SLAB_TYPESAFE_BY_RCU,
 	.twsk_prot		= &tcp_timewait_sock_ops,
 	.rsk_prot		= &tcp_request_sock_ops,
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 4ae664b05fa9171ed996bf8f3b6e7b2aaa63d5c9..8bf29186c15f99dd2ab63d2b0b3890ed0c68d514 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -2332,6 +2332,8 @@ struct proto tcpv6_prot = {
 	.sysctl_rmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_rmem),
 	.max_header		= MAX_TCP_HEADER,
 	.obj_size		= sizeof(struct tcp6_sock),
+	.freeptr_offset		= offsetof(struct tcp6_sock,
+					   tcp.inet_conn.icsk_inet.sk.sk_freeptr),
 	.ipv6_pinfo_offset = offsetof(struct tcp6_sock, inet6),
 	.slab_flags		= SLAB_TYPESAFE_BY_RCU,
 	.twsk_prot		= &tcp6_timewait_sock_ops,
-- 
2.53.0.rc1.217.geba53bf80e-goog


Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ