[<prev] [next>] [day] [month] [year] [list]
Message-ID: <20260129153458.4163797-1-edumazet@google.com>
Date: Thu, 29 Jan 2026 15:34:58 +0000
From: Eric Dumazet <edumazet@...gle.com>
To: "David S . Miller" <davem@...emloft.net>, Jakub Kicinski <kuba@...nel.org>,
Paolo Abeni <pabeni@...hat.com>
Cc: Simon Horman <horms@...nel.org>, Neal Cardwell <ncardwell@...gle.com>,
Kuniyuki Iwashima <kuniyu@...gle.com>, netdev@...r.kernel.org, eric.dumazet@...il.com,
Eric Dumazet <edumazet@...gle.com>
Subject: [PATCH net-next] tcp: reduce tcp sockets size by one cache line
By default, when a kmem_cache is created with SLAB_TYPESAFE_BY_RCU,
slub has to use extra storage for the freelist pointer after each
object, because slub assumes that any bit in the object
can be used by RCU readers.
Because proto_register() is also using SLAB_HWCACHE_ALIGN,
this forces slub to use one extra cache line per object.
We can instead put the slub freelist anywhere in the object,
granted the concurrent RCU readers are not supposed to
use the pointer value.
Add a new (struct sock)sk_freeptr field, in an union
with sk_rcu: No RCU readers would need to look at sk_rcu,
which is only used at free phase.
Tested:
grep . /sys/kernel/slab/TCP/{object_size,slab_size,objs_per_slab}
grep . /sys/kernel/slab/TCPv6/{object_size,slab_size,objs_per_slab}
Before:
/sys/kernel/slab/TCP/object_size:2368
/sys/kernel/slab/TCP/slab_size:2432
/sys/kernel/slab/TCP/objs_per_slab:13
/sys/kernel/slab/TCPv6/object_size:2496
/sys/kernel/slab/TCPv6/slab_size:2560
/sys/kernel/slab/TCPv6/objs_per_slab:12
After this patch, we can pack one more TCPv6 object per slab,
and object_size == slab_size.
/sys/kernel/slab/TCP/object_size:2368
/sys/kernel/slab/TCP/slab_size:2368
/sys/kernel/slab/TCP/objs_per_slab:13
/sys/kernel/slab/TCPv6/object_size:2496
/sys/kernel/slab/TCPv6/slab_size:2496
/sys/kernel/slab/TCPv6/objs_per_slab:13
Signed-off-by: Eric Dumazet <edumazet@...gle.com>
---
include/net/sock.h | 11 ++++++++++-
net/core/sock.c | 16 ++++++++++------
net/ipv4/tcp_ipv4.c | 2 ++
net/ipv6/tcp_ipv6.c | 2 ++
4 files changed, 24 insertions(+), 7 deletions(-)
diff --git a/include/net/sock.h b/include/net/sock.h
index aafe8bdb2c0f936bc3a179e394c2df6830419997..66b56288c1d3850439b2a0bed00be801d5770efa 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -341,6 +341,7 @@ struct sk_filter;
* @sk_reuseport_cb: reuseport group container
* @sk_bpf_storage: ptr to cache and control for bpf_sk_storage
* @sk_rcu: used during RCU grace period
+ * @sk_freeptr: used for SLAB_TYPESAFE_BY_RCU managed sockets
* @sk_clockid: clockid used by time-based scheduling (SO_TXTIME)
* @sk_txtime_deadline_mode: set deadline mode for SO_TXTIME
* @sk_txtime_report_errors: set report errors mode for SO_TXTIME
@@ -582,7 +583,14 @@ struct sock {
struct bpf_local_storage __rcu *sk_bpf_storage;
#endif
struct numa_drop_counters *sk_drop_counters;
- struct rcu_head sk_rcu;
+ /* sockets using SLAB_TYPESAFE_BY_RCU can use sk_freeptr.
+ * By the time kfree() is called, sk_rcu can not be in
+ * use and can be mangled.
+ */
+ union {
+ struct rcu_head sk_rcu;
+ freeptr_t sk_freeptr;
+ };
netns_tracker ns_tracker;
struct xarray sk_user_frags;
@@ -1368,6 +1376,7 @@ struct proto {
struct kmem_cache *slab;
unsigned int obj_size;
+ unsigned int freeptr_offset;
unsigned int ipv6_pinfo_offset;
slab_flags_t slab_flags;
unsigned int useroffset; /* Usercopy region offset */
diff --git a/net/core/sock.c b/net/core/sock.c
index a1c8b47b0d5662b882dc0f9257c54ed312c383b4..693e6d80f501ef552aa58928f28b78a578169536 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -4193,13 +4193,17 @@ int proto_register(struct proto *prot, int alloc_slab)
return -EINVAL;
}
if (alloc_slab) {
- prot->slab = kmem_cache_create_usercopy(prot->name,
- prot->obj_size, 0,
- SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT |
- prot->slab_flags,
- prot->useroffset, prot->usersize,
- NULL);
+ struct kmem_cache_args args = {
+ .useroffset = prot->useroffset,
+ .usersize = prot->usersize,
+ .freeptr_offset = prot->freeptr_offset,
+ .use_freeptr_offset = !!prot->freeptr_offset,
+ };
+ prot->slab = kmem_cache_create(prot->name, prot->obj_size,
+ &args,
+ SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT |
+ prot->slab_flags);
if (prot->slab == NULL) {
pr_crit("%s: Can't create sock SLAB cache!\n",
prot->name);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index ffdf52fbf6463b41d7c712f3710b681ecdf6e2d7..0fc8a42921aabac27dcb7c6a9db811498edbb31c 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -3460,6 +3460,8 @@ struct proto tcp_prot = {
.sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem),
.max_header = MAX_TCP_HEADER,
.obj_size = sizeof(struct tcp_sock),
+ .freeptr_offset = offsetof(struct tcp_sock,
+ inet_conn.icsk_inet.sk.sk_freeptr),
.slab_flags = SLAB_TYPESAFE_BY_RCU,
.twsk_prot = &tcp_timewait_sock_ops,
.rsk_prot = &tcp_request_sock_ops,
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 4ae664b05fa9171ed996bf8f3b6e7b2aaa63d5c9..8bf29186c15f99dd2ab63d2b0b3890ed0c68d514 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -2332,6 +2332,8 @@ struct proto tcpv6_prot = {
.sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem),
.max_header = MAX_TCP_HEADER,
.obj_size = sizeof(struct tcp6_sock),
+ .freeptr_offset = offsetof(struct tcp6_sock,
+ tcp.inet_conn.icsk_inet.sk.sk_freeptr),
.ipv6_pinfo_offset = offsetof(struct tcp6_sock, inet6),
.slab_flags = SLAB_TYPESAFE_BY_RCU,
.twsk_prot = &tcp6_timewait_sock_ops,
--
2.53.0.rc1.217.geba53bf80e-goog
Powered by blists - more mailing lists