netdev - [PATCH bpf-next/net 2/6] net: Allow opt-out from global protocol memory accounting.

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20251007001120.2661442-3-kuniyu@google.com>
Date: Tue,  7 Oct 2025 00:07:27 +0000
From: Kuniyuki Iwashima <kuniyu@...gle.com>
To: Alexei Starovoitov <ast@...nel.org>, Andrii Nakryiko <andrii@...nel.org>, 
	Daniel Borkmann <daniel@...earbox.net>, Martin KaFai Lau <martin.lau@...ux.dev>
Cc: John Fastabend <john.fastabend@...il.com>, Stanislav Fomichev <sdf@...ichev.me>, 
	"David S. Miller" <davem@...emloft.net>, Eric Dumazet <edumazet@...gle.com>, 
	Jakub Kicinski <kuba@...nel.org>, Paolo Abeni <pabeni@...hat.com>, 
	Neal Cardwell <ncardwell@...gle.com>, Willem de Bruijn <willemb@...gle.com>, 
	Mina Almasry <almasrymina@...gle.com>, Roman Gushchin <roman.gushchin@...ux.dev>, 
	Kuniyuki Iwashima <kuniyu@...gle.com>, Kuniyuki Iwashima <kuni1840@...il.com>, bpf@...r.kernel.org, 
	netdev@...r.kernel.org
Subject: [PATCH bpf-next/net 2/6] net: Allow opt-out from global protocol
 memory accounting.

Some protocols (e.g., TCP, UDP) implement memory accounting for socket
buffers and charge memory to per-protocol global counters pointed to by
sk->sk_proto->memory_allocated.

Sometimes, system processes do not want that limitation.  For a similar
purpose, there is SO_RESERVE_MEM for sockets under memcg.

Also, by opting out of the per-protocol accounting, sockets under memcg
can avoid paying costs for two orthogonal memory accounting mechanisms.
A microbenchmark result is in the subsequent bpf patch.

Let's allow opt-out from the per-protocol memory accounting if
sk->sk_bypass_prot_mem is true.

sk->sk_bypass_prot_mem and sk->sk_prot are placed in the same cache
line, and sk_has_account() always fetches sk->sk_prot before accessing
sk->sk_bypass_prot_mem, so there is no extra cache miss for this patch.

The following patches will set sk->sk_bypass_prot_mem to true, and
then, the per-protocol memory accounting will be skipped.

Note that this does NOT disable memcg, but rather the per-protocol one.

Another option not to use the hole in struct sock_common is create
sk_prot variants like tcp_prot_bypass, but this would complicate
SOCKMAP logic, tcp_bpf_prots etc.

Signed-off-by: Kuniyuki Iwashima <kuniyu@...gle.com>
---
 include/net/proto_memory.h |  3 +++
 include/net/sock.h         |  3 +++
 include/net/tcp.h          |  3 +++
 net/core/sock.c            | 32 +++++++++++++++++++++++++-------
 net/ipv4/tcp.c             |  3 ++-
 net/ipv4/tcp_output.c      |  7 ++++++-
 net/mptcp/protocol.c       |  7 ++++---
 net/tls/tls_device.c       |  3 ++-
 8 files changed, 48 insertions(+), 13 deletions(-)

diff --git a/include/net/proto_memory.h b/include/net/proto_memory.h
index 8e91a8fa31b5..ad6d703ce6fe 100644
--- a/include/net/proto_memory.h
+++ b/include/net/proto_memory.h
@@ -35,6 +35,9 @@ static inline bool sk_under_memory_pressure(const struct sock *sk)
 	    mem_cgroup_sk_under_memory_pressure(sk))
 		return true;
 
+	if (sk->sk_bypass_prot_mem)
+		return false;
+
 	return !!READ_ONCE(*sk->sk_prot->memory_pressure);
 }
 
diff --git a/include/net/sock.h b/include/net/sock.h
index 60bcb13f045c..5cf8de6b6bf2 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -118,6 +118,7 @@ typedef __u64 __bitwise __addrpair;
  *	@skc_reuseport: %SO_REUSEPORT setting
  *	@skc_ipv6only: socket is IPV6 only
  *	@skc_net_refcnt: socket is using net ref counting
+ *	@skc_bypass_prot_mem:
  *	@skc_bound_dev_if: bound device index if != 0
  *	@skc_bind_node: bind hash linkage for various protocol lookup tables
  *	@skc_portaddr_node: second hash linkage for UDP/UDP-Lite protocol
@@ -174,6 +175,7 @@ struct sock_common {
 	unsigned char		skc_reuseport:1;
 	unsigned char		skc_ipv6only:1;
 	unsigned char		skc_net_refcnt:1;
+	unsigned char		skc_bypass_prot_mem:1;
 	int			skc_bound_dev_if;
 	union {
 		struct hlist_node	skc_bind_node;
@@ -380,6 +382,7 @@ struct sock {
 #define sk_reuseport		__sk_common.skc_reuseport
 #define sk_ipv6only		__sk_common.skc_ipv6only
 #define sk_net_refcnt		__sk_common.skc_net_refcnt
+#define sk_bypass_prot_mem		__sk_common.skc_bypass_prot_mem
 #define sk_bound_dev_if		__sk_common.skc_bound_dev_if
 #define sk_bind_node		__sk_common.skc_bind_node
 #define sk_prot			__sk_common.skc_prot
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 5ca230ed526a..d52ed4fe2335 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -303,6 +303,9 @@ static inline bool tcp_under_memory_pressure(const struct sock *sk)
 	    mem_cgroup_sk_under_memory_pressure(sk))
 		return true;
 
+	if (sk->sk_bypass_prot_mem)
+		return false;
+
 	return READ_ONCE(tcp_memory_pressure);
 }
 /*
diff --git a/net/core/sock.c b/net/core/sock.c
index dc03d4b5909a..7de189ec2556 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1046,9 +1046,13 @@ static int sock_reserve_memory(struct sock *sk, int bytes)
 	if (!charged)
 		return -ENOMEM;
 
+	if (sk->sk_bypass_prot_mem)
+		goto success;
+
 	/* pre-charge to forward_alloc */
 	sk_memory_allocated_add(sk, pages);
 	allocated = sk_memory_allocated(sk);
+
 	/* If the system goes into memory pressure with this
 	 * precharge, give up and return error.
 	 */
@@ -1057,6 +1061,8 @@ static int sock_reserve_memory(struct sock *sk, int bytes)
 		mem_cgroup_sk_uncharge(sk, pages);
 		return -ENOMEM;
 	}
+
+success:
 	sk_forward_alloc_add(sk, pages << PAGE_SHIFT);
 
 	WRITE_ONCE(sk->sk_reserved_mem,
@@ -3136,8 +3142,11 @@ bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
 	if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
 		return true;
 
-	sk_enter_memory_pressure(sk);
+	if (!sk->sk_bypass_prot_mem)
+		sk_enter_memory_pressure(sk);
+
 	sk_stream_moderate_sndbuf(sk);
+
 	return false;
 }
 EXPORT_SYMBOL(sk_page_frag_refill);
@@ -3254,10 +3263,12 @@ int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
 {
 	bool memcg_enabled = false, charged = false;
 	struct proto *prot = sk->sk_prot;
-	long allocated;
+	long allocated = 0;
 
-	sk_memory_allocated_add(sk, amt);
-	allocated = sk_memory_allocated(sk);
+	if (!sk->sk_bypass_prot_mem) {
+		sk_memory_allocated_add(sk, amt);
+		allocated = sk_memory_allocated(sk);
+	}
 
 	if (mem_cgroup_sk_enabled(sk)) {
 		memcg_enabled = true;
@@ -3266,6 +3277,9 @@ int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
 			goto suppress_allocation;
 	}
 
+	if (!allocated)
+		return 1;
+
 	/* Under limit. */
 	if (allocated <= sk_prot_mem_limits(sk, 0)) {
 		sk_leave_memory_pressure(sk);
@@ -3344,7 +3358,8 @@ int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
 
 	trace_sock_exceed_buf_limit(sk, prot, allocated, kind);
 
-	sk_memory_allocated_sub(sk, amt);
+	if (allocated)
+		sk_memory_allocated_sub(sk, amt);
 
 	if (charged)
 		mem_cgroup_sk_uncharge(sk, amt);
@@ -3383,11 +3398,14 @@ EXPORT_SYMBOL(__sk_mem_schedule);
  */
 void __sk_mem_reduce_allocated(struct sock *sk, int amount)
 {
-	sk_memory_allocated_sub(sk, amount);
-
 	if (mem_cgroup_sk_enabled(sk))
 		mem_cgroup_sk_uncharge(sk, amount);
 
+	if (sk->sk_bypass_prot_mem)
+		return;
+
+	sk_memory_allocated_sub(sk, amount);
+
 	if (sk_under_global_memory_pressure(sk) &&
 	    (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
 		sk_leave_memory_pressure(sk);
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 7949d16506a4..feb0aea23a59 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -928,7 +928,8 @@ struct sk_buff *tcp_stream_alloc_skb(struct sock *sk, gfp_t gfp,
 		}
 		__kfree_skb(skb);
 	} else {
-		sk->sk_prot->enter_memory_pressure(sk);
+		if (!sk->sk_bypass_prot_mem)
+			tcp_enter_memory_pressure(sk);
 		sk_stream_moderate_sndbuf(sk);
 	}
 	return NULL;
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index bb3576ac0ad7..0fb2a3bb62cd 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -3732,12 +3732,17 @@ void sk_forced_mem_schedule(struct sock *sk, int size)
 	delta = size - sk->sk_forward_alloc;
 	if (delta <= 0)
 		return;
+
 	amt = sk_mem_pages(delta);
 	sk_forward_alloc_add(sk, amt << PAGE_SHIFT);
-	sk_memory_allocated_add(sk, amt);
 
 	if (mem_cgroup_sk_enabled(sk))
 		mem_cgroup_sk_charge(sk, amt, gfp_memcg_charge() | __GFP_NOFAIL);
+
+	if (sk->sk_bypass_prot_mem)
+		return;
+
+	sk_memory_allocated_add(sk, amt);
 }
 
 /* Send a FIN. The caller locks the socket for us.
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index 0292162a14ee..94a5f6dcc577 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -1065,11 +1065,12 @@ static void mptcp_enter_memory_pressure(struct sock *sk)
 	mptcp_for_each_subflow(msk, subflow) {
 		struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
 
-		if (first)
+		if (first && !ssk->sk_bypass_prot_mem) {
 			tcp_enter_memory_pressure(ssk);
-		sk_stream_moderate_sndbuf(ssk);
+			first = false;
+		}
 
-		first = false;
+		sk_stream_moderate_sndbuf(ssk);
 	}
 	__mptcp_sync_sndbuf(sk);
 }
diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c
index a64ae15b1a60..caa2b5d24622 100644
--- a/net/tls/tls_device.c
+++ b/net/tls/tls_device.c
@@ -373,7 +373,8 @@ static int tls_do_allocation(struct sock *sk,
 	if (!offload_ctx->open_record) {
 		if (unlikely(!skb_page_frag_refill(prepend_size, pfrag,
 						   sk->sk_allocation))) {
-			READ_ONCE(sk->sk_prot)->enter_memory_pressure(sk);
+			if (!sk->sk_bypass_prot_mem)
+				READ_ONCE(sk->sk_prot)->enter_memory_pressure(sk);
 			sk_stream_moderate_sndbuf(sk);
 			return -ENOMEM;
 		}
-- 
2.51.0.710.ga91ca5db03-goog