[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20250910192057.1045711-3-kuniyu@google.com>
Date: Wed, 10 Sep 2025 19:19:29 +0000
From: Kuniyuki Iwashima <kuniyu@...gle.com>
To: Alexei Starovoitov <ast@...nel.org>, Andrii Nakryiko <andrii@...nel.org>,
Daniel Borkmann <daniel@...earbox.net>, Martin KaFai Lau <martin.lau@...ux.dev>
Cc: John Fastabend <john.fastabend@...il.com>, Stanislav Fomichev <sdf@...ichev.me>,
Johannes Weiner <hannes@...xchg.org>, Michal Hocko <mhocko@...nel.org>,
Roman Gushchin <roman.gushchin@...ux.dev>, Shakeel Butt <shakeel.butt@...ux.dev>,
"David S. Miller" <davem@...emloft.net>, Eric Dumazet <edumazet@...gle.com>,
Jakub Kicinski <kuba@...nel.org>, Paolo Abeni <pabeni@...hat.com>,
Neal Cardwell <ncardwell@...gle.com>, Willem de Bruijn <willemb@...gle.com>,
Mina Almasry <almasrymina@...gle.com>, Kuniyuki Iwashima <kuniyu@...gle.com>,
Kuniyuki Iwashima <kuni1840@...il.com>, bpf@...r.kernel.org, netdev@...r.kernel.org
Subject: [PATCH v8 bpf-next/net 2/6] net-memcg: Allow decoupling memcg from
global protocol memory accounting.
Some protocols (e.g., TCP, UDP) implement memory accounting for socket
buffers and charge memory to per-protocol global counters pointed to by
sk->sk_proto->memory_allocated.
If a socket has sk->sk_memcg, this memory is also charged to memcg as
"sock" in memory.stat.
We do not need to pay costs for two orthogonal memory accounting
mechanisms. A microbenchmark result is in the subsequent bpf patch.
Let's decouple sockets under memcg from the global per-protocol memory
accounting if mem_cgroup_sk_exclusive() returns true.
Note that this does NOT disable memcg, but rather the per-protocol one.
mem_cgroup_sk_exclusive() starts to return true in the following patches,
and then, the per-protocol memory accounting will be skipped.
In __inet_accept(), we need to reclaim counts that are already charged
for child sockets because we do not allocate sk->sk_memcg until accept().
trace_sock_exceed_buf_limit() will always show 0 as accounted for the
memcg-exclusive sockets, but this can be obtained in memory.stat.
Signed-off-by: Kuniyuki Iwashima <kuniyu@...gle.com>
Nacked-by: Johannes Weiner <hannes@...xchg.org>
---
v7: Reorder before sysctl & bpf patches
v6: Update commit message
---
include/net/proto_memory.h | 15 ++++++--
include/net/sock.h | 10 ++++++
include/net/tcp.h | 10 ++++--
net/core/sock.c | 64 ++++++++++++++++++++++-----------
net/ipv4/af_inet.c | 12 ++++++-
net/ipv4/inet_connection_sock.c | 1 +
net/ipv4/tcp.c | 3 +-
net/ipv4/tcp_output.c | 10 ++++--
net/mptcp/protocol.c | 3 +-
net/tls/tls_device.c | 4 ++-
10 files changed, 100 insertions(+), 32 deletions(-)
diff --git a/include/net/proto_memory.h b/include/net/proto_memory.h
index 72d4ec413ab5..4383cb4cb2d2 100644
--- a/include/net/proto_memory.h
+++ b/include/net/proto_memory.h
@@ -31,13 +31,22 @@ static inline bool sk_under_memory_pressure(const struct sock *sk)
if (!sk->sk_prot->memory_pressure)
return false;
- if (mem_cgroup_sk_enabled(sk) &&
- mem_cgroup_sk_under_memory_pressure(sk))
- return true;
+ if (mem_cgroup_sk_enabled(sk)) {
+ if (mem_cgroup_sk_under_memory_pressure(sk))
+ return true;
+
+ if (mem_cgroup_sk_exclusive(sk))
+ return false;
+ }
return !!READ_ONCE(*sk->sk_prot->memory_pressure);
}
+static inline bool sk_should_enter_memory_pressure(struct sock *sk)
+{
+ return !mem_cgroup_sk_enabled(sk) || !mem_cgroup_sk_exclusive(sk);
+}
+
static inline long
proto_memory_allocated(const struct proto *prot)
{
diff --git a/include/net/sock.h b/include/net/sock.h
index 63a6a48afb48..66501ab670eb 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -2607,6 +2607,11 @@ static inline bool mem_cgroup_sk_enabled(const struct sock *sk)
return mem_cgroup_sockets_enabled && mem_cgroup_from_sk(sk);
}
+static inline bool mem_cgroup_sk_exclusive(const struct sock *sk)
+{
+ return false;
+}
+
static inline bool mem_cgroup_sk_under_memory_pressure(const struct sock *sk)
{
struct mem_cgroup *memcg = mem_cgroup_from_sk(sk);
@@ -2634,6 +2639,11 @@ static inline bool mem_cgroup_sk_enabled(const struct sock *sk)
return false;
}
+static inline bool mem_cgroup_sk_exclusive(const struct sock *sk)
+{
+ return false;
+}
+
static inline bool mem_cgroup_sk_under_memory_pressure(const struct sock *sk)
{
return false;
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 2936b8175950..225f6bac06c3 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -275,9 +275,13 @@ extern unsigned long tcp_memory_pressure;
/* optimized version of sk_under_memory_pressure() for TCP sockets */
static inline bool tcp_under_memory_pressure(const struct sock *sk)
{
- if (mem_cgroup_sk_enabled(sk) &&
- mem_cgroup_sk_under_memory_pressure(sk))
- return true;
+ if (mem_cgroup_sk_enabled(sk)) {
+ if (mem_cgroup_sk_under_memory_pressure(sk))
+ return true;
+
+ if (mem_cgroup_sk_exclusive(sk))
+ return false;
+ }
return READ_ONCE(tcp_memory_pressure);
}
diff --git a/net/core/sock.c b/net/core/sock.c
index 8002ac6293dc..814966309b0e 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1046,17 +1046,21 @@ static int sock_reserve_memory(struct sock *sk, int bytes)
if (!charged)
return -ENOMEM;
- /* pre-charge to forward_alloc */
- sk_memory_allocated_add(sk, pages);
- allocated = sk_memory_allocated(sk);
- /* If the system goes into memory pressure with this
- * precharge, give up and return error.
- */
- if (allocated > sk_prot_mem_limits(sk, 1)) {
- sk_memory_allocated_sub(sk, pages);
- mem_cgroup_sk_uncharge(sk, pages);
- return -ENOMEM;
+ if (!mem_cgroup_sk_exclusive(sk)) {
+ /* pre-charge to forward_alloc */
+ sk_memory_allocated_add(sk, pages);
+ allocated = sk_memory_allocated(sk);
+
+ /* If the system goes into memory pressure with this
+ * precharge, give up and return error.
+ */
+ if (allocated > sk_prot_mem_limits(sk, 1)) {
+ sk_memory_allocated_sub(sk, pages);
+ mem_cgroup_sk_uncharge(sk, pages);
+ return -ENOMEM;
+ }
}
+
sk_forward_alloc_add(sk, pages << PAGE_SHIFT);
WRITE_ONCE(sk->sk_reserved_mem,
@@ -3153,8 +3157,11 @@ bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
return true;
- sk_enter_memory_pressure(sk);
+ if (sk_should_enter_memory_pressure(sk))
+ sk_enter_memory_pressure(sk);
+
sk_stream_moderate_sndbuf(sk);
+
return false;
}
EXPORT_SYMBOL(sk_page_frag_refill);
@@ -3267,18 +3274,30 @@ int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
{
bool memcg_enabled = false, charged = false;
struct proto *prot = sk->sk_prot;
- long allocated;
-
- sk_memory_allocated_add(sk, amt);
- allocated = sk_memory_allocated(sk);
+ long allocated = 0;
if (mem_cgroup_sk_enabled(sk)) {
+ bool exclusive = mem_cgroup_sk_exclusive(sk);
+
memcg_enabled = true;
charged = mem_cgroup_sk_charge(sk, amt, gfp_memcg_charge());
- if (!charged)
+
+ if (exclusive && charged)
+ return 1;
+
+ if (!charged) {
+ if (!exclusive) {
+ sk_memory_allocated_add(sk, amt);
+ allocated = sk_memory_allocated(sk);
+ }
+
goto suppress_allocation;
+ }
}
+ sk_memory_allocated_add(sk, amt);
+ allocated = sk_memory_allocated(sk);
+
/* Under limit. */
if (allocated <= sk_prot_mem_limits(sk, 0)) {
sk_leave_memory_pressure(sk);
@@ -3357,7 +3376,8 @@ int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
trace_sock_exceed_buf_limit(sk, prot, allocated, kind);
- sk_memory_allocated_sub(sk, amt);
+ if (allocated)
+ sk_memory_allocated_sub(sk, amt);
if (charged)
mem_cgroup_sk_uncharge(sk, amt);
@@ -3396,11 +3416,15 @@ EXPORT_SYMBOL(__sk_mem_schedule);
*/
void __sk_mem_reduce_allocated(struct sock *sk, int amount)
{
- sk_memory_allocated_sub(sk, amount);
-
- if (mem_cgroup_sk_enabled(sk))
+ if (mem_cgroup_sk_enabled(sk)) {
mem_cgroup_sk_uncharge(sk, amount);
+ if (mem_cgroup_sk_exclusive(sk))
+ return;
+ }
+
+ sk_memory_allocated_sub(sk, amount);
+
if (sk_under_global_memory_pressure(sk) &&
(sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
sk_leave_memory_pressure(sk);
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index d42757f74c6e..52d060bc9009 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -95,6 +95,7 @@
#include <net/checksum.h>
#include <net/ip.h>
#include <net/protocol.h>
+#include <net/proto_memory.h>
#include <net/arp.h>
#include <net/route.h>
#include <net/ip_fib.h>
@@ -769,8 +770,17 @@ void __inet_accept(struct socket *sock, struct socket *newsock, struct sock *new
*/
amt = sk_mem_pages(newsk->sk_forward_alloc +
atomic_read(&newsk->sk_rmem_alloc));
- if (amt)
+ if (amt) {
+ /* This amt is already charged globally to
+ * sk_prot->memory_allocated due to lack of
+ * sk_memcg until accept(), thus we need to
+ * reclaim it here if newsk is isolated.
+ */
+ if (mem_cgroup_sk_exclusive(newsk))
+ sk_memory_allocated_sub(newsk, amt);
+
mem_cgroup_sk_charge(newsk, amt, gfp);
+ }
}
kmem_cache_charge(newsk, gfp);
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index ed10b959a906..f8dd53d40dcf 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -22,6 +22,7 @@
#include <net/tcp.h>
#include <net/sock_reuseport.h>
#include <net/addrconf.h>
+#include <net/proto_memory.h>
#if IS_ENABLED(CONFIG_IPV6)
/* match_sk*_wildcard == true: IPV6_ADDR_ANY equals to any IPv6 addresses
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 71a956fbfc55..dcbd49e2f8af 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -908,7 +908,8 @@ struct sk_buff *tcp_stream_alloc_skb(struct sock *sk, gfp_t gfp,
}
__kfree_skb(skb);
} else {
- sk->sk_prot->enter_memory_pressure(sk);
+ if (sk_should_enter_memory_pressure(sk))
+ tcp_enter_memory_pressure(sk);
sk_stream_moderate_sndbuf(sk);
}
return NULL;
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index dfbac0876d96..4b6a7250a9c2 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -3574,12 +3574,18 @@ void sk_forced_mem_schedule(struct sock *sk, int size)
delta = size - sk->sk_forward_alloc;
if (delta <= 0)
return;
+
amt = sk_mem_pages(delta);
sk_forward_alloc_add(sk, amt << PAGE_SHIFT);
- sk_memory_allocated_add(sk, amt);
- if (mem_cgroup_sk_enabled(sk))
+ if (mem_cgroup_sk_enabled(sk)) {
mem_cgroup_sk_charge(sk, amt, gfp_memcg_charge() | __GFP_NOFAIL);
+
+ if (mem_cgroup_sk_exclusive(sk))
+ return;
+ }
+
+ sk_memory_allocated_add(sk, amt);
}
/* Send a FIN. The caller locks the socket for us.
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index 9a287b75c1b3..f7487e22a3f8 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -16,6 +16,7 @@
#include <net/inet_common.h>
#include <net/inet_hashtables.h>
#include <net/protocol.h>
+#include <net/proto_memory.h>
#include <net/tcp_states.h>
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
#include <net/transp_v6.h>
@@ -1016,7 +1017,7 @@ static void mptcp_enter_memory_pressure(struct sock *sk)
mptcp_for_each_subflow(msk, subflow) {
struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
- if (first)
+ if (first && sk_should_enter_memory_pressure(ssk))
tcp_enter_memory_pressure(ssk);
sk_stream_moderate_sndbuf(ssk);
diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c
index f672a62a9a52..6696ef837116 100644
--- a/net/tls/tls_device.c
+++ b/net/tls/tls_device.c
@@ -35,6 +35,7 @@
#include <linux/netdevice.h>
#include <net/dst.h>
#include <net/inet_connection_sock.h>
+#include <net/proto_memory.h>
#include <net/tcp.h>
#include <net/tls.h>
#include <linux/skbuff_ref.h>
@@ -371,7 +372,8 @@ static int tls_do_allocation(struct sock *sk,
if (!offload_ctx->open_record) {
if (unlikely(!skb_page_frag_refill(prepend_size, pfrag,
sk->sk_allocation))) {
- READ_ONCE(sk->sk_prot)->enter_memory_pressure(sk);
+ if (sk_should_enter_memory_pressure(sk))
+ READ_ONCE(sk->sk_prot)->enter_memory_pressure(sk);
sk_stream_moderate_sndbuf(sk);
return -ENOMEM;
}
--
2.51.0.384.g4c02a37b29-goog
Powered by blists - more mailing lists