[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20220921170002.29557-11-laoar.shao@gmail.com>
Date: Wed, 21 Sep 2022 17:00:02 +0000
From: Yafang Shao <laoar.shao@...il.com>
To: ast@...nel.org, daniel@...earbox.net, andrii@...nel.org,
kafai@...com, songliubraving@...com, yhs@...com,
john.fastabend@...il.com, kpsingh@...nel.org, sdf@...gle.com,
haoluo@...gle.com, jolsa@...nel.org, hannes@...xchg.org,
mhocko@...nel.org, roman.gushchin@...ux.dev, shakeelb@...gle.com,
songmuchun@...edance.com, akpm@...ux-foundation.org, tj@...nel.org,
lizefan.x@...edance.com
Cc: cgroups@...r.kernel.org, netdev@...r.kernel.org,
bpf@...r.kernel.org, linux-mm@...ck.org,
Yafang Shao <laoar.shao@...il.com>
Subject: [RFC PATCH bpf-next 10/10] bpf, memcg: Add new item bpf into memory.stat
A new item 'bpf' is introduced into memory.stat, then we can get the memory
consumed by bpf. Currently only the memory of bpf-map is accounted.
The accouting of this new item is implemented with scope-based accouting,
which is similar to set_active_memcg(). In this scope, the memory allocated
will be accounted or unaccounted to a specific item, which is specified by
set_active_memcg_item().
The result in cgroup v1 as follows,
$ cat /sys/fs/cgroup/memory/foo/memory.stat | grep bpf
bpf 109056000
total_bpf 109056000
After the map is removed, the counter will become zero again.
$ cat /sys/fs/cgroup/memory/foo/memory.stat | grep bpf
bpf 0
total_bpf 0
The 'bpf' may not be 0 after the bpf-map is destroyed, because there may be
cached objects.
Note that there's no kmemcg in root memory cgroup, so the item 'bpf' will
be always 0 in root memory cgroup. If a bpf-map is charged into root memcg
directly, its memory size will not be accounted, so the 'total_bpf' can't
be used to monitor system-wide bpf memory consumption yet.
Signed-off-by: Yafang Shao <laoar.shao@...il.com>
---
include/linux/bpf.h | 10 ++++++++--
include/linux/memcontrol.h | 1 +
include/linux/sched.h | 1 +
include/linux/sched/mm.h | 24 ++++++++++++++++++++++++
kernel/bpf/memalloc.c | 10 ++++++++++
kernel/bpf/ringbuf.c | 4 ++++
kernel/bpf/syscall.c | 40 ++++++++++++++++++++++++++++++++++++++--
kernel/fork.c | 1 +
mm/memcontrol.c | 20 ++++++++++++++++++++
9 files changed, 107 insertions(+), 4 deletions(-)
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index f7a4cfc..9eda143 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1725,7 +1725,13 @@ void __percpu *bpf_map_alloc_percpu(const struct bpf_map *map, size_t size,
void bpf_map_kvfree(const void *ptr);
void bpf_map_free_percpu(void __percpu *ptr);
-#define bpf_map_kfree_rcu(ptr, rhf...) kvfree_rcu(ptr, ## rhf)
+#define bpf_map_kfree_rcu(ptr, rhf...) { \
+ int old_item; \
+ \
+ old_item = set_active_memcg_item(MEMCG_BPF); \
+ kvfree_rcu(ptr, ## rhf); \
+ set_active_memcg_item(old_item); \
+}
#else
static inline void *
@@ -1771,7 +1777,7 @@ static inline void bpf_map_free_percpu(void __percpu *ptr)
#define bpf_map_kfree_rcu(ptr, rhf...) kvfree_rcu(ptr, ## rhf)
-#endif
+#endif /* CONFIG_MEMCG_KMEM */
extern int sysctl_unprivileged_bpf_disabled;
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index d4a0ad3..f345467 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -37,6 +37,7 @@ enum memcg_stat_item {
MEMCG_KMEM,
MEMCG_ZSWAP_B,
MEMCG_ZSWAPPED,
+ MEMCG_BPF,
MEMCG_NR_STAT,
};
diff --git a/include/linux/sched.h b/include/linux/sched.h
index e7b2f8a..79362da 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1423,6 +1423,7 @@ struct task_struct {
/* Used by memcontrol for targeted memcg charge: */
struct mem_cgroup *active_memcg;
+ int active_item;
#endif
#ifdef CONFIG_BLK_CGROUP
diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
index 2a24361..3a334c7 100644
--- a/include/linux/sched/mm.h
+++ b/include/linux/sched/mm.h
@@ -363,6 +363,7 @@ static inline void memalloc_pin_restore(unsigned int flags)
#ifdef CONFIG_MEMCG
DECLARE_PER_CPU(struct mem_cgroup *, int_active_memcg);
+DECLARE_PER_CPU(int, int_active_item);
/**
* set_active_memcg - Starts the remote memcg charging scope.
* @memcg: memcg to charge.
@@ -389,12 +390,35 @@ static inline void memalloc_pin_restore(unsigned int flags)
return old;
}
+
+static inline int
+set_active_memcg_item(int item)
+{
+ int old_item;
+
+ if (!in_task()) {
+ old_item = this_cpu_read(int_active_item);
+ this_cpu_write(int_active_item, item);
+ } else {
+ old_item = current->active_item;
+ current->active_item = item;
+ }
+
+ return old_item;
+}
+
#else
static inline struct mem_cgroup *
set_active_memcg(struct mem_cgroup *memcg)
{
return NULL;
}
+
+static inline int
+set_active_memcg_item(int item)
+{
+ return MEMCG_NR_STAT;
+}
#endif
#ifdef CONFIG_MEMBARRIER
diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c
index 5f83be1..51d59d4 100644
--- a/kernel/bpf/memalloc.c
+++ b/kernel/bpf/memalloc.c
@@ -165,11 +165,14 @@ static void alloc_bulk(struct bpf_mem_cache *c, int cnt, int node)
{
struct mem_cgroup *memcg = NULL, *old_memcg;
unsigned long flags;
+ int old_item;
void *obj;
int i;
memcg = get_memcg(c);
old_memcg = set_active_memcg(memcg);
+ old_item = set_active_memcg_item(MEMCG_BPF);
+
for (i = 0; i < cnt; i++) {
obj = __alloc(c, node);
if (!obj)
@@ -194,19 +197,26 @@ static void alloc_bulk(struct bpf_mem_cache *c, int cnt, int node)
if (IS_ENABLED(CONFIG_PREEMPT_RT))
local_irq_restore(flags);
}
+
+ set_active_memcg_item(old_item);
set_active_memcg(old_memcg);
mem_cgroup_put(memcg);
}
static void free_one(struct bpf_mem_cache *c, void *obj)
{
+ int old_item;
+
+ old_item = set_active_memcg_item(MEMCG_BPF);
if (c->percpu_size) {
free_percpu(((void **)obj)[1]);
kfree(obj);
+ set_active_memcg_item(old_item);
return;
}
kfree(obj);
+ set_active_memcg_item(old_item);
}
static void __free_rcu(struct rcu_head *head)
diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c
index 535e440..72435bd 100644
--- a/kernel/bpf/ringbuf.c
+++ b/kernel/bpf/ringbuf.c
@@ -61,7 +61,11 @@ struct bpf_ringbuf_hdr {
static inline void bpf_map_free_page(struct page *page)
{
+ int old_item;
+
+ old_item = set_active_memcg_item(MEMCG_BPF);
__free_page(page);
+ set_active_memcg_item(old_item);
}
static void bpf_ringbuf_pages_free(struct page **pages, int nr_pages)
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index b9250c8..703aa6a 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -340,11 +340,14 @@ static void *__bpf_map_area_alloc(u64 size, int numa_node, bool mmapable)
const gfp_t gfp = __GFP_NOWARN | __GFP_ZERO | __GFP_ACCOUNT;
unsigned int flags = 0;
unsigned long align = 1;
+ int old_item;
void *area;
+ void *ptr;
if (size >= SIZE_MAX)
return NULL;
+ old_item = set_active_memcg_item(MEMCG_BPF);
/* kmalloc()'ed memory can't be mmap()'ed */
if (mmapable) {
BUG_ON(!PAGE_ALIGNED(size));
@@ -353,13 +356,18 @@ static void *__bpf_map_area_alloc(u64 size, int numa_node, bool mmapable)
} else if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) {
area = kmalloc_node(size, gfp | GFP_USER | __GFP_NORETRY,
numa_node);
- if (area != NULL)
+ if (area != NULL) {
+ set_active_memcg_item(old_item);
return area;
+ }
}
- return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END,
+ ptr = __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END,
gfp | GFP_KERNEL | __GFP_RETRY_MAYFAIL, PAGE_KERNEL,
flags, numa_node, __builtin_return_address(0));
+
+ set_active_memcg_item(old_item);
+ return ptr;
}
void *bpf_map_area_alloc(u64 size, int numa_node, struct bpf_map *map)
@@ -386,9 +394,13 @@ void *bpf_map_area_mmapable_alloc(u64 size, int numa_node)
void bpf_map_area_free(void *area, struct bpf_map *map)
{
+ int old_item;
+
if (map)
bpf_map_release_memcg(map);
+ old_item = set_active_memcg_item(MEMCG_BPF);
kvfree(area);
+ set_active_memcg_item(old_item);
}
static u32 bpf_map_flags_retain_permanent(u32 flags)
@@ -464,11 +476,14 @@ void *bpf_map_kmalloc_node(const struct bpf_map *map, size_t size, gfp_t flags,
int node)
{
struct mem_cgroup *memcg, *old_memcg;
+ int old_item;
void *ptr;
memcg = bpf_map_get_memcg(map);
old_memcg = set_active_memcg(memcg);
+ old_item = set_active_memcg_item(MEMCG_BPF);
ptr = kmalloc_node(size, flags | __GFP_ACCOUNT, node);
+ set_active_memcg_item(old_item);
set_active_memcg(old_memcg);
bpf_map_put_memcg(memcg);
@@ -479,10 +494,13 @@ void *bpf_map_kzalloc(const struct bpf_map *map, size_t size, gfp_t flags)
{
struct mem_cgroup *memcg, *old_memcg;
void *ptr;
+ int old_item;
memcg = bpf_map_get_memcg(map);
old_memcg = set_active_memcg(memcg);
+ old_item = set_active_memcg_item(MEMCG_BPF);
ptr = kzalloc(size, flags | __GFP_ACCOUNT);
+ set_active_memcg_item(old_item);
set_active_memcg(old_memcg);
bpf_map_put_memcg(memcg);
@@ -494,11 +512,14 @@ void *bpf_map_kvcalloc(struct bpf_map *map, size_t n, size_t size,
{
struct mem_cgroup *memcg, *old_memcg;
void *ptr;
+ int old_item;
memcg = bpf_map_get_memcg(map);
old_memcg = set_active_memcg(memcg);
+ old_item = set_active_memcg_item(MEMCG_BPF);
ptr = kvcalloc(n, size, flags | __GFP_ACCOUNT);
set_active_memcg(old_memcg);
+ set_active_memcg_item(old_item);
bpf_map_put_memcg(memcg);
return ptr;
@@ -509,10 +530,13 @@ void __percpu *bpf_map_alloc_percpu(const struct bpf_map *map, size_t size,
{
struct mem_cgroup *memcg, *old_memcg;
void __percpu *ptr;
+ int old_item;
memcg = bpf_map_get_memcg(map);
old_memcg = set_active_memcg(memcg);
+ old_item = set_active_memcg_item(MEMCG_BPF);
ptr = __alloc_percpu_gfp(size, align, flags | __GFP_ACCOUNT);
+ set_active_memcg_item(old_item);
set_active_memcg(old_memcg);
bpf_map_put_memcg(memcg);
@@ -521,17 +545,29 @@ void __percpu *bpf_map_alloc_percpu(const struct bpf_map *map, size_t size,
void bpf_map_kfree(const void *ptr)
{
+ int old_item;
+
+ old_item = set_active_memcg_item(MEMCG_BPF);
kfree(ptr);
+ set_active_memcg_item(old_item);
}
void bpf_map_kvfree(const void *ptr)
{
+ int old_item;
+
+ old_item = set_active_memcg_item(MEMCG_BPF);
kvfree(ptr);
+ set_active_memcg_item(old_item);
}
void bpf_map_free_percpu(void __percpu *ptr)
{
+ int old_item;
+
+ old_item = set_active_memcg_item(MEMCG_BPF);
free_percpu(ptr);
+ set_active_memcg_item(old_item);
}
#endif
diff --git a/kernel/fork.c b/kernel/fork.c
index 90c85b1..dac2429 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1043,6 +1043,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
#ifdef CONFIG_MEMCG
tsk->active_memcg = NULL;
+ tsk->active_item = 0;
#endif
#ifdef CONFIG_CPU_SUP_INTEL
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index b69979c..9008417 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -82,6 +82,10 @@
DEFINE_PER_CPU(struct mem_cgroup *, int_active_memcg);
EXPORT_PER_CPU_SYMBOL_GPL(int_active_memcg);
+/* Active memory cgroup to use from an interrupt context */
+DEFINE_PER_CPU(int, int_active_item);
+EXPORT_PER_CPU_SYMBOL_GPL(int_active_item);
+
/* Socket memory accounting disabled? */
static bool cgroup_memory_nosocket __ro_after_init;
@@ -923,6 +927,14 @@ static __always_inline struct mem_cgroup *active_memcg(void)
return current->active_memcg;
}
+static __always_inline int active_memcg_item(void)
+{
+ if (!in_task())
+ return this_cpu_read(int_active_item);
+
+ return current->active_item;
+}
+
/**
* get_mem_cgroup_from_mm: Obtain a reference on given mm_struct's memcg.
* @mm: mm from which memcg should be extracted. It can be NULL.
@@ -1436,6 +1448,7 @@ struct memory_stat {
{ "workingset_restore_anon", WORKINGSET_RESTORE_ANON },
{ "workingset_restore_file", WORKINGSET_RESTORE_FILE },
{ "workingset_nodereclaim", WORKINGSET_NODERECLAIM },
+ { "bpf", MEMCG_BPF },
};
/* Translate stat items to the correct unit for memory.stat output */
@@ -2993,6 +3006,11 @@ struct obj_cgroup *get_obj_cgroup_from_page(struct page *page)
static void memcg_account_kmem(struct mem_cgroup *memcg, int nr_pages)
{
+ int item = active_memcg_item();
+
+ WARN_ON_ONCE(item != 0 && (item < MEMCG_SWAP || item >= MEMCG_NR_STAT));
+ if (item)
+ mod_memcg_state(memcg, item, nr_pages);
mod_memcg_state(memcg, MEMCG_KMEM, nr_pages);
if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) {
if (nr_pages > 0)
@@ -3976,6 +3994,7 @@ static int memcg_numa_stat_show(struct seq_file *m, void *v)
NR_FILE_DIRTY,
NR_WRITEBACK,
MEMCG_SWAP,
+ MEMCG_BPF,
};
static const char *const memcg1_stat_names[] = {
@@ -3989,6 +4008,7 @@ static int memcg_numa_stat_show(struct seq_file *m, void *v)
"dirty",
"writeback",
"swap",
+ "bpf",
};
/* Universal VM events cgroup1 shows, original sort order */
--
1.8.3.1
Powered by blists - more mailing lists