[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20230624031333.96597-9-alexei.starovoitov@gmail.com>
Date: Fri, 23 Jun 2023 20:13:28 -0700
From: Alexei Starovoitov <alexei.starovoitov@...il.com>
To: daniel@...earbox.net,
andrii@...nel.org,
void@...ifault.com,
houtao@...weicloud.com,
paulmck@...nel.org
Cc: tj@...nel.org,
rcu@...r.kernel.org,
netdev@...r.kernel.org,
bpf@...r.kernel.org,
kernel-team@...com
Subject: [PATCH v2 bpf-next 08/13] bpf: Add a hint to allocated objects.
From: Alexei Starovoitov <ast@...nel.org>
To address OOM issue when one cpu is allocating and another cpu is freeing add
a target bpf_mem_cache hint to allocated objects and when local cpu free_llist
overflows free to that bpf_mem_cache. The hint addresses the OOM while
maintaing the same performance for common case when alloc/free are done on the
same cpu.
Signed-off-by: Alexei Starovoitov <ast@...nel.org>
---
kernel/bpf/memalloc.c | 46 ++++++++++++++++++++++++++-----------------
1 file changed, 28 insertions(+), 18 deletions(-)
diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c
index d68a854f45ee..692a9a30c1dc 100644
--- a/kernel/bpf/memalloc.c
+++ b/kernel/bpf/memalloc.c
@@ -99,6 +99,7 @@ struct bpf_mem_cache {
int low_watermark, high_watermark, batch;
int percpu_size;
bool draining;
+ struct bpf_mem_cache *tgt;
/* list of objects to be freed after RCU tasks trace GP */
struct llist_head free_by_rcu_ttrace;
@@ -190,18 +191,11 @@ static void alloc_bulk(struct bpf_mem_cache *c, int cnt, int node)
for (i = 0; i < cnt; i++) {
/*
- * free_by_rcu_ttrace is only manipulated by irq work refill_work().
- * IRQ works on the same CPU are called sequentially, so it is
- * safe to use __llist_del_first() here. If alloc_bulk() is
- * invoked by the initial prefill, there will be no running
- * refill_work(), so __llist_del_first() is fine as well.
- *
- * In most cases, objects on free_by_rcu_ttrace are from the same CPU.
- * If some objects come from other CPUs, it doesn't incur any
- * harm because NUMA_NO_NODE means the preference for current
- * numa node and it is not a guarantee.
+ * For every 'c' llist_del_first(&c->free_by_rcu_ttrace); is
+ * done only by one CPU == current CPU. Other CPUs might
+ * llist_add() and llist_del_all() in parallel.
*/
- obj = __llist_del_first(&c->free_by_rcu_ttrace);
+ obj = llist_del_first(&c->free_by_rcu_ttrace);
if (!obj)
break;
add_obj_to_free_list(c, obj);
@@ -278,7 +272,7 @@ static void enque_to_free(struct bpf_mem_cache *c, void *obj)
/* bpf_mem_cache is a per-cpu object. Freeing happens in irq_work.
* Nothing races to add to free_by_rcu_ttrace list.
*/
- if (__llist_add(llnode, &c->free_by_rcu_ttrace))
+ if (llist_add(llnode, &c->free_by_rcu_ttrace))
c->free_by_rcu_ttrace_tail = llnode;
}
@@ -290,7 +284,7 @@ static void do_call_rcu_ttrace(struct bpf_mem_cache *c)
return;
WARN_ON_ONCE(!llist_empty(&c->waiting_for_gp_ttrace));
- llnode = __llist_del_all(&c->free_by_rcu_ttrace);
+ llnode = llist_del_all(&c->free_by_rcu_ttrace);
if (llnode)
/* There is no concurrent __llist_add(waiting_for_gp_ttrace) access.
* It doesn't race with llist_del_all either.
@@ -303,16 +297,22 @@ static void do_call_rcu_ttrace(struct bpf_mem_cache *c)
* If RCU Tasks Trace grace period implies RCU grace period, free
* these elements directly, else use call_rcu() to wait for normal
* progs to finish and finally do free_one() on each element.
+ *
+ * call_rcu_tasks_trace() enqueues to a global queue, so it's ok
+ * that current cpu bpf_mem_cache != target bpf_mem_cache.
*/
call_rcu_tasks_trace(&c->rcu_ttrace, __free_rcu_tasks_trace);
}
static void free_bulk(struct bpf_mem_cache *c)
{
+ struct bpf_mem_cache *tgt = c->tgt;
struct llist_node *llnode, *t;
unsigned long flags;
int cnt;
+ WARN_ON_ONCE(tgt->unit_size != c->unit_size);
+
do {
if (IS_ENABLED(CONFIG_PREEMPT_RT))
local_irq_save(flags);
@@ -326,13 +326,13 @@ static void free_bulk(struct bpf_mem_cache *c)
if (IS_ENABLED(CONFIG_PREEMPT_RT))
local_irq_restore(flags);
if (llnode)
- enque_to_free(c, llnode);
+ enque_to_free(tgt, llnode);
} while (cnt > (c->high_watermark + c->low_watermark) / 2);
/* and drain free_llist_extra */
llist_for_each_safe(llnode, t, llist_del_all(&c->free_llist_extra))
- enque_to_free(c, llnode);
- do_call_rcu_ttrace(c);
+ enque_to_free(tgt, llnode);
+ do_call_rcu_ttrace(tgt);
}
static void bpf_mem_refill(struct irq_work *work)
@@ -431,6 +431,7 @@ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu)
c->unit_size = unit_size;
c->objcg = objcg;
c->percpu_size = percpu_size;
+ c->tgt = c;
prefill_mem_cache(c, cpu);
}
ma->cache = pc;
@@ -453,6 +454,7 @@ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu)
c = &cc->cache[i];
c->unit_size = sizes[i];
c->objcg = objcg;
+ c->tgt = c;
prefill_mem_cache(c, cpu);
}
}
@@ -471,7 +473,7 @@ static void drain_mem_cache(struct bpf_mem_cache *c)
* Except for waiting_for_gp_ttrace list, there are no concurrent operations
* on these lists, so it is safe to use __llist_del_all().
*/
- free_all(__llist_del_all(&c->free_by_rcu_ttrace), percpu);
+ free_all(llist_del_all(&c->free_by_rcu_ttrace), percpu);
free_all(llist_del_all(&c->waiting_for_gp_ttrace), percpu);
free_all(__llist_del_all(&c->free_llist), percpu);
free_all(__llist_del_all(&c->free_llist_extra), percpu);
@@ -605,8 +607,10 @@ static void notrace *unit_alloc(struct bpf_mem_cache *c)
local_irq_save(flags);
if (local_inc_return(&c->active) == 1) {
llnode = __llist_del_first(&c->free_llist);
- if (llnode)
+ if (llnode) {
cnt = --c->free_cnt;
+ *(struct bpf_mem_cache **)llnode = c;
+ }
}
local_dec(&c->active);
local_irq_restore(flags);
@@ -630,6 +634,12 @@ static void notrace unit_free(struct bpf_mem_cache *c, void *ptr)
BUILD_BUG_ON(LLIST_NODE_SZ > 8);
+ /*
+ * Remember bpf_mem_cache that allocated this object.
+ * The hint is not accurate.
+ */
+ c->tgt = *(struct bpf_mem_cache **)llnode;
+
local_irq_save(flags);
if (local_inc_return(&c->active) == 1) {
__llist_add(llnode, &c->free_llist);
--
2.34.1
Powered by blists - more mailing lists