[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <e114418f-a71c-4538-af5e-4a39bc42c04c@huaweicloud.com>
Date: Mon, 24 Nov 2025 11:52:51 +0800
From: Chen Ridong <chenridong@...weicloud.com>
To: akpm@...ux-foundation.org, david@...nel.org, lorenzo.stoakes@...cle.com,
Liam.Howlett@...cle.com, vbabka@...e.cz, rppt@...nel.org, surenb@...gle.com,
mhocko@...e.com, axelrasmussen@...gle.com, yuanchu@...gle.com,
weixugc@...gle.com, hannes@...xchg.org, zhengqi.arch@...edance.com,
shakeel.butt@...ux.dev
Cc: linux-mm@...ck.org, linux-kernel@...r.kernel.org, lujialin4@...wei.com,
chenridong@...wei.com
Subject: Re: [RFC -next] memcg: Optimize creation performance when LRU_GEN is
enabled
On 2025/11/19 16:37, Chen Ridong wrote:
> From: Chen Ridong <chenridong@...wei.com>
>
> With LRU_GEN=y and LRU_GEN_ENABLED=n, a performance regression occurs
> when creating a large number of memory cgroups (memcgs):
>
> # time mkdir testcg_{1..10000}
>
> real 0m7.167s
> user 0m0.037s
> sys 0m6.773s
>
> # time mkdir testcg_{1..20000}
>
> real 0m27.158s
> user 0m0.079s
> sys 0m26.270s
>
> In contrast, with LRU_GEN=n, creation of the same number of memcgs
> performs better:
>
> # time mkdir testcg_{1..10000}
>
> real 0m3.386s
> user 0m0.044s
> sys 0m3.009s
>
> # time mkdir testcg_{1..20000}
>
> real 0m6.876s
> user 0m0.075s
> sys 0m6.121s
>
> The root cause is that lru_gen node onlining uses hlist_nulls_add_tail_rcu,
> which traverses the entire list to find the tail. This traversal scales
> with the number of memcgs, even when LRU_GEN is runtime-disabled.
>
> Fix this by adding a per-lru_gen tail pointer to track the list's tail.
> Appending new nodes now uses the tail pointer directly, eliminating full
> list traversal.
>
> After applying this patch, memcg creation performance with LRU_GEN=y
> matches the fully disabled baseline:
>
> #time mkdir testcg_{1..10000}
>
> real 0m3.368s
> user 0m0.025s
> sys 0m3.012s
>
> # time mkdir testcg_{1..20000}
> real 0m6.742s
> user 0m0.085s
> sys 0m5.995s
>
> Signed-off-by: Chen Ridong <chenridong@...wei.com>
> ---
> include/linux/mmzone.h | 4 +++
> mm/vmscan.c | 78 ++++++++++++++++++++++++++++++++++++++----
> 2 files changed, 75 insertions(+), 7 deletions(-)
>
> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
> index 4398e027f450..bdee57b35126 100644
> --- a/include/linux/mmzone.h
> +++ b/include/linux/mmzone.h
> @@ -513,6 +513,8 @@ struct lru_gen_folio {
> u8 gen;
> /* the list segment this lru_gen_folio belongs to */
> u8 seg;
> + /* the bin index this lru_gen_folio is queued on */
> + u8 bin;
> /* per-node lru_gen_folio list for global reclaim */
> struct hlist_nulls_node list;
> };
> @@ -610,6 +612,8 @@ struct lru_gen_memcg {
> unsigned long nr_memcgs[MEMCG_NR_GENS];
> /* per-node lru_gen_folio list for global reclaim */
> struct hlist_nulls_head fifo[MEMCG_NR_GENS][MEMCG_NR_BINS];
> + /* cached tails to speed up enqueueing */
> + struct hlist_nulls_node *tails[MEMCG_NR_GENS][MEMCG_NR_BINS];
> /* protects the above */
> spinlock_t lock;
> };
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index 8890f4b58673..6c2665e48f19 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -4299,6 +4299,66 @@ enum {
> MEMCG_LRU_YOUNG,
> };
>
> +static void memcg_lru_add_head_locked(struct pglist_data *pgdat,
> + struct lruvec *lruvec, int gen, int bin)
> +{
> + struct lru_gen_memcg *memcg_lru = &pgdat->memcg_lru;
> + struct hlist_nulls_head *head = &memcg_lru->fifo[gen][bin];
> + struct hlist_nulls_node *node = &lruvec->lrugen.list;
> + bool empty = !memcg_lru->tails[gen][bin];
> +
> + hlist_nulls_add_head_rcu(node, head);
> + lruvec->lrugen.bin = bin;
> +
> + if (empty)
> + memcg_lru->tails[gen][bin] = node;
> +}
> +
> +static void memcg_lru_add_tail_locked(struct pglist_data *pgdat,
> + struct lruvec *lruvec, int gen, int bin)
> +{
> + struct lru_gen_memcg *memcg_lru = &pgdat->memcg_lru;
> + struct hlist_nulls_head *head = &memcg_lru->fifo[gen][bin];
> + struct hlist_nulls_node *node = &lruvec->lrugen.list;
> + struct hlist_nulls_node *tail = memcg_lru->tails[gen][bin];
> +
> + if (tail) {
> + WRITE_ONCE(node->next, tail->next);
> + WRITE_ONCE(node->pprev, &tail->next);
> + rcu_assign_pointer(hlist_nulls_next_rcu(tail), node);
> + } else {
> + hlist_nulls_add_head_rcu(node, head);
> + }
> +
> + memcg_lru->tails[gen][bin] = node;
> + lruvec->lrugen.bin = bin;
> +}
> +
> +static void memcg_lru_del_locked(struct pglist_data *pgdat, struct lruvec *lruvec,
> + bool reinit)
> +{
> + int gen = lruvec->lrugen.gen;
> + int bin = lruvec->lrugen.bin;
> + struct lru_gen_memcg *memcg_lru = &pgdat->memcg_lru;
> + struct hlist_nulls_head *head = &memcg_lru->fifo[gen][bin];
> + struct hlist_nulls_node *node = &lruvec->lrugen.list;
> + struct hlist_nulls_node *prev = NULL;
> +
> + if (hlist_nulls_unhashed(node))
> + return;
> +
> + if (memcg_lru->tails[gen][bin] == node) {
> + if (node->pprev != &head->first)
> + prev = container_of(node->pprev, struct hlist_nulls_node, next);
> + memcg_lru->tails[gen][bin] = prev;
> + }
> +
> + if (reinit)
> + hlist_nulls_del_init_rcu(node);
> + else
> + hlist_nulls_del_rcu(node);
> +}
> +
> static void lru_gen_rotate_memcg(struct lruvec *lruvec, int op)
> {
> int seg;
> @@ -4326,15 +4386,15 @@ static void lru_gen_rotate_memcg(struct lruvec *lruvec, int op)
> else
> VM_WARN_ON_ONCE(true);
>
> + memcg_lru_del_locked(pgdat, lruvec, false);
> +
> WRITE_ONCE(lruvec->lrugen.seg, seg);
> WRITE_ONCE(lruvec->lrugen.gen, new);
>
> - hlist_nulls_del_rcu(&lruvec->lrugen.list);
> -
> if (op == MEMCG_LRU_HEAD || op == MEMCG_LRU_OLD)
> - hlist_nulls_add_head_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[new][bin]);
> + memcg_lru_add_head_locked(pgdat, lruvec, new, bin);
> else
> - hlist_nulls_add_tail_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[new][bin]);
> + memcg_lru_add_tail_locked(pgdat, lruvec, new, bin);
>
> pgdat->memcg_lru.nr_memcgs[old]--;
> pgdat->memcg_lru.nr_memcgs[new]++;
> @@ -4365,7 +4425,7 @@ void lru_gen_online_memcg(struct mem_cgroup *memcg)
>
> lruvec->lrugen.gen = gen;
>
> - hlist_nulls_add_tail_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[gen][bin]);
> + memcg_lru_add_tail_locked(pgdat, lruvec, gen, bin);
> pgdat->memcg_lru.nr_memcgs[gen]++;
>
> spin_unlock_irq(&pgdat->memcg_lru.lock);
> @@ -4399,7 +4459,7 @@ void lru_gen_release_memcg(struct mem_cgroup *memcg)
>
> gen = lruvec->lrugen.gen;
>
> - hlist_nulls_del_init_rcu(&lruvec->lrugen.list);
> + memcg_lru_del_locked(pgdat, lruvec, true);
> pgdat->memcg_lru.nr_memcgs[gen]--;
>
> if (!pgdat->memcg_lru.nr_memcgs[gen] && gen == get_memcg_gen(pgdat->memcg_lru.seq))
> @@ -5664,8 +5724,10 @@ void lru_gen_init_pgdat(struct pglist_data *pgdat)
> spin_lock_init(&pgdat->memcg_lru.lock);
>
> for (i = 0; i < MEMCG_NR_GENS; i++) {
> - for (j = 0; j < MEMCG_NR_BINS; j++)
> + for (j = 0; j < MEMCG_NR_BINS; j++) {
> INIT_HLIST_NULLS_HEAD(&pgdat->memcg_lru.fifo[i][j], i);
> + pgdat->memcg_lru.tails[i][j] = NULL;
> + }
> }
> }
>
> @@ -5687,6 +5749,8 @@ void lru_gen_init_lruvec(struct lruvec *lruvec)
>
> if (mm_state)
> mm_state->seq = MIN_NR_GENS;
> +
> + lrugen->bin = 0;
> }
>
> #ifdef CONFIG_MEMCG
Hello all,
Is anyone interested in this issue?
Any better ideas or suggestions are welcome.
--
Best regards,
Ridong
Powered by blists - more mailing lists