[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <oxe2ksih27igcstfgyyvbzirn5i55qlxkrsfdtitlx6ltpdp4s@vax2vt5433yh>
Date: Tue, 22 Jul 2025 08:23:27 -0700
From: Shakeel Butt <shakeel.butt@...ux.dev>
To: Bertrand Wlodarczyk <bertrand.wlodarczyk@...el.com>
Cc: tj@...nel.org, hannes@...xchg.org, mkoutny@...e.com,
edumazet@...gle.com, gthelen@...gle.com, yosry.ahmed@...ux.dev,
cgroups@...r.kernel.org, linux-kernel@...r.kernel.org, inwardvessel@...il.com
Subject: Re: [PATCH] cgroup/rstat: move css_rstat_lock outside cpu loop
On Tue, Jul 22, 2025 at 02:43:19PM +0200, Bertrand Wlodarczyk wrote:
> By moving the lock outside the CPU loop, we reduce the frequency
> of costly lock acquisition.
> This adjustment slightly improves performance in scenarios where
> multiple CPUs concurrently attempt to acquire the lock for
> the same css.
Did you see the commit 0efc297a3c497 ("cgroup/rstat: avoid disabling
irqs for O(num_cpu)") for the reasoning on why we are doing this?
>
> The cpu argument passed to __css_rstat_lock, which was utilized
> by the event system, has been removed because it is no longer in use.
> ---
> Results:
>
> QEMU vm
> +-------+---------+
> | main | patched |
> +-------+---------+
> | 9.17s | 2.36s |
> +-------+---------+
> ext4 raw image with debian:
> qemu-system-x86_64 -enable-kvm -cpu host -smp 102 -m 16G -kernel linux-cgroup/arch/x86/boot/bzImage -drive file=rootfs.ext4,if=virtio,format=raw -append "rootwait root=/dev/vda console=tty1 console=ttyS0 nokaslr cgroup_enable=memory cgroup_memory=1" -net nic,model=virtio -net user -nographic
>
> Benchmark code: https://gist.github.com/bwlodarcz/c955b36b5667f0167dffcff23953d1da
> musl-gcc -o benchmark -static -g3 -DNUM_THREADS=10 -DNUM_ITER=10000 -O2 -Wall benchmark.c -pthread
> ---
> include/trace/events/cgroup.h | 22 ++++++++++------------
> kernel/cgroup/rstat.c | 20 +++++++++-----------
> 2 files changed, 19 insertions(+), 23 deletions(-)
>
> diff --git a/include/trace/events/cgroup.h b/include/trace/events/cgroup.h
> index ba9229af9a34..eb674eef8b99 100644
> --- a/include/trace/events/cgroup.h
> +++ b/include/trace/events/cgroup.h
> @@ -206,15 +206,14 @@ DEFINE_EVENT(cgroup_event, cgroup_notify_frozen,
>
> DECLARE_EVENT_CLASS(cgroup_rstat,
>
> - TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
> + TP_PROTO(struct cgroup *cgrp, bool contended),
>
> - TP_ARGS(cgrp, cpu, contended),
> + TP_ARGS(cgrp, contended),
>
> TP_STRUCT__entry(
> __field( int, root )
> __field( int, level )
> __field( u64, id )
> - __field( int, cpu )
> __field( bool, contended )
> ),
>
> @@ -222,13 +221,12 @@ DECLARE_EVENT_CLASS(cgroup_rstat,
> __entry->root = cgrp->root->hierarchy_id;
> __entry->id = cgroup_id(cgrp);
> __entry->level = cgrp->level;
> - __entry->cpu = cpu;
> __entry->contended = contended;
> ),
>
> - TP_printk("root=%d id=%llu level=%d cpu=%d lock contended:%d",
> + TP_printk("root=%d id=%llu level=%d lock contended:%d",
> __entry->root, __entry->id, __entry->level,
> - __entry->cpu, __entry->contended)
> + __entry->contended)
> );
>
> /*
> @@ -238,23 +236,23 @@ DECLARE_EVENT_CLASS(cgroup_rstat,
> */
> DEFINE_EVENT(cgroup_rstat, cgroup_rstat_lock_contended,
>
> - TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
> + TP_PROTO(struct cgroup *cgrp, bool contended),
>
> - TP_ARGS(cgrp, cpu, contended)
> + TP_ARGS(cgrp, contended)
> );
>
> DEFINE_EVENT(cgroup_rstat, cgroup_rstat_locked,
>
> - TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
> + TP_PROTO(struct cgroup *cgrp, bool contended),
>
> - TP_ARGS(cgrp, cpu, contended)
> + TP_ARGS(cgrp, contended)
> );
>
> DEFINE_EVENT(cgroup_rstat, cgroup_rstat_unlock,
>
> - TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
> + TP_PROTO(struct cgroup *cgrp, bool contended),
>
> - TP_ARGS(cgrp, cpu, contended)
> + TP_ARGS(cgrp, contended)
> );
>
> #endif /* _TRACE_CGROUP_H */
> diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
> index c8a48cf83878..dd312fe1896d 100644
> --- a/kernel/cgroup/rstat.c
> +++ b/kernel/cgroup/rstat.c
> @@ -326,8 +326,7 @@ __bpf_hook_end();
> * value -1 is used when obtaining the main lock else this is the CPU
> * number processed last.
> */
> -static inline void __css_rstat_lock(struct cgroup_subsys_state *css,
> - int cpu_in_loop)
> +static inline void __css_rstat_lock(struct cgroup_subsys_state *css)
> __acquires(ss_rstat_lock(css->ss))
> {
> struct cgroup *cgrp = css->cgroup;
> @@ -337,21 +336,20 @@ static inline void __css_rstat_lock(struct cgroup_subsys_state *css,
> lock = ss_rstat_lock(css->ss);
> contended = !spin_trylock_irq(lock);
> if (contended) {
> - trace_cgroup_rstat_lock_contended(cgrp, cpu_in_loop, contended);
> + trace_cgroup_rstat_lock_contended(cgrp, contended);
> spin_lock_irq(lock);
> }
> - trace_cgroup_rstat_locked(cgrp, cpu_in_loop, contended);
> + trace_cgroup_rstat_locked(cgrp, contended);
> }
>
> -static inline void __css_rstat_unlock(struct cgroup_subsys_state *css,
> - int cpu_in_loop)
> +static inline void __css_rstat_unlock(struct cgroup_subsys_state *css)
> __releases(ss_rstat_lock(css->ss))
> {
> struct cgroup *cgrp = css->cgroup;
> spinlock_t *lock;
>
> lock = ss_rstat_lock(css->ss);
> - trace_cgroup_rstat_unlock(cgrp, cpu_in_loop, false);
> + trace_cgroup_rstat_unlock(cgrp, false);
> spin_unlock_irq(lock);
> }
>
> @@ -381,11 +379,11 @@ __bpf_kfunc void css_rstat_flush(struct cgroup_subsys_state *css)
> return;
>
> might_sleep();
> + __css_rstat_lock(css);
> for_each_possible_cpu(cpu) {
> struct cgroup_subsys_state *pos;
>
> /* Reacquire for each CPU to avoid disabling IRQs too long */
> - __css_rstat_lock(css, cpu);
> pos = css_rstat_updated_list(css, cpu);
> for (; pos; pos = pos->rstat_flush_next) {
> if (is_self) {
> @@ -395,10 +393,10 @@ __bpf_kfunc void css_rstat_flush(struct cgroup_subsys_state *css)
> } else
> pos->ss->css_rstat_flush(pos, cpu);
> }
> - __css_rstat_unlock(css, cpu);
> if (!cond_resched())
cond_resched() with the spin lock held?
> cpu_relax();
> }
> + __css_rstat_unlock(css);
> }
>
> int css_rstat_init(struct cgroup_subsys_state *css)
> @@ -685,11 +683,11 @@ void cgroup_base_stat_cputime_show(struct seq_file *seq)
>
> if (cgroup_parent(cgrp)) {
> css_rstat_flush(&cgrp->self);
> - __css_rstat_lock(&cgrp->self, -1);
> + __css_rstat_lock(&cgrp->self);
> bstat = cgrp->bstat;
> cputime_adjust(&cgrp->bstat.cputime, &cgrp->prev_cputime,
> &bstat.cputime.utime, &bstat.cputime.stime);
> - __css_rstat_unlock(&cgrp->self, -1);
> + __css_rstat_unlock(&cgrp->self);
> } else {
> root_cgroup_cputime(&bstat);
> }
> --
> 2.49.0
>
Powered by blists - more mailing lists