lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite for Android: free password hash cracker in your pocket
[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <oxe2ksih27igcstfgyyvbzirn5i55qlxkrsfdtitlx6ltpdp4s@vax2vt5433yh>
Date: Tue, 22 Jul 2025 08:23:27 -0700
From: Shakeel Butt <shakeel.butt@...ux.dev>
To: Bertrand Wlodarczyk <bertrand.wlodarczyk@...el.com>
Cc: tj@...nel.org, hannes@...xchg.org, mkoutny@...e.com, 
	edumazet@...gle.com, gthelen@...gle.com, yosry.ahmed@...ux.dev, 
	cgroups@...r.kernel.org, linux-kernel@...r.kernel.org, inwardvessel@...il.com
Subject: Re: [PATCH] cgroup/rstat: move css_rstat_lock outside cpu loop

On Tue, Jul 22, 2025 at 02:43:19PM +0200, Bertrand Wlodarczyk wrote:
> By moving the lock outside the CPU loop, we reduce the frequency
> of costly lock acquisition.
> This adjustment slightly improves performance in scenarios where
> multiple CPUs concurrently attempt to acquire the lock for
> the same css.

Did you see the commit 0efc297a3c497 ("cgroup/rstat: avoid disabling
irqs for O(num_cpu)") for the reasoning on why we are doing this?

> 
> The cpu argument passed to __css_rstat_lock, which was utilized
> by the event system, has been removed because it is no longer in use.
> ---
> Results:
>  
> QEMU vm
> +-------+---------+
> | main  | patched |
> +-------+---------+
> | 9.17s | 2.36s   |
> +-------+---------+
> ext4 raw image with debian:
> qemu-system-x86_64 -enable-kvm -cpu host -smp 102 -m 16G -kernel linux-cgroup/arch/x86/boot/bzImage -drive file=rootfs.ext4,if=virtio,format=raw -append "rootwait root=/dev/vda console=tty1 console=ttyS0 nokaslr cgroup_enable=memory cgroup_memory=1" -net nic,model=virtio -net user -nographic
> 
> Benchmark code: https://gist.github.com/bwlodarcz/c955b36b5667f0167dffcff23953d1da
> musl-gcc -o benchmark -static -g3 -DNUM_THREADS=10 -DNUM_ITER=10000 -O2 -Wall benchmark.c -pthread
> ---
>  include/trace/events/cgroup.h | 22 ++++++++++------------
>  kernel/cgroup/rstat.c         | 20 +++++++++-----------
>  2 files changed, 19 insertions(+), 23 deletions(-)
> 
> diff --git a/include/trace/events/cgroup.h b/include/trace/events/cgroup.h
> index ba9229af9a34..eb674eef8b99 100644
> --- a/include/trace/events/cgroup.h
> +++ b/include/trace/events/cgroup.h
> @@ -206,15 +206,14 @@ DEFINE_EVENT(cgroup_event, cgroup_notify_frozen,
>  
>  DECLARE_EVENT_CLASS(cgroup_rstat,
>  
> -	TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
> +	TP_PROTO(struct cgroup *cgrp, bool contended),
>  
> -	TP_ARGS(cgrp, cpu, contended),
> +	TP_ARGS(cgrp, contended),
>  
>  	TP_STRUCT__entry(
>  		__field(	int,		root			)
>  		__field(	int,		level			)
>  		__field(	u64,		id			)
> -		__field(	int,		cpu			)
>  		__field(	bool,		contended		)
>  	),
>  
> @@ -222,13 +221,12 @@ DECLARE_EVENT_CLASS(cgroup_rstat,
>  		__entry->root = cgrp->root->hierarchy_id;
>  		__entry->id = cgroup_id(cgrp);
>  		__entry->level = cgrp->level;
> -		__entry->cpu = cpu;
>  		__entry->contended = contended;
>  	),
>  
> -	TP_printk("root=%d id=%llu level=%d cpu=%d lock contended:%d",
> +	TP_printk("root=%d id=%llu level=%d lock contended:%d",
>  		  __entry->root, __entry->id, __entry->level,
> -		  __entry->cpu, __entry->contended)
> +		  __entry->contended)
>  );
>  
>  /*
> @@ -238,23 +236,23 @@ DECLARE_EVENT_CLASS(cgroup_rstat,
>   */
>  DEFINE_EVENT(cgroup_rstat, cgroup_rstat_lock_contended,
>  
> -	TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
> +	TP_PROTO(struct cgroup *cgrp, bool contended),
>  
> -	TP_ARGS(cgrp, cpu, contended)
> +	TP_ARGS(cgrp, contended)
>  );
>  
>  DEFINE_EVENT(cgroup_rstat, cgroup_rstat_locked,
>  
> -	TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
> +	TP_PROTO(struct cgroup *cgrp, bool contended),
>  
> -	TP_ARGS(cgrp, cpu, contended)
> +	TP_ARGS(cgrp, contended)
>  );
>  
>  DEFINE_EVENT(cgroup_rstat, cgroup_rstat_unlock,
>  
> -	TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
> +	TP_PROTO(struct cgroup *cgrp, bool contended),
>  
> -	TP_ARGS(cgrp, cpu, contended)
> +	TP_ARGS(cgrp, contended)
>  );
>  
>  #endif /* _TRACE_CGROUP_H */
> diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
> index c8a48cf83878..dd312fe1896d 100644
> --- a/kernel/cgroup/rstat.c
> +++ b/kernel/cgroup/rstat.c
> @@ -326,8 +326,7 @@ __bpf_hook_end();
>   * value -1 is used when obtaining the main lock else this is the CPU
>   * number processed last.
>   */
> -static inline void __css_rstat_lock(struct cgroup_subsys_state *css,
> -		int cpu_in_loop)
> +static inline void __css_rstat_lock(struct cgroup_subsys_state *css)
>  	__acquires(ss_rstat_lock(css->ss))
>  {
>  	struct cgroup *cgrp = css->cgroup;
> @@ -337,21 +336,20 @@ static inline void __css_rstat_lock(struct cgroup_subsys_state *css,
>  	lock = ss_rstat_lock(css->ss);
>  	contended = !spin_trylock_irq(lock);
>  	if (contended) {
> -		trace_cgroup_rstat_lock_contended(cgrp, cpu_in_loop, contended);
> +		trace_cgroup_rstat_lock_contended(cgrp, contended);
>  		spin_lock_irq(lock);
>  	}
> -	trace_cgroup_rstat_locked(cgrp, cpu_in_loop, contended);
> +	trace_cgroup_rstat_locked(cgrp, contended);
>  }
>  
> -static inline void __css_rstat_unlock(struct cgroup_subsys_state *css,
> -				      int cpu_in_loop)
> +static inline void __css_rstat_unlock(struct cgroup_subsys_state *css)
>  	__releases(ss_rstat_lock(css->ss))
>  {
>  	struct cgroup *cgrp = css->cgroup;
>  	spinlock_t *lock;
>  
>  	lock = ss_rstat_lock(css->ss);
> -	trace_cgroup_rstat_unlock(cgrp, cpu_in_loop, false);
> +	trace_cgroup_rstat_unlock(cgrp, false);
>  	spin_unlock_irq(lock);
>  }
>  
> @@ -381,11 +379,11 @@ __bpf_kfunc void css_rstat_flush(struct cgroup_subsys_state *css)
>  		return;
>  
>  	might_sleep();
> +	__css_rstat_lock(css);
>  	for_each_possible_cpu(cpu) {
>  		struct cgroup_subsys_state *pos;
>  
>  		/* Reacquire for each CPU to avoid disabling IRQs too long */
> -		__css_rstat_lock(css, cpu);
>  		pos = css_rstat_updated_list(css, cpu);
>  		for (; pos; pos = pos->rstat_flush_next) {
>  			if (is_self) {
> @@ -395,10 +393,10 @@ __bpf_kfunc void css_rstat_flush(struct cgroup_subsys_state *css)
>  			} else
>  				pos->ss->css_rstat_flush(pos, cpu);
>  		}
> -		__css_rstat_unlock(css, cpu);
>  		if (!cond_resched())

cond_resched() with the spin lock held?

>  			cpu_relax();
>  	}
> +	__css_rstat_unlock(css);
>  }
>  
>  int css_rstat_init(struct cgroup_subsys_state *css)
> @@ -685,11 +683,11 @@ void cgroup_base_stat_cputime_show(struct seq_file *seq)
>  
>  	if (cgroup_parent(cgrp)) {
>  		css_rstat_flush(&cgrp->self);
> -		__css_rstat_lock(&cgrp->self, -1);
> +		__css_rstat_lock(&cgrp->self);
>  		bstat = cgrp->bstat;
>  		cputime_adjust(&cgrp->bstat.cputime, &cgrp->prev_cputime,
>  			       &bstat.cputime.utime, &bstat.cputime.stime);
> -		__css_rstat_unlock(&cgrp->self, -1);
> +		__css_rstat_unlock(&cgrp->self);
>  	} else {
>  		root_cgroup_cputime(&bstat);
>  	}
> -- 
> 2.49.0
> 

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ