linux-kernel - Re: [PATCH v2 1/4] seqlock: introduce scoped_seqlock_read() and scoped_seqlock_read

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite for Android: free password hash cracker in your pocket
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20251010080327.GF4067720@noisy.programming.kicks-ass.net>
Date: Fri, 10 Oct 2025 10:03:27 +0200
From: Peter Zijlstra <peterz@...radead.org>
To: Linus Torvalds <torvalds@...ux-foundation.org>
Cc: Oleg Nesterov <oleg@...hat.com>,
	Alexander Viro <viro@...iv.linux.org.uk>,
	Boqun Feng <boqun.feng@...il.com>,
	David Howells <dhowells@...hat.com>, Ingo Molnar <mingo@...hat.com>,
	Li RongQing <lirongqing@...du.com>,
	Waiman Long <longman@...hat.com>, Will Deacon <will@...nel.org>,
	linux-kernel@...r.kernel.org
Subject: Re: [PATCH v2 1/4] seqlock: introduce scoped_seqlock_read() and
 scoped_seqlock_read_irqsave()

On Thu, Oct 09, 2025 at 03:55:15PM -0700, Linus Torvalds wrote:
> On Thu, 9 Oct 2025 at 15:12, Peter Zijlstra <peterz@...radead.org> wrote:
> >
> > Sure; otoh compiler should be able to tell the same using liveness
> > analysis I suppose, but perhaps they're not *that* clever.
> 
> They are that clever, but only if they end up unrolling the loop
> statically. If the loop remains a loop, the two variables end up live
> in the same code.
> 
> And while a compiler _could_ in theory still see that they aren't
> actually live in the same _iteration_, I don't think any compiler
> actually ends up being that clever in practice.
> 
> So making it a union then hopefully gets the compiler to basically use
> that explicit information.

Right, so I had to use -Os to not make it unroll the thing, but then
indeed, sharing the variable helps it.

> > So I thought they were fine; we handle all the enum cases with 'return'
> > so its impossible to not exit the switch() but the silly compiler was
> > complaining about possible fall-through, so clearly it was getting
> > confused.
> 
> Yeah, I found the same thing with the 0/1/2 approach - the compiler
> wouldn't realize that the range was limited until I added a very
> explicit limit check that "shouldn't matter", but did.
> 
> This might obviously end up depending on compiler version and other
> random things, but in general the whole value range analysis tends to
> be a pretty fragile thing.
> 
> In practice, compilers tend to be good at doing value range analysis
> if they see particular patterns (like initializing it to some value,
> always incrementing it by one, and comparing against another value).
> 
> But when it's written more like a state machine like this, it's
> clearly very hit and miss.

I reordered the code, it is happier now.

Anyway, the below seems to generate decent code for
{-O2,-Os}x{gcc-14,clang-22}. Yay for optimizing compilers I suppose :-)

---
diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h
index 5ce48eab7a2a..45fab026f7d6 100644
--- a/include/linux/seqlock.h
+++ b/include/linux/seqlock.h
@@ -1209,4 +1209,83 @@ done_seqretry_irqrestore(seqlock_t *lock, int seq, unsigned long flags)
 	if (seq & 1)
 		read_sequnlock_excl_irqrestore(lock, flags);
 }
+
+enum ss_state {
+	ss_done = 0,
+	ss_lock,
+	ss_lock_irqsave,
+	ss_lockless,
+};
+
+struct ss_tmp {
+	enum ss_state	state;
+	unsigned long	data;
+	spinlock_t	*lock;
+	spinlock_t	*lock_irqsave;
+};
+
+static inline void __scoped_seqlock_cleanup(struct ss_tmp *sst)
+{
+	if (sst->lock)
+		spin_unlock(sst->lock);
+	if (sst->lock_irqsave)
+		spin_unlock_irqrestore(sst->lock, sst->data);
+}
+
+extern void __scoped_seqlock_invalid_target(void);
+extern void __scoped_seqlock_bug(void);
+
+static inline void
+__scoped_seqlock_next(struct ss_tmp *sst, seqlock_t *lock, enum ss_state target)
+{
+	switch (sst->state) {
+	case ss_done:
+		__scoped_seqlock_bug();
+		return;
+
+	case ss_lock:
+	case ss_lock_irqsave:
+		sst->state = ss_done;
+		return;
+
+	case ss_lockless:
+		if (!read_seqretry(lock, sst->data)) {
+			sst->state = ss_done;
+			return;
+		}
+		break;
+	}
+
+	switch (target) {
+	case ss_done:
+		__scoped_seqlock_invalid_target();
+		return;
+
+	case ss_lock:
+		sst->lock = &lock->lock;
+		spin_lock(sst->lock);
+		sst->state = ss_lock;
+		return;
+
+	case ss_lock_irqsave:
+		sst->lock_irqsave = &lock->lock;
+		spin_lock_irqsave(sst->lock, sst->data);
+		sst->state = ss_lock_irqsave;
+		return;
+
+	case ss_lockless:
+		sst->data = read_seqbegin(lock);
+		return;
+	}
+}
+
+#define __scoped_seqlock_read(_seqlock, _target, _s)			\
+	for (struct ss_tmp _s __cleanup(__scoped_seqlock_cleanup) =	\
+	     { .state = ss_lockless, .data = read_seqbegin(_seqlock) };	\
+	     _s.state != ss_done;					\
+	     __scoped_seqlock_next(&_s, _seqlock, _target))
+
+#define scoped_seqlock_read(_seqlock, _target)				\
+	__scoped_seqlock_read(_seqlock, _target, __UNIQUE_ID(seqlock))
+
 #endif /* __LINUX_SEQLOCK_H */
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 7097de2c8cda..d2b3f987c888 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -313,10 +313,8 @@ static u64 read_sum_exec_runtime(struct task_struct *t)
 void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
 {
 	struct signal_struct *sig = tsk->signal;
-	u64 utime, stime;
 	struct task_struct *t;
-	unsigned int seq, nextseq;
-	unsigned long flags;
+	u64 utime, stime;
 
 	/*
 	 * Update current task runtime to account pending time since last
@@ -329,27 +327,19 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
 	if (same_thread_group(current, tsk))
 		(void) task_sched_runtime(current);
 
-	rcu_read_lock();
-	/* Attempt a lockless read on the first round. */
-	nextseq = 0;
-	do {
-		seq = nextseq;
-		flags = read_seqbegin_or_lock_irqsave(&sig->stats_lock, &seq);
+	guard(rcu)();
+	scoped_seqlock_read(&sig->stats_lock, ss_lock_irqsave) {
 		times->utime = sig->utime;
 		times->stime = sig->stime;
 		times->sum_exec_runtime = sig->sum_sched_runtime;
 
-		for_each_thread(tsk, t) {
+		__for_each_thread(sig, t) {
 			task_cputime(t, &utime, &stime);
 			times->utime += utime;
 			times->stime += stime;
 			times->sum_exec_runtime += read_sum_exec_runtime(t);
 		}
-		/* If lockless access failed, take the lock. */
-		nextseq = 1;
-	} while (need_seqretry(&sig->stats_lock, seq));
-	done_seqretry_irqrestore(&sig->stats_lock, seq, flags);
-	rcu_read_unlock();
+	}
 }
 
 #ifdef CONFIG_IRQ_TIME_ACCOUNTING