lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20251127092226.1439196-12-ardb+git@google.com>
Date: Thu, 27 Nov 2025 10:22:31 +0100
From: Ard Biesheuvel <ardb+git@...gle.com>
To: linux-hardening@...r.kernel.org
Cc: linux-arm-kernel@...ts.infradead.org, linux-kernel@...r.kernel.org, 
	Ard Biesheuvel <ardb@...nel.org>, Kees Cook <kees@...nel.org>, Ryan Roberts <ryan.roberts@....com>, 
	Will Deacon <will@...nel.org>, Arnd Bergmann <arnd@...db.de>, Jeremy Linton <jeremy.linton@....com>, 
	Catalin Marinas <Catalin.Marinas@....com>, Mark Rutland <mark.rutland@....com>, 
	"Jason A. Donenfeld" <Jason@...c4.com>
Subject: [RFC/RFT PATCH 4/6] random: Use a lockless fast path for get_random_uXX()

From: Ard Biesheuvel <ardb@...nel.org>

Currently, the implementations of the get_random_uXX() API protect their
critical section with a local lock and disabling interrupts, to ensure
that the code does not race with itself when called from interrupt
context.

Given that the fast path does nothing more than read a single uXX
quantity from a linear buffer and bump the position pointer, poking the
hardware registers to disable and re-enable interrupts is
disproportionately costly, and best avoided.

There are two conditions under which the batched entropy buffer is
replenished, which is what forms the critical section:
- the buffer is exhausted
- the base_crng generation counter has incremented.

By combining the position and generation counters into a single u64, we
can use compare and exchange to implement the fast path without taking
the local lock or disabling interrupts. By constructing the expected and
next values carefully, the compare and exchange will only succeed if
- we did not race with ourselves, i.e., the compare and exchange
  increments the position counter by exactly 1;
- the buffer is not exhausted
- the generation counter equals the base_crng generation counter.

Only if the compare and exchange fails is the original slow path taken,
and only in that case do we take the local lock. This results in a
considerable speedup (3-5x) when benchmarking get_random_u8() in a tight
loop.

Signed-off-by: Ard Biesheuvel <ardb@...nel.org>
---
 drivers/char/random.c | 44 ++++++++++++++------
 1 file changed, 31 insertions(+), 13 deletions(-)

diff --git a/drivers/char/random.c b/drivers/char/random.c
index 0e04bc60d034..71bd74871540 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -496,6 +496,12 @@ static ssize_t get_random_bytes_user(struct iov_iter *iter)
  * should be called and return 0 at least once at any point prior.
  */
 
+#ifdef __LITTLE_ENDIAN
+#define LOHI(lo, hi)	lo, hi
+#else
+#define LOHI(lo, hi)	hi, lo
+#endif
+
 #define DEFINE_BATCHED_ENTROPY(type)						\
 struct batch_ ##type {								\
 	/*									\
@@ -507,8 +513,12 @@ struct batch_ ##type {								\
 	 */									\
 	type entropy[CHACHA_BLOCK_SIZE * 3 / (2 * sizeof(type))];		\
 	local_lock_t lock;							\
-	unsigned int generation;						\
-	unsigned int position;							\
+	union {									\
+		struct {							\
+			unsigned int LOHI(position, generation);		\
+		};								\
+		u64 posgen;							\
+	};									\
 };										\
 										\
 static DEFINE_PER_CPU(struct batch_ ##type, batched_entropy_ ##type) = {	\
@@ -522,6 +532,7 @@ type get_random_ ##type(void)							\
 	unsigned long flags;							\
 	struct batch_ ##type *batch;						\
 	unsigned int next_gen;							\
+	u64 next;								\
 										\
 	warn_unseeded_randomness();						\
 										\
@@ -530,21 +541,28 @@ type get_random_ ##type(void)							\
 		return ret;							\
 	}									\
 										\
-	local_lock_irqsave(&batched_entropy_ ##type.lock, flags);		\
-	batch = raw_cpu_ptr(&batched_entropy_##type);				\
+	batch = &get_cpu_var(batched_entropy_##type);				\
 										\
 	next_gen = (unsigned int)READ_ONCE(base_crng.generation);		\
-	if (batch->position >= ARRAY_SIZE(batch->entropy) ||			\
-	    next_gen != batch->generation) {					\
-		_get_random_bytes(batch->entropy, sizeof(batch->entropy));	\
-		batch->position = 0;						\
-		batch->generation = next_gen;					\
+	next = (u64)next_gen << 32;						\
+	if (likely(batch->position < ARRAY_SIZE(batch->entropy))) {		\
+		next |=	batch->position + 1; /* next-1 is bogus otherwise */	\
+		ret = batch->entropy[batch->position];				\
+	}									\
+	if (cmpxchg64_local(&batch->posgen, next, next - 1) != next - 1) {	\
+		local_lock_irqsave(&batched_entropy_ ##type.lock, flags);	\
+		if (batch->position >= ARRAY_SIZE(batch->entropy) ||		\
+		    next_gen != batch->generation) {				\
+			_get_random_bytes(batch->entropy, sizeof(batch->entropy));\
+			batch->position = 0;					\
+			batch->generation = next_gen;				\
+		}								\
+		ret = batch->entropy[batch->position++];			\
+		local_unlock_irqrestore(&batched_entropy_ ##type.lock, flags);	\
 	}									\
 										\
-	ret = batch->entropy[batch->position];					\
-	batch->entropy[batch->position] = 0;					\
-	++batch->position;							\
-	local_unlock_irqrestore(&batched_entropy_ ##type.lock, flags);		\
+	batch->entropy[batch->position - 1] = 0;				\
+	put_cpu_var(batched_entropy_##type);					\
 	return ret;								\
 }										\
 EXPORT_SYMBOL(get_random_ ##type);
-- 
2.52.0.107.ga0afd4fd5b-goog


Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ