linux-kernel - Re: [RFC PATCH v4 3/4] hazptr: Implement Hazard Pointers

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <aUO9KRaP0FXpm_l9@tardis-2.local>
Date: Thu, 18 Dec 2025 17:36:57 +0900
From: Boqun Feng <boqun.feng@...il.com>
To: Mathieu Desnoyers <mathieu.desnoyers@...icios.com>
Cc: Joel Fernandes <joel@...lfernandes.org>,
	"Paul E. McKenney" <paulmck@...nel.org>,	linux-kernel@...r.kernel.org,
 Nicholas Piggin <npiggin@...il.com>,
	Michael Ellerman <mpe@...erman.id.au>,
	Greg Kroah-Hartman <gregkh@...uxfoundation.org>,
	Sebastian Andrzej Siewior <bigeasy@...utronix.de>,
	Will Deacon <will@...nel.org>,	Peter Zijlstra <peterz@...radead.org>,
	Alan Stern <stern@...land.harvard.edu>,	John Stultz <jstultz@...gle.com>,
	Neeraj Upadhyay <Neeraj.Upadhyay@....com>,
	Linus Torvalds <torvalds@...ux-foundation.org>,
	Andrew Morton <akpm@...ux-foundation.org>,
	Frederic Weisbecker <frederic@...nel.org>,
	Josh Triplett <josh@...htriplett.org>,
	Uladzislau Rezki <urezki@...il.com>,
	Steven Rostedt <rostedt@...dmis.org>,
	Lai Jiangshan <jiangshanlai@...il.com>,
	Zqiang <qiang.zhang1211@...il.com>, Ingo Molnar <mingo@...hat.com>,
	Waiman Long <longman@...hat.com>,	Mark Rutland <mark.rutland@....com>,
	Thomas Gleixner <tglx@...utronix.de>,	Vlastimil Babka <vbabka@...e.cz>,
 maged.michael@...il.com,	Mateusz Guzik <mjguzik@...il.com>,
	Jonas Oberhauser <jonas.oberhauser@...weicloud.com>,	rcu@...r.kernel.org,
 linux-mm@...ck.org, lkmm@...ts.linux.dev
Subject: Re: [RFC PATCH v4 3/4] hazptr: Implement Hazard Pointers

On Wed, Dec 17, 2025 at 08:45:30PM -0500, Mathieu Desnoyers wrote:
[...]
> +static inline
> +struct hazptr_slot *hazptr_get_free_percpu_slot(void)
> +{
> +	struct hazptr_percpu_slots *percpu_slots = this_cpu_ptr(&hazptr_percpu_slots);
> +	unsigned int idx;
> +
> +	for (idx = 0; idx < NR_HAZPTR_PERCPU_SLOTS; idx++) {
> +		struct hazptr_slot *slot = &percpu_slots->slots[idx];
> +
> +		if (!READ_ONCE(slot->addr))
> +			return slot;
> +	}
> +	/* All slots are in use. */
> +	return NULL;
> +}
> +
> +static inline
> +bool hazptr_slot_is_backup(struct hazptr_ctx *ctx, struct hazptr_slot *slot)
> +{
> +	return slot == &ctx->backup_slot.slot;
> +}
> +
> +/*
> + * hazptr_acquire: Load pointer at address and protect with hazard pointer.
> + *
> + * Load @addr_p, and protect the loaded pointer with hazard pointer.
> + *
> + * Returns a non-NULL protected address if the loaded pointer is non-NULL.
> + * Returns NULL if the loaded pointer is NULL.
> + *
> + * On success the protected hazptr slot is stored in @ctx->slot.
> + */
> +static inline
> +void *hazptr_acquire(struct hazptr_ctx *ctx, void * const * addr_p)
> +{
> +	struct hazptr_slot *slot = NULL;
> +	void *addr, *addr2;
> +
> +	/*
> +	 * Load @addr_p to know which address should be protected.
> +	 */
> +	addr = READ_ONCE(*addr_p);
> +	for (;;) {
> +		if (!addr)
> +			return NULL;
> +		guard(preempt)();
> +		if (likely(!hazptr_slot_is_backup(ctx, slot))) {
> +			slot = hazptr_get_free_percpu_slot();

I need to continue share my concerns about this "allocating slot while
protecting" pattern. Here realistically, we will go over a few of the
per-CPU hazard pointer slots *every time* instead of directly using a
pre-allocated hazard pointer slot. Could you utilize this[1] to see a
comparison of the reader-side performance against RCU/SRCU?

> +			/*
> +			 * If all the per-CPU slots are already in use, fallback
> +			 * to the backup slot.
> +			 */
> +			if (unlikely(!slot))
> +				slot = hazptr_chain_backup_slot(ctx);
> +		}
> +		WRITE_ONCE(slot->addr, addr);	/* Store B */
> +
> +		/* Memory ordering: Store B before Load A. */
> +		smp_mb();
> +
> +		/*
> +		 * Re-load @addr_p after storing it to the hazard pointer slot.
> +		 */
> +		addr2 = READ_ONCE(*addr_p);	/* Load A */
> +		if (likely(ptr_eq(addr2, addr)))
> +			break;
> +		/*
> +		 * If @addr_p content has changed since the first load,
> +		 * release the hazard pointer and try again.
> +		 */
> +		WRITE_ONCE(slot->addr, NULL);
> +		if (!addr2) {
> +			if (hazptr_slot_is_backup(ctx, slot))
> +				hazptr_unchain_backup_slot(ctx);
> +			return NULL;
> +		}
> +		addr = addr2;
> +	}
> +	ctx->slot = slot;
> +	/*
> +	 * Use addr2 loaded from the second READ_ONCE() to preserve
> +	 * address dependency ordering.
> +	 */
> +	return addr2;
> +}
> +
> +/* Release the protected hazard pointer from @slot. */
> +static inline
> +void hazptr_release(struct hazptr_ctx *ctx, void *addr)
> +{
> +	struct hazptr_slot *slot;
> +
> +	if (!addr)
> +		return;
> +	slot = ctx->slot;
> +	WARN_ON_ONCE(slot->addr != addr);
> +	smp_store_release(&slot->addr, NULL);
> +	if (unlikely(hazptr_slot_is_backup(ctx, slot)))
> +		hazptr_unchain_backup_slot(ctx);
> +}
> +
> +void hazptr_init(void);
> +
> +#endif /* _LINUX_HAZPTR_H */
> diff --git a/init/main.c b/init/main.c
> index 07a3116811c5..858eaa87bde7 100644
> --- a/init/main.c
> +++ b/init/main.c
> @@ -104,6 +104,7 @@
>  #include <linux/pidfs.h>
>  #include <linux/ptdump.h>
>  #include <linux/time_namespace.h>
> +#include <linux/hazptr.h>
>  #include <net/net_namespace.h>
>  
>  #include <asm/io.h>
> @@ -1002,6 +1003,7 @@ void start_kernel(void)
>  	workqueue_init_early();
>  
>  	rcu_init();
> +	hazptr_init();
>  	kvfree_rcu_init();
>  
>  	/* Trace events are available after this */
> diff --git a/kernel/Makefile b/kernel/Makefile
> index 9fe722305c9b..1178907fe0ea 100644
> --- a/kernel/Makefile
> +++ b/kernel/Makefile
> @@ -7,7 +7,7 @@ obj-y     = fork.o exec_domain.o panic.o \
>  	    cpu.o exit.o softirq.o resource.o \
>  	    sysctl.o capability.o ptrace.o user.o \
>  	    signal.o sys.o umh.o workqueue.o pid.o task_work.o \
> -	    extable.o params.o \
> +	    extable.o params.o hazptr.o \
>  	    kthread.o sys_ni.o nsproxy.o nstree.o nscommon.o \
>  	    notifier.o ksysfs.o cred.o reboot.o \
>  	    async.o range.o smpboot.o ucount.o regset.o ksyms_common.o
> diff --git a/kernel/hazptr.c b/kernel/hazptr.c
> new file mode 100644
> index 000000000000..2ec288bc1132
> --- /dev/null
> +++ b/kernel/hazptr.c
> @@ -0,0 +1,150 @@
> +// SPDX-FileCopyrightText: 2024 Mathieu Desnoyers <mathieu.desnoyers@...icios.com>
> +//
> +// SPDX-License-Identifier: LGPL-2.1-or-later
> +
> +/*
> + * hazptr: Hazard Pointers
> + */
> +
> +#include <linux/hazptr.h>
> +#include <linux/percpu.h>
> +#include <linux/spinlock.h>
> +#include <linux/list.h>
> +#include <linux/export.h>
> +
> +struct overflow_list {
> +	raw_spinlock_t lock;		/* Lock protecting overflow list and list generation. */
> +	struct list_head head;		/* Overflow list head. */
> +	uint64_t gen;			/* Overflow list generation. */
> +};
> +
> +static DEFINE_PER_CPU(struct overflow_list, percpu_overflow_list);
> +
> +DEFINE_PER_CPU(struct hazptr_percpu_slots, hazptr_percpu_slots);
> +EXPORT_PER_CPU_SYMBOL_GPL(hazptr_percpu_slots);
> +
> +/*
> + * Perform piecewise iteration on overflow list waiting until "addr" is
> + * not present. Raw spinlock is released and taken between each list
> + * item and busy loop iteration. The overflow list generation is checked
> + * each time the lock is taken to validate that the list has not changed
> + * before resuming iteration or busy wait. If the generation has
> + * changed, retry the entire list traversal.
> + */
> +static
> +void hazptr_synchronize_overflow_list(struct overflow_list *overflow_list, void *addr)
> +{
> +	struct hazptr_backup_slot *backup_slot;
> +	uint64_t snapshot_gen;
> +
> +	raw_spin_lock(&overflow_list->lock);
> +retry:
> +	snapshot_gen = overflow_list->gen;
> +	list_for_each_entry(backup_slot, &overflow_list->head, node) {
> +		/* Busy-wait if node is found. */
> +		while (smp_load_acquire(&backup_slot->slot.addr) == addr) { /* Load B */
> +			raw_spin_unlock(&overflow_list->lock);
> +			cpu_relax();

I think we should prioritize the scan thread solution [2] instead of
busy waiting hazrd pointer updaters, because when we have multiple
hazard pointer usages we would want to consolidate the scans from
updater side. If so, the whole ->gen can be avoided.

However this ->gen idea does seem ot resolve another issue for me, I'm
trying to make shazptr critical section preemptive by using a per-task
backup slot (if you recall, this is your idea from the hallway
discussions we had during LPC 2024), and currently I could not make it
work because the following sequeue:

1. CPU 0 already has one pointer protected.

2. CPU 1 begins the updater scan, and it scans the list of preempted
   hazard pointer readers, no reader.

3. CPU 0 does a context switch, it stores the current hazard pointer
   value to the current task's ->hazard_slot (let's say the task is task
   A), and add it to the list of preempted hazard pointer readers.

4. CPU 0 clears its percpu hazptr_slots for the next task (B).

5. CPU 1 continues the updater scan, and it scans the percpu slot of
   CPU 0, and finds no reader.

in this situation, updater will miss a reader. But if we add a
generation snapshotting at step 2 and generation increment at step 3, I
think it'll work.

IMO, if we make this work, it's better than the current backup slot
mechanism IMO, because we only need to acquire the lock if context
switch happens. I will look into the implementation of this and if I
could get it down, I will send it in my next version of shazptr. Mention
it here just to add this option into the discussion.

[1]: https://lore.kernel.org/lkml/20250625031101.12555-3-boqun.feng@gmail.com/
[2]: https://lore.kernel.org/lkml/20250625031101.12555-5-boqun.feng@gmail.com/

Regards,
Boqun

> +			raw_spin_lock(&overflow_list->lock);
> +			if (overflow_list->gen != snapshot_gen)
> +				goto retry;
> +		}
> +		raw_spin_unlock(&overflow_list->lock);
> +		/*
> +		 * Release raw spinlock, validate generation after
> +		 * re-acquiring the lock.
> +		 */
> +		raw_spin_lock(&overflow_list->lock);
> +		if (overflow_list->gen != snapshot_gen)
> +			goto retry;
> +	}
> +	raw_spin_unlock(&overflow_list->lock);
> +}
> +
> +static
> +void hazptr_synchronize_cpu_slots(int cpu, void *addr)
> +{
> +	struct hazptr_percpu_slots *percpu_slots = per_cpu_ptr(&hazptr_percpu_slots, cpu);
> +	unsigned int idx;
> +
> +	for (idx = 0; idx < NR_HAZPTR_PERCPU_SLOTS; idx++) {
> +		struct hazptr_slot *slot = &percpu_slots->slots[idx];
> +
> +		/* Busy-wait if node is found. */
> +		smp_cond_load_acquire(&slot->addr, VAL != addr); /* Load B */
> +	}
> +}
> +
> +/*
> + * hazptr_synchronize: Wait until @addr is released from all slots.
> + *
> + * Wait to observe that each slot contains a value that differs from
> + * @addr before returning.
> + * Should be called from preemptible context.
> + */
> +void hazptr_synchronize(void *addr)
> +{
> +	int cpu;
> +
> +	/*
> +	 * Busy-wait should only be done from preemptible context.
> +	 */
> +	lockdep_assert_preemption_enabled();
> +
> +	/*
> +	 * Store A precedes hazptr_scan(): it unpublishes addr (sets it to
> +	 * NULL or to a different value), and thus hides it from hazard
> +	 * pointer readers.
> +	 */
> +	if (!addr)
> +		return;
> +	/* Memory ordering: Store A before Load B. */
> +	smp_mb();
> +	/* Scan all CPUs slots. */
> +	for_each_possible_cpu(cpu) {
> +		/* Scan CPU slots. */
> +		hazptr_synchronize_cpu_slots(cpu, addr);
> +		/* Scan backup slots in percpu overflow list. */
> +		hazptr_synchronize_overflow_list(per_cpu_ptr(&percpu_overflow_list, cpu), addr);
> +	}
> +}
> +EXPORT_SYMBOL_GPL(hazptr_synchronize);
> +
> +struct hazptr_slot *hazptr_chain_backup_slot(struct hazptr_ctx *ctx)
> +{
> +	struct overflow_list *overflow_list = this_cpu_ptr(&percpu_overflow_list);
> +	struct hazptr_slot *slot = &ctx->backup_slot.slot;
> +
> +	slot->addr = NULL;
> +
> +	raw_spin_lock(&overflow_list->lock);
> +	overflow_list->gen++;
> +	list_add(&ctx->backup_slot.node, &overflow_list->head);
> +	ctx->backup_slot.cpu = smp_processor_id();
> +	raw_spin_unlock(&overflow_list->lock);
> +	return slot;
> +}
> +EXPORT_SYMBOL_GPL(hazptr_chain_backup_slot);
> +
> +void hazptr_unchain_backup_slot(struct hazptr_ctx *ctx)
> +{
> +	struct overflow_list *overflow_list = per_cpu_ptr(&percpu_overflow_list, ctx->backup_slot.cpu);
> +
> +	raw_spin_lock(&overflow_list->lock);
> +	overflow_list->gen++;
> +	list_del(&ctx->backup_slot.node);
> +	raw_spin_unlock(&overflow_list->lock);
> +}
> +EXPORT_SYMBOL_GPL(hazptr_unchain_backup_slot);
> +
> +void __init hazptr_init(void)
> +{
> +	int cpu;
> +
> +	for_each_possible_cpu(cpu) {
> +		struct overflow_list *overflow_list = per_cpu_ptr(&percpu_overflow_list, cpu);
> +
> +		raw_spin_lock_init(&overflow_list->lock);
> +		INIT_LIST_HEAD(&overflow_list->head);
> +	}
> +}
> -- 
> 2.39.5
>