[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <aUO9KRaP0FXpm_l9@tardis-2.local>
Date: Thu, 18 Dec 2025 17:36:57 +0900
From: Boqun Feng <boqun.feng@...il.com>
To: Mathieu Desnoyers <mathieu.desnoyers@...icios.com>
Cc: Joel Fernandes <joel@...lfernandes.org>,
"Paul E. McKenney" <paulmck@...nel.org>, linux-kernel@...r.kernel.org,
Nicholas Piggin <npiggin@...il.com>,
Michael Ellerman <mpe@...erman.id.au>,
Greg Kroah-Hartman <gregkh@...uxfoundation.org>,
Sebastian Andrzej Siewior <bigeasy@...utronix.de>,
Will Deacon <will@...nel.org>, Peter Zijlstra <peterz@...radead.org>,
Alan Stern <stern@...land.harvard.edu>, John Stultz <jstultz@...gle.com>,
Neeraj Upadhyay <Neeraj.Upadhyay@....com>,
Linus Torvalds <torvalds@...ux-foundation.org>,
Andrew Morton <akpm@...ux-foundation.org>,
Frederic Weisbecker <frederic@...nel.org>,
Josh Triplett <josh@...htriplett.org>,
Uladzislau Rezki <urezki@...il.com>,
Steven Rostedt <rostedt@...dmis.org>,
Lai Jiangshan <jiangshanlai@...il.com>,
Zqiang <qiang.zhang1211@...il.com>, Ingo Molnar <mingo@...hat.com>,
Waiman Long <longman@...hat.com>, Mark Rutland <mark.rutland@....com>,
Thomas Gleixner <tglx@...utronix.de>, Vlastimil Babka <vbabka@...e.cz>,
maged.michael@...il.com, Mateusz Guzik <mjguzik@...il.com>,
Jonas Oberhauser <jonas.oberhauser@...weicloud.com>, rcu@...r.kernel.org,
linux-mm@...ck.org, lkmm@...ts.linux.dev
Subject: Re: [RFC PATCH v4 3/4] hazptr: Implement Hazard Pointers
On Wed, Dec 17, 2025 at 08:45:30PM -0500, Mathieu Desnoyers wrote:
[...]
> +static inline
> +struct hazptr_slot *hazptr_get_free_percpu_slot(void)
> +{
> + struct hazptr_percpu_slots *percpu_slots = this_cpu_ptr(&hazptr_percpu_slots);
> + unsigned int idx;
> +
> + for (idx = 0; idx < NR_HAZPTR_PERCPU_SLOTS; idx++) {
> + struct hazptr_slot *slot = &percpu_slots->slots[idx];
> +
> + if (!READ_ONCE(slot->addr))
> + return slot;
> + }
> + /* All slots are in use. */
> + return NULL;
> +}
> +
> +static inline
> +bool hazptr_slot_is_backup(struct hazptr_ctx *ctx, struct hazptr_slot *slot)
> +{
> + return slot == &ctx->backup_slot.slot;
> +}
> +
> +/*
> + * hazptr_acquire: Load pointer at address and protect with hazard pointer.
> + *
> + * Load @addr_p, and protect the loaded pointer with hazard pointer.
> + *
> + * Returns a non-NULL protected address if the loaded pointer is non-NULL.
> + * Returns NULL if the loaded pointer is NULL.
> + *
> + * On success the protected hazptr slot is stored in @ctx->slot.
> + */
> +static inline
> +void *hazptr_acquire(struct hazptr_ctx *ctx, void * const * addr_p)
> +{
> + struct hazptr_slot *slot = NULL;
> + void *addr, *addr2;
> +
> + /*
> + * Load @addr_p to know which address should be protected.
> + */
> + addr = READ_ONCE(*addr_p);
> + for (;;) {
> + if (!addr)
> + return NULL;
> + guard(preempt)();
> + if (likely(!hazptr_slot_is_backup(ctx, slot))) {
> + slot = hazptr_get_free_percpu_slot();
I need to continue share my concerns about this "allocating slot while
protecting" pattern. Here realistically, we will go over a few of the
per-CPU hazard pointer slots *every time* instead of directly using a
pre-allocated hazard pointer slot. Could you utilize this[1] to see a
comparison of the reader-side performance against RCU/SRCU?
> + /*
> + * If all the per-CPU slots are already in use, fallback
> + * to the backup slot.
> + */
> + if (unlikely(!slot))
> + slot = hazptr_chain_backup_slot(ctx);
> + }
> + WRITE_ONCE(slot->addr, addr); /* Store B */
> +
> + /* Memory ordering: Store B before Load A. */
> + smp_mb();
> +
> + /*
> + * Re-load @addr_p after storing it to the hazard pointer slot.
> + */
> + addr2 = READ_ONCE(*addr_p); /* Load A */
> + if (likely(ptr_eq(addr2, addr)))
> + break;
> + /*
> + * If @addr_p content has changed since the first load,
> + * release the hazard pointer and try again.
> + */
> + WRITE_ONCE(slot->addr, NULL);
> + if (!addr2) {
> + if (hazptr_slot_is_backup(ctx, slot))
> + hazptr_unchain_backup_slot(ctx);
> + return NULL;
> + }
> + addr = addr2;
> + }
> + ctx->slot = slot;
> + /*
> + * Use addr2 loaded from the second READ_ONCE() to preserve
> + * address dependency ordering.
> + */
> + return addr2;
> +}
> +
> +/* Release the protected hazard pointer from @slot. */
> +static inline
> +void hazptr_release(struct hazptr_ctx *ctx, void *addr)
> +{
> + struct hazptr_slot *slot;
> +
> + if (!addr)
> + return;
> + slot = ctx->slot;
> + WARN_ON_ONCE(slot->addr != addr);
> + smp_store_release(&slot->addr, NULL);
> + if (unlikely(hazptr_slot_is_backup(ctx, slot)))
> + hazptr_unchain_backup_slot(ctx);
> +}
> +
> +void hazptr_init(void);
> +
> +#endif /* _LINUX_HAZPTR_H */
> diff --git a/init/main.c b/init/main.c
> index 07a3116811c5..858eaa87bde7 100644
> --- a/init/main.c
> +++ b/init/main.c
> @@ -104,6 +104,7 @@
> #include <linux/pidfs.h>
> #include <linux/ptdump.h>
> #include <linux/time_namespace.h>
> +#include <linux/hazptr.h>
> #include <net/net_namespace.h>
>
> #include <asm/io.h>
> @@ -1002,6 +1003,7 @@ void start_kernel(void)
> workqueue_init_early();
>
> rcu_init();
> + hazptr_init();
> kvfree_rcu_init();
>
> /* Trace events are available after this */
> diff --git a/kernel/Makefile b/kernel/Makefile
> index 9fe722305c9b..1178907fe0ea 100644
> --- a/kernel/Makefile
> +++ b/kernel/Makefile
> @@ -7,7 +7,7 @@ obj-y = fork.o exec_domain.o panic.o \
> cpu.o exit.o softirq.o resource.o \
> sysctl.o capability.o ptrace.o user.o \
> signal.o sys.o umh.o workqueue.o pid.o task_work.o \
> - extable.o params.o \
> + extable.o params.o hazptr.o \
> kthread.o sys_ni.o nsproxy.o nstree.o nscommon.o \
> notifier.o ksysfs.o cred.o reboot.o \
> async.o range.o smpboot.o ucount.o regset.o ksyms_common.o
> diff --git a/kernel/hazptr.c b/kernel/hazptr.c
> new file mode 100644
> index 000000000000..2ec288bc1132
> --- /dev/null
> +++ b/kernel/hazptr.c
> @@ -0,0 +1,150 @@
> +// SPDX-FileCopyrightText: 2024 Mathieu Desnoyers <mathieu.desnoyers@...icios.com>
> +//
> +// SPDX-License-Identifier: LGPL-2.1-or-later
> +
> +/*
> + * hazptr: Hazard Pointers
> + */
> +
> +#include <linux/hazptr.h>
> +#include <linux/percpu.h>
> +#include <linux/spinlock.h>
> +#include <linux/list.h>
> +#include <linux/export.h>
> +
> +struct overflow_list {
> + raw_spinlock_t lock; /* Lock protecting overflow list and list generation. */
> + struct list_head head; /* Overflow list head. */
> + uint64_t gen; /* Overflow list generation. */
> +};
> +
> +static DEFINE_PER_CPU(struct overflow_list, percpu_overflow_list);
> +
> +DEFINE_PER_CPU(struct hazptr_percpu_slots, hazptr_percpu_slots);
> +EXPORT_PER_CPU_SYMBOL_GPL(hazptr_percpu_slots);
> +
> +/*
> + * Perform piecewise iteration on overflow list waiting until "addr" is
> + * not present. Raw spinlock is released and taken between each list
> + * item and busy loop iteration. The overflow list generation is checked
> + * each time the lock is taken to validate that the list has not changed
> + * before resuming iteration or busy wait. If the generation has
> + * changed, retry the entire list traversal.
> + */
> +static
> +void hazptr_synchronize_overflow_list(struct overflow_list *overflow_list, void *addr)
> +{
> + struct hazptr_backup_slot *backup_slot;
> + uint64_t snapshot_gen;
> +
> + raw_spin_lock(&overflow_list->lock);
> +retry:
> + snapshot_gen = overflow_list->gen;
> + list_for_each_entry(backup_slot, &overflow_list->head, node) {
> + /* Busy-wait if node is found. */
> + while (smp_load_acquire(&backup_slot->slot.addr) == addr) { /* Load B */
> + raw_spin_unlock(&overflow_list->lock);
> + cpu_relax();
I think we should prioritize the scan thread solution [2] instead of
busy waiting hazrd pointer updaters, because when we have multiple
hazard pointer usages we would want to consolidate the scans from
updater side. If so, the whole ->gen can be avoided.
However this ->gen idea does seem ot resolve another issue for me, I'm
trying to make shazptr critical section preemptive by using a per-task
backup slot (if you recall, this is your idea from the hallway
discussions we had during LPC 2024), and currently I could not make it
work because the following sequeue:
1. CPU 0 already has one pointer protected.
2. CPU 1 begins the updater scan, and it scans the list of preempted
hazard pointer readers, no reader.
3. CPU 0 does a context switch, it stores the current hazard pointer
value to the current task's ->hazard_slot (let's say the task is task
A), and add it to the list of preempted hazard pointer readers.
4. CPU 0 clears its percpu hazptr_slots for the next task (B).
5. CPU 1 continues the updater scan, and it scans the percpu slot of
CPU 0, and finds no reader.
in this situation, updater will miss a reader. But if we add a
generation snapshotting at step 2 and generation increment at step 3, I
think it'll work.
IMO, if we make this work, it's better than the current backup slot
mechanism IMO, because we only need to acquire the lock if context
switch happens. I will look into the implementation of this and if I
could get it down, I will send it in my next version of shazptr. Mention
it here just to add this option into the discussion.
[1]: https://lore.kernel.org/lkml/20250625031101.12555-3-boqun.feng@gmail.com/
[2]: https://lore.kernel.org/lkml/20250625031101.12555-5-boqun.feng@gmail.com/
Regards,
Boqun
> + raw_spin_lock(&overflow_list->lock);
> + if (overflow_list->gen != snapshot_gen)
> + goto retry;
> + }
> + raw_spin_unlock(&overflow_list->lock);
> + /*
> + * Release raw spinlock, validate generation after
> + * re-acquiring the lock.
> + */
> + raw_spin_lock(&overflow_list->lock);
> + if (overflow_list->gen != snapshot_gen)
> + goto retry;
> + }
> + raw_spin_unlock(&overflow_list->lock);
> +}
> +
> +static
> +void hazptr_synchronize_cpu_slots(int cpu, void *addr)
> +{
> + struct hazptr_percpu_slots *percpu_slots = per_cpu_ptr(&hazptr_percpu_slots, cpu);
> + unsigned int idx;
> +
> + for (idx = 0; idx < NR_HAZPTR_PERCPU_SLOTS; idx++) {
> + struct hazptr_slot *slot = &percpu_slots->slots[idx];
> +
> + /* Busy-wait if node is found. */
> + smp_cond_load_acquire(&slot->addr, VAL != addr); /* Load B */
> + }
> +}
> +
> +/*
> + * hazptr_synchronize: Wait until @addr is released from all slots.
> + *
> + * Wait to observe that each slot contains a value that differs from
> + * @addr before returning.
> + * Should be called from preemptible context.
> + */
> +void hazptr_synchronize(void *addr)
> +{
> + int cpu;
> +
> + /*
> + * Busy-wait should only be done from preemptible context.
> + */
> + lockdep_assert_preemption_enabled();
> +
> + /*
> + * Store A precedes hazptr_scan(): it unpublishes addr (sets it to
> + * NULL or to a different value), and thus hides it from hazard
> + * pointer readers.
> + */
> + if (!addr)
> + return;
> + /* Memory ordering: Store A before Load B. */
> + smp_mb();
> + /* Scan all CPUs slots. */
> + for_each_possible_cpu(cpu) {
> + /* Scan CPU slots. */
> + hazptr_synchronize_cpu_slots(cpu, addr);
> + /* Scan backup slots in percpu overflow list. */
> + hazptr_synchronize_overflow_list(per_cpu_ptr(&percpu_overflow_list, cpu), addr);
> + }
> +}
> +EXPORT_SYMBOL_GPL(hazptr_synchronize);
> +
> +struct hazptr_slot *hazptr_chain_backup_slot(struct hazptr_ctx *ctx)
> +{
> + struct overflow_list *overflow_list = this_cpu_ptr(&percpu_overflow_list);
> + struct hazptr_slot *slot = &ctx->backup_slot.slot;
> +
> + slot->addr = NULL;
> +
> + raw_spin_lock(&overflow_list->lock);
> + overflow_list->gen++;
> + list_add(&ctx->backup_slot.node, &overflow_list->head);
> + ctx->backup_slot.cpu = smp_processor_id();
> + raw_spin_unlock(&overflow_list->lock);
> + return slot;
> +}
> +EXPORT_SYMBOL_GPL(hazptr_chain_backup_slot);
> +
> +void hazptr_unchain_backup_slot(struct hazptr_ctx *ctx)
> +{
> + struct overflow_list *overflow_list = per_cpu_ptr(&percpu_overflow_list, ctx->backup_slot.cpu);
> +
> + raw_spin_lock(&overflow_list->lock);
> + overflow_list->gen++;
> + list_del(&ctx->backup_slot.node);
> + raw_spin_unlock(&overflow_list->lock);
> +}
> +EXPORT_SYMBOL_GPL(hazptr_unchain_backup_slot);
> +
> +void __init hazptr_init(void)
> +{
> + int cpu;
> +
> + for_each_possible_cpu(cpu) {
> + struct overflow_list *overflow_list = per_cpu_ptr(&percpu_overflow_list, cpu);
> +
> + raw_spin_lock_init(&overflow_list->lock);
> + INIT_LIST_HEAD(&overflow_list->head);
> + }
> +}
> --
> 2.39.5
>
Powered by blists - more mailing lists