[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <CAGudoHET9YLe9E9j5kTadhFtF=7j+9jNJj6TaQG4vukRp9ivfw@mail.gmail.com>
Date: Fri, 23 Jan 2026 17:17:54 +0100
From: Mateusz Guzik <mjguzik@...il.com>
To: Christian Brauner <brauner@...nel.org>
Cc: viro@...iv.linux.org.uk, jack@...e.cz, linux-kernel@...r.kernel.org,
linux-fsdevel@...r.kernel.org
Subject: Re: [PATCH] pidfs: implement ino allocation without the pidmap lock
On Fri, Jan 23, 2026 at 1:06 PM Christian Brauner <brauner@...nel.org> wrote:
>
> On Tue, Jan 20, 2026 at 07:45:39PM +0100, Mateusz Guzik wrote:
> > This paves the way for scalable PID allocation later.
> >
> > The 32 bit variant merely takes a spinlock for simplicity, the 64 bit
> > variant uses a scalable scheme.
> >
> > Signed-off-by: Mateusz Guzik <mjguzik@...il.com>
> > ---
> >
> > this patch assumes the rb -> rhashtable conversion landed
> >
> > i booted the 32 bit code on the 64 bit kernel, i take it its fine
> >
> > I'm slightly worried about error handling. It seems pid->pidfs_hash.next
> > = NULL is supposed to sort it out.
>
> r
> >
> > Given that ino of 0 is not legal, I think it should be used as a
> > sentinel value for presence in the table instead.
> >
> > so something like:
> >
> > alloc_pid:
> > pid->ino = 0;
> > ....
> >
> > then:
> >
> > void pidfs_remove_pid(struct pid *pid)
> > {
> > if (unlikely(!pid->ino))
> > return;
> > rhashtable_remove_fast(&pidfs_ino_ht, &pid->pidfs_hash,
> > pidfs_ino_ht_params);
> > }
>
> All fine, but we should probably just use DEFINE_COOKIE() and accept
> that numbering starts at 1 for 64-bit. Does the below look good to you
> too?
>
> From aaaa5cbb6e6920854aaee0ed59382d71614e785e Mon Sep 17 00:00:00 2001
> From: Mateusz Guzik <mjguzik@...il.com>
> Date: Tue, 20 Jan 2026 19:45:39 +0100
> Subject: [PATCH] pidfs: implement ino allocation without the pidmap lock
>
> This paves the way for scalable PID allocation later.
>
> The 32 bit variant merely takes a spinlock for simplicity, the 64 bit
> variant uses a scalable scheme.
>
> Signed-off-by: Mateusz Guzik <mjguzik@...il.com>
> Link: https://patch.msgid.link/20260120184539.1480930-1-mjguzik@gmail.com
> Signed-off-by: Christian Brauner <brauner@...nel.org>
> ---
> fs/pidfs.c | 115 ++++++++++++++++++++++++++++++++++-----------------
> kernel/pid.c | 3 +-
> 2 files changed, 78 insertions(+), 40 deletions(-)
>
> diff --git a/fs/pidfs.c b/fs/pidfs.c
> index ee0e36dd29d2..3e7e7bdda780 100644
> --- a/fs/pidfs.c
> +++ b/fs/pidfs.c
> @@ -23,6 +23,7 @@
> #include <linux/coredump.h>
> #include <linux/rhashtable.h>
> #include <linux/xattr.h>
> +#include <linux/cookie.h>
>
> #include "internal.h"
> #include "mount.h"
> @@ -65,7 +66,39 @@ static const struct rhashtable_params pidfs_ino_ht_params = {
> .automatic_shrinking = true,
> };
>
> +/*
> + * inode number handling
> + *
> + * On 64 bit nothing special happens. The 64bit number assigned
> + * to struct pid is the inode number.
> + *
> + * On 32 bit the 64 bit number assigned to struct pid is split
> + * into two 32 bit numbers. The lower 32 bits are used as the
> + * inode number and the upper 32 bits are used as the inode
> + * generation number.
> + *
> + * On 32 bit pidfs_ino() will return the lower 32 bit. When
> + * pidfs_ino() returns zero a wrap around happened. When a
> + * wraparound happens the 64 bit number will be incremented by 2
> + * so inode numbering starts at 2 again.
> + *
> + * On 64 bit comparing two pidfds is as simple as comparing
> + * inode numbers.
> + *
> + * When a wraparound happens on 32 bit multiple pidfds with the
> + * same inode number are likely to exist (This isn't a problem
> + * since before pidfs pidfds used the anonymous inode meaning
> + * all pidfds had the same inode number.). Userspace can
> + * reconstruct the 64 bit identifier by retrieving both the
> + * inode number and the inode generation number to compare or
> + * use file handles.
> + */
> +
> #if BITS_PER_LONG == 32
> +
> +DEFINE_SPINLOCK(pidfs_ino_lock);
> +static u64 pidfs_ino_nr = 2;
> +
> static inline unsigned long pidfs_ino(u64 ino)
> {
> return lower_32_bits(ino);
> @@ -77,6 +110,18 @@ static inline u32 pidfs_gen(u64 ino)
> return upper_32_bits(ino);
> }
>
> +static inline u64 pidfs_alloc_ino(void)
> +{
> + u64 ino;
> +
> + spin_lock(&pidfs_ino_lock);
> + if (pidfs_ino(pidfs_ino_nr) == 0)
> + pidfs_ino_nr += 2;
this should + 1 for consistency with the 64 bit variant
> + ino = pidfs_ino_nr++;
> + spin_unlock(&pidfs_ino_lock);
> + return ino;
> +}
> +
> #else
>
> /* On 64 bit simply return ino. */
> @@ -90,61 +135,55 @@ static inline u32 pidfs_gen(u64 ino)
> {
> return 0;
> }
> +
> +DEFINE_COOKIE(pidfs_ino_cookie);
> +
> +static u64 pidfs_alloc_ino(void)
> +{
> + u64 ino;
> +
> + preempt_disable();
> + ino = gen_cookie_next(&pidfs_ino_cookie);
> + preempt_enable();
> +
> + VFS_WARN_ON_ONCE(ino < 1);
> + return ino;
> +}
> +
> #endif
>
> /*
> - * Allocate inode number and initialize pidfs fields.
> - * Called with pidmap_lock held.
> + * Initialize pidfs fields.
> */
> void pidfs_prepare_pid(struct pid *pid)
> {
> - static u64 pidfs_ino_nr = 2;
> -
> - /*
> - * On 64 bit nothing special happens. The 64bit number assigned
> - * to struct pid is the inode number.
> - *
> - * On 32 bit the 64 bit number assigned to struct pid is split
> - * into two 32 bit numbers. The lower 32 bits are used as the
> - * inode number and the upper 32 bits are used as the inode
> - * generation number.
> - *
> - * On 32 bit pidfs_ino() will return the lower 32 bit. When
> - * pidfs_ino() returns zero a wrap around happened. When a
> - * wraparound happens the 64 bit number will be incremented by 2
> - * so inode numbering starts at 2 again.
> - *
> - * On 64 bit comparing two pidfds is as simple as comparing
> - * inode numbers.
> - *
> - * When a wraparound happens on 32 bit multiple pidfds with the
> - * same inode number are likely to exist (This isn't a problem
> - * since before pidfs pidfds used the anonymous inode meaning
> - * all pidfds had the same inode number.). Userspace can
> - * reconstruct the 64 bit identifier by retrieving both the
> - * inode number and the inode generation number to compare or
> - * use file handles.
> - */
> - if (pidfs_ino(pidfs_ino_nr) == 0)
> - pidfs_ino_nr += 2;
> -
> - pid->ino = pidfs_ino_nr;
> pid->pidfs_hash.next = NULL;
> pid->stashed = NULL;
> pid->attr = NULL;
> - pidfs_ino_nr++;
> }
>
> int pidfs_add_pid(struct pid *pid)
> {
> - return rhashtable_insert_fast(&pidfs_ino_ht, &pid->pidfs_hash,
> - pidfs_ino_ht_params);
> + int ret;
> +
> + pid->ino = pidfs_alloc_ino();
> + ret = rhashtable_insert_fast(&pidfs_ino_ht, &pid->pidfs_hash,
> + pidfs_ino_ht_params);
> + /*
> + * This is fine. The pid will not have a task attached to it so
> + * no pidfd can be created for it. So we can unset the inode
> + * number.
> + */
> + if (unlikely(ret))
> + pid->ino = 0;
I would pid->ino = 0 unconditoinally in pidfs_prepare_pid. this
gurantees safe execution of pidfs_remove_pid even with pidfs_add_pid
did not get called
this removes the above comment as well
then pidfs_prepare_pid can also stop nullyifying the hash thing
then that's something i'm happy ti sing off on
> + return ret;
> }
>
> void pidfs_remove_pid(struct pid *pid)
> {
> - rhashtable_remove_fast(&pidfs_ino_ht, &pid->pidfs_hash,
> - pidfs_ino_ht_params);
> + if (likely(pid->ino))
> + rhashtable_remove_fast(&pidfs_ino_ht, &pid->pidfs_hash,
> + pidfs_ino_ht_params);
> }
>
> void pidfs_free_pid(struct pid *pid)
> diff --git a/kernel/pid.c b/kernel/pid.c
> index 6077da774652..dfa545a97c00 100644
> --- a/kernel/pid.c
> +++ b/kernel/pid.c
> @@ -198,6 +198,7 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *arg_set_tid,
> INIT_HLIST_HEAD(&pid->tasks[type]);
> init_waitqueue_head(&pid->wait_pidfd);
> INIT_HLIST_HEAD(&pid->inodes);
> + pidfs_prepare_pid(pid);
>
> /*
> * 2. perm check checkpoint_restore_ns_capable()
> @@ -313,8 +314,6 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *arg_set_tid,
> retval = -ENOMEM;
> if (unlikely(!(ns->pid_allocated & PIDNS_ADDING)))
> goto out_free;
> - pidfs_prepare_pid(pid);
> -
> for (upid = pid->numbers + ns->level; upid >= pid->numbers; --upid) {
> /* Make the PID visible to find_pid_ns. */
> idr_replace(&upid->ns->idr, pid, upid->nr);
> --
> 2.47.3
>
Powered by blists - more mailing lists