[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <CAOQ4uxjiHT1F3EXTkxwg1iUGJ583g9j+txe71VjqCbT-4rzV+Q@mail.gmail.com>
Date: Thu, 28 Nov 2024 18:19:08 +0100
From: Amir Goldstein <amir73il@...il.com>
To: Christian Brauner <brauner@...nel.org>
Cc: Erin Shepherd <erin.shepherd@....eu>, Jeff Layton <jlayton@...nel.org>,
Alexander Viro <viro@...iv.linux.org.uk>, Jan Kara <jack@...e.cz>,
Chuck Lever <chuck.lever@...cle.com>, linux-fsdevel@...r.kernel.org,
linux-kernel@...r.kernel.org, linux-nfs@...r.kernel.org
Subject: Re: [PATCH RFC 1/2] pidfs: rework inode number allocation
On Thu, Nov 28, 2024 at 1:34 PM Christian Brauner <brauner@...nel.org> wrote:
>
> Recently we received a patchset that aims to enable file handle encoding
> and decoding via name_to_handle_at(2) and open_by_handle_at(2).
>
> A crucical step in the patch series is how to go from inode number to
> struct pid without leaking information into unprivileged contexts. The
> issue is that in order to find a struct pid the pid number in the
> initial pid namespace must be encoded into the file handle via
> name_to_handle_at(2). This can be used by containers using a separate
> pid namespace to learn what the pid number of a given process in the
> initial pid namespace is. While this is a weak information leak it could
> be used in various exploits and in general is an ugly wart in the design.
>
> To solve this problem a new way is needed to lookup a struct pid based
> on the inode number allocated for that struct pid. The other part is to
> remove the custom inode number allocation on 32bit systems that is also
> an ugly wart that should go away.
>
> So, a new scheme is used that I was discusssing with Tejun some time
> back. A cyclic ida is used for the lower 32 bits and a the high 32 bits
> are used for the generation number. This gives a 64 bit inode number
> that is unique on both 32 bit and 64 bit. The lower 32 bit number is
> recycled slowly and can be used to lookup struct pids.
>
> Signed-off-by: Christian Brauner <brauner@...nel.org>
> ---
> fs/pidfs.c | 92 +++++++++++++++++++++++++++++++++++++++++++++++++++
> include/linux/pidfs.h | 2 ++
> kernel/pid.c | 11 +++---
> 3 files changed, 98 insertions(+), 7 deletions(-)
>
> diff --git a/fs/pidfs.c b/fs/pidfs.c
> index 618abb1fa1b84cf31282c922374e28d60cd49d00..09a0c8ac805301927a94758b3f7d1e513826daf9 100644
> --- a/fs/pidfs.c
> +++ b/fs/pidfs.c
> @@ -23,6 +23,88 @@
> #include "internal.h"
> #include "mount.h"
>
> +static u32 pidfs_ino_highbits;
> +static u32 pidfs_ino_last_ino_lowbits;
> +
> +static DEFINE_IDR(pidfs_ino_idr);
> +
> +static inline ino_t pidfs_ino(u64 ino)
> +{
> + /* On 32 bit low 32 bits are the inode. */
> + if (sizeof(ino_t) < sizeof(u64))
> + return (u32)ino;
> +
> + /* On 64 bit simply return ino. */
> + return ino;
> +}
> +
> +static inline u32 pidfs_gen(u64 ino)
> +{
> + /* On 32 bit the generation number are the upper 32 bits. */
> + if (sizeof(ino_t) < sizeof(u64))
> + return ino >> 32;
> +
> + /* On 64 bit the generation number is 1. */
> + return 1;
> +}
> +
> +/*
> + * Construct an inode number for struct pid in a way that we can use the
> + * lower 32bit to lookup struct pid independent of any pid numbers that
> + * could be leaked into userspace (e.g., via file handle encoding).
> + */
> +int pidfs_add_pid(struct pid *pid)
> +{
> + u32 ino_highbits;
> + int ret;
> +
> + ret = idr_alloc_cyclic(&pidfs_ino_idr, pid, 1, 0, GFP_ATOMIC);
> + if (ret >= 0 && ret < pidfs_ino_last_ino_lowbits)
> + pidfs_ino_highbits++;
> + ino_highbits = pidfs_ino_highbits;
> + pidfs_ino_last_ino_lowbits = ret;
> + if (ret < 0)
> + return ret;
> +
This code looks "too useful" to be in a random filesystem.
Maybe work a generation counter into idr_alloc_cyclic or
just let it let the caller know that the range has been cycled?
Only if this is not too complicated to do.
> + pid->ino = (u64)ino_highbits << 32 | ret;
> + pid->stashed = NULL;
> + return 0;
> +}
> +
> +void pidfs_remove_pid(struct pid *pid)
> +{
> + idr_remove(&pidfs_ino_idr, (u32)pidfs_ino(pid->ino));
> +}
> +
> +/* Find a struct pid based on the inode number. */
> +static __maybe_unused struct pid *pidfs_ino_get_pid(u64 ino)
> +{
> + ino_t pid_ino = pidfs_ino(ino);
> + u32 gen = pidfs_gen(ino);
> + struct pid *pid;
> +
> + guard(rcu)();
> +
> + /* Handle @pid lookup carefully so there's no risk of UAF. */
> + pid = idr_find(&pidfs_ino_idr, (u32)ino);
> + if (!pid)
> + return NULL;
> +
> + if (sizeof(ino_t) < sizeof(u64)) {
> + if (gen && pidfs_gen(pid->ino) != gen)
> + pid = NULL;
> + } else {
> + if (pidfs_ino(pid->ino) != pid_ino)
> + pid = NULL;
> + }
> +
> + /* Within our pid namespace hierarchy? */
> + if (pid_vnr(pid) == 0)
> + pid = NULL;
> +
> + return get_pid(pid);
> +}
> +
> #ifdef CONFIG_PROC_FS
> /**
> * pidfd_show_fdinfo - print information about a pidfd
> @@ -491,6 +573,16 @@ struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags)
>
> void __init pidfs_init(void)
> {
> + /*
> + * On 32 bit systems the lower 32 bits are the inode number and
> + * the higher 32 bits are the generation number. The starting
> + * value for the inode number and the generation number is one.
> + */
> + if (sizeof(ino_t) < sizeof(u64))
> + pidfs_ino_highbits = 1;
> + else
> + pidfs_ino_highbits = 0;
> +
> pidfs_mnt = kern_mount(&pidfs_type);
> if (IS_ERR(pidfs_mnt))
> panic("Failed to mount pidfs pseudo filesystem");
> diff --git a/include/linux/pidfs.h b/include/linux/pidfs.h
> index 75bdf9807802a5d1a9699c99aa42648c2bd34170..2958652bb108b8a2e02128e17317be4545b40a01 100644
> --- a/include/linux/pidfs.h
> +++ b/include/linux/pidfs.h
> @@ -4,5 +4,7 @@
>
> struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags);
> void __init pidfs_init(void);
> +int pidfs_add_pid(struct pid *pid);
> +void pidfs_remove_pid(struct pid *pid);
>
> #endif /* _LINUX_PID_FS_H */
> diff --git a/kernel/pid.c b/kernel/pid.c
> index 115448e89c3e9e664d0d51c8d853e8167ba0540c..9f321c6456d24af705c28f0256ca4de771f5e681 100644
> --- a/kernel/pid.c
> +++ b/kernel/pid.c
> @@ -64,11 +64,6 @@ int pid_max = PID_MAX_DEFAULT;
>
> int pid_max_min = RESERVED_PIDS + 1;
> int pid_max_max = PID_MAX_LIMIT;
> -/*
> - * Pseudo filesystems start inode numbering after one. We use Reserved
> - * PIDs as a natural offset.
> - */
> -static u64 pidfs_ino = RESERVED_PIDS;
>
> /*
> * PID-map pages start out as NULL, they get allocated upon
> @@ -157,6 +152,7 @@ void free_pid(struct pid *pid)
> }
>
> idr_remove(&ns->idr, upid->nr);
> + pidfs_remove_pid(pid);
> }
> spin_unlock_irqrestore(&pidmap_lock, flags);
>
> @@ -276,8 +272,9 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
> spin_lock_irq(&pidmap_lock);
> if (!(ns->pid_allocated & PIDNS_ADDING))
> goto out_unlock;
> - pid->stashed = NULL;
> - pid->ino = ++pidfs_ino;
> + retval = pidfs_add_pid(pid);
I hope this does not create a scalability issue compared to the prev method?
> + if (retval)
> + goto out_unlock;
> for ( ; upid >= pid->numbers; --upid) {
> /* Make the PID visible to find_pid_ns. */
> idr_replace(&upid->ns->idr, pid, upid->nr);
>
> --
> 2.45.2
>
Nice idea!
If there is something wrong with it, then I could not find it ;)
Thanks,
Amir.
Powered by blists - more mailing lists