[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <2025-07-24.1753364886-sleek-serves-calm-fiat-canned-carnivals-jmzVpR@cyphar.com>
Date: Fri, 25 Jul 2025 12:13:03 +1000
From: Aleksa Sarai <cyphar@...har.com>
To: Christian Brauner <brauner@...nel.org>
Cc: Alexander Viro <viro@...iv.linux.org.uk>, Jan Kara <jack@...e.cz>,
Jonathan Corbet <corbet@....net>, Shuah Khan <shuah@...nel.org>, linux-kernel@...r.kernel.org,
linux-fsdevel@...r.kernel.org, linux-api@...r.kernel.org, linux-doc@...r.kernel.org,
linux-kselftest@...r.kernel.org
Subject: Re: [PATCH RFC v2 2/4] procfs: add "pidns" mount option
On 2025-07-24, Christian Brauner <brauner@...nel.org> wrote:
> On Wed, Jul 23, 2025 at 09:18:52AM +1000, Aleksa Sarai wrote:
> > Since the introduction of pid namespaces, their interaction with procfs
> > has been entirely implicit in ways that require a lot of dancing around
> > by programs that need to construct sandboxes with different PID
> > namespaces.
> >
> > Being able to explicitly specify the pid namespace to use when
> > constructing a procfs super block will allow programs to no longer need
> > to fork off a process which does then does unshare(2) / setns(2) and
> > forks again in order to construct a procfs in a pidns.
> >
> > So, provide a "pidns" mount option which allows such users to just
> > explicitly state which pid namespace they want that procfs instance to
> > use. This interface can be used with fsconfig(2) either with a file
> > descriptor or a path:
> >
> > fsconfig(procfd, FSCONFIG_SET_FD, "pidns", NULL, nsfd);
> > fsconfig(procfd, FSCONFIG_SET_STRING, "pidns", "/proc/self/ns/pid", 0);
>
> Fwiw, namespace mount options could just be VFS generic mount options.
> But it's not something that we need to solve right now.
Yeah if we add this to sysfs it probably should be made generic, but
let's punt this to later. :D
> > or with classic mount(2) / mount(8):
> >
> > // mount -t proc -o pidns=/proc/self/ns/pid proc /tmp/proc
> > mount("proc", "/tmp/proc", "proc", MS_..., "pidns=/proc/self/ns/pid");
> >
> > As this new API is effectively shorthand for setns(2) followed by
> > mount(2), the permission model for this mirrors pidns_install() to avoid
> > opening up new attack surfaces by loosening the existing permission
> > model.
> >
> > Note that the mount infrastructure also allows userspace to reconfigure
> > the pidns of an existing procfs mount, which may or may not be useful to
> > some users.
> >
> > Signed-off-by: Aleksa Sarai <cyphar@...har.com>
> > ---
> > Documentation/filesystems/proc.rst | 6 +++
> > fs/proc/root.c | 90 +++++++++++++++++++++++++++++++++++---
> > 2 files changed, 90 insertions(+), 6 deletions(-)
> >
> > diff --git a/Documentation/filesystems/proc.rst b/Documentation/filesystems/proc.rst
> > index 5236cb52e357..c520b9f8a3fd 100644
> > --- a/Documentation/filesystems/proc.rst
> > +++ b/Documentation/filesystems/proc.rst
> > @@ -2360,6 +2360,7 @@ The following mount options are supported:
> > hidepid= Set /proc/<pid>/ access mode.
> > gid= Set the group authorized to learn processes information.
> > subset= Show only the specified subset of procfs.
> > + pidns= Specify a the namespace used by this procfs.
> > ========= ========================================================
> >
> > hidepid=off or hidepid=0 means classic mode - everybody may access all
> > @@ -2392,6 +2393,11 @@ information about processes information, just add identd to this group.
> > subset=pid hides all top level files and directories in the procfs that
> > are not related to tasks.
> >
> > +pidns= specifies a pid namespace (either as a string path to something like
> > +`/proc/$pid/ns/pid`, or a file descriptor when using `FSCONFIG_SET_FD`) that
> > +will be used by the procfs instance when translating pids. By default, procfs
> > +will use the calling process's active pid namespace.
> > +
> > Chapter 5: Filesystem behavior
> > ==============================
> >
> > diff --git a/fs/proc/root.c b/fs/proc/root.c
> > index ed86ac710384..057c8a125c6e 100644
> > --- a/fs/proc/root.c
> > +++ b/fs/proc/root.c
> > @@ -38,12 +38,18 @@ enum proc_param {
> > Opt_gid,
> > Opt_hidepid,
> > Opt_subset,
> > +#ifdef CONFIG_PID_NS
> > + Opt_pidns,
> > +#endif
> > };
> >
> > static const struct fs_parameter_spec proc_fs_parameters[] = {
> > - fsparam_u32("gid", Opt_gid),
> > + fsparam_u32("gid", Opt_gid),
> > fsparam_string("hidepid", Opt_hidepid),
> > fsparam_string("subset", Opt_subset),
> > +#ifdef CONFIG_PID_NS
> > + fsparam_file_or_string("pidns", Opt_pidns),
> > +#endif
> > {}
> > };
> >
> > @@ -109,11 +115,67 @@ static int proc_parse_subset_param(struct fs_context *fc, char *value)
> > return 0;
> > }
> >
> > +#ifdef CONFIG_PID_NS
> > +static int proc_parse_pidns_param(struct fs_context *fc,
> > + struct fs_parameter *param,
> > + struct fs_parse_result *result)
> > +{
> > + struct proc_fs_context *ctx = fc->fs_private;
> > + struct pid_namespace *target, *active = task_active_pid_ns(current);
> > + struct ns_common *ns;
> > + struct file *ns_filp __free(fput) = NULL;
> > +
> > + switch (param->type) {
> > + case fs_value_is_file:
> > + /* came throug fsconfig, steal the file reference */
> > + ns_filp = param->file;
> > + param->file = NULL;
>
> This can be shortened to:
>
> ns_filp = no_free_ptr(param->file);
I really need to take a closer look at <linux/cleanup.h>, each time I
look at it I learn about another handy helper.
> > + break;
> > + case fs_value_is_string:
> > + ns_filp = filp_open(param->string, O_RDONLY, 0);
> > + break;
> > + default:
> > + WARN_ON_ONCE(true);
> > + break;
> > + }
> > + if (!ns_filp)
> > + ns_filp = ERR_PTR(-EBADF);
> > + if (IS_ERR(ns_filp)) {
> > + errorfc(fc, "could not get file from pidns argument");
> > + return PTR_ERR(ns_filp);
> > + }
> > +
> > + if (!proc_ns_file(ns_filp))
> > + return invalfc(fc, "pidns argument is not an nsfs file");
> > + ns = get_proc_ns(file_inode(ns_filp));
> > + if (ns->ops->type != CLONE_NEWPID)
> > + return invalfc(fc, "pidns argument is not a pidns file");
> > + target = container_of(ns, struct pid_namespace, ns);
> > +
> > + /*
> > + * pidns= is shorthand for joining the pidns to get a fsopen fd, so the
> > + * permission model should be the same as pidns_install().
> > + */
> > + if (!ns_capable(target->user_ns, CAP_SYS_ADMIN)) {
> > + errorfc(fc, "insufficient permissions to set pidns");
> > + return -EPERM;
> > + }
> > + if (!pidns_is_ancestor(target, active))
> > + return invalfc(fc, "cannot set pidns to non-descendant pidns");
>
> This made me think. If one rewrote this as:
>
> if (!ns_capable(task_active_pidns(current)->user_ns, CAP_SYS_ADMIN))
>
> if (!pidns_is_ancestor(target, active))
>
> that would also work, right? IOW, you'd be checking whether you're
> capable over your current pid namespace owning userns and if the target
> pidns is an ancestor it's also implied by the first check that you're
> capable over it.
>
> The only way this would not be true is if a descendant pidns would be
> owned by a userns over which you don't hold privileges and I wondered
> whether that's even possible? I don't think it is but maybe you see a
> way.
Well, if you run a setuid binary, it could create a pidns that is a
child but is owned by a more privileged userns than you. My main goal
here was to just mirror pidns_install() exactly, to make sure that the
permission model was identical.
> > +
> > + put_pid_ns(ctx->pid_ns);
> > + ctx->pid_ns = get_pid_ns(target);
> > + put_user_ns(fc->user_ns);
> > + fc->user_ns = get_user_ns(ctx->pid_ns->user_ns);
> > + return 0;
> > +}
> > +#endif /* CONFIG_PID_NS */
> > +
> > static int proc_parse_param(struct fs_context *fc, struct fs_parameter *param)
> > {
> > struct proc_fs_context *ctx = fc->fs_private;
> > struct fs_parse_result result;
> > - int opt;
> > + int opt, err;
> >
> > opt = fs_parse(fc, proc_fs_parameters, param, &result);
> > if (opt < 0)
> > @@ -125,14 +187,24 @@ static int proc_parse_param(struct fs_context *fc, struct fs_parameter *param)
> > break;
> >
> > case Opt_hidepid:
> > - if (proc_parse_hidepid_param(fc, param))
> > - return -EINVAL;
> > + err = proc_parse_hidepid_param(fc, param);
> > + if (err)
> > + return err;
> > break;
> >
> > case Opt_subset:
> > - if (proc_parse_subset_param(fc, param->string) < 0)
> > - return -EINVAL;
> > + err = proc_parse_subset_param(fc, param->string);
> > + if (err)
> > + return err;
> > + break;
> > +
> > +#ifdef CONFIG_PID_NS
> > + case Opt_pidns:
>
> I think it would be easier if we returned EOPNOTSUPP when !CONFIG_PID_NS
> instead of EINVALing this?
>
> > + err = proc_parse_pidns_param(fc, param, &result);
> > + if (err)
> > + return err;
> > break;
> > +#endif
> >
> > default:
> > return -EINVAL;
> > @@ -154,6 +226,12 @@ static void proc_apply_options(struct proc_fs_info *fs_info,
> > fs_info->hide_pid = ctx->hidepid;
> > if (ctx->mask & (1 << Opt_subset))
> > fs_info->pidonly = ctx->pidonly;
> > +#ifdef CONFIG_PID_NS
> > + if (ctx->mask & (1 << Opt_pidns)) {
> > + put_pid_ns(fs_info->pid_ns);
> > + fs_info->pid_ns = get_pid_ns(ctx->pid_ns);
> > + }
> > +#endif
> > }
> >
> > static int proc_fill_super(struct super_block *s, struct fs_context *fc)
> >
> > --
> > 2.50.0
> >
--
Aleksa Sarai
Senior Software Engineer (Containers)
SUSE Linux GmbH
https://www.cyphar.com/
Download attachment "signature.asc" of type "application/pgp-signature" (229 bytes)
Powered by blists - more mailing lists