[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20250306-esskultur-sitzheizung-d482c4a35f80@brauner>
Date: Thu, 6 Mar 2025 09:59:13 +0100
From: Christian Brauner <brauner@...nel.org>
To: Michal Koutný <mkoutny@...e.com>
Cc: Alexander Mikhalitsyn <alexander@...alicyn.com>,
linux-doc@...r.kernel.org, linux-kernel@...r.kernel.org, linux-fsdevel@...r.kernel.org,
linux-trace-kernel@...r.kernel.org, Jonathan Corbet <corbet@....net>, Kees Cook <kees@...nel.org>,
Andrew Morton <akpm@...ux-foundation.org>, "Eric W . Biederman" <ebiederm@...ssion.com>,
Oleg Nesterov <oleg@...hat.com>
Subject: Re: [PATCH 2/2] pid: Optional first-fit pid allocation
On Fri, Feb 21, 2025 at 06:02:49PM +0100, Michal Koutný wrote:
> Noone would need to use this allocation strategy (it's slower, pid
> numbers collide sooner). Its primary purpose are pid namespaces in
> conjunction with pids.max cgroup limit which keeps (virtual) pid numbers
> below the given limit. This is for 32-bit userspace programs that may
> not work well with pid numbers above 65536.
>
> Link: https://lore.kernel.org/r/20241122132459.135120-1-aleksandr.mikhalitsyn@canonical.com/
> Signed-off-by: Michal Koutný <mkoutny@...e.com>
> ---
> Documentation/admin-guide/sysctl/kernel.rst | 2 ++
> include/linux/pid_namespace.h | 3 +++
> kernel/pid.c | 12 +++++++--
> kernel/pid_namespace.c | 28 +++++++++++++++------
> 4 files changed, 36 insertions(+), 9 deletions(-)
>
> diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst
> index a43b78b4b6464..f5e68d1c8849f 100644
> --- a/Documentation/admin-guide/sysctl/kernel.rst
> +++ b/Documentation/admin-guide/sysctl/kernel.rst
> @@ -1043,6 +1043,8 @@ The last pid allocated in the current (the one task using this sysctl
> lives in) pid namespace. When selecting a pid for a next task on fork
> kernel tries to allocate a number starting from this one.
>
> +When set to -1, first-fit pid numbering is used instead of the next-fit.
I strongly disagree with this approach. This is way worse then making
pid_max per pid namespace.
I'm fine if you come up with something else that's purely based on
cgroups somehow and is uniform across 64-bit and 32-bit. Allowing to
change the pid allocation strategy just for 32-bit is not the solution
and not mergable.
> +
>
> powersave-nap (PPC only)
> ========================
> diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h
> index f9f9931e02d6a..10bf66ca78590 100644
> --- a/include/linux/pid_namespace.h
> +++ b/include/linux/pid_namespace.h
> @@ -41,6 +41,9 @@ struct pid_namespace {
> #if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE)
> int memfd_noexec_scope;
> #endif
> +#ifdef CONFIG_IA32_EMULATION
> + bool pid_noncyclic;
> +#endif
> } __randomize_layout;
>
> extern struct pid_namespace init_pid_ns;
> diff --git a/kernel/pid.c b/kernel/pid.c
> index aa2a7d4da4555..e9da1662b8821 100644
> --- a/kernel/pid.c
> +++ b/kernel/pid.c
> @@ -191,6 +191,10 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
>
> for (i = ns->level; i >= 0; i--) {
> int tid = 0;
> + bool pid_noncyclic = 0;
> +#ifdef CONFIG_IA32_EMULATION
> + pid_noncyclic = READ_ONCE(tmp->pid_noncyclic);
> +#endif
>
> if (set_tid_size) {
> tid = set_tid[ns->level - i];
> @@ -235,8 +239,12 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
> * Store a null pointer so find_pid_ns does not find
> * a partially initialized PID (see below).
> */
> - nr = idr_alloc_cyclic(&tmp->idr, NULL, pid_min,
> - pid_max, GFP_ATOMIC);
> + if (likely(!pid_noncyclic))
> + nr = idr_alloc_cyclic(&tmp->idr, NULL, pid_min,
> + pid_max, GFP_ATOMIC);
> + else
> + nr = idr_alloc(&tmp->idr, NULL, pid_min,
> + pid_max, GFP_ATOMIC);
> }
> spin_unlock_irq(&pidmap_lock);
> idr_preload_end();
> diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
> index 0f23285be4f92..ceda94a064294 100644
> --- a/kernel/pid_namespace.c
> +++ b/kernel/pid_namespace.c
> @@ -113,6 +113,9 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns
> ns->pid_allocated = PIDNS_ADDING;
> #if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE)
> ns->memfd_noexec_scope = pidns_memfd_noexec_scope(parent_pid_ns);
> +#endif
> +#ifdef CONFIG_IA32_EMULATION
> + ns->pid_noncyclic = READ_ONCE(parent_pid_ns->pid_noncyclic);
> #endif
> return ns;
>
> @@ -260,7 +263,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
> return;
> }
>
> -#ifdef CONFIG_CHECKPOINT_RESTORE
> +#if defined(CONFIG_CHECKPOINT_RESTORE) || defined(CONFIG_IA32_EMULATION)
> static int pid_ns_ctl_handler(const struct ctl_table *table, int write,
> void *buffer, size_t *lenp, loff_t *ppos)
> {
> @@ -271,12 +274,23 @@ static int pid_ns_ctl_handler(const struct ctl_table *table, int write,
> if (write && !checkpoint_restore_ns_capable(pid_ns->user_ns))
> return -EPERM;
>
> - next = idr_get_cursor(&pid_ns->idr) - 1;
> + next = -1;
> +#ifdef CONFIG_IA32_EMULATION
> + if (!pid_ns->pid_noncyclic)
> +#endif
> + next += idr_get_cursor(&pid_ns->idr);
>
> tmp.data = &next;
> ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
> - if (!ret && write)
> - idr_set_cursor(&pid_ns->idr, next + 1);
> + if (!ret && write) {
> + if (next > -1)
> + idr_set_cursor(&pid_ns->idr, next + 1);
> + else if (!IS_ENABLED(CONFIG_IA32_EMULATION))
> + ret = -EINVAL;
> +#ifdef CONFIG_IA32_EMULATION
> + WRITE_ONCE(pid_ns->pid_noncyclic, next == -1);
> +#endif
> + }
>
> return ret;
> }
> @@ -288,11 +302,11 @@ static const struct ctl_table pid_ns_ctl_table[] = {
> .maxlen = sizeof(int),
> .mode = 0666, /* permissions are checked in the handler */
> .proc_handler = pid_ns_ctl_handler,
> - .extra1 = SYSCTL_ZERO,
> + .extra1 = SYSCTL_NEG_ONE,
> .extra2 = &pid_max,
> },
> };
> -#endif /* CONFIG_CHECKPOINT_RESTORE */
> +#endif /* CONFIG_CHECKPOINT_RESTORE || CONFIG_IA32_EMULATION */
>
> int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd)
> {
> @@ -449,7 +463,7 @@ static __init int pid_namespaces_init(void)
> {
> pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC | SLAB_ACCOUNT);
>
> -#ifdef CONFIG_CHECKPOINT_RESTORE
> +#if defined(CONFIG_CHECKPOINT_RESTORE) || defined(CONFIG_IA32_EMULATION)
> register_sysctl_init("kernel", pid_ns_ctl_table);
> #endif
>
> --
> 2.48.1
>
Powered by blists - more mailing lists