[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <CAJqdLrpjx6nRRMO+349T8W4xFyoYav7N+ys+AvLGWFsOLfsHeg@mail.gmail.com>
Date: Tue, 25 Feb 2025 18:30:55 +0100
From: Alexander Mikhalitsyn <alexander@...alicyn.com>
To: Michal Koutný <mkoutny@...e.com>
Cc: Christian Brauner <brauner@...nel.org>, linux-doc@...r.kernel.org,
linux-kernel@...r.kernel.org, linux-fsdevel@...r.kernel.org,
linux-trace-kernel@...r.kernel.org, Jonathan Corbet <corbet@....net>,
Kees Cook <kees@...nel.org>, Andrew Morton <akpm@...ux-foundation.org>,
"Eric W . Biederman" <ebiederm@...ssion.com>, Oleg Nesterov <oleg@...hat.com>
Subject: Re: [PATCH 2/2] pid: Optional first-fit pid allocation
Am Fr., 21. Feb. 2025 um 18:02 Uhr schrieb Michal Koutný <mkoutny@...e.com>:
>
> Noone would need to use this allocation strategy (it's slower, pid
> numbers collide sooner). Its primary purpose are pid namespaces in
> conjunction with pids.max cgroup limit which keeps (virtual) pid numbers
> below the given limit. This is for 32-bit userspace programs that may
> not work well with pid numbers above 65536.
>
> Link: https://lore.kernel.org/r/20241122132459.135120-1-aleksandr.mikhalitsyn@canonical.com/
> Signed-off-by: Michal Koutný <mkoutny@...e.com>
Dear Michal,
sorry for such a long delay with reply on your patches.
> ---
> Documentation/admin-guide/sysctl/kernel.rst | 2 ++
> include/linux/pid_namespace.h | 3 +++
> kernel/pid.c | 12 +++++++--
> kernel/pid_namespace.c | 28 +++++++++++++++------
> 4 files changed, 36 insertions(+), 9 deletions(-)
>
> diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst
> index a43b78b4b6464..f5e68d1c8849f 100644
> --- a/Documentation/admin-guide/sysctl/kernel.rst
> +++ b/Documentation/admin-guide/sysctl/kernel.rst
> @@ -1043,6 +1043,8 @@ The last pid allocated in the current (the one task using this sysctl
> lives in) pid namespace. When selecting a pid for a next task on fork
> kernel tries to allocate a number starting from this one.
>
> +When set to -1, first-fit pid numbering is used instead of the next-fit.
> +
>
> powersave-nap (PPC only)
> ========================
> diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h
> index f9f9931e02d6a..10bf66ca78590 100644
> --- a/include/linux/pid_namespace.h
> +++ b/include/linux/pid_namespace.h
> @@ -41,6 +41,9 @@ struct pid_namespace {
> #if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE)
> int memfd_noexec_scope;
> #endif
> +#ifdef CONFIG_IA32_EMULATION
Unfortunately, this does not work for our use case as it's x86-specific.
In the original cover letter [1] it was written:
>In any case, there are workloads that have expections about how large
>pid numbers they accept. Either for historical reasons or architectural
>reasons. One concreate example is the 32-bit version of Android's bionic
>libc which requires pid numbers less than 65536. There are workloads
>where it is run in a 32-bit container on a 64-bit kernel. If the host
And I have just confirmed with folks from Canonical, who work on Anbox
(Android in container project),
that they use Arm machines (both armhf/arm64). And one of the reasons
to add this feature is to
make legacy 32-bit Android Bionic libc to work [2].
[1] https://lore.kernel.org/all/20241122132459.135120-1-aleksandr.mikhalitsyn@canonical.com/
[2] https://android.googlesource.com/platform/bionic.git/+/HEAD/docs/32-bit-abi.md#is-too-small-for-large-pids
Kind regards,
Alex
> + bool pid_noncyclic;
> +#endif
> } __randomize_layout;
>
> extern struct pid_namespace init_pid_ns;
> diff --git a/kernel/pid.c b/kernel/pid.c
> index aa2a7d4da4555..e9da1662b8821 100644
> --- a/kernel/pid.c
> +++ b/kernel/pid.c
> @@ -191,6 +191,10 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
>
> for (i = ns->level; i >= 0; i--) {
> int tid = 0;
> + bool pid_noncyclic = 0;
> +#ifdef CONFIG_IA32_EMULATION
> + pid_noncyclic = READ_ONCE(tmp->pid_noncyclic);
> +#endif
>
> if (set_tid_size) {
> tid = set_tid[ns->level - i];
> @@ -235,8 +239,12 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
> * Store a null pointer so find_pid_ns does not find
> * a partially initialized PID (see below).
> */
> - nr = idr_alloc_cyclic(&tmp->idr, NULL, pid_min,
> - pid_max, GFP_ATOMIC);
> + if (likely(!pid_noncyclic))
> + nr = idr_alloc_cyclic(&tmp->idr, NULL, pid_min,
> + pid_max, GFP_ATOMIC);
> + else
> + nr = idr_alloc(&tmp->idr, NULL, pid_min,
> + pid_max, GFP_ATOMIC);
> }
> spin_unlock_irq(&pidmap_lock);
> idr_preload_end();
> diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
> index 0f23285be4f92..ceda94a064294 100644
> --- a/kernel/pid_namespace.c
> +++ b/kernel/pid_namespace.c
> @@ -113,6 +113,9 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns
> ns->pid_allocated = PIDNS_ADDING;
> #if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE)
> ns->memfd_noexec_scope = pidns_memfd_noexec_scope(parent_pid_ns);
> +#endif
> +#ifdef CONFIG_IA32_EMULATION
> + ns->pid_noncyclic = READ_ONCE(parent_pid_ns->pid_noncyclic);
> #endif
> return ns;
>
> @@ -260,7 +263,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
> return;
> }
>
> -#ifdef CONFIG_CHECKPOINT_RESTORE
> +#if defined(CONFIG_CHECKPOINT_RESTORE) || defined(CONFIG_IA32_EMULATION)
> static int pid_ns_ctl_handler(const struct ctl_table *table, int write,
> void *buffer, size_t *lenp, loff_t *ppos)
> {
> @@ -271,12 +274,23 @@ static int pid_ns_ctl_handler(const struct ctl_table *table, int write,
> if (write && !checkpoint_restore_ns_capable(pid_ns->user_ns))
> return -EPERM;
>
> - next = idr_get_cursor(&pid_ns->idr) - 1;
> + next = -1;
> +#ifdef CONFIG_IA32_EMULATION
> + if (!pid_ns->pid_noncyclic)
> +#endif
> + next += idr_get_cursor(&pid_ns->idr);
>
> tmp.data = &next;
> ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
> - if (!ret && write)
> - idr_set_cursor(&pid_ns->idr, next + 1);
> + if (!ret && write) {
> + if (next > -1)
> + idr_set_cursor(&pid_ns->idr, next + 1);
> + else if (!IS_ENABLED(CONFIG_IA32_EMULATION))
> + ret = -EINVAL;
> +#ifdef CONFIG_IA32_EMULATION
> + WRITE_ONCE(pid_ns->pid_noncyclic, next == -1);
> +#endif
> + }
>
> return ret;
> }
> @@ -288,11 +302,11 @@ static const struct ctl_table pid_ns_ctl_table[] = {
> .maxlen = sizeof(int),
> .mode = 0666, /* permissions are checked in the handler */
> .proc_handler = pid_ns_ctl_handler,
> - .extra1 = SYSCTL_ZERO,
> + .extra1 = SYSCTL_NEG_ONE,
> .extra2 = &pid_max,
> },
> };
> -#endif /* CONFIG_CHECKPOINT_RESTORE */
> +#endif /* CONFIG_CHECKPOINT_RESTORE || CONFIG_IA32_EMULATION */
>
> int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd)
> {
> @@ -449,7 +463,7 @@ static __init int pid_namespaces_init(void)
> {
> pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC | SLAB_ACCOUNT);
>
> -#ifdef CONFIG_CHECKPOINT_RESTORE
> +#if defined(CONFIG_CHECKPOINT_RESTORE) || defined(CONFIG_IA32_EMULATION)
> register_sysctl_init("kernel", pid_ns_ctl_table);
> #endif
>
> --
> 2.48.1
>
Powered by blists - more mailing lists