lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <CAJqdLrpjx6nRRMO+349T8W4xFyoYav7N+ys+AvLGWFsOLfsHeg@mail.gmail.com>
Date: Tue, 25 Feb 2025 18:30:55 +0100
From: Alexander Mikhalitsyn <alexander@...alicyn.com>
To: Michal Koutný <mkoutny@...e.com>
Cc: Christian Brauner <brauner@...nel.org>, linux-doc@...r.kernel.org, 
	linux-kernel@...r.kernel.org, linux-fsdevel@...r.kernel.org, 
	linux-trace-kernel@...r.kernel.org, Jonathan Corbet <corbet@....net>, 
	Kees Cook <kees@...nel.org>, Andrew Morton <akpm@...ux-foundation.org>, 
	"Eric W . Biederman" <ebiederm@...ssion.com>, Oleg Nesterov <oleg@...hat.com>
Subject: Re: [PATCH 2/2] pid: Optional first-fit pid allocation

Am Fr., 21. Feb. 2025 um 18:02 Uhr schrieb Michal Koutný <mkoutny@...e.com>:
>
> Noone would need to use this allocation strategy (it's slower, pid
> numbers collide sooner). Its primary purpose are pid namespaces in
> conjunction with pids.max cgroup limit which keeps (virtual) pid numbers
> below the given limit. This is for 32-bit userspace programs that may
> not work well with pid numbers above 65536.
>
> Link: https://lore.kernel.org/r/20241122132459.135120-1-aleksandr.mikhalitsyn@canonical.com/
> Signed-off-by: Michal Koutný <mkoutny@...e.com>

Dear Michal,

sorry for such a long delay with reply on your patches.

> ---
>  Documentation/admin-guide/sysctl/kernel.rst |  2 ++
>  include/linux/pid_namespace.h               |  3 +++
>  kernel/pid.c                                | 12 +++++++--
>  kernel/pid_namespace.c                      | 28 +++++++++++++++------
>  4 files changed, 36 insertions(+), 9 deletions(-)
>
> diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst
> index a43b78b4b6464..f5e68d1c8849f 100644
> --- a/Documentation/admin-guide/sysctl/kernel.rst
> +++ b/Documentation/admin-guide/sysctl/kernel.rst
> @@ -1043,6 +1043,8 @@ The last pid allocated in the current (the one task using this sysctl
>  lives in) pid namespace. When selecting a pid for a next task on fork
>  kernel tries to allocate a number starting from this one.
>
> +When set to -1, first-fit pid numbering is used instead of the next-fit.
> +
>
>  powersave-nap (PPC only)
>  ========================
> diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h
> index f9f9931e02d6a..10bf66ca78590 100644
> --- a/include/linux/pid_namespace.h
> +++ b/include/linux/pid_namespace.h
> @@ -41,6 +41,9 @@ struct pid_namespace {
>  #if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE)
>         int memfd_noexec_scope;
>  #endif
> +#ifdef CONFIG_IA32_EMULATION

Unfortunately, this does not work for our use case as it's x86-specific.

In the original cover letter [1] it was written:

>In any case, there are workloads that have expections about how large
>pid numbers they accept. Either for historical reasons or architectural
>reasons. One concreate example is the 32-bit version of Android's bionic
>libc which requires pid numbers less than 65536. There are workloads
>where it is run in a 32-bit container on a 64-bit kernel. If the host

And I have just confirmed with folks from Canonical, who work on Anbox
(Android in container project),
that they use Arm machines (both armhf/arm64). And one of the reasons
to add this feature is to
make legacy 32-bit Android Bionic libc to work [2].

[1] https://lore.kernel.org/all/20241122132459.135120-1-aleksandr.mikhalitsyn@canonical.com/
[2] https://android.googlesource.com/platform/bionic.git/+/HEAD/docs/32-bit-abi.md#is-too-small-for-large-pids

Kind regards,
Alex

> +       bool pid_noncyclic;
> +#endif
>  } __randomize_layout;
>
>  extern struct pid_namespace init_pid_ns;
> diff --git a/kernel/pid.c b/kernel/pid.c
> index aa2a7d4da4555..e9da1662b8821 100644
> --- a/kernel/pid.c
> +++ b/kernel/pid.c
> @@ -191,6 +191,10 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
>
>         for (i = ns->level; i >= 0; i--) {
>                 int tid = 0;
> +               bool pid_noncyclic = 0;
> +#ifdef CONFIG_IA32_EMULATION
> +               pid_noncyclic = READ_ONCE(tmp->pid_noncyclic);
> +#endif
>
>                 if (set_tid_size) {
>                         tid = set_tid[ns->level - i];
> @@ -235,8 +239,12 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
>                          * Store a null pointer so find_pid_ns does not find
>                          * a partially initialized PID (see below).
>                          */
> -                       nr = idr_alloc_cyclic(&tmp->idr, NULL, pid_min,
> -                                             pid_max, GFP_ATOMIC);
> +                       if (likely(!pid_noncyclic))
> +                               nr = idr_alloc_cyclic(&tmp->idr, NULL, pid_min,
> +                                                     pid_max, GFP_ATOMIC);
> +                       else
> +                               nr = idr_alloc(&tmp->idr, NULL, pid_min,
> +                                                     pid_max, GFP_ATOMIC);
>                 }
>                 spin_unlock_irq(&pidmap_lock);
>                 idr_preload_end();
> diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
> index 0f23285be4f92..ceda94a064294 100644
> --- a/kernel/pid_namespace.c
> +++ b/kernel/pid_namespace.c
> @@ -113,6 +113,9 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns
>         ns->pid_allocated = PIDNS_ADDING;
>  #if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE)
>         ns->memfd_noexec_scope = pidns_memfd_noexec_scope(parent_pid_ns);
> +#endif
> +#ifdef CONFIG_IA32_EMULATION
> +       ns->pid_noncyclic = READ_ONCE(parent_pid_ns->pid_noncyclic);
>  #endif
>         return ns;
>
> @@ -260,7 +263,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
>         return;
>  }
>
> -#ifdef CONFIG_CHECKPOINT_RESTORE
> +#if defined(CONFIG_CHECKPOINT_RESTORE) || defined(CONFIG_IA32_EMULATION)
>  static int pid_ns_ctl_handler(const struct ctl_table *table, int write,
>                 void *buffer, size_t *lenp, loff_t *ppos)
>  {
> @@ -271,12 +274,23 @@ static int pid_ns_ctl_handler(const struct ctl_table *table, int write,
>         if (write && !checkpoint_restore_ns_capable(pid_ns->user_ns))
>                 return -EPERM;
>
> -       next = idr_get_cursor(&pid_ns->idr) - 1;
> +       next = -1;
> +#ifdef CONFIG_IA32_EMULATION
> +       if (!pid_ns->pid_noncyclic)
> +#endif
> +               next += idr_get_cursor(&pid_ns->idr);
>
>         tmp.data = &next;
>         ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
> -       if (!ret && write)
> -               idr_set_cursor(&pid_ns->idr, next + 1);
> +       if (!ret && write) {
> +               if (next > -1)
> +                       idr_set_cursor(&pid_ns->idr, next + 1);
> +               else if (!IS_ENABLED(CONFIG_IA32_EMULATION))
> +                       ret = -EINVAL;
> +#ifdef CONFIG_IA32_EMULATION
> +               WRITE_ONCE(pid_ns->pid_noncyclic, next == -1);
> +#endif
> +       }
>
>         return ret;
>  }
> @@ -288,11 +302,11 @@ static const struct ctl_table pid_ns_ctl_table[] = {
>                 .maxlen = sizeof(int),
>                 .mode = 0666, /* permissions are checked in the handler */
>                 .proc_handler = pid_ns_ctl_handler,
> -               .extra1 = SYSCTL_ZERO,
> +               .extra1 = SYSCTL_NEG_ONE,
>                 .extra2 = &pid_max,
>         },
>  };
> -#endif /* CONFIG_CHECKPOINT_RESTORE */
> +#endif /* CONFIG_CHECKPOINT_RESTORE || CONFIG_IA32_EMULATION */
>
>  int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd)
>  {
> @@ -449,7 +463,7 @@ static __init int pid_namespaces_init(void)
>  {
>         pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC | SLAB_ACCOUNT);
>
> -#ifdef CONFIG_CHECKPOINT_RESTORE
> +#if defined(CONFIG_CHECKPOINT_RESTORE) || defined(CONFIG_IA32_EMULATION)
>         register_sysctl_init("kernel", pid_ns_ctl_table);
>  #endif
>
> --
> 2.48.1
>

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ