lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20250306-esskultur-sitzheizung-d482c4a35f80@brauner>
Date: Thu, 6 Mar 2025 09:59:13 +0100
From: Christian Brauner <brauner@...nel.org>
To: Michal Koutný <mkoutny@...e.com>
Cc: Alexander Mikhalitsyn <alexander@...alicyn.com>, 
	linux-doc@...r.kernel.org, linux-kernel@...r.kernel.org, linux-fsdevel@...r.kernel.org, 
	linux-trace-kernel@...r.kernel.org, Jonathan Corbet <corbet@....net>, Kees Cook <kees@...nel.org>, 
	Andrew Morton <akpm@...ux-foundation.org>, "Eric W . Biederman" <ebiederm@...ssion.com>, 
	Oleg Nesterov <oleg@...hat.com>
Subject: Re: [PATCH 2/2] pid: Optional first-fit pid allocation

On Fri, Feb 21, 2025 at 06:02:49PM +0100, Michal Koutný wrote:
> Noone would need to use this allocation strategy (it's slower, pid
> numbers collide sooner). Its primary purpose are pid namespaces in
> conjunction with pids.max cgroup limit which keeps (virtual) pid numbers
> below the given limit. This is for 32-bit userspace programs that may
> not work well with pid numbers above 65536.
> 
> Link: https://lore.kernel.org/r/20241122132459.135120-1-aleksandr.mikhalitsyn@canonical.com/
> Signed-off-by: Michal Koutný <mkoutny@...e.com>
> ---
>  Documentation/admin-guide/sysctl/kernel.rst |  2 ++
>  include/linux/pid_namespace.h               |  3 +++
>  kernel/pid.c                                | 12 +++++++--
>  kernel/pid_namespace.c                      | 28 +++++++++++++++------
>  4 files changed, 36 insertions(+), 9 deletions(-)
> 
> diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst
> index a43b78b4b6464..f5e68d1c8849f 100644
> --- a/Documentation/admin-guide/sysctl/kernel.rst
> +++ b/Documentation/admin-guide/sysctl/kernel.rst
> @@ -1043,6 +1043,8 @@ The last pid allocated in the current (the one task using this sysctl
>  lives in) pid namespace. When selecting a pid for a next task on fork
>  kernel tries to allocate a number starting from this one.
>  
> +When set to -1, first-fit pid numbering is used instead of the next-fit.

I strongly disagree with this approach. This is way worse then making
pid_max per pid namespace.

I'm fine if you come up with something else that's purely based on
cgroups somehow and is uniform across 64-bit and 32-bit. Allowing to
change the pid allocation strategy just for 32-bit is not the solution
and not mergable.

> +
>  
>  powersave-nap (PPC only)
>  ========================
> diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h
> index f9f9931e02d6a..10bf66ca78590 100644
> --- a/include/linux/pid_namespace.h
> +++ b/include/linux/pid_namespace.h
> @@ -41,6 +41,9 @@ struct pid_namespace {
>  #if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE)
>  	int memfd_noexec_scope;
>  #endif
> +#ifdef CONFIG_IA32_EMULATION
> +	bool pid_noncyclic;
> +#endif
>  } __randomize_layout;
>  
>  extern struct pid_namespace init_pid_ns;
> diff --git a/kernel/pid.c b/kernel/pid.c
> index aa2a7d4da4555..e9da1662b8821 100644
> --- a/kernel/pid.c
> +++ b/kernel/pid.c
> @@ -191,6 +191,10 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
>  
>  	for (i = ns->level; i >= 0; i--) {
>  		int tid = 0;
> +		bool pid_noncyclic = 0;
> +#ifdef CONFIG_IA32_EMULATION
> +		pid_noncyclic = READ_ONCE(tmp->pid_noncyclic);
> +#endif
>  
>  		if (set_tid_size) {
>  			tid = set_tid[ns->level - i];
> @@ -235,8 +239,12 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
>  			 * Store a null pointer so find_pid_ns does not find
>  			 * a partially initialized PID (see below).
>  			 */
> -			nr = idr_alloc_cyclic(&tmp->idr, NULL, pid_min,
> -					      pid_max, GFP_ATOMIC);
> +			if (likely(!pid_noncyclic))
> +				nr = idr_alloc_cyclic(&tmp->idr, NULL, pid_min,
> +						      pid_max, GFP_ATOMIC);
> +			else
> +				nr = idr_alloc(&tmp->idr, NULL, pid_min,
> +						      pid_max, GFP_ATOMIC);
>  		}
>  		spin_unlock_irq(&pidmap_lock);
>  		idr_preload_end();
> diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
> index 0f23285be4f92..ceda94a064294 100644
> --- a/kernel/pid_namespace.c
> +++ b/kernel/pid_namespace.c
> @@ -113,6 +113,9 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns
>  	ns->pid_allocated = PIDNS_ADDING;
>  #if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE)
>  	ns->memfd_noexec_scope = pidns_memfd_noexec_scope(parent_pid_ns);
> +#endif
> +#ifdef CONFIG_IA32_EMULATION
> +	ns->pid_noncyclic = READ_ONCE(parent_pid_ns->pid_noncyclic);
>  #endif
>  	return ns;
>  
> @@ -260,7 +263,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
>  	return;
>  }
>  
> -#ifdef CONFIG_CHECKPOINT_RESTORE
> +#if defined(CONFIG_CHECKPOINT_RESTORE) || defined(CONFIG_IA32_EMULATION)
>  static int pid_ns_ctl_handler(const struct ctl_table *table, int write,
>  		void *buffer, size_t *lenp, loff_t *ppos)
>  {
> @@ -271,12 +274,23 @@ static int pid_ns_ctl_handler(const struct ctl_table *table, int write,
>  	if (write && !checkpoint_restore_ns_capable(pid_ns->user_ns))
>  		return -EPERM;
>  
> -	next = idr_get_cursor(&pid_ns->idr) - 1;
> +	next = -1;
> +#ifdef CONFIG_IA32_EMULATION
> +	if (!pid_ns->pid_noncyclic)
> +#endif
> +		next += idr_get_cursor(&pid_ns->idr);
>  
>  	tmp.data = &next;
>  	ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
> -	if (!ret && write)
> -		idr_set_cursor(&pid_ns->idr, next + 1);
> +	if (!ret && write) {
> +		if (next > -1)
> +			idr_set_cursor(&pid_ns->idr, next + 1);
> +		else if (!IS_ENABLED(CONFIG_IA32_EMULATION))
> +			ret = -EINVAL;
> +#ifdef CONFIG_IA32_EMULATION
> +		WRITE_ONCE(pid_ns->pid_noncyclic, next == -1);
> +#endif
> +	}
>  
>  	return ret;
>  }
> @@ -288,11 +302,11 @@ static const struct ctl_table pid_ns_ctl_table[] = {
>  		.maxlen = sizeof(int),
>  		.mode = 0666, /* permissions are checked in the handler */
>  		.proc_handler = pid_ns_ctl_handler,
> -		.extra1 = SYSCTL_ZERO,
> +		.extra1 = SYSCTL_NEG_ONE,
>  		.extra2 = &pid_max,
>  	},
>  };
> -#endif	/* CONFIG_CHECKPOINT_RESTORE */
> +#endif	/* CONFIG_CHECKPOINT_RESTORE || CONFIG_IA32_EMULATION */
>  
>  int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd)
>  {
> @@ -449,7 +463,7 @@ static __init int pid_namespaces_init(void)
>  {
>  	pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC | SLAB_ACCOUNT);
>  
> -#ifdef CONFIG_CHECKPOINT_RESTORE
> +#if defined(CONFIG_CHECKPOINT_RESTORE) || defined(CONFIG_IA32_EMULATION)
>  	register_sysctl_init("kernel", pid_ns_ctl_table);
>  #endif
>  
> -- 
> 2.48.1
> 

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ