lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20240520033829.GB1816262@mail.hallyn.com>
Date: Sun, 19 May 2024 22:38:29 -0500
From: "Serge E. Hallyn" <serge@...lyn.com>
To: Jonathan Calmels <jcalmels@...0.net>
Cc: brauner@...nel.org, ebiederm@...ssion.com,
	Luis Chamberlain <mcgrof@...nel.org>,
	Kees Cook <keescook@...omium.org>,
	Joel Granados <j.granados@...sung.com>,
	Serge Hallyn <serge@...lyn.com>, Paul Moore <paul@...l-moore.com>,
	James Morris <jmorris@...ei.org>,
	David Howells <dhowells@...hat.com>,
	Jarkko Sakkinen <jarkko@...nel.org>, containers@...ts.linux.dev,
	linux-kernel@...r.kernel.org, linux-fsdevel@...r.kernel.org,
	linux-security-module@...r.kernel.org, keyrings@...r.kernel.org
Subject: Re: [PATCH 3/3] capabilities: add cap userns sysctl mask

On Thu, May 16, 2024 at 02:22:05AM -0700, Jonathan Calmels wrote:
> This patch adds a new system-wide userns capability mask designed to mask
> off capabilities in user namespaces.
> 
> This mask is controlled through a sysctl and can be set early in the boot
> process or on the kernel command line to exclude known capabilities from
> ever being gained in namespaces. Once set, it can be further restricted to
> exert dynamic policies on the system (e.g. ward off a potential exploit).
> 
> Changing this mask requires privileges over CAP_SYS_ADMIN and CAP_SETPCAP
> in the initial user namespace.
> 
> Example:
> 
>     # sysctl -qw kernel.cap_userns_mask=0x1fffffdffff && \
>       unshare -r grep Cap /proc/self/status
>     CapInh: 0000000000000000
>     CapPrm: 000001fffffdffff
>     CapEff: 000001fffffdffff
>     CapBnd: 000001fffffdffff
>     CapAmb: 0000000000000000
>     CapUNs: 000001fffffdffff
> 
> Signed-off-by: Jonathan Calmels <jcalmels@...0.net>

Reviewed-by: Serge Hallyn <serge@...lyn.com>

> ---
>  include/linux/user_namespace.h |  7 ++++
>  kernel/sysctl.c                | 10 ++++++
>  kernel/user_namespace.c        | 66 ++++++++++++++++++++++++++++++++++
>  3 files changed, 83 insertions(+)
> 
> diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h
> index 6030a8235617..e3478bd54ee5 100644
> --- a/include/linux/user_namespace.h
> +++ b/include/linux/user_namespace.h
> @@ -2,6 +2,7 @@
>  #ifndef _LINUX_USER_NAMESPACE_H
>  #define _LINUX_USER_NAMESPACE_H
>  
> +#include <linux/capability.h>
>  #include <linux/kref.h>
>  #include <linux/nsproxy.h>
>  #include <linux/ns_common.h>
> @@ -14,6 +15,12 @@
>  #define UID_GID_MAP_MAX_BASE_EXTENTS 5
>  #define UID_GID_MAP_MAX_EXTENTS 340
>  
> +#ifdef CONFIG_SYSCTL
> +extern kernel_cap_t cap_userns_mask;
> +int proc_cap_userns_handler(struct ctl_table *table, int write,
> +			    void *buffer, size_t *lenp, loff_t *ppos);
> +#endif
> +
>  struct uid_gid_extent {
>  	u32 first;
>  	u32 lower_first;
> diff --git a/kernel/sysctl.c b/kernel/sysctl.c
> index 81cc974913bb..1546eebd6aea 100644
> --- a/kernel/sysctl.c
> +++ b/kernel/sysctl.c
> @@ -62,6 +62,7 @@
>  #include <linux/sched/sysctl.h>
>  #include <linux/mount.h>
>  #include <linux/userfaultfd_k.h>
> +#include <linux/user_namespace.h>
>  #include <linux/pid.h>
>  
>  #include "../lib/kstrtox.h"
> @@ -1846,6 +1847,15 @@ static struct ctl_table kern_table[] = {
>  		.mode		= 0444,
>  		.proc_handler	= proc_dointvec,
>  	},
> +#ifdef CONFIG_USER_NS
> +	{
> +		.procname	= "cap_userns_mask",
> +		.data		= &cap_userns_mask,
> +		.maxlen		= sizeof(kernel_cap_t),
> +		.mode		= 0644,
> +		.proc_handler	= proc_cap_userns_handler,
> +	},
> +#endif
>  #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86)
>  	{
>  		.procname       = "unknown_nmi_panic",
> diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
> index 53848e2b68cd..e0cf606e9140 100644
> --- a/kernel/user_namespace.c
> +++ b/kernel/user_namespace.c
> @@ -26,6 +26,66 @@
>  static struct kmem_cache *user_ns_cachep __ro_after_init;
>  static DEFINE_MUTEX(userns_state_mutex);
>  
> +#ifdef CONFIG_SYSCTL
> +static DEFINE_SPINLOCK(cap_userns_lock);
> +kernel_cap_t cap_userns_mask = CAP_FULL_SET;
> +
> +int proc_cap_userns_handler(struct ctl_table *table, int write,
> +			    void *buffer, size_t *lenp, loff_t *ppos)
> +{
> +	struct ctl_table t;
> +	unsigned long mask_array[2];
> +	kernel_cap_t new_mask, *mask;
> +	int err;
> +
> +	if (write && (!capable(CAP_SETPCAP) ||
> +		      !capable(CAP_SYS_ADMIN)))
> +		return -EPERM;
> +
> +	/*
> +	 * convert from the global kernel_cap_t to the ulong array to print to
> +	 * userspace if this is a read.
> +	 *
> +	 * capabilities are exposed as one 64-bit value or two 32-bit values
> +	 * depending on the architecture
> +	 */
> +	mask = table->data;
> +	spin_lock(&cap_userns_lock);
> +	mask_array[0] = (unsigned long) mask->val;
> +#if BITS_PER_LONG != 64
> +	mask_array[1] = mask->val >> BITS_PER_LONG;
> +#endif
> +	spin_unlock(&cap_userns_lock);
> +
> +	t = *table;
> +	t.data = &mask_array;
> +
> +	/*
> +	 * actually read or write and array of ulongs from userspace.  Remember
> +	 * these are least significant bits first
> +	 */
> +	err = proc_doulongvec_minmax(&t, write, buffer, lenp, ppos);
> +	if (err < 0)
> +		return err;
> +
> +	new_mask.val = mask_array[0];
> +#if BITS_PER_LONG != 64
> +	new_mask.val += (u64)mask_array[1] << BITS_PER_LONG;
> +#endif
> +
> +	/*
> +	 * Drop everything not in the new_mask (but don't add things)
> +	 */
> +	if (write) {
> +		spin_lock(&cap_userns_lock);
> +		*mask = cap_intersect(*mask, new_mask);
> +		spin_unlock(&cap_userns_lock);
> +	}
> +
> +	return 0;
> +}
> +#endif
> +
>  static bool new_idmap_permitted(const struct file *file,
>  				struct user_namespace *ns, int cap_setid,
>  				struct uid_gid_map *map);
> @@ -46,6 +106,12 @@ static void set_cred_user_ns(struct cred *cred, struct user_namespace *user_ns)
>  	/* Limit userns capabilities to our parent's bounding set. */
>  	if (iscredsecure(cred, SECURE_USERNS_STRICT_CAPS))
>  		cred->cap_userns = cap_intersect(cred->cap_userns, cred->cap_bset);
> +#ifdef CONFIG_SYSCTL
> +	/* Mask off userns capabilities that are not permitted by the system-wide mask. */
> +	spin_lock(&cap_userns_lock);
> +	cred->cap_userns = cap_intersect(cred->cap_userns, cap_userns_mask);
> +	spin_unlock(&cap_userns_lock);
> +#endif
>  
>  	/* Start with the capabilities defined in the userns set. */
>  	cred->cap_bset = cred->cap_userns;
> -- 
> 2.45.0
> 

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ