[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20240516092213.6799-4-jcalmels@3xx0.net>
Date: Thu, 16 May 2024 02:22:05 -0700
From: Jonathan Calmels <jcalmels@...0.net>
To: brauner@...nel.org,
ebiederm@...ssion.com,
Luis Chamberlain <mcgrof@...nel.org>,
Kees Cook <keescook@...omium.org>,
Joel Granados <j.granados@...sung.com>,
Serge Hallyn <serge@...lyn.com>,
Paul Moore <paul@...l-moore.com>,
James Morris <jmorris@...ei.org>,
David Howells <dhowells@...hat.com>,
Jarkko Sakkinen <jarkko@...nel.org>
Cc: containers@...ts.linux.dev,
Jonathan Calmels <jcalmels@...0.net>,
linux-kernel@...r.kernel.org,
linux-fsdevel@...r.kernel.org,
linux-security-module@...r.kernel.org,
keyrings@...r.kernel.org
Subject: [PATCH 3/3] capabilities: add cap userns sysctl mask
This patch adds a new system-wide userns capability mask designed to mask
off capabilities in user namespaces.
This mask is controlled through a sysctl and can be set early in the boot
process or on the kernel command line to exclude known capabilities from
ever being gained in namespaces. Once set, it can be further restricted to
exert dynamic policies on the system (e.g. ward off a potential exploit).
Changing this mask requires privileges over CAP_SYS_ADMIN and CAP_SETPCAP
in the initial user namespace.
Example:
# sysctl -qw kernel.cap_userns_mask=0x1fffffdffff && \
unshare -r grep Cap /proc/self/status
CapInh: 0000000000000000
CapPrm: 000001fffffdffff
CapEff: 000001fffffdffff
CapBnd: 000001fffffdffff
CapAmb: 0000000000000000
CapUNs: 000001fffffdffff
Signed-off-by: Jonathan Calmels <jcalmels@...0.net>
---
include/linux/user_namespace.h | 7 ++++
kernel/sysctl.c | 10 ++++++
kernel/user_namespace.c | 66 ++++++++++++++++++++++++++++++++++
3 files changed, 83 insertions(+)
diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h
index 6030a8235617..e3478bd54ee5 100644
--- a/include/linux/user_namespace.h
+++ b/include/linux/user_namespace.h
@@ -2,6 +2,7 @@
#ifndef _LINUX_USER_NAMESPACE_H
#define _LINUX_USER_NAMESPACE_H
+#include <linux/capability.h>
#include <linux/kref.h>
#include <linux/nsproxy.h>
#include <linux/ns_common.h>
@@ -14,6 +15,12 @@
#define UID_GID_MAP_MAX_BASE_EXTENTS 5
#define UID_GID_MAP_MAX_EXTENTS 340
+#ifdef CONFIG_SYSCTL
+extern kernel_cap_t cap_userns_mask;
+int proc_cap_userns_handler(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos);
+#endif
+
struct uid_gid_extent {
u32 first;
u32 lower_first;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 81cc974913bb..1546eebd6aea 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -62,6 +62,7 @@
#include <linux/sched/sysctl.h>
#include <linux/mount.h>
#include <linux/userfaultfd_k.h>
+#include <linux/user_namespace.h>
#include <linux/pid.h>
#include "../lib/kstrtox.h"
@@ -1846,6 +1847,15 @@ static struct ctl_table kern_table[] = {
.mode = 0444,
.proc_handler = proc_dointvec,
},
+#ifdef CONFIG_USER_NS
+ {
+ .procname = "cap_userns_mask",
+ .data = &cap_userns_mask,
+ .maxlen = sizeof(kernel_cap_t),
+ .mode = 0644,
+ .proc_handler = proc_cap_userns_handler,
+ },
+#endif
#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86)
{
.procname = "unknown_nmi_panic",
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 53848e2b68cd..e0cf606e9140 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -26,6 +26,66 @@
static struct kmem_cache *user_ns_cachep __ro_after_init;
static DEFINE_MUTEX(userns_state_mutex);
+#ifdef CONFIG_SYSCTL
+static DEFINE_SPINLOCK(cap_userns_lock);
+kernel_cap_t cap_userns_mask = CAP_FULL_SET;
+
+int proc_cap_userns_handler(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ struct ctl_table t;
+ unsigned long mask_array[2];
+ kernel_cap_t new_mask, *mask;
+ int err;
+
+ if (write && (!capable(CAP_SETPCAP) ||
+ !capable(CAP_SYS_ADMIN)))
+ return -EPERM;
+
+ /*
+ * convert from the global kernel_cap_t to the ulong array to print to
+ * userspace if this is a read.
+ *
+ * capabilities are exposed as one 64-bit value or two 32-bit values
+ * depending on the architecture
+ */
+ mask = table->data;
+ spin_lock(&cap_userns_lock);
+ mask_array[0] = (unsigned long) mask->val;
+#if BITS_PER_LONG != 64
+ mask_array[1] = mask->val >> BITS_PER_LONG;
+#endif
+ spin_unlock(&cap_userns_lock);
+
+ t = *table;
+ t.data = &mask_array;
+
+ /*
+ * actually read or write and array of ulongs from userspace. Remember
+ * these are least significant bits first
+ */
+ err = proc_doulongvec_minmax(&t, write, buffer, lenp, ppos);
+ if (err < 0)
+ return err;
+
+ new_mask.val = mask_array[0];
+#if BITS_PER_LONG != 64
+ new_mask.val += (u64)mask_array[1] << BITS_PER_LONG;
+#endif
+
+ /*
+ * Drop everything not in the new_mask (but don't add things)
+ */
+ if (write) {
+ spin_lock(&cap_userns_lock);
+ *mask = cap_intersect(*mask, new_mask);
+ spin_unlock(&cap_userns_lock);
+ }
+
+ return 0;
+}
+#endif
+
static bool new_idmap_permitted(const struct file *file,
struct user_namespace *ns, int cap_setid,
struct uid_gid_map *map);
@@ -46,6 +106,12 @@ static void set_cred_user_ns(struct cred *cred, struct user_namespace *user_ns)
/* Limit userns capabilities to our parent's bounding set. */
if (iscredsecure(cred, SECURE_USERNS_STRICT_CAPS))
cred->cap_userns = cap_intersect(cred->cap_userns, cred->cap_bset);
+#ifdef CONFIG_SYSCTL
+ /* Mask off userns capabilities that are not permitted by the system-wide mask. */
+ spin_lock(&cap_userns_lock);
+ cred->cap_userns = cap_intersect(cred->cap_userns, cap_userns_mask);
+ spin_unlock(&cap_userns_lock);
+#endif
/* Start with the capabilities defined in the userns set. */
cred->cap_bset = cred->cap_userns;
--
2.45.0
Powered by blists - more mailing lists