lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <c14518ba563d4c6bb75b9fac63b69cd4c82f9dcc.1600951211.git.yifeifz2@illinois.edu>
Date:   Thu, 24 Sep 2020 07:44:18 -0500
From:   YiFei Zhu <zhuyifei1999@...il.com>
To:     containers@...ts.linux-foundation.org
Cc:     YiFei Zhu <yifeifz2@...inois.edu>, bpf@...r.kernel.org,
        linux-kernel@...r.kernel.org, Aleksa Sarai <cyphar@...har.com>,
        Andrea Arcangeli <aarcange@...hat.com>,
        Andy Lutomirski <luto@...capital.net>,
        Dimitrios Skarlatos <dskarlat@...cmu.edu>,
        Giuseppe Scrivano <gscrivan@...hat.com>,
        Hubertus Franke <frankeh@...ibm.com>,
        Jack Chen <jianyan2@...inois.edu>,
        Jann Horn <jannh@...gle.com>,
        Josep Torrellas <torrella@...inois.edu>,
        Kees Cook <keescook@...omium.org>,
        Tianyin Xu <tyxu@...inois.edu>,
        Tobin Feldman-Fitzthum <tobin@....com>,
        Tycho Andersen <tycho@...ho.pizza>,
        Valentin Rothberg <vrothber@...hat.com>,
        Will Drewry <wad@...omium.org>
Subject: [PATCH v2 seccomp 3/6] seccomp/cache: Add "emulator" to check if filter is arg-dependent

From: YiFei Zhu <yifeifz2@...inois.edu>

SECCOMP_CACHE_NR_ONLY will only operate on syscalls that do not
access any syscall arguments or instruction pointer. To facilitate
this we need a static analyser to know whether a filter will
return allow regardless of syscall arguments for a given
architecture number / syscall number pair. This is implemented
here with a pseudo-emulator, and stored in a per-filter bitmap.

Each common BPF instruction (stolen from Kees's list [1]) are
emulated. Any weirdness or loading from a syscall argument will
cause the emulator to bail.

The emulation is also halted if it reaches a return. In that case,
if it returns an SECCOMP_RET_ALLOW, the syscall is marked as good.

Filter dependency is resolved at attach time. If a filter depends
on more filters, then we perform an and on its bitmask against its
dependee; if the dependee does not guarantee to allow the syscall,
then the depender is also marked not to guarantee to allow the
syscall.

[1] https://lore.kernel.org/lkml/20200923232923.3142503-5-keescook@chromium.org/

Signed-off-by: YiFei Zhu <yifeifz2@...inois.edu>
---
 arch/Kconfig     |  25 ++++++
 kernel/seccomp.c | 194 ++++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 218 insertions(+), 1 deletion(-)

diff --git a/arch/Kconfig b/arch/Kconfig
index 6dfc5673215d..8cc3dc87f253 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -489,6 +489,31 @@ config SECCOMP_FILTER
 
 	  See Documentation/userspace-api/seccomp_filter.rst for details.
 
+choice
+	prompt "Seccomp filter cache"
+	default SECCOMP_CACHE_NONE
+	depends on SECCOMP_FILTER
+	help
+	  Seccomp filters can potentially incur large overhead for each
+	  system call. This can alleviate some of the overhead.
+
+	  If in doubt, select 'syscall numbers only'.
+
+config SECCOMP_CACHE_NONE
+	bool "None"
+	help
+	  No caching is done. Seccomp filters will be called each time
+	  a system call occurs in a seccomp-guarded task.
+
+config SECCOMP_CACHE_NR_ONLY
+	bool "Syscall number only"
+	depends on !HAVE_SPARSE_SYSCALL_NR
+	help
+	  For each syscall number, if the seccomp filter has a fixed
+	  result, store that result in a bitmap to speed up system calls.
+
+endchoice
+
 config HAVE_ARCH_STACKLEAK
 	bool
 	help
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 3ee59ce0a323..20d33378a092 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -143,6 +143,32 @@ struct notification {
 	struct list_head notifications;
 };
 
+#ifdef CONFIG_SECCOMP_CACHE_NR_ONLY
+/**
+ * struct seccomp_cache_filter_data - container for cache's per-filter data
+ *
+ * @syscall_ok: A bitmap for each architecture number, where each bit
+ *		represents whether the filter will always allow the syscall.
+ */
+struct seccomp_cache_filter_data {
+	DECLARE_BITMAP(syscall_ok[ARRAY_SIZE(syscall_arches)], NR_syscalls);
+};
+
+#define SECCOMP_EMU_MAX_PENDING_STATES 64
+#else
+struct seccomp_cache_filter_data { };
+
+static inline int seccomp_cache_prepare(struct seccomp_filter *sfilter)
+{
+	return 0;
+}
+
+static inline void seccomp_cache_inherit(struct seccomp_filter *sfilter,
+					 const struct seccomp_filter *prev)
+{
+}
+#endif /* CONFIG_SECCOMP_CACHE_NR_ONLY */
+
 /**
  * struct seccomp_filter - container for seccomp BPF programs
  *
@@ -185,6 +211,7 @@ struct seccomp_filter {
 	struct notification *notif;
 	struct mutex notify_lock;
 	wait_queue_head_t wqh;
+	struct seccomp_cache_filter_data cache;
 };
 
 /* Limit any path through the tree to 256KB worth of instructions. */
@@ -530,6 +557,139 @@ static inline void seccomp_sync_threads(unsigned long flags)
 	}
 }
 
+#ifdef CONFIG_SECCOMP_CACHE_NR_ONLY
+/**
+ * struct seccomp_emu_env - container for seccomp emulator environment
+ *
+ * @filter: The cBPF filter instructions.
+ * @nr: The syscall number we are emulating.
+ * @arch: The architecture number we are emulating.
+ * @syscall_ok: Emulation result, whether it is okay for seccomp to cache the
+ *		syscall.
+ */
+struct seccomp_emu_env {
+	struct sock_filter *filter;
+	int arch;
+	int nr;
+	bool syscall_ok;
+};
+
+/**
+ * struct seccomp_emu_state - container for seccomp emulator state
+ *
+ * @next: The next pending state. This structure is a linked list.
+ * @pc: The current program counter.
+ * @areg: the value of that A register.
+ */
+struct seccomp_emu_state {
+	struct seccomp_emu_state *next;
+	int pc;
+	u32 areg;
+};
+
+/**
+ * seccomp_emu_step - step one instruction in the emulator
+ * @env: The emulator environment
+ * @state: The emulator state
+ *
+ * Returns 1 to halt emulation, 0 to continue, or -errno if error occurred.
+ */
+static int seccomp_emu_step(struct seccomp_emu_env *env,
+			    struct seccomp_emu_state *state)
+{
+	struct sock_filter *ftest = &env->filter[state->pc++];
+	u16 code = ftest->code;
+	u32 k = ftest->k;
+	bool compare;
+
+	switch (code) {
+	case BPF_LD | BPF_W | BPF_ABS:
+		if (k == offsetof(struct seccomp_data, nr))
+			state->areg = env->nr;
+		else if (k == offsetof(struct seccomp_data, arch))
+			state->areg = env->arch;
+		else
+			return 1;
+
+		return 0;
+	case BPF_JMP | BPF_JA:
+		state->pc += k;
+		return 0;
+	case BPF_JMP | BPF_JEQ | BPF_K:
+	case BPF_JMP | BPF_JGE | BPF_K:
+	case BPF_JMP | BPF_JGT | BPF_K:
+	case BPF_JMP | BPF_JSET | BPF_K:
+		switch (BPF_OP(code)) {
+		case BPF_JEQ:
+			compare = state->areg == k;
+			break;
+		case BPF_JGT:
+			compare = state->areg > k;
+			break;
+		case BPF_JGE:
+			compare = state->areg >= k;
+			break;
+		case BPF_JSET:
+			compare = state->areg & k;
+			break;
+		default:
+			WARN_ON(true);
+			return -EINVAL;
+		}
+
+		state->pc += compare ? ftest->jt : ftest->jf;
+		return 0;
+	case BPF_ALU | BPF_AND | BPF_K:
+		state->areg &= k;
+		return 0;
+	case BPF_RET | BPF_K:
+		env->syscall_ok = k == SECCOMP_RET_ALLOW;
+		return 1;
+	default:
+		return 1;
+	}
+}
+
+/**
+ * seccomp_cache_prepare - emulate the filter to find cachable syscalls
+ * @sfilter: The seccomp filter
+ *
+ * Returns 0 if successful or -errno if error occurred.
+ */
+int seccomp_cache_prepare(struct seccomp_filter *sfilter)
+{
+	struct sock_fprog_kern *fprog = sfilter->prog->orig_prog;
+	struct sock_filter *filter = fprog->filter;
+	int arch, nr, res = 0;
+
+	for (arch = 0; arch < ARRAY_SIZE(syscall_arches); arch++) {
+		for (nr = 0; nr < NR_syscalls; nr++) {
+			struct seccomp_emu_env env = {0};
+			struct seccomp_emu_state state = {0};
+
+			env.filter = filter;
+			env.arch = syscall_arches[arch];
+			env.nr = nr;
+
+			while (true) {
+				res = seccomp_emu_step(&env, &state);
+				if (res)
+					break;
+			}
+
+			if (res < 0)
+				goto out;
+
+			if (env.syscall_ok)
+				set_bit(nr, sfilter->cache.syscall_ok[arch]);
+		}
+	}
+
+out:
+	return res;
+}
+#endif /* CONFIG_SECCOMP_CACHE_NR_ONLY */
+
 /**
  * seccomp_prepare_filter: Prepares a seccomp filter for use.
  * @fprog: BPF program to install
@@ -540,7 +700,8 @@ static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog)
 {
 	struct seccomp_filter *sfilter;
 	int ret;
-	const bool save_orig = IS_ENABLED(CONFIG_CHECKPOINT_RESTORE);
+	const bool save_orig = IS_ENABLED(CONFIG_CHECKPOINT_RESTORE) ||
+			       IS_ENABLED(CONFIG_SECCOMP_CACHE_NR_ONLY);
 
 	if (fprog->len == 0 || fprog->len > BPF_MAXINSNS)
 		return ERR_PTR(-EINVAL);
@@ -571,6 +732,13 @@ static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog)
 		return ERR_PTR(ret);
 	}
 
+	ret = seccomp_cache_prepare(sfilter);
+	if (ret < 0) {
+		bpf_prog_destroy(sfilter->prog);
+		kfree(sfilter);
+		return ERR_PTR(ret);
+	}
+
 	refcount_set(&sfilter->refs, 1);
 	refcount_set(&sfilter->users, 1);
 	init_waitqueue_head(&sfilter->wqh);
@@ -606,6 +774,29 @@ seccomp_prepare_user_filter(const char __user *user_filter)
 	return filter;
 }
 
+#ifdef CONFIG_SECCOMP_CACHE_NR_ONLY
+/**
+ * seccomp_cache_inherit - mask accept bitmap against previous filter
+ * @sfilter: The seccomp filter
+ * @sfilter: The previous seccomp filter
+ */
+static void seccomp_cache_inherit(struct seccomp_filter *sfilter,
+				  const struct seccomp_filter *prev)
+{
+	int arch;
+
+	if (!prev)
+		return;
+
+	for (arch = 0; arch < ARRAY_SIZE(syscall_arches); arch++) {
+		bitmap_and(sfilter->cache.syscall_ok[arch],
+			   sfilter->cache.syscall_ok[arch],
+			   prev->cache.syscall_ok[arch],
+			   NR_syscalls);
+	}
+}
+#endif /* CONFIG_SECCOMP_CACHE_NR_ONLY */
+
 /**
  * seccomp_attach_filter: validate and attach filter
  * @flags:  flags to change filter behavior
@@ -655,6 +846,7 @@ static long seccomp_attach_filter(unsigned int flags,
 	 * task reference.
 	 */
 	filter->prev = current->seccomp.filter;
+	seccomp_cache_inherit(filter, filter->prev);
 	current->seccomp.filter = filter;
 	atomic_inc(&current->seccomp.filter_count);
 
-- 
2.28.0

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ