[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-ID: <20250903203805.1335307-1-tom.hromatka@oracle.com>
Date: Wed, 3 Sep 2025 20:38:03 +0000
From: Tom Hromatka <tom.hromatka@...cle.com>
To: kees@...nel.org, luto@...capital.net, wad@...omium.org, sargun@...gun.me,
corbet@....net, shuah@...nel.org, brauner@...nel.org,
tom.hromatka@...cle.com
Cc: linux-doc@...r.kernel.org, linux-kernel@...r.kernel.org,
linux-kselftest@...r.kernel.org, bpf@...r.kernel.org
Subject: [PATCH] seccomp: Add SECCOMP_CLONE_FILTER operation
Add an operation, SECCOMP_CLONE_FILTER, that can copy the seccomp filters
from another process to the current process.
I roughly reproduced the Docker seccomp filter [1] and timed how long it
takes to build it (via libseccomp) and attach it to a process. After
1000 runs, on average it took 3,740,000 TSC ticks (or ~1440 microseconds)
on an AMD EPYC 9J14 running at 2596 MHz. The median build/load time was
3,715,000 TSC ticks.
On the same system, I preloaded the above Docker seccomp filter onto a
process. (Note that I opened a pidfd to the reference process and left
the pidfd open for the entire run.) I then cloned the filter using the
feature in this patch to 1000 new processes. On average, it took 9,300
TSC ticks (or ~3.6 microseconds) to copy the filter to the new processes.
The median clone time was 9,048 TSC ticks.
This is approximately a 400x performance improvement for those container
managers that are using the exact same seccomp filter across all of their
containers.
[1] https://raw.githubusercontent.com/moby/moby/refs/heads/master/profiles/seccomp/default.json
Signed-off-by: Tom Hromatka <tom.hromatka@...cle.com>
---
.../userspace-api/seccomp_filter.rst | 10 ++
include/uapi/linux/seccomp.h | 1 +
kernel/seccomp.c | 48 ++++++
samples/seccomp/Makefile | 2 +-
samples/seccomp/clone-filter.c | 143 ++++++++++++++++++
tools/include/uapi/linux/seccomp.h | 1 +
tools/testing/selftests/seccomp/seccomp_bpf.c | 71 +++++++++
7 files changed, 275 insertions(+), 1 deletion(-)
create mode 100644 samples/seccomp/clone-filter.c
diff --git a/Documentation/userspace-api/seccomp_filter.rst b/Documentation/userspace-api/seccomp_filter.rst
index cff0fa7f3175..ef1797d093f6 100644
--- a/Documentation/userspace-api/seccomp_filter.rst
+++ b/Documentation/userspace-api/seccomp_filter.rst
@@ -289,6 +289,16 @@ above in this document: all arguments being read from the tracee's memory
should be read into the tracer's memory before any policy decisions are made.
This allows for an atomic decision on syscall arguments.
+Cloning an Existing Seccomp Filter
+==================================
+
+Constructing and loading a complex seccomp filter can often take a non-trivial
+amount of time. If a user wants to use the same seccomp filter across more
+than one process, it can be cloned to new processes via the
+``SECCOMP_CLONE_FILTER`` operation. Note that the clone will only succeed if
+the destination process does not have any seccomp filters already applied to
+it. See ``samples/seccomp/clone-filter.c`` for an example.
+
Sysctls
=======
diff --git a/include/uapi/linux/seccomp.h b/include/uapi/linux/seccomp.h
index dbfc9b37fcae..b0917e333b4b 100644
--- a/include/uapi/linux/seccomp.h
+++ b/include/uapi/linux/seccomp.h
@@ -16,6 +16,7 @@
#define SECCOMP_SET_MODE_FILTER 1
#define SECCOMP_GET_ACTION_AVAIL 2
#define SECCOMP_GET_NOTIF_SIZES 3
+#define SECCOMP_CLONE_FILTER 4
/* Valid flags for SECCOMP_SET_MODE_FILTER */
#define SECCOMP_FILTER_FLAG_TSYNC (1UL << 0)
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 41aa761c7738..b726e0d6715d 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -2081,6 +2081,49 @@ static long seccomp_get_notif_sizes(void __user *usizes)
return 0;
}
+static long seccomp_clone_filter(void __user *upidfd)
+{
+ struct task_struct *task;
+ unsigned int flags;
+ pid_t pidfd;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (atomic_read(¤t->seccomp.filter_count) > 0)
+ return -EINVAL;
+
+ if (copy_from_user(&pidfd, upidfd, sizeof(pid_t)))
+ return -EFAULT;
+
+ task = pidfd_get_task(pidfd, &flags);
+ if (IS_ERR(task))
+ return -ESRCH;
+
+ spin_lock_irq(¤t->sighand->siglock);
+ spin_lock_irq(&task->sighand->siglock);
+
+ if (atomic_read(&task->seccomp.filter_count) == 0) {
+ spin_unlock_irq(&task->sighand->siglock);
+ spin_unlock_irq(¤t->sighand->siglock);
+ put_task_struct(task);
+ return -EINVAL;
+ }
+
+ get_seccomp_filter(task);
+ current->seccomp = task->seccomp;
+
+ spin_unlock_irq(&task->sighand->siglock);
+
+ set_task_syscall_work(current, SECCOMP);
+
+ spin_unlock_irq(¤t->sighand->siglock);
+
+ put_task_struct(task);
+
+ return 0;
+}
+
/* Common entry point for both prctl and syscall. */
static long do_seccomp(unsigned int op, unsigned int flags,
void __user *uargs)
@@ -2102,6 +2145,11 @@ static long do_seccomp(unsigned int op, unsigned int flags,
return -EINVAL;
return seccomp_get_notif_sizes(uargs);
+ case SECCOMP_CLONE_FILTER:
+ if (flags != 0)
+ return -EINVAL;
+
+ return seccomp_clone_filter(uargs);
default:
return -EINVAL;
}
diff --git a/samples/seccomp/Makefile b/samples/seccomp/Makefile
index c85ae0ed8342..d38977f41b86 100644
--- a/samples/seccomp/Makefile
+++ b/samples/seccomp/Makefile
@@ -1,5 +1,5 @@
# SPDX-License-Identifier: GPL-2.0
-userprogs-always-y += bpf-fancy dropper bpf-direct user-trap
+userprogs-always-y += bpf-fancy dropper bpf-direct user-trap clone-filter
bpf-fancy-objs := bpf-fancy.o bpf-helper.o
diff --git a/samples/seccomp/clone-filter.c b/samples/seccomp/clone-filter.c
new file mode 100644
index 000000000000..d26e1375b9dc
--- /dev/null
+++ b/samples/seccomp/clone-filter.c
@@ -0,0 +1,143 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Seccomp filter example for cloning a filter
+ *
+ * Copyright (c) 2025 Oracle and/or its affiliates.
+ * Author: Tom Hromatka <tom.hromatka@...cle.com>
+ *
+ * The code may be used by anyone for any purpose,
+ * and can serve as a starting point for developing
+ * applications that reuse the same seccomp filter
+ * across many processes.
+ */
+#include <linux/seccomp.h>
+#include <linux/filter.h>
+#include <sys/syscall.h>
+#include <sys/wait.h>
+#include <stdbool.h>
+#include <signal.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <stdio.h>
+#include <errno.h>
+
+#define ARRAY_SIZE(x) (sizeof(x) / sizeof(*(x)))
+
+static int seccomp(unsigned int op, unsigned int flags, void *args)
+{
+ errno = 0;
+ return syscall(__NR_seccomp, op, flags, args);
+}
+
+static int install_filter(void)
+{
+ struct sock_filter deny_filter[] = {
+ BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
+ offsetof(struct seccomp_data, nr)),
+ BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getppid, 0, 1),
+ BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ERRNO | ESRCH),
+ BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
+ };
+ struct sock_fprog deny_prog = {
+ .len = (unsigned short)ARRAY_SIZE(deny_filter),
+ .filter = deny_filter,
+ };
+
+ return seccomp(SECCOMP_SET_MODE_FILTER, 0, &deny_prog);
+}
+
+static int clone_filter(pid_t ref_pid)
+{
+ int ref_pidfd, ret;
+
+ ref_pidfd = syscall(SYS_pidfd_open, ref_pid, 0);
+ if (ref_pidfd < 0)
+ return -errno;
+
+ ret = seccomp(SECCOMP_CLONE_FILTER, 0, &ref_pidfd);
+
+ close(ref_pidfd);
+
+ return ret;
+}
+
+static void do_ref_filter(void)
+{
+ int ret;
+
+ ret = install_filter();
+ if (ret) {
+ perror("Failed to install ref filter\n");
+ exit(1);
+ }
+
+ while (true)
+ sleep(1);
+}
+
+static void do_child_process(pid_t ref_pid)
+{
+ pid_t res;
+ int ret;
+
+ ret = clone_filter(ref_pid);
+ if (ret != 0) {
+ perror("Failed to clone filter. Installing filter from scratch\n");
+
+ ret = install_filter();
+ if (ret != 0) {
+ perror("Filter install failed\n");
+ exit(ret);
+ }
+ }
+
+ res = syscall(__NR_getpid);
+ if (res < 0) {
+ perror("getpid() unexpectedly failed\n");
+ exit(errno);
+ }
+
+ res = syscall(__NR_getppid);
+ if (res > 0) {
+ perror("getppid() unexpectedly succeeded\n");
+ exit(1);
+ }
+
+ exit(0);
+}
+
+int main(void)
+{
+ pid_t ref_pid = -1, child_pid = -1;
+ int ret, status;
+
+ ref_pid = fork();
+ if (ref_pid < 0)
+ exit(errno);
+ else if (ref_pid == 0)
+ do_ref_filter();
+
+ child_pid = fork();
+ if (child_pid < 0)
+ goto out;
+ else if (child_pid == 0)
+ do_child_process(ref_pid);
+
+ waitpid(child_pid, &status, 0);
+ if (WEXITSTATUS(status) != 0) {
+ perror("child process failed");
+ ret = WEXITSTATUS(status);
+ goto out;
+ }
+
+ ret = 0;
+
+out:
+ if (ref_pid != -1)
+ kill(ref_pid, SIGKILL);
+ if (child_pid != -1)
+ kill(child_pid, SIGKILL);
+
+ exit(ret);
+}
diff --git a/tools/include/uapi/linux/seccomp.h b/tools/include/uapi/linux/seccomp.h
index dbfc9b37fcae..b0917e333b4b 100644
--- a/tools/include/uapi/linux/seccomp.h
+++ b/tools/include/uapi/linux/seccomp.h
@@ -16,6 +16,7 @@
#define SECCOMP_SET_MODE_FILTER 1
#define SECCOMP_GET_ACTION_AVAIL 2
#define SECCOMP_GET_NOTIF_SIZES 3
+#define SECCOMP_CLONE_FILTER 4
/* Valid flags for SECCOMP_SET_MODE_FILTER */
#define SECCOMP_FILTER_FLAG_TSYNC (1UL << 0)
diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c
index 61acbd45ffaa..df5e0f615da0 100644
--- a/tools/testing/selftests/seccomp/seccomp_bpf.c
+++ b/tools/testing/selftests/seccomp/seccomp_bpf.c
@@ -177,6 +177,10 @@ struct seccomp_data {
#define SECCOMP_GET_NOTIF_SIZES 3
#endif
+#ifndef SECCOMP_CLONE_FILTER
+#define SECCOMP_CLONE_FILTER 4
+#endif
+
#ifndef SECCOMP_FILTER_FLAG_TSYNC
#define SECCOMP_FILTER_FLAG_TSYNC (1UL << 0)
#endif
@@ -5090,6 +5094,73 @@ TEST_F(URETPROBE, uretprobe_default_block_with_uretprobe_syscall)
ASSERT_EQ(0, run_probed_with_filter(&prog));
}
+TEST(clone_filter)
+{
+ struct sock_filter deny_filter[] = {
+ BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
+ offsetof(struct seccomp_data, nr)),
+ BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getppid, 0, 1),
+ BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ERRNO | ESRCH),
+ BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
+ };
+ struct sock_fprog deny_prog = {
+ .len = (unsigned short)ARRAY_SIZE(deny_filter),
+ .filter = deny_filter,
+ };
+ struct timespec ts = {
+ .tv_sec = 0,
+ .tv_nsec = 100000000,
+ };
+
+ pid_t child_pid, self_pid, res;
+ int child_pidfd, ret;
+
+ /* Only real root can copy a filter. */
+ if (geteuid()) {
+ SKIP(return, "clone_filter requires real root");
+ return;
+ }
+
+ self_pid = getpid();
+
+ child_pid = fork();
+ ASSERT_LE(0, child_pid);
+
+ if (child_pid == 0) {
+ ASSERT_EQ(0, prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0));
+ ASSERT_EQ(0, prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &deny_prog));
+
+ while (true)
+ EXPECT_EQ(0, syscall(__NR_nanosleep, &ts, NULL));
+ }
+
+ /* wait for the child pid to create its seccomp filter */
+ ASSERT_EQ(0, syscall(__NR_nanosleep, &ts, NULL));
+
+ child_pidfd = syscall(SYS_pidfd_open, child_pid, 0);
+ EXPECT_LE(0, child_pidfd);
+
+ /* Invalid flag provided */
+ ret = seccomp(SECCOMP_CLONE_FILTER, 1, &child_pidfd);
+ EXPECT_EQ(-1, ret);
+ EXPECT_EQ(errno, EINVAL);
+
+ errno = 0;
+ ret = seccomp(SECCOMP_CLONE_FILTER, 0, &child_pidfd);
+ EXPECT_EQ(0, ret);
+ EXPECT_EQ(errno, 0);
+
+ res = syscall(__NR_getppid);
+ EXPECT_EQ(res, -1);
+ EXPECT_EQ(errno, ESRCH);
+
+ res = syscall(__NR_getpid);
+ EXPECT_EQ(res, self_pid);
+
+ close(child_pidfd);
+ kill(child_pid, SIGKILL);
+}
+
/*
* TODO:
* - expand NNP testing
--
2.47.3
Powered by blists - more mailing lists