linux-kernel - [PATCH] seccomp: plug syscall-dodging ptrace hole

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-ID: <20160526210450.GA31203@www.outflux.net>
Date:	Thu, 26 May 2016 14:04:50 -0700
From:	Kees Cook <keescook@...omium.org>
To:	Andy Lutomirski <luto@...capital.net>
Cc:	Jann Horn <jann@...jh.net>, Stephane Graber <stgraber@...ntu.com>,
	Will Drewry <wad@...omium.org>, linux-kernel@...r.kernel.org
Subject: [PATCH] seccomp: plug syscall-dodging ptrace hole

One problem with seccomp was that ptrace could be used to change a
syscall after seccomp filtering had completed. This was a well documented
limitation, and it was recommended to block ptrace when defining a filter
to avoid this problem. This can be quite a limitation for containers or
other places where ptrace is desired even under seccomp filters.

Since seccomp filtering has been split into pre-trace and trace phases
(phase1 and phase2 respectively), it's possible to re-run phase1 seccomp
after ptrace. This makes that change, and updates the test suite for
both SECCOMP_RET_TRACE and PTRACE_SYSCALL manipulation.

Signed-off-by: Kees Cook <keescook@...omium.org>
---
 include/linux/seccomp.h                       |   6 +
 include/linux/tracehook.h                     |   8 +-
 kernel/seccomp.c                              |  42 ++++++
 tools/testing/selftests/seccomp/seccomp_bpf.c | 176 ++++++++++++++++++++++++--
 4 files changed, 220 insertions(+), 12 deletions(-)

diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h
index 2296e6b2f690..e2b72394c200 100644
--- a/include/linux/seccomp.h
+++ b/include/linux/seccomp.h
@@ -85,6 +85,7 @@ static inline int seccomp_mode(struct seccomp *s)
 #ifdef CONFIG_SECCOMP_FILTER
 extern void put_seccomp_filter(struct task_struct *tsk);
 extern void get_seccomp_filter(struct task_struct *tsk);
+extern int seccomp_phase1_recheck(void);
 #else  /* CONFIG_SECCOMP_FILTER */
 static inline void put_seccomp_filter(struct task_struct *tsk)
 {
@@ -94,6 +95,11 @@ static inline void get_seccomp_filter(struct task_struct *tsk)
 {
 	return;
 }
+
+static inline int seccomp_phase1_recheck(void)
+{
+	return 0;
+}
 #endif /* CONFIG_SECCOMP_FILTER */
 
 #if defined(CONFIG_SECCOMP_FILTER) && defined(CONFIG_CHECKPOINT_RESTORE)
diff --git a/include/linux/tracehook.h b/include/linux/tracehook.h
index 26c152122a42..69b584d88508 100644
--- a/include/linux/tracehook.h
+++ b/include/linux/tracehook.h
@@ -48,6 +48,7 @@
 
 #include <linux/sched.h>
 #include <linux/ptrace.h>
+#include <linux/seccomp.h>
 #include <linux/security.h>
 #include <linux/task_work.h>
 #include <linux/memcontrol.h>
@@ -100,7 +101,12 @@ static inline int ptrace_report_syscall(struct pt_regs *regs)
 static inline __must_check int tracehook_report_syscall_entry(
 	struct pt_regs *regs)
 {
-	return ptrace_report_syscall(regs);
+	int skip;
+
+	skip = ptrace_report_syscall(regs);
+	if (skip)
+		return skip;
+	return seccomp_phase1_recheck();
 }
 
 /**
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 7002796f14a4..6eaa3a1c5edb 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -665,6 +665,46 @@ u32 seccomp_phase1(struct seccomp_data *sd)
 }
 
 /**
+ * seccomp_phase1_recheck() - recheck phase1 in the context of ptrace
+ *
+ * This re-runs phase 1 seccomp checks in the case where ptrace may have
+ * just changed things out from under us.
+ *
+ * Returns 0 if the syscall should be processed or -1 to skip the syscall.
+ */
+int seccomp_phase1_recheck(void)
+{
+	u32 action;
+
+	/* If we're not under seccomp, continue normally. */
+	if (!test_thread_flag(TIF_SECCOMP))
+		return 0;
+
+	/* Pass NULL struct seccomp_data to force reload after ptrace. */
+	action = seccomp_phase1(NULL);
+	switch (action) {
+	case SECCOMP_PHASE1_OK:
+		/* Passes seccomp, continue normally. */
+		break;
+	case SECCOMP_PHASE1_SKIP:
+		/* Skip the syscall. */
+		return -1;
+	default:
+		if ((action & SECCOMP_RET_ACTION) != SECCOMP_RET_TRACE) {
+			/* Impossible return value: kill the process. */
+			do_exit(SIGSYS);
+		}
+		/*
+		 * We've hit a trace request, but ptrace already put us
+		 * into this state, so just continue.
+		 */
+		break;
+	}
+
+	return 0;
+}
+
+/**
  * seccomp_phase2() - finish slow path seccomp work for the current syscall
  * @phase1_result: The return value from seccomp_phase1()
  *
@@ -701,6 +741,8 @@ int seccomp_phase2(u32 phase1_result)
 		do_exit(SIGSYS);
 	if (syscall_get_nr(current, regs) < 0)
 		return -1;  /* Explicit request to skip. */
+	if (seccomp_phase1_recheck() < 0)
+		return -1;
 
 	return 0;
 }
diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c
index 7947e568e057..b8f6c743998f 100644
--- a/tools/testing/selftests/seccomp/seccomp_bpf.c
+++ b/tools/testing/selftests/seccomp/seccomp_bpf.c
@@ -1021,8 +1021,8 @@ void tracer_stop(int sig)
 typedef void tracer_func_t(struct __test_metadata *_metadata,
 			   pid_t tracee, int status, void *args);
 
-void tracer(struct __test_metadata *_metadata, int fd, pid_t tracee,
-	    tracer_func_t tracer_func, void *args)
+void start_tracer(struct __test_metadata *_metadata, int fd, pid_t tracee,
+	    tracer_func_t tracer_func, void *args, bool ptrace_syscall)
 {
 	int ret = -1;
 	struct sigaction action = {
@@ -1042,12 +1042,16 @@ void tracer(struct __test_metadata *_metadata, int fd, pid_t tracee,
 	/* Wait for attach stop */
 	wait(NULL);
 
-	ret = ptrace(PTRACE_SETOPTIONS, tracee, NULL, PTRACE_O_TRACESECCOMP);
+	ret = ptrace(PTRACE_SETOPTIONS, tracee, NULL, ptrace_syscall ?
+						      PTRACE_O_TRACESYSGOOD :
+						      PTRACE_O_TRACESECCOMP);
 	ASSERT_EQ(0, ret) {
 		TH_LOG("Failed to set PTRACE_O_TRACESECCOMP");
 		kill(tracee, SIGKILL);
 	}
-	ptrace(PTRACE_CONT, tracee, NULL, 0);
+	ret = ptrace(ptrace_syscall ? PTRACE_SYSCALL : PTRACE_CONT,
+		     tracee, NULL, 0);
+	ASSERT_EQ(0, ret);
 
 	/* Unblock the tracee */
 	ASSERT_EQ(1, write(fd, "A", 1));
@@ -1063,12 +1067,13 @@ void tracer(struct __test_metadata *_metadata, int fd, pid_t tracee,
 			/* Child is dead. Time to go. */
 			return;
 
-		/* Make sure this is a seccomp event. */
-		ASSERT_EQ(true, IS_SECCOMP_EVENT(status));
+		/* Check if this is a seccomp event. */
+		ASSERT_EQ(!ptrace_syscall, IS_SECCOMP_EVENT(status));
 
 		tracer_func(_metadata, tracee, status, args);
 
-		ret = ptrace(PTRACE_CONT, tracee, NULL, NULL);
+		ret = ptrace(ptrace_syscall ? PTRACE_SYSCALL : PTRACE_CONT,
+			     tracee, NULL, 0);
 		ASSERT_EQ(0, ret);
 	}
 	/* Directly report the status of our test harness results. */
@@ -1079,7 +1084,7 @@ void tracer(struct __test_metadata *_metadata, int fd, pid_t tracee,
 void cont_handler(int num)
 { }
 pid_t setup_trace_fixture(struct __test_metadata *_metadata,
-			  tracer_func_t func, void *args)
+			  tracer_func_t func, void *args, bool ptrace_syscall)
 {
 	char sync;
 	int pipefd[2];
@@ -1095,7 +1100,8 @@ pid_t setup_trace_fixture(struct __test_metadata *_metadata,
 	signal(SIGALRM, cont_handler);
 	if (tracer_pid == 0) {
 		close(pipefd[0]);
-		tracer(_metadata, pipefd[1], tracee, func, args);
+		start_tracer(_metadata, pipefd[1], tracee, func, args,
+			     ptrace_syscall);
 		syscall(__NR_exit, 0);
 	}
 	close(pipefd[1]);
@@ -1177,7 +1183,7 @@ FIXTURE_SETUP(TRACE_poke)
 
 	/* Launch tracer. */
 	self->tracer = setup_trace_fixture(_metadata, tracer_poke,
-					   &self->tracer_args);
+					   &self->tracer_args, false);
 }
 
 FIXTURE_TEARDOWN(TRACE_poke)
@@ -1395,6 +1401,29 @@ void tracer_syscall(struct __test_metadata *_metadata, pid_t tracee,
 
 }
 
+void tracer_ptrace(struct __test_metadata *_metadata, pid_t tracee,
+		   int status, void *args)
+{
+	int ret, nr;
+	unsigned long msg;
+	static bool entry;
+
+	/* Make sure we got an empty message. */
+	ret = ptrace(PTRACE_GETEVENTMSG, tracee, NULL, &msg);
+	EXPECT_EQ(0, ret);
+	EXPECT_EQ(0, msg);
+
+	/* The only way to tell PTRACE_SYSCALL entry/exit is by counting. */
+	entry = !entry;
+	if (!entry)
+		return;
+
+	nr = get_syscall(_metadata, tracee);
+
+	if (nr == __NR_getpid)
+		change_syscall(_metadata, tracee, __NR_getppid);
+}
+
 FIXTURE_DATA(TRACE_syscall) {
 	struct sock_fprog prog;
 	pid_t tracer, mytid, mypid, parent;
@@ -1436,7 +1465,8 @@ FIXTURE_SETUP(TRACE_syscall)
 	ASSERT_NE(self->parent, self->mypid);
 
 	/* Launch tracer. */
-	self->tracer = setup_trace_fixture(_metadata, tracer_syscall, NULL);
+	self->tracer = setup_trace_fixture(_metadata, tracer_syscall, NULL,
+					   false);
 }
 
 FIXTURE_TEARDOWN(TRACE_syscall)
@@ -1496,6 +1526,130 @@ TEST_F(TRACE_syscall, syscall_dropped)
 	EXPECT_NE(self->mytid, syscall(__NR_gettid));
 }
 
+TEST_F(TRACE_syscall, skip_after_RET_TRACE)
+{
+	struct sock_filter filter[] = {
+		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
+			offsetof(struct seccomp_data, nr)),
+		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getppid, 0, 1),
+		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ERRNO | EPERM),
+		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
+	};
+	struct sock_fprog prog = {
+		.len = (unsigned short)ARRAY_SIZE(filter),
+		.filter = filter,
+	};
+	long ret;
+
+	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
+	ASSERT_EQ(0, ret);
+
+	/* Install fixture filter. */
+	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->prog, 0, 0);
+	ASSERT_EQ(0, ret);
+
+	/* Install "errno on getppid" filter. */
+	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0);
+	ASSERT_EQ(0, ret);
+
+	/* Tracer will redirect getpid to getppid, and we should see EPERM. */
+	EXPECT_EQ(-1, syscall(__NR_getpid));
+	EXPECT_EQ(EPERM, errno);
+}
+
+TEST_F_SIGNAL(TRACE_syscall, kill_after_RET_TRACE, SIGSYS)
+{
+	struct sock_filter filter[] = {
+		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
+			offsetof(struct seccomp_data, nr)),
+		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getppid, 0, 1),
+		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL),
+		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
+	};
+	struct sock_fprog prog = {
+		.len = (unsigned short)ARRAY_SIZE(filter),
+		.filter = filter,
+	};
+	long ret;
+
+	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
+	ASSERT_EQ(0, ret);
+
+	/* Install fixture filter. */
+	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->prog, 0, 0);
+	ASSERT_EQ(0, ret);
+
+	/* Install "death on getppid" filter. */
+	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0);
+	ASSERT_EQ(0, ret);
+
+	/* Tracer will redirect getpid to getppid, and we should die. */
+	EXPECT_NE(self->mypid, syscall(__NR_getpid));
+}
+
+TEST_F(TRACE_syscall, skip_after_ptrace)
+{
+	struct sock_filter filter[] = {
+		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
+			offsetof(struct seccomp_data, nr)),
+		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getppid, 0, 1),
+		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ERRNO | EPERM),
+		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
+	};
+	struct sock_fprog prog = {
+		.len = (unsigned short)ARRAY_SIZE(filter),
+		.filter = filter,
+	};
+	long ret;
+
+	/* Swap SECCOMP_RET_TRACE tracer for PTRACE_SYSCALL tracer. */
+	teardown_trace_fixture(_metadata, self->tracer);
+	self->tracer = setup_trace_fixture(_metadata, tracer_ptrace, NULL,
+					   true);
+
+	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
+	ASSERT_EQ(0, ret);
+
+	/* Install "errno on getppid" filter. */
+	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0);
+	ASSERT_EQ(0, ret);
+
+	/* Tracer will redirect getpid to getppid, and we should see EPERM. */
+	EXPECT_EQ(-1, syscall(__NR_getpid));
+	EXPECT_EQ(EPERM, errno);
+}
+
+TEST_F_SIGNAL(TRACE_syscall, kill_after_ptrace, SIGSYS)
+{
+	struct sock_filter filter[] = {
+		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
+			offsetof(struct seccomp_data, nr)),
+		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getppid, 0, 1),
+		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL),
+		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
+	};
+	struct sock_fprog prog = {
+		.len = (unsigned short)ARRAY_SIZE(filter),
+		.filter = filter,
+	};
+	long ret;
+
+	/* Swap SECCOMP_RET_TRACE tracer for PTRACE_SYSCALL tracer. */
+	teardown_trace_fixture(_metadata, self->tracer);
+	self->tracer = setup_trace_fixture(_metadata, tracer_ptrace, NULL,
+					   true);
+
+	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
+	ASSERT_EQ(0, ret);
+
+	/* Install "death on getppid" filter. */
+	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0);
+	ASSERT_EQ(0, ret);
+
+	/* Tracer will redirect getpid to getppid, and we should die. */
+	EXPECT_NE(self->mypid, syscall(__NR_getpid));
+}
+
 #ifndef __NR_seccomp
 # if defined(__i386__)
 #  define __NR_seccomp 354
-- 
2.6.3


-- 
Kees Cook
Chrome OS & Brillo Security