[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <87tulpbp19.fsf@disp2133>
Date: Tue, 22 Jun 2021 15:52:18 -0500
From: ebiederm@...ssion.com (Eric W. Biederman)
To: Linus Torvalds <torvalds@...ux-foundation.org>
Cc: Al Viro <viro@...iv.linux.org.uk>,
Michael Schmitz <schmitzmic@...il.com>,
linux-arch <linux-arch@...r.kernel.org>,
Jens Axboe <axboe@...nel.dk>, Oleg Nesterov <oleg@...hat.com>,
Linux Kernel Mailing List <linux-kernel@...r.kernel.org>,
Richard Henderson <rth@...ddle.net>,
Ivan Kokshaysky <ink@...assic.park.msu.ru>,
Matt Turner <mattst88@...il.com>,
alpha <linux-alpha@...r.kernel.org>,
Geert Uytterhoeven <geert@...ux-m68k.org>,
linux-m68k <linux-m68k@...ts.linux-m68k.org>,
Arnd Bergmann <arnd@...nel.org>,
Ley Foon Tan <ley.foon.tan@...el.com>,
Tejun Heo <tj@...nel.org>, Kees Cook <keescook@...omium.org>
Subject: Re: Kernel stack read with PTRACE_EVENT_EXIT and io_uring threads
Linus Torvalds <torvalds@...ux-foundation.org> writes:
> On Mon, Jun 21, 2021 at 1:04 PM Eric W. Biederman <ebiederm@...ssion.com> wrote:
>>
>> For other ptrace_event calls I am playing with seeing if I can split
>> them in two. Like sending a signal. So that we can have perform all
>> of the work in get_signal.
>
> That sounds like the right model, but I don't think it works.
> Particularly not for exit(). The second phase will never happen.
Playing with it some more I think I have everything working working
except for PTRACE_EVENT_SECCOMP (which can stay ptrace_event) and
group_exit(2).
Basically in exit sending yourself a signal and then calling do_exit
from the signal handler is not unreasonable, as exit is an ordinary
system call.
I haven't seen anything that ``knows'' that exit(2) or exit_group(2)
will never return and adds a special case in the system call table for
that case.
The complications of exit_group(2) are roughly those of moving
ptrace_event out of do_exit. They look doable and I am going to look
at that next.
This is not to say that this is the most maintainable way or that we
necessarily want to implement things this way, but I need to look and
see what it looks like.
For purposes of discussion this is my current draft implementation.
diff --git a/include/linux/sched.h b/include/linux/sched.h
index d2c881384517..891812d32b90 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1087,6 +1087,7 @@ struct task_struct {
struct capture_control *capture_control;
#endif
/* Ptrace state: */
+ int stop_code;
unsigned long ptrace_message;
kernel_siginfo_t *last_siginfo;
diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h
index b5ebf6c01292..33c50119b193 100644
--- a/include/linux/ptrace.h
+++ b/include/linux/ptrace.h
@@ -164,18 +164,29 @@ static inline void ptrace_event(int event, unsigned long message)
}
}
+static inline bool ptrace_post_event(int event, unsigned long message)
+{
+ bool posted = false;
+ if (unlikely(ptrace_event_enabled(current, event))) {
+ current->ptrace_message = message;
+ current->stop_code = (event << 8) | SIGTRAP;
+ set_tsk_thread_flag(current, TIF_SIGPENDING);
+ posted = true;
+ } else if (event == PTRACE_EVENT_EXEC) {
+ /* legacy EXEC report via SIGTRAP */
+ if ((current->ptrace & (PT_PTRACED|PT_SEIZED)) == PT_PTRACED)
+ send_sig(SIGTRAP, current, 0);
+ }
+ return posted;
+}
+
/**
- * ptrace_event_pid - possibly stop for a ptrace event notification
- * @event: %PTRACE_EVENT_* value to report
- * @pid: process identifier for %PTRACE_GETEVENTMSG to return
- *
- * Check whether @event is enabled and, if so, report @event and @pid
- * to the ptrace parent. @pid is reported as the pid_t seen from the
- * ptrace parent's pid namespace.
+ * pid_parent_nr - Return the number the parent knows this pid as
+ * @pid: The struct pid whose numerical value we want
*
* Called without locks.
*/
-static inline void ptrace_event_pid(int event, struct pid *pid)
+static inline pid_t pid_parent_nr(struct pid *pid)
{
/*
* FIXME: There's a potential race if a ptracer in a different pid
@@ -183,16 +194,15 @@ static inline void ptrace_event_pid(int event, struct pid *pid)
* when we acquire tasklist_lock in ptrace_stop(). If this happens,
* the ptracer will get a bogus pid from PTRACE_GETEVENTMSG.
*/
- unsigned long message = 0;
+ pid_t nr = 0;
struct pid_namespace *ns;
rcu_read_lock();
ns = task_active_pid_ns(rcu_dereference(current->parent));
if (ns)
- message = pid_nr_ns(pid, ns);
+ nr = pid_nr_ns(pid, ns);
rcu_read_unlock();
-
- ptrace_event(event, message);
+ return nr;
}
/**
diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
index e24b1fe348e3..a2eac3831369 100644
--- a/include/linux/sched/mm.h
+++ b/include/linux/sched/mm.h
@@ -97,6 +97,8 @@ extern void exit_mm_release(struct task_struct *, struct mm_struct *);
/* Remove the current tasks stale references to the old mm_struct on exec() */
extern void exec_mm_release(struct task_struct *, struct mm_struct *);
+extern int wait_for_vfork_done(struct task_struct *child, struct completion *vfork);
+
#ifdef CONFIG_MEMCG
extern void mm_update_next_owner(struct mm_struct *mm);
#else
diff --git a/fs/exec.c b/fs/exec.c
index 18594f11c31f..bb4751d84e2d 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1781,7 +1781,7 @@ static int exec_binprm(struct linux_binprm *bprm)
audit_bprm(bprm);
trace_sched_process_exec(current, old_pid, bprm);
- ptrace_event(PTRACE_EVENT_EXEC, old_vpid);
+ ptrace_post_event(PTRACE_EVENT_EXEC, old_vpid);
proc_exec_connector(current);
return 0;
}
diff --git a/kernel/exit.c b/kernel/exit.c
index fd1c04193e18..aeb22a8e4d24 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -889,7 +889,9 @@ EXPORT_SYMBOL(complete_and_exit);
SYSCALL_DEFINE1(exit, int, error_code)
{
- do_exit((error_code&0xff)<<8);
+ long code = (error_code&0xff)<<8;
+ if (!ptrace_post_event(PTRACE_EVENT_EXIT, code))
+ do_exit((error_code&0xff)<<8);
}
/*
diff --git a/kernel/fork.c b/kernel/fork.c
index dc06afd725cb..8533e056a3d6 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1266,8 +1266,7 @@ static void complete_vfork_done(struct task_struct *tsk)
task_unlock(tsk);
}
-static int wait_for_vfork_done(struct task_struct *child,
- struct completion *vfork)
+int wait_for_vfork_done(struct task_struct *child, struct completion *vfork)
{
int killed;
@@ -2278,7 +2277,8 @@ static __latent_entropy struct task_struct *copy_process(
init_task_pid_links(p);
if (likely(p->pid)) {
- ptrace_init_task(p, (clone_flags & CLONE_PTRACE) || trace);
+ ptrace_init_task(p, (clone_flags & CLONE_PTRACE) ||
+ (trace && ptrace_event_enabled(current, trace)));
init_task_pid(p, PIDTYPE_PID, pid);
if (thread_group_leader(p)) {
@@ -2462,7 +2462,7 @@ struct task_struct *create_io_thread(int (*fn)(void *), void *arg, int node)
pid_t kernel_clone(struct kernel_clone_args *args)
{
u64 clone_flags = args->flags;
- struct completion vfork;
+ unsigned long message;
struct pid *pid;
struct task_struct *p;
int trace = 0;
@@ -2495,9 +2495,6 @@ pid_t kernel_clone(struct kernel_clone_args *args)
trace = PTRACE_EVENT_CLONE;
else
trace = PTRACE_EVENT_FORK;
-
- if (likely(!ptrace_event_enabled(current, trace)))
- trace = 0;
}
p = copy_process(NULL, trace, NUMA_NO_NODE, args);
@@ -2512,30 +2509,27 @@ pid_t kernel_clone(struct kernel_clone_args *args)
*/
trace_sched_process_fork(current, p);
- pid = get_task_pid(p, PIDTYPE_PID);
+ pid = task_pid(p);
nr = pid_vnr(pid);
+ message = pid_parent_nr(pid);
if (clone_flags & CLONE_PARENT_SETTID)
put_user(nr, args->parent_tid);
- if (clone_flags & CLONE_VFORK) {
- p->vfork_done = &vfork;
+ if (!(clone_flags & CLONE_VFORK)) {
+ wake_up_new_task(p);
+ ptrace_post_event(trace, message);
+ }
+ else if (!ptrace_post_event(PTRACE_EVENT_VFORK, (unsigned long)p)) {
+ struct completion vfork;
init_completion(&vfork);
+ p->vfork_done = &vfork;
get_task_struct(p);
+ wake_up_new_task(p);
+ if (wait_for_vfork_done(p, &vfork))
+ ptrace_post_event(PTRACE_EVENT_VFORK_DONE, message);
}
- wake_up_new_task(p);
-
- /* forking complete and child started to run, tell ptracer */
- if (unlikely(trace))
- ptrace_event_pid(trace, pid);
-
- if (clone_flags & CLONE_VFORK) {
- if (!wait_for_vfork_done(p, &vfork))
- ptrace_event_pid(PTRACE_EVENT_VFORK_DONE, pid);
- }
-
- put_pid(pid);
return nr;
}
diff --git a/kernel/signal.c b/kernel/signal.c
index f7c6ffcbd044..8ac8c4a31d88 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -155,7 +155,8 @@ static bool recalc_sigpending_tsk(struct task_struct *t)
if ((t->jobctl & (JOBCTL_PENDING_MASK | JOBCTL_TRAP_FREEZE)) ||
PENDING(&t->pending, &t->blocked) ||
PENDING(&t->signal->shared_pending, &t->blocked) ||
- cgroup_task_frozen(t)) {
+ cgroup_task_frozen(t) ||
+ t->stop_code) {
set_tsk_thread_flag(t, TIF_SIGPENDING);
return true;
}
@@ -2607,6 +2608,39 @@ bool get_signal(struct ksignal *ksig)
if (unlikely(current->task_works))
task_work_run();
+ptrace_event:
+ /* Handle a posted ptrace event */
+ if (unlikely(current->stop_code)) {
+ int stop_code = current->stop_code;
+ unsigned long message = current->ptrace_message;
+ struct completion vfork;
+ struct task_struct *p;
+
+ current->stop_code = 0;
+
+ if (stop_code == PTRACE_EVENT_VFORK) {
+ p = (struct task_struct *)message;
+ get_task_struct(p);
+ current->ptrace_message = pid_parent_nr(task_pid(p));
+ init_completion(&vfork);
+ p->vfork_done = &vfork;
+ wake_up_new_task(p);
+ }
+
+ spin_lock_irq(&sighand->siglock);
+ ptrace_do_notify(SIGTRAP, stop_code, CLD_TRAPPED);
+ spin_unlock_irq(&sighand->siglock);
+
+ if ((stop_code == PTRACE_EVENT_VFORK) &&
+ wait_for_vfork_done(p, &vfork) &&
+ ptrace_post_event(PTRACE_EVENT_VFORK_DONE, message))
+ goto ptrace_event;
+
+ if (stop_code == PTRACE_EVENT_EXIT) {
+ do_exit(message);
+ }
+ }
+
/*
* For non-generic architectures, check for TIF_NOTIFY_SIGNAL so
* that the arch handlers don't all have to do it. If we get here
Eric
Powered by blists - more mailing lists