[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <202109241154.A915C488E2@keescook>
Date: Fri, 24 Sep 2021 11:56:00 -0700
From: Kees Cook <keescook@...omium.org>
To: "Eric W. Biederman" <ebiederm@...ssion.com>
Cc: linux-kernel@...r.kernel.org,
Linus Torvalds <torvalds@...ux-foundation.org>,
Oleg Nesterov <oleg@...hat.com>,
Al Viro <viro@...iv.linux.org.uk>, linux-api@...r.kernel.org
Subject: Re: [PATCH 6/6] coredump: Limit coredumps to a single thread group
On Thu, Sep 23, 2021 at 07:12:33PM -0500, Eric W. Biederman wrote:
>
> Today when a signal is delivered with a handler of SIG_DFL whose
> default behavior is to generate a core dump not only that process but
> every process that shares the mm is killed.
>
> In the case of vfork this looks like a real world problem. Consider
> the following well defined sequence.
>
> if (vfork() == 0) {
> execve(...);
> _exit(EXIT_FAILURE);
> }
>
> If a signal that generates a core dump is received after vfork but
> before the execve changes the mm the process that called vfork will
> also be killed (as the mm is shared).
>
> Similarly if the execve fails after the point of no return the kernel
> delivers SIGSEGV which will kill both the exec'ing process and because
> the mm is shared the process that called vfork as well.
>
> As far as I can tell this behavior is a violation of people's
> reasonable expectations, POSIX, and is unnecessarily fragile when the
> system is low on memory.
>
> Solve this by making a userspace visible change to only kill a single
> process/thread group. This is possible because Jann Horn recently
> modified[1] the coredump code so that the mm can safely be modified
> while the coredump is happening. With LinuxThreads long gone I don't
> expect anyone to have a notice this behavior change in practice.
>
> To accomplish this move the core_state pointer from mm_struct to
> signal_struct, which allows different thread groups to coredump
> simultatenously.
>
> In zap_threads remove the work to kill anything except for the current
> thread group.
>
> [1] a07279c9a8cd ("binfmt_elf, binfmt_elf_fdpic: use a VMA list snapshot")
> Fixes: d89f3847def4 ("[PATCH] thread-aware coredumps, 2.5.43-C3")
> History-tree: git://git.kernel.org/pub/scm/linux/kernel/git/tglx/history.git
> Signed-off-by: "Eric W. Biederman" <ebiederm@...ssion.com>
This looks correct to me, but depends on the 5/6 not introducing any
races. So, to that end:
Reviewed-by: Kees Cook <keescook@...omium.org>
If you have some local tools you've been using for testing this series,
can you toss them into tools/testing/selftests/ptrace/ ? I can help
clean them up if want.
-Kees
> ---
> fs/binfmt_elf.c | 4 +-
> fs/binfmt_elf_fdpic.c | 2 +-
> fs/coredump.c | 84 ++++--------------------------------
> fs/proc/array.c | 6 +--
> include/linux/mm_types.h | 13 ------
> include/linux/sched/signal.h | 13 ++++++
> kernel/exit.c | 13 ++----
> kernel/fork.c | 1 -
> 8 files changed, 32 insertions(+), 104 deletions(-)
>
> diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
> index 69d900a8473d..796e5327ee7d 100644
> --- a/fs/binfmt_elf.c
> +++ b/fs/binfmt_elf.c
> @@ -1834,7 +1834,7 @@ static int fill_note_info(struct elfhdr *elf, int phdrs,
> /*
> * Allocate a structure for each thread.
> */
> - for (ct = &dump_task->mm->core_state->dumper; ct; ct = ct->next) {
> + for (ct = &dump_task->signal->core_state->dumper; ct; ct = ct->next) {
> t = kzalloc(offsetof(struct elf_thread_core_info,
> notes[info->thread_notes]),
> GFP_KERNEL);
> @@ -2024,7 +2024,7 @@ static int fill_note_info(struct elfhdr *elf, int phdrs,
> if (!elf_note_info_init(info))
> return 0;
>
> - for (ct = current->mm->core_state->dumper.next;
> + for (ct = current->signal->core_state->dumper.next;
> ct; ct = ct->next) {
> ets = kzalloc(sizeof(*ets), GFP_KERNEL);
> if (!ets)
> diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
> index 6d8fd6030cbb..c6f588dc4a9d 100644
> --- a/fs/binfmt_elf_fdpic.c
> +++ b/fs/binfmt_elf_fdpic.c
> @@ -1494,7 +1494,7 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm)
> if (dump_vma_snapshot(cprm, &vma_count, &vma_meta, &vma_data_size))
> goto end_coredump;
>
> - for (ct = current->mm->core_state->dumper.next;
> + for (ct = current->signal->core_state->dumper.next;
> ct; ct = ct->next) {
> tmp = elf_dump_thread_status(cprm->siginfo->si_signo,
> ct->task, &thread_status_size);
> diff --git a/fs/coredump.c b/fs/coredump.c
> index d576287fb88b..a6b3c196cdef 100644
> --- a/fs/coredump.c
> +++ b/fs/coredump.c
> @@ -369,99 +369,34 @@ static int zap_process(struct task_struct *start, int exit_code, int flags)
> return nr;
> }
>
> -static int zap_threads(struct task_struct *tsk, struct mm_struct *mm,
> +static int zap_threads(struct task_struct *tsk,
> struct core_state *core_state, int exit_code)
> {
> - struct task_struct *g, *p;
> - unsigned long flags;
> int nr = -EAGAIN;
>
> spin_lock_irq(&tsk->sighand->siglock);
> if (!signal_group_exit(tsk->signal)) {
> - mm->core_state = core_state;
> + tsk->signal->core_state = core_state;
> tsk->signal->group_exit_task = tsk;
> nr = zap_process(tsk, exit_code, 0);
> clear_tsk_thread_flag(tsk, TIF_SIGPENDING);
> + tsk->flags |= PF_DUMPCORE;
> + atomic_set(&core_state->nr_threads, nr);
> }
> spin_unlock_irq(&tsk->sighand->siglock);
> - if (unlikely(nr < 0))
> - return nr;
> -
> - tsk->flags |= PF_DUMPCORE;
> - if (atomic_read(&mm->mm_users) == nr + 1)
> - goto done;
> - /*
> - * We should find and kill all tasks which use this mm, and we should
> - * count them correctly into ->nr_threads. We don't take tasklist
> - * lock, but this is safe wrt:
> - *
> - * fork:
> - * None of sub-threads can fork after zap_process(leader). All
> - * processes which were created before this point should be
> - * visible to zap_threads() because copy_process() adds the new
> - * process to the tail of init_task.tasks list, and lock/unlock
> - * of ->siglock provides a memory barrier.
> - *
> - * do_exit:
> - * The caller holds mm->mmap_lock. This means that the task which
> - * uses this mm can't pass coredump_task_exit(), so it can't exit
> - * or clear its ->mm.
> - *
> - * de_thread:
> - * It does list_replace_rcu(&leader->tasks, ¤t->tasks),
> - * we must see either old or new leader, this does not matter.
> - * However, it can change p->sighand, so lock_task_sighand(p)
> - * must be used. Since p->mm != NULL and we hold ->mmap_lock
> - * it can't fail.
> - *
> - * Note also that "g" can be the old leader with ->mm == NULL
> - * and already unhashed and thus removed from ->thread_group.
> - * This is OK, __unhash_process()->list_del_rcu() does not
> - * clear the ->next pointer, we will find the new leader via
> - * next_thread().
> - */
> - rcu_read_lock();
> - for_each_process(g) {
> - if (g == tsk->group_leader)
> - continue;
> - if (g->flags & PF_KTHREAD)
> - continue;
> -
> - for_each_thread(g, p) {
> - if (unlikely(!p->mm))
> - continue;
> - if (unlikely(p->mm == mm)) {
> - lock_task_sighand(p, &flags);
> - nr += zap_process(p, exit_code,
> - SIGNAL_GROUP_EXIT);
> - unlock_task_sighand(p, &flags);
> - }
> - break;
> - }
> - }
> - rcu_read_unlock();
> -done:
> - atomic_set(&core_state->nr_threads, nr);
> return nr;
> }
>
> static int coredump_wait(int exit_code, struct core_state *core_state)
> {
> struct task_struct *tsk = current;
> - struct mm_struct *mm = tsk->mm;
> int core_waiters = -EBUSY;
>
> init_completion(&core_state->startup);
> core_state->dumper.task = tsk;
> core_state->dumper.next = NULL;
>
> - if (mmap_write_lock_killable(mm))
> - return -EINTR;
> -
> - if (!mm->core_state)
> - core_waiters = zap_threads(tsk, mm, core_state, exit_code);
> - mmap_write_unlock(mm);
> -
> + core_waiters = zap_threads(tsk, core_state, exit_code);
> if (core_waiters > 0) {
> struct core_thread *ptr;
>
> @@ -483,7 +418,7 @@ static int coredump_wait(int exit_code, struct core_state *core_state)
> return core_waiters;
> }
>
> -static void coredump_finish(struct mm_struct *mm, bool core_dumped)
> +static void coredump_finish(bool core_dumped)
> {
> struct core_thread *curr, *next;
> struct task_struct *task;
> @@ -493,9 +428,10 @@ static void coredump_finish(struct mm_struct *mm, bool core_dumped)
> current->signal->group_exit_code |= 0x80;
> current->signal->group_exit_task = NULL;
> current->signal->flags = SIGNAL_GROUP_EXIT;
> + next = current->signal->core_state->dumper.next;
> + current->signal->core_state = NULL;
> spin_unlock_irq(¤t->sighand->siglock);
>
> - next = mm->core_state->dumper.next;
> while ((curr = next) != NULL) {
> next = curr->next;
> task = curr->task;
> @@ -507,8 +443,6 @@ static void coredump_finish(struct mm_struct *mm, bool core_dumped)
> curr->task = NULL;
> wake_up_process(task);
> }
> -
> - mm->core_state = NULL;
> }
>
> static bool dump_interrupted(void)
> @@ -839,7 +773,7 @@ void do_coredump(const kernel_siginfo_t *siginfo)
> fail_unlock:
> kfree(argv);
> kfree(cn.corename);
> - coredump_finish(mm, core_dumped);
> + coredump_finish(core_dumped);
> revert_creds(old_cred);
> fail_creds:
> put_cred(cred);
> diff --git a/fs/proc/array.c b/fs/proc/array.c
> index 49be8c8ef555..520c51be1e57 100644
> --- a/fs/proc/array.c
> +++ b/fs/proc/array.c
> @@ -408,9 +408,9 @@ static void task_cpus_allowed(struct seq_file *m, struct task_struct *task)
> cpumask_pr_args(&task->cpus_mask));
> }
>
> -static inline void task_core_dumping(struct seq_file *m, struct mm_struct *mm)
> +static inline void task_core_dumping(struct seq_file *m, struct task_struct *task)
> {
> - seq_put_decimal_ull(m, "CoreDumping:\t", !!mm->core_state);
> + seq_put_decimal_ull(m, "CoreDumping:\t", !!task->signal->core_state);
> seq_putc(m, '\n');
> }
>
> @@ -436,7 +436,7 @@ int proc_pid_status(struct seq_file *m, struct pid_namespace *ns,
>
> if (mm) {
> task_mem(m, mm);
> - task_core_dumping(m, mm);
> + task_core_dumping(m, task);
> task_thp_status(m, mm);
> mmput(mm);
> }
> diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
> index 7f8ee09c711f..1039f6ae922c 100644
> --- a/include/linux/mm_types.h
> +++ b/include/linux/mm_types.h
> @@ -387,17 +387,6 @@ struct vm_area_struct {
> struct vm_userfaultfd_ctx vm_userfaultfd_ctx;
> } __randomize_layout;
>
> -struct core_thread {
> - struct task_struct *task;
> - struct core_thread *next;
> -};
> -
> -struct core_state {
> - atomic_t nr_threads;
> - struct core_thread dumper;
> - struct completion startup;
> -};
> -
> struct kioctx_table;
> struct mm_struct {
> struct {
> @@ -518,8 +507,6 @@ struct mm_struct {
>
> unsigned long flags; /* Must use atomic bitops to access */
>
> - struct core_state *core_state; /* coredumping support */
> -
> #ifdef CONFIG_AIO
> spinlock_t ioctx_lock;
> struct kioctx_table __rcu *ioctx_table;
> diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
> index e5f4ce622ee6..a8fe2a593a3a 100644
> --- a/include/linux/sched/signal.h
> +++ b/include/linux/sched/signal.h
> @@ -72,6 +72,17 @@ struct multiprocess_signals {
> struct hlist_node node;
> };
>
> +struct core_thread {
> + struct task_struct *task;
> + struct core_thread *next;
> +};
> +
> +struct core_state {
> + atomic_t nr_threads;
> + struct core_thread dumper;
> + struct completion startup;
> +};
> +
> /*
> * NOTE! "signal_struct" does not have its own
> * locking, because a shared signal_struct always
> @@ -110,6 +121,8 @@ struct signal_struct {
> int group_stop_count;
> unsigned int flags; /* see SIGNAL_* flags below */
>
> + struct core_state *core_state; /* coredumping support */
> +
> /*
> * PR_SET_CHILD_SUBREAPER marks a process, like a service
> * manager, to re-parent orphan (double-forking) child processes
> diff --git a/kernel/exit.c b/kernel/exit.c
> index 774e6b5061b8..2b355e926c13 100644
> --- a/kernel/exit.c
> +++ b/kernel/exit.c
> @@ -342,23 +342,18 @@ kill_orphaned_pgrp(struct task_struct *tsk, struct task_struct *parent)
> static void coredump_task_exit(struct task_struct *tsk)
> {
> struct core_state *core_state;
> - struct mm_struct *mm;
> -
> - mm = tsk->mm;
> - if (!mm)
> - return;
>
> /*
> * Serialize with any possible pending coredump.
> - * We must hold mmap_lock around checking core_state
> + * We must hold siglock around checking core_state
> * and setting PF_POSTCOREDUMP. The core-inducing thread
> * will increment ->nr_threads for each thread in the
> * group without PF_POSTCOREDUMP set.
> */
> - mmap_read_lock(mm);
> + spin_lock_irq(&tsk->sighand->siglock);
> tsk->flags |= PF_POSTCOREDUMP;
> - core_state = mm->core_state;
> - mmap_read_unlock(mm);
> + core_state = tsk->signal->core_state;
> + spin_unlock_irq(&tsk->sighand->siglock);
> if (core_state) {
> struct core_thread self;
>
> diff --git a/kernel/fork.c b/kernel/fork.c
> index 9bd9f2da9e41..c8adb76982f7 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -1044,7 +1044,6 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
> seqcount_init(&mm->write_protect_seq);
> mmap_init_lock(mm);
> INIT_LIST_HEAD(&mm->mmlist);
> - mm->core_state = NULL;
> mm_pgtables_bytes_init(mm);
> mm->map_count = 0;
> mm->locked_vm = 0;
> --
> 2.20.1
>
--
Kees Cook
Powered by blists - more mailing lists