linux-kernel - Re: [PATCH 1/2] exec: don't wait for zombie threads with cred_guard

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <7ce94707-7bbe-a0d5-85a0-b93e73d76e22@nextfour.com>
Date:   Mon, 13 Feb 2017 19:27:51 +0200
From:   Mika Penttilä <mika.penttila@...tfour.com>
To:     Oleg Nesterov <oleg@...hat.com>,
        Andrew Morton <akpm@...ux-foundation.org>
CC:     Aleksa Sarai <asarai@...e.com>,
        Andy Lutomirski <luto@...capital.net>,
        Attila Fazekas <afazekas@...hat.com>,
        "Eric W. Biederman" <ebiederm@...ssion.com>,
        Jann Horn <jann@...jh.net>, Kees Cook <keescook@...omium.org>,
        Michal Hocko <mhocko@...nel.org>,
        Ulrich Obergfell <uobergfe@...hat.com>,
        <linux-kernel@...r.kernel.org>
Subject: Re: [PATCH 1/2] exec: don't wait for zombie threads with
 cred_guard_mutex held


On 13.02.2017 16:15, Oleg Nesterov wrote:
> +	retval = de_thread(current);
> +	if (retval)
> +		return retval;
>  
>  	if (N_MAGIC(ex) == OMAGIC) {
>  		unsigned long text_addr, map_size;
> diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
> index 4223702..79508f7 100644
> --- a/fs/binfmt_elf.c
> +++ b/fs/binfmt_elf.c
> @@ -855,13 +855,17 @@ static int load_elf_binary(struct linux_binprm *bprm)
>  	setup_new_exec(bprm);
>  	install_exec_creds(bprm);
>  
> +	retval = de_thread(current);
> +	if (retval)
> +		goto out_free_dentry;
> +
>  	/* Do this so that we can load the interpreter, if need be.  We will
>  	   change some of these later */
>  	retval = setup_arg_pages(bprm, randomize_stack_top(STACK_TOP),
>  				 executable_stack);
>  	if (retval < 0)
>  		goto out_free_dentry;
> -	
> +
>  	current->mm->start_stack = bprm->p;
>  
>  	/* Now we do a little grungy work by mmapping the ELF image into
> diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
> index d2e36f8..75fd6d8 100644
> --- a/fs/binfmt_elf_fdpic.c
> +++ b/fs/binfmt_elf_fdpic.c
> @@ -430,6 +430,10 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm)
>  #endif
>  
>  	install_exec_creds(bprm);
> +	retval = de_thread(current);
> +	if (retval)
> +		goto error;
> +
>  	if (create_elf_fdpic_tables(bprm, current->mm,
>  				    &exec_params, &interp_params) < 0)
>  		goto error;
> diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
> index 9b2917a..a0ad9a3 100644
> --- a/fs/binfmt_flat.c
> +++ b/fs/binfmt_flat.c
> @@ -953,6 +953,9 @@ static int load_flat_binary(struct linux_binprm *bprm)
>  	}
>  
>  	install_exec_creds(bprm);
> +	res = de_thread(current);
> +	if (res)
> +		return res;
>  
>  	set_binfmt(&flat_format);
>  
> diff --git a/fs/exec.c b/fs/exec.c
> index e579466..8591c56 100644
> --- a/fs/exec.c
> +++ b/fs/exec.c
> @@ -1036,13 +1036,62 @@ static int exec_mmap(struct mm_struct *mm)
>  	return 0;
>  }
>  
> +static int wait_for_notify_count(struct task_struct *tsk, struct signal_struct *sig)
> +{
> +	for (;;) {
> +		if (unlikely(__fatal_signal_pending(tsk)))
> +			goto killed;
> +		set_current_state(TASK_KILLABLE);
> +		if (!sig->notify_count)
> +			break;
> +		schedule();
> +	}
> +	__set_current_state(TASK_RUNNING);
> +	return 0;
> +
> +killed:
> +	/* protects against exit_notify() and __exit_signal() */
> +	read_lock(&tasklist_lock);
> +	sig->group_exit_task = NULL;
> +	sig->notify_count = 0;
> +	read_unlock(&tasklist_lock);
> +	return -EINTR;
> +}
> +
> +/*
> + * Kill all the sub-threads and wait until they all pass exit_notify().
> + */
> +static int kill_sub_threads(struct task_struct *tsk)
> +{
> +	struct signal_struct *sig = tsk->signal;
> +	int err = -EINTR;
> +
> +	if (thread_group_empty(tsk))
> +		return 0;
> +
> +	read_lock(&tasklist_lock);
> +	spin_lock_irq(&tsk->sighand->siglock);
> +	if (!signal_group_exit(sig)) {
> +		sig->group_exit_task = tsk;
> +		sig->notify_count = -zap_other_threads(tsk);
> +		err = 0;
> +	}
> +	spin_unlock_irq(&tsk->sighand->siglock);
> +	read_unlock(&tasklist_lock);
> +
> +	if (!err)
> +		err = wait_for_notify_count(tsk, sig);
> +	return err;
> +
> +}
> +
>  /*
> - * This function makes sure the current process has its own signal table,
> - * so that flush_signal_handlers can later reset the handlers without
> - * disturbing other processes.  (Other processes might share the signal
> - * table via the CLONE_SIGHAND option to clone().)
> + * This function makes sure the current process has no other threads and
> + * has a private signal table so that flush_signal_handlers() can reset
> + * the handlers without disturbing other processes which might share the
> + * signal table via the CLONE_SIGHAND option to clone().
>   */
> -static int de_thread(struct task_struct *tsk)
> +int de_thread(struct task_struct *tsk)
>  {
>  	struct signal_struct *sig = tsk->signal;
>  	struct sighand_struct *oldsighand = tsk->sighand;
> @@ -1051,60 +1100,24 @@ static int de_thread(struct task_struct *tsk)
>  	if (thread_group_empty(tsk))
>  		goto no_thread_group;
>  
> -	/*
> -	 * Kill all other threads in the thread group.
> -	 */
>  	spin_lock_irq(lock);
> -	if (signal_group_exit(sig)) {
> -		/*
> -		 * Another group action in progress, just
> -		 * return so that the signal is processed.
> -		 */
> -		spin_unlock_irq(lock);
> -		return -EAGAIN;
> -	}
> -
> -	sig->group_exit_task = tsk;
> -	sig->notify_count = zap_other_threads(tsk);
> +	sig->notify_count = sig->nr_threads;


maybe nr_threads - 1 since nr_threads includes us ?

+	sig->notify_count = sig->nr_threads - 1;



>  	if (!thread_group_leader(tsk))
>  		sig->notify_count--;
> -
> -	while (sig->notify_count) {
> -		__set_current_state(TASK_KILLABLE);
> -		spin_unlock_irq(lock);
> -		schedule();
> -		if (unlikely(__fatal_signal_pending(tsk)))
> -			goto killed;
> -		spin_lock_irq(lock);
> -	}
>  	spin_unlock_irq(lock);
>  
> +	if (wait_for_notify_count(tsk, sig))
> +		return -EINTR;
> +
>  	/*
>  	 * At this point all other threads have exited, all we have to
> -	 * do is to wait for the thread group leader to become inactive,
> -	 * and to assume its PID:
> +	 * do is to reap the old leader and assume its PID.
>  	 */
>  	if (!thread_group_leader(tsk)) {
>  		struct task_struct *leader = tsk->group_leader;
>  
> -		for (;;) {
> -			threadgroup_change_begin(tsk);
> -			write_lock_irq(&tasklist_lock);
> -			/*
> -			 * Do this under tasklist_lock to ensure that
> -			 * exit_notify() can't miss ->group_exit_task
> -			 */
> -			sig->notify_count = -1;
> -			if (likely(leader->exit_state))
> -				break;
> -			__set_current_state(TASK_KILLABLE);
> -			write_unlock_irq(&tasklist_lock);
> -			threadgroup_change_end(tsk);
> -			schedule();
> -			if (unlikely(__fatal_signal_pending(tsk)))
> -				goto killed;
> -		}
> -
> +		threadgroup_change_begin(tsk);
> +		write_lock_irq(&tasklist_lock);
>  		/*
>  		 * The only record we have of the real-time age of a
>  		 * process, regardless of execs it's done, is start_time.
> @@ -1162,10 +1175,9 @@ static int de_thread(struct task_struct *tsk)
>  		release_task(leader);
>  	}
>  
> +no_thread_group:
>  	sig->group_exit_task = NULL;
>  	sig->notify_count = 0;
> -
> -no_thread_group:
>  	/* we have changed execution domain */
>  	tsk->exit_signal = SIGCHLD;
>  
> @@ -1198,15 +1210,8 @@ static int de_thread(struct task_struct *tsk)
>  	}
>  
>  	BUG_ON(!thread_group_leader(tsk));
> +	flush_signal_handlers(current, 0);
>  	return 0;
> -
> -killed:
> -	/* protects against exit_notify() and __exit_signal() */
> -	read_lock(&tasklist_lock);
> -	sig->group_exit_task = NULL;
> -	sig->notify_count = 0;
> -	read_unlock(&tasklist_lock);
> -	return -EAGAIN;
>  }
>  
>  char *get_task_comm(char *buf, struct task_struct *tsk)
> @@ -1237,11 +1242,7 @@ int flush_old_exec(struct linux_binprm * bprm)
>  {
>  	int retval;
>  
> -	/*
> -	 * Make sure we have a private signal table and that
> -	 * we are unassociated from the previous thread group.
> -	 */
> -	retval = de_thread(current);
> +	retval = kill_sub_threads(current);
>  	if (retval)
>  		goto out;
>  
> @@ -1336,7 +1337,6 @@ void setup_new_exec(struct linux_binprm * bprm)
>  	/* An exec changes our domain. We are no longer part of the thread
>  	   group */
>  	current->self_exec_id++;
> -	flush_signal_handlers(current, 0);
>  }
>  EXPORT_SYMBOL(setup_new_exec);
>  
> diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h
> index 1303b57..06a5a7b 100644
> --- a/include/linux/binfmts.h
> +++ b/include/linux/binfmts.h
> @@ -101,6 +101,7 @@ extern int __must_check remove_arg_zero(struct linux_binprm *);
>  extern int search_binary_handler(struct linux_binprm *);
>  extern int flush_old_exec(struct linux_binprm * bprm);
>  extern void setup_new_exec(struct linux_binprm * bprm);
> +extern int de_thread(struct task_struct *tsk);
>  extern void would_dump(struct linux_binprm *, struct file *);
>  
>  extern int suid_dumpable;
> diff --git a/kernel/exit.c b/kernel/exit.c
> index 8f14b86..169d9f2 100644
> --- a/kernel/exit.c
> +++ b/kernel/exit.c
> @@ -699,8 +699,9 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
>  	if (tsk->exit_state == EXIT_DEAD)
>  		list_add(&tsk->ptrace_entry, &dead);
>  
> -	/* mt-exec, de_thread() is waiting for group leader */
> -	if (unlikely(tsk->signal->notify_count < 0))
> +	/* mt-exec, kill_sub_threads() is waiting for group exit */
> +	if (unlikely(tsk->signal->notify_count < 0) &&
> +	    !++tsk->signal->notify_count)
>  		wake_up_process(tsk->signal->group_exit_task);
>  	write_unlock_irq(&tasklist_lock);
>  
> diff --git a/kernel/signal.c b/kernel/signal.c
> index 3603d93..b78ce63 100644
> --- a/kernel/signal.c
> +++ b/kernel/signal.c
> @@ -1200,13 +1200,12 @@ int zap_other_threads(struct task_struct *p)
>  
>  	while_each_thread(p, t) {
>  		task_clear_jobctl_pending(t, JOBCTL_PENDING_MASK);
> -		count++;
> -
>  		/* Don't bother with already dead threads */
>  		if (t->exit_state)
>  			continue;
>  		sigaddset(&t->pending.signal, SIGKILL);
>  		signal_wake_up(t, 1);
> +		count++;
>  	}
>  
>  	return count;