[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20181013104446.b3z26rwmiripqfks@gmail.com>
Date:   Sat, 13 Oct 2018 12:44:48 +0200
From:   Christian Brauner <christian.brauner@...onical.com>
To:     Enke Chen <enkechen@...co.com>
Cc:     Thomas Gleixner <tglx@...utronix.de>,
        Ingo Molnar <mingo@...hat.com>, Borislav Petkov <bp@...en8.de>,
        "H. Peter Anvin" <hpa@...or.com>, x86@...nel.org,
        Peter Zijlstra <peterz@...radead.org>,
        Arnd Bergmann <arnd@...db.de>,
        "Eric W. Biederman" <ebiederm@...ssion.com>,
        Khalid Aziz <khalid.aziz@...cle.com>,
        Kate Stewart <kstewart@...uxfoundation.org>,
        Helge Deller <deller@....de>,
        Greg Kroah-Hartman <gregkh@...uxfoundation.org>,
        Al Viro <viro@...iv.linux.org.uk>,
        Andrew Morton <akpm@...ux-foundation.org>,
        Christian Brauner <christian@...uner.io>,
        Catalin Marinas <catalin.marinas@....com>,
        Will Deacon <will.deacon@....com>,
        Dave Martin <Dave.Martin@....com>,
        Mauro Carvalho Chehab <mchehab+samsung@...nel.org>,
        Michal Hocko <mhocko@...nel.org>,
        Rik van Riel <riel@...riel.com>,
        "Kirill A. Shutemov" <kirill.shutemov@...ux.intel.com>,
        Roman Gushchin <guro@...com>,
        Marcos Paulo de Souza <marcos.souza.org@...il.com>,
        Oleg Nesterov <oleg@...hat.com>,
        Dominik Brodowski <linux@...inikbrodowski.net>,
        Cyrill Gorcunov <gorcunov@...nvz.org>,
        Yang Shi <yang.shi@...ux.alibaba.com>,
        Jann Horn <jannh@...gle.com>,
        Kees Cook <keescook@...omium.org>,
        linux-kernel@...r.kernel.org, linux-arch@...r.kernel.org,
        "Victor Kamensky (kamensky)" <kamensky@...co.com>,
        xe-linux-external@...co.com, Stefan Strogin <sstrogin@...co.com>
Subject: Re: [PATCH] kernel/signal: Signal-based pre-coredump notification
On Fri, Oct 12, 2018 at 05:33:35PM -0700, Enke Chen wrote:
> For simplicity and consistency, this patch provides an implementation
> for signal-based fault notification prior to the coredump of a child
> process. A new prctl command, PR_SET_PREDUMP_SIG, is defined that can
> be used by an application to express its interest and to specify the
> signal (SIGCHLD or SIGUSR1 or SIGUSR2) for such a notification. A new
> signal code (si_code), CLD_PREDUMP, is also defined for SIGCHLD.
> 
> Background:
> 
> As the coredump of a process may take time, in certain time-sensitive
> applications it is necessary for a parent process (e.g., a process
> manager) to be notified of a child's imminent death before the coredump
> so that the parent process can act sooner, such as re-spawning an
> application process, or initiating a control-plane fail-over.
> 
> Currently there are two ways for a parent process to be notified of a
> child process's state change. One is to use the POSIX signal, and
> another is to use the kernel connector module. The specific events and
> actions are summarized as follows:
> 
> Process Event    POSIX Signal                Connector-based
> ----------------------------------------------------------------------
> ptrace_attach()  do_notify_parent_cldstop()  proc_ptrace_connector()
>                  SIGCHLD / CLD_STOPPED
> 
> ptrace_detach()  do_notify_parent_cldstop()  proc_ptrace_connector()
>                  SIGCHLD / CLD_CONTINUED
> 
> pre_coredump/    N/A                         proc_coredump_connector()
> get_signal()
> 
> post_coredump/   do_notify_parent()          proc_exit_connector()
> do_exit()        SIGCHLD / exit_signal
> ----------------------------------------------------------------------
> 
> As shown in the table, the signal-based pre-coredump notification is not
> currently available. In some cases using a connector-based notification
> can be quite complicated (e.g., when a process manager is written in shell
> scripts and thus is subject to certain inherent limitations), and a
> signal-based notification would be simpler and better suited.
> 
> Signed-off-by: Enke Chen <enkechen@...co.com>
> ---
>  arch/x86/kernel/signal_compat.c    |  2 +-
>  include/linux/sched.h              |  4 ++
>  include/linux/signal.h             |  5 +++
>  include/uapi/asm-generic/siginfo.h |  3 +-
>  include/uapi/linux/prctl.h         |  4 ++
>  kernel/fork.c                      |  1 +
>  kernel/signal.c                    | 51 +++++++++++++++++++++++++
>  kernel/sys.c                       | 77 ++++++++++++++++++++++++++++++++++++++
>  8 files changed, 145 insertions(+), 2 deletions(-)
> 
> diff --git a/arch/x86/kernel/signal_compat.c b/arch/x86/kernel/signal_compat.c
> index 9ccbf05..a3deba8 100644
> --- a/arch/x86/kernel/signal_compat.c
> +++ b/arch/x86/kernel/signal_compat.c
> @@ -30,7 +30,7 @@ static inline void signal_compat_build_tests(void)
>  	BUILD_BUG_ON(NSIGSEGV != 7);
>  	BUILD_BUG_ON(NSIGBUS  != 5);
>  	BUILD_BUG_ON(NSIGTRAP != 5);
> -	BUILD_BUG_ON(NSIGCHLD != 6);
> +	BUILD_BUG_ON(NSIGCHLD != 7);
>  	BUILD_BUG_ON(NSIGSYS  != 1);
>  
>  	/* This is part of the ABI and can never change in size: */
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 09026ea..cfb9645 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -696,6 +696,10 @@ struct task_struct {
>  	int				exit_signal;
>  	/* The signal sent when the parent dies: */
>  	int				pdeath_signal;
> +
> +	/* The signal sent prior to a child's coredump: */
> +	int				predump_signal;
> +
>  	/* JOBCTL_*, siglock protected: */
>  	unsigned long			jobctl;
>  
> diff --git a/include/linux/signal.h b/include/linux/signal.h
> index 706a499..7cb976d 100644
> --- a/include/linux/signal.h
> +++ b/include/linux/signal.h
> @@ -256,6 +256,11 @@ static inline int valid_signal(unsigned long sig)
>  	return sig <= _NSIG ? 1 : 0;
>  }
>  
> +static inline int valid_predump_signal(int sig)
> +{
> +	return (sig == SIGCHLD) || (sig == SIGUSR1) || (sig == SIGUSR2);
> +}
> +
>  struct timespec;
>  struct pt_regs;
>  enum pid_type;
> diff --git a/include/uapi/asm-generic/siginfo.h b/include/uapi/asm-generic/siginfo.h
> index cb3d6c2..1a47cef 100644
> --- a/include/uapi/asm-generic/siginfo.h
> +++ b/include/uapi/asm-generic/siginfo.h
> @@ -267,7 +267,8 @@ struct {				\
>  #define CLD_TRAPPED	4	/* traced child has trapped */
>  #define CLD_STOPPED	5	/* child has stopped */
>  #define CLD_CONTINUED	6	/* stopped child has continued */
> -#define NSIGCHLD	6
> +#define CLD_PREDUMP	7	/* child is about to dump core */
> +#define NSIGCHLD	7
>  
>  /*
>   * SIGPOLL (or any other signal without signal specific si_codes) si_codes
> diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h
> index c0d7ea0..79f0a8a 100644
> --- a/include/uapi/linux/prctl.h
> +++ b/include/uapi/linux/prctl.h
> @@ -219,4 +219,8 @@ struct prctl_mm_map {
>  # define PR_SPEC_DISABLE		(1UL << 2)
>  # define PR_SPEC_FORCE_DISABLE		(1UL << 3)
>  
> +/* Whether to receive signal prior to child's coredump */
> +#define PR_SET_PREDUMP_SIG	54
> +#define PR_GET_PREDUMP_SIG	55
> +
>  #endif /* _LINUX_PRCTL_H */
> diff --git a/kernel/fork.c b/kernel/fork.c
> index 07cddff..c296c11 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -1985,6 +1985,7 @@ static __latent_entropy struct task_struct *copy_process(
>  	p->dirty_paused_when = 0;
>  
>  	p->pdeath_signal = 0;
> +	p->predump_signal = 0;
>  	INIT_LIST_HEAD(&p->thread_group);
>  	p->task_works = NULL;
>  
> diff --git a/kernel/signal.c b/kernel/signal.c
> index 312b43e..eb4a483 100644
> --- a/kernel/signal.c
> +++ b/kernel/signal.c
> @@ -2337,6 +2337,44 @@ static int ptrace_signal(int signr, kernel_siginfo_t *info)
>  	return signr;
>  }
>  
> +/*
> + * Let the parent, if so desired, know about the imminent death of a child
> + * prior to its coredump.
> + *
> + * Locking logic is similar to do_notify_parent_cldstop().
> + */
> +static void do_notify_parent_predump(struct task_struct *tsk)
> +{
> +	struct sighand_struct *sighand;
> +	struct task_struct *parent;
> +	struct kernel_siginfo info;
> +	unsigned long flags;
> +	int sig;
> +
> +	parent = tsk->real_parent;
> +	sig = parent->predump_signal;
> +
> +	/* Check again with "tasklist_lock" locked by the caller */
> +	if (!valid_predump_signal(sig))
> +		return;
> +
> +	clear_siginfo(&info);
> +	info.si_signo = sig;
> +	if (sig == SIGCHLD)
> +		info.si_code = CLD_PREDUMP;
> +
> +	rcu_read_lock();
> +	info.si_pid = task_pid_nr_ns(tsk, task_active_pid_ns(parent));
> +	info.si_uid = from_kuid_munged(task_cred_xxx(parent, user_ns),
> +				       task_uid(tsk));
> +	rcu_read_unlock();
> +
> +	sighand = parent->sighand;
> +	spin_lock_irqsave(&sighand->siglock, flags);
> +	__group_send_sig_info(sig, &info, parent);
> +	spin_unlock_irqrestore(&sighand->siglock, flags);
> +}
> +
>  bool get_signal(struct ksignal *ksig)
>  {
>  	struct sighand_struct *sighand = current->sighand;
> @@ -2497,6 +2535,19 @@ bool get_signal(struct ksignal *ksig)
>  		current->flags |= PF_SIGNALED;
>  
>  		if (sig_kernel_coredump(signr)) {
> +			/*
> +			 * Notify the parent prior to the coredump if the
> +			 * parent is interested in such a notificaiton.
> +			 */
> +			int p_sig = current->real_parent->predump_signal;
> +
> +			if (valid_predump_signal(p_sig)) {
> +				read_lock(&tasklist_lock);
> +				do_notify_parent_predump(current);
> +				read_unlock(&tasklist_lock);
> +				cond_resched();
> +			}
> +
>  			if (print_fatal_signals)
>  				print_fatal_signal(ksig->info.si_signo);
>  			proc_coredump_connector(current);
> diff --git a/kernel/sys.c b/kernel/sys.c
> index 123bd73..43eb250d 100644
> --- a/kernel/sys.c
> +++ b/kernel/sys.c
> @@ -2258,6 +2258,76 @@ int __weak arch_prctl_spec_ctrl_set(struct task_struct *t, unsigned long which,
>  	return -EINVAL;
>  }
>  
> +static int prctl_get_predump_signal(struct task_struct *tsk, pid_t pid,
> +				    int __user *addr)
> +{
> +	struct task_struct *p;
> +	int error;
> +
> +	/* For the current task, the common case */
> +	if (pid == 0) {
> +		put_user(tsk->predump_signal, addr);
> +		return 0;
> +	}
> +
> +	error = -ESRCH;
> +	rcu_read_lock();
> +	p = find_task_by_vpid(pid);
> +	if (p) {
> +		error = 0;
> +		put_user(p->predump_signal, addr);
> +	}
> +	rcu_read_unlock();
> +	return error;
> +}
> +
> +/*
> + * Returns true if current's euid is same as p's uid or euid,
> + * or has CAP_SYS_ADMIN.
> + *
> + * Called with rcu_read_lock, creds are safe.
> + *
> + * Adapted from set_one_prio_perm().
> + */
> +static bool set_predump_signal_perm(struct task_struct *p)
> +{
> +	const struct cred *cred = current_cred(), *pcred = __task_cred(p);
> +
> +	return uid_eq(pcred->uid, cred->euid) ||
> +	       uid_eq(pcred->euid, cred->euid) ||
> +	       capable(CAP_SYS_ADMIN);
So before proceeding I'd like to discuss at least two points:
- how does this interact with the dumpability of a process?
- do we need the capable(CAP_SYS_ADMIN) restriction to init_user_ns?
  Seems we could make this work per-user-ns just like
  PRCTL_SET_PDEATHSIG does?
> +}
> +
> +static int prctl_set_predump_signal(struct task_struct *tsk, pid_t pid, int sig)
> +{
> +	struct task_struct *p;
> +	int error;
> +
> +	/* 0 is valid for disabling the feature */
> +	if (sig && !valid_predump_signal(sig))
> +		return -EINVAL;
> +
> +	/* For the current task, the common case */
> +	if (pid == 0) {
> +		tsk->predump_signal = sig;
> +		return 0;
> +	}
> +
> +	error = -ESRCH;
> +	rcu_read_lock();
> +	p = find_task_by_vpid(pid);
> +	if (p) {
> +		if (!set_predump_signal_perm(p))
> +			error = -EPERM;
> +		else {
> +			error = 0;
> +			p->predump_signal = sig;
> +		}
> +	}
> +	rcu_read_unlock();
> +	return error;
> +}
> +
>  SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
>  		unsigned long, arg4, unsigned long, arg5)
>  {
> @@ -2476,6 +2546,13 @@ int __weak arch_prctl_spec_ctrl_set(struct task_struct *t, unsigned long which,
>  			return -EINVAL;
>  		error = arch_prctl_spec_ctrl_set(me, arg2, arg3);
>  		break;
> +	case PR_SET_PREDUMP_SIG:
> +		error = prctl_set_predump_signal(me, (pid_t)arg2, (int)arg3);
> +		break;
> +	case PR_GET_PREDUMP_SIG:
> +		error = prctl_get_predump_signal(me, (pid_t)arg2,
> +						 (int __user *)arg3);
> +		break;
>  	default:
>  		error = -EINVAL;
>  		break;
> -- 
> 1.8.3.1
Powered by blists - more mailing lists
 
