linux-kernel - Re: [RFC 2/2] signal: extend pidfd_send_signal() to allow expedited process killing

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20190411103018.tcsinifuj7klh6rp@brauner.io>
Date:   Thu, 11 Apr 2019 12:30:19 +0200
From:   Christian Brauner <christian@...uner.io>
To:     Suren Baghdasaryan <surenb@...gle.com>
Cc:     akpm@...ux-foundation.org, mhocko@...e.com, rientjes@...gle.com,
        willy@...radead.org, yuzhoujian@...ichuxing.com,
        jrdr.linux@...il.com, guro@...com, hannes@...xchg.org,
        penguin-kernel@...ove.SAKURA.ne.jp, ebiederm@...ssion.com,
        shakeelb@...gle.com, minchan@...nel.org, timmurray@...gle.com,
        dancol@...gle.com, joel@...lfernandes.org, jannh@...gle.com,
        linux-mm@...ck.org, lsf-pc@...ts.linux-foundation.org,
        linux-kernel@...r.kernel.org, kernel-team@...roid.com
Subject: Re: [RFC 2/2] signal: extend pidfd_send_signal() to allow expedited
 process killing

On Wed, Apr 10, 2019 at 06:43:53PM -0700, Suren Baghdasaryan wrote:
> Add new SS_EXPEDITE flag to be used when sending SIGKILL via
> pidfd_send_signal() syscall to allow expedited memory reclaim of the
> victim process. The usage of this flag is currently limited to SIGKILL
> signal and only to privileged users.
> 
> Signed-off-by: Suren Baghdasaryan <surenb@...gle.com>
> ---
>  include/linux/sched/signal.h |  3 ++-
>  include/linux/signal.h       | 11 ++++++++++-
>  ipc/mqueue.c                 |  2 +-
>  kernel/signal.c              | 37 ++++++++++++++++++++++++++++--------
>  kernel/time/itimer.c         |  2 +-
>  5 files changed, 43 insertions(+), 12 deletions(-)
> 
> diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
> index e412c092c1e8..8a227633a058 100644
> --- a/include/linux/sched/signal.h
> +++ b/include/linux/sched/signal.h
> @@ -327,7 +327,8 @@ extern int send_sig_info(int, struct kernel_siginfo *, struct task_struct *);
>  extern void force_sigsegv(int sig, struct task_struct *p);
>  extern int force_sig_info(int, struct kernel_siginfo *, struct task_struct *);
>  extern int __kill_pgrp_info(int sig, struct kernel_siginfo *info, struct pid *pgrp);
> -extern int kill_pid_info(int sig, struct kernel_siginfo *info, struct pid *pid);
> +extern int kill_pid_info(int sig, struct kernel_siginfo *info, struct pid *pid,
> +				bool expedite);
>  extern int kill_pid_info_as_cred(int, struct kernel_siginfo *, struct pid *,
>  				const struct cred *);
>  extern int kill_pgrp(struct pid *pid, int sig, int priv);
> diff --git a/include/linux/signal.h b/include/linux/signal.h
> index 9702016734b1..34b7852aa4a0 100644
> --- a/include/linux/signal.h
> +++ b/include/linux/signal.h
> @@ -446,8 +446,17 @@ int __save_altstack(stack_t __user *, unsigned long);
>  } while (0);
>  
>  #ifdef CONFIG_PROC_FS
> +
> +/*
> + * SS_FLAGS values used in pidfd_send_signal:
> + *
> + * SS_EXPEDITE indicates desire to expedite the operation.
> + */
> +#define SS_EXPEDITE	0x00000001

Does this make sense as an SS_* flag?
How does this relate to the signal stack?
Is there any intention to ever use this flag with stack_t?

New flags should be PIDFD_SIGNAL_*. (E.g. the thread flag will be
PIDFD_SIGNAL_THREAD.)
And since this is exposed to userspace in contrast to the mm internal
naming it should be something more easily understandable like
PIDFD_SIGNAL_MM_RECLAIM{_FASTER} or something.

> +
>  struct seq_file;
>  extern void render_sigset_t(struct seq_file *, const char *, sigset_t *);
> -#endif
> +
> +#endif /* CONFIG_PROC_FS */
>  
>  #endif /* _LINUX_SIGNAL_H */
> diff --git a/ipc/mqueue.c b/ipc/mqueue.c
> index aea30530c472..27c66296e08e 100644
> --- a/ipc/mqueue.c
> +++ b/ipc/mqueue.c
> @@ -720,7 +720,7 @@ static void __do_notify(struct mqueue_inode_info *info)
>  			rcu_read_unlock();
>  
>  			kill_pid_info(info->notify.sigev_signo,
> -				      &sig_i, info->notify_owner);
> +				      &sig_i, info->notify_owner, false);
>  			break;
>  		case SIGEV_THREAD:
>  			set_cookie(info->notify_cookie, NOTIFY_WOKENUP);
> diff --git a/kernel/signal.c b/kernel/signal.c
> index f98448cf2def..02ed4332d17c 100644
> --- a/kernel/signal.c
> +++ b/kernel/signal.c
> @@ -43,6 +43,7 @@
>  #include <linux/compiler.h>
>  #include <linux/posix-timers.h>
>  #include <linux/livepatch.h>
> +#include <linux/oom.h>
>  
>  #define CREATE_TRACE_POINTS
>  #include <trace/events/signal.h>
> @@ -1394,7 +1395,8 @@ int __kill_pgrp_info(int sig, struct kernel_siginfo *info, struct pid *pgrp)
>  	return success ? 0 : retval;
>  }
>  
> -int kill_pid_info(int sig, struct kernel_siginfo *info, struct pid *pid)
> +int kill_pid_info(int sig, struct kernel_siginfo *info, struct pid *pid,
> +				  bool expedite)
>  {
>  	int error = -ESRCH;
>  	struct task_struct *p;
> @@ -1402,8 +1404,17 @@ int kill_pid_info(int sig, struct kernel_siginfo *info, struct pid *pid)
>  	for (;;) {
>  		rcu_read_lock();
>  		p = pid_task(pid, PIDTYPE_PID);
> -		if (p)
> +		if (p) {
>  			error = group_send_sig_info(sig, info, p, PIDTYPE_TGID);
> +
> +			/*
> +			 * Ignore expedite_reclaim return value, it is best
> +			 * effort only.
> +			 */
> +			if (!error && expedite)
> +				expedite_reclaim(p);

SIGKILL will take the whole thread group down so the reclaim should make
sense here.

> +		}
> +
>  		rcu_read_unlock();
>  		if (likely(!p || error != -ESRCH))
>  			return error;
> @@ -1420,7 +1431,7 @@ static int kill_proc_info(int sig, struct kernel_siginfo *info, pid_t pid)
>  {
>  	int error;
>  	rcu_read_lock();
> -	error = kill_pid_info(sig, info, find_vpid(pid));
> +	error = kill_pid_info(sig, info, find_vpid(pid), false);
>  	rcu_read_unlock();
>  	return error;
>  }
> @@ -1487,7 +1498,7 @@ static int kill_something_info(int sig, struct kernel_siginfo *info, pid_t pid)
>  
>  	if (pid > 0) {
>  		rcu_read_lock();
> -		ret = kill_pid_info(sig, info, find_vpid(pid));
> +		ret = kill_pid_info(sig, info, find_vpid(pid), false);
>  		rcu_read_unlock();
>  		return ret;
>  	}
> @@ -1704,7 +1715,7 @@ EXPORT_SYMBOL(kill_pgrp);
>  
>  int kill_pid(struct pid *pid, int sig, int priv)
>  {
> -	return kill_pid_info(sig, __si_special(priv), pid);
> +	return kill_pid_info(sig, __si_special(priv), pid, false);
>  }
>  EXPORT_SYMBOL(kill_pid);
>  
> @@ -3577,10 +3588,20 @@ SYSCALL_DEFINE4(pidfd_send_signal, int, pidfd, int, sig,
>  	struct pid *pid;
>  	kernel_siginfo_t kinfo;
>  
> -	/* Enforce flags be set to 0 until we add an extension. */
> -	if (flags)
> +	/* Enforce no unknown flags. */
> +	if (flags & ~SS_EXPEDITE)
>  		return -EINVAL;
>  
> +	if (flags & SS_EXPEDITE) {
> +		/* Enforce SS_EXPEDITE to be used with SIGKILL only. */
> +		if (sig != SIGKILL)
> +			return -EINVAL;

Not super fond of this being a SIGKILL specific flag but I get why.

> +
> +		/* Limit expedited killing to privileged users only. */
> +		if (!capable(CAP_SYS_NICE))
> +			return -EPERM;

Do you have a specific (DOS or other) attack vector in mind that renders
ns_capable unsuitable?

> +	}
> +
>  	f = fdget_raw(pidfd);
>  	if (!f.file)
>  		return -EBADF;
> @@ -3614,7 +3635,7 @@ SYSCALL_DEFINE4(pidfd_send_signal, int, pidfd, int, sig,
>  		prepare_kill_siginfo(sig, &kinfo);
>  	}
>  
> -	ret = kill_pid_info(sig, &kinfo, pid);
> +	ret = kill_pid_info(sig, &kinfo, pid, (flags & SS_EXPEDITE) != 0);
>  
>  err:
>  	fdput(f);
> diff --git a/kernel/time/itimer.c b/kernel/time/itimer.c
> index 02068b2d5862..c926483cdb53 100644
> --- a/kernel/time/itimer.c
> +++ b/kernel/time/itimer.c
> @@ -140,7 +140,7 @@ enum hrtimer_restart it_real_fn(struct hrtimer *timer)
>  	struct pid *leader_pid = sig->pids[PIDTYPE_TGID];
>  
>  	trace_itimer_expire(ITIMER_REAL, leader_pid, 0);
> -	kill_pid_info(SIGALRM, SEND_SIG_PRIV, leader_pid);
> +	kill_pid_info(SIGALRM, SEND_SIG_PRIV, leader_pid, false);
>  
>  	return HRTIMER_NORESTART;
>  }
> -- 
> 2.21.0.392.gf8f6787159e-goog
>