linux-kernel - Re: [PATCH 2/5] sched,ptrace: Fix ptrace_check_attach() vs PREEMPT

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <YlikBjA3kL3XEQP5@hirez.programming.kicks-ass.net>
Date:   Fri, 15 Apr 2022 00:45:26 +0200
From:   Peter Zijlstra <peterz@...radead.org>
To:     Oleg Nesterov <oleg@...hat.com>
Cc:     rjw@...ysocki.net, mingo@...nel.org, vincent.guittot@...aro.org,
        dietmar.eggemann@....com, rostedt@...dmis.org, mgorman@...e.de,
        ebiederm@...ssion.com, bigeasy@...utronix.de,
        Will Deacon <will@...nel.org>, linux-kernel@...r.kernel.org,
        tj@...nel.org, linux-pm@...r.kernel.org
Subject: Re: [PATCH 2/5] sched,ptrace: Fix ptrace_check_attach() vs PREEMPT_RT

On Thu, Apr 14, 2022 at 08:34:33PM +0200, Oleg Nesterov wrote:
> On 04/14, Oleg Nesterov wrote:
> >
> > OK, let me think about it. Thanks!
> 
> So. what do you think about the patch below?

Might just work, will have to look at it again though ;-) Comments
below.

> If it can work, then 1/5 needs some changes, I think. In particular,
> it should not introduce JOBCTL_TRACED_FROZEN until 5/5, and perhaps

That TRACED_FROZEN was to distinguish the TASK_TRACED and __TASK_TRACED
state, and isn't related to the freezer.

> we can avoid this flag altogether...
> 
> This is how ptrace_check_attach() looks with the patch applied:
> 
> 	static int ptrace_check_attach(struct task_struct *child, bool ignore_state)
> 	{
> 		int traced;
> 		/*
> 		 * We take the read lock around doing both checks to close a
> 		 * possible race where someone else attaches or detaches our
> 		 * natural child.
> 		 */
> 		read_lock(&tasklist_lock);
> 		traced = child->ptrace && child->parent == current;
> 		read_unlock(&tasklist_lock);
> 
> 		if (!traced)
> 			return -ESRCH;

The thing being, that if it is our ptrace child, it won't be going away
since we're running this code and not ptrace_detach().  Right?

I failed to realize this earlier and I think that's part of why my
solution is somewhat over complicated.

> 		WARN_ON(READ_ONCE(child->__state) == __TASK_TRACED);
> 		if (ignore_state)
> 			return 0;
> 
> 		for (;;) {
> 			if (fatal_signal_pending(current))
> 				return -EINTR;

What if signal_wake_up(.resume=true) happens here? In that case we miss
the fatal pending, and task state isn't changed yet so we'll happily go
sleep.

> 			set_current_state(TASK_KILLABLE);
> 			if (!(READ_ONCE(child->jobctl) & JOBCTL_TRACED)) {

  TRACED_XXX ?

> 				__set_current_state(TASK_RUNNING);
> 				break;
> 			}
> 			schedule();
> 		}

That is, shouldn't we write this like:

		for (;;) {
			set_current_state(TASK_KILLABLE);
			if (fatal_signal_pending(current)) {
				ret = -EINTR;
				break;
			}
			if (!(READ_ONCE(current->jobctl) & JOBCTL_TRACED_XXX)) {
				ret = 0;
				break;
			}
			schedule();
		}
		__set_current_state(TASK_RUNNING);
		if (ret)
			return ret;

> 		if (!wait_task_inactive(child, TASK_TRACED) ||
> 		    !ptrace_freeze_traced(child))
> 			return -ESRCH;
> 
> 		return 0;
> 	}
> 
> Oleg.
> ---
> 
> diff --git a/include/linux/sched/jobctl.h b/include/linux/sched/jobctl.h
> index ec8b312f7506..1b5a57048e13 100644
> --- a/include/linux/sched/jobctl.h
> +++ b/include/linux/sched/jobctl.h
> @@ -22,7 +22,8 @@ struct task_struct;
>  
>  #define JOBCTL_STOPPED_BIT	24
>  #define JOBCTL_TRACED_BIT	25
> -#define JOBCTL_TRACED_FROZEN_BIT 26
> +#define JOBCTL_TRACED_XXX_BIT	25
> +#define JOBCTL_TRACED_FROZEN_BIT 27
>  
>  #define JOBCTL_STOP_DEQUEUED	(1UL << JOBCTL_STOP_DEQUEUED_BIT)
>  #define JOBCTL_STOP_PENDING	(1UL << JOBCTL_STOP_PENDING_BIT)
> @@ -35,6 +36,7 @@ struct task_struct;
>  
>  #define JOBCTL_STOPPED		(1UL << JOBCTL_STOPPED_BIT)
>  #define JOBCTL_TRACED		(1UL << JOBCTL_TRACED_BIT)
> +#define JOBCTL_TRACED_XXX	(1UL << JOBCTL_TRACED_XXX_BIT)
>  #define JOBCTL_TRACED_FROZEN	(1UL << JOBCTL_TRACED_FROZEN_BIT)
>  
>  #define JOBCTL_TRAP_MASK	(JOBCTL_TRAP_STOP | JOBCTL_TRAP_NOTIFY)
> diff --git a/kernel/ptrace.c b/kernel/ptrace.c
> index 626f96d275c7..5a03ae5cb0c0 100644
> --- a/kernel/ptrace.c
> +++ b/kernel/ptrace.c
> @@ -193,20 +193,22 @@ static bool looks_like_a_spurious_pid(struct task_struct *task)
>   */
>  static bool ptrace_freeze_traced(struct task_struct *task)
>  {
> +	unsigned long flags;
>  	bool ret = false;
>  
>  	/* Lockless, nobody but us can set this flag */
>  	if (task->jobctl & JOBCTL_LISTENING)
>  		return ret;
> +	if (!lock_task_sighand(task, &flags))
> +		return ret;
>  
> -	spin_lock_irq(&task->sighand->siglock);
>  	if (task_is_traced(task) && !looks_like_a_spurious_pid(task) &&
>  	    !__fatal_signal_pending(task)) {
>  		task->jobctl |= JOBCTL_TRACED_FROZEN;
>  		WRITE_ONCE(task->__state, __TASK_TRACED);
>  		ret = true;
>  	}

I would feel much better if this were still a task_func_call()
validating !->on_rq && !->on_cpu.

> -	spin_unlock_irq(&task->sighand->siglock);
> +	unlock_task_sighand(task, &flags);
>  
>  	return ret;
>  }
> @@ -253,40 +255,39 @@ static void ptrace_unfreeze_traced(struct task_struct *task)
>   */
>  static int ptrace_check_attach(struct task_struct *child, bool ignore_state)
>  {
> -	int ret = -ESRCH;
> -
> +	int traced;
>  	/*
>  	 * We take the read lock around doing both checks to close a
> -	 * possible race where someone else was tracing our child and
> -	 * detached between these two checks.  After this locked check,
> -	 * we are sure that this is our traced child and that can only
> -	 * be changed by us so it's not changing right after this.
> +	 * possible race where someone else attaches or detaches our
> +	 * natural child.
>  	 */
>  	read_lock(&tasklist_lock);
> -	if (child->ptrace && child->parent == current) {
> -		WARN_ON(READ_ONCE(child->__state) == __TASK_TRACED);
> -		/*
> -		 * child->sighand can't be NULL, release_task()
> -		 * does ptrace_unlink() before __exit_signal().
> -		 */
> -		if (ignore_state || ptrace_freeze_traced(child))
> -			ret = 0;
> -	}
> +	traced = child->ptrace && child->parent == current;
>  	read_unlock(&tasklist_lock);
>  
> -	if (!ret && !ignore_state) {
> -		if (!wait_task_inactive(child, __TASK_TRACED)) {
> -			/*
> -			 * This can only happen if may_ptrace_stop() fails and
> -			 * ptrace_stop() changes ->state back to TASK_RUNNING,
> -			 * so we should not worry about leaking __TASK_TRACED.
> -			 */
> -			WARN_ON(READ_ONCE(child->__state) == __TASK_TRACED);
> -			ret = -ESRCH;
> +	if (!traced)
> +		return -ESRCH;
> +
> +	WARN_ON(READ_ONCE(child->__state) == __TASK_TRACED);
> +	if (ignore_state)
> +		return 0;
> +
> +	for (;;) {
> +		if (fatal_signal_pending(current))
> +			return -EINTR;
> +		set_current_state(TASK_KILLABLE);
> +		if (!(READ_ONCE(child->jobctl) & JOBCTL_TRACED)) {
> +			__set_current_state(TASK_RUNNING);
> +			break;
>  		}
> +		schedule();
>  	}
>  
> -	return ret;
> +	if (!wait_task_inactive(child, TASK_TRACED) ||
> +	    !ptrace_freeze_traced(child))
> +		return -ESRCH;
> +
> +	return 0;
>  }
>  
>  static bool ptrace_has_cap(struct user_namespace *ns, unsigned int mode)
> diff --git a/kernel/signal.c b/kernel/signal.c
> index 0aea3f0a8002..684f0a0e9c71 100644
> --- a/kernel/signal.c
> +++ b/kernel/signal.c
> @@ -2182,6 +2182,14 @@ static void do_notify_parent_cldstop(struct task_struct *tsk,
>  	spin_unlock_irqrestore(&sighand->siglock, flags);
>  }
>  
> +static void clear_traced_xxx(void)
> +{
> +	spin_lock_irq(&current->sighand->siglock);
> +	current->jobctl &= ~JOBCTL_TRACED_XXX;
> +	spin_unlock_irq(&current->sighand->siglock);
> +	wake_up_state(current->parent, TASK_KILLABLE);
> +}
> +
>  /*
>   * This must be called with current->sighand->siglock held.
>   *
> @@ -2220,7 +2228,7 @@ static int ptrace_stop(int exit_code, int why, int clear_code,
>  	 * schedule() will not sleep if there is a pending signal that
>  	 * can awaken the task.
>  	 */
> -	current->jobctl |= JOBCTL_TRACED;
> +	current->jobctl |= JOBCTL_TRACED | JOBCTL_TRACED_XXX;
>  	set_special_state(TASK_TRACED);
>  
>  	/*
> @@ -2282,6 +2290,7 @@ static int ptrace_stop(int exit_code, int why, int clear_code,
>  		if (gstop_done && ptrace_reparented(current))
>  			do_notify_parent_cldstop(current, false, why);
>  
> +		clear_traced_xxx();
>  		/*
>  		 * Don't want to allow preemption here, because
>  		 * sys_ptrace() needs this task to be inactive.
> @@ -2296,9 +2305,6 @@ static int ptrace_stop(int exit_code, int why, int clear_code,
>  		cgroup_leave_frozen(true);
>  	} else {
>  		/*
> -		 * By the time we got the lock, our tracer went away.
> -		 * Don't drop the lock yet, another tracer may come.
> -		 *
>  		 * If @gstop_done, the ptracer went away between group stop
>  		 * completion and here.  During detach, it would have set
>  		 * JOBCTL_STOP_PENDING on us and we'll re-enter
> @@ -2307,13 +2313,14 @@ static int ptrace_stop(int exit_code, int why, int clear_code,
>  		 */
>  		if (gstop_done)
>  			do_notify_parent_cldstop(current, false, why);
> +		clear_traced_xxx();
> +		read_unlock(&tasklist_lock);
>  
> -		/* tasklist protects us from ptrace_freeze_traced() */
> +		/* JOBCTL_TRACED_XXX protects us from ptrace_freeze_traced() */

But... TRACED_XXX has just been cleared ?!

>  		__set_current_state(TASK_RUNNING);
>  		read_code = false;
>  		if (clear_code)
>  			exit_code = 0;
> -		read_unlock(&tasklist_lock);
>  	}
>  
>  	/*
>