linux-kernel - Re: [patch] sched: fix b5d9d734 blunder in task_new

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite for Android: free password hash cracker in your pocket
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <1259070683.4531.1476.camel@laptop>
Date:	Tue, 24 Nov 2009 14:51:23 +0100
From:	Peter Zijlstra <peterz@...radead.org>
To:	Mike Galbraith <efault@....de>
Cc:	Ingo Molnar <mingo@...e.hu>, LKML <linux-kernel@...r.kernel.org>
Subject: Re: [patch] sched: fix b5d9d734 blunder in task_new_fair()

On Sun, 2009-11-22 at 13:07 +0100, Mike Galbraith wrote:
> sched: fix b5d9d734 blunder in task_new_fair()
> 
> b5d9d734 fixed the problem of a forking task's child gaining vruntime..
> IFF the child/parent shared a runqueue.  In the other case, it broke
> fairness all to pieces by setting the child's vruntime to whatever task
> happened to be current on the child's runqueue at wakeup time.  Fix this
> by adding a sched_class::task_new parameter to give the class a chance to
> prepare the child for wakeup.
> 
> At child wakeup time, call task_new() with the parent's rq locked as the
> comment in task new states, update the parent's stats (which must be done
> with the rq locked), call task_new() to prepare the child, unlock parent rq,
> select a runqueue a runqueue for the child, _then_ set_task_cpu() with the
> child's vruntime set properly and both runqueue clocks updated to get the
> current offset.  Lock child's rq and proceed with wakeup.
> 
> Also, since setting scheduling policy requires the tasklist lock, move
> sched_fork() under the tasklist lock in copy_process();

OK, so hopefully I managed to untangle this...

>  include/linux/sched.h |    2 +-
>  kernel/fork.c         |    6 +++---
>  kernel/sched.c        |   43 ++++++++++++++++++++++++++++---------------
>  kernel/sched_fair.c   |   17 +++++++++++------
>  4 files changed, 43 insertions(+), 25 deletions(-)
> 
> Index: linux-2.6/kernel/sched_fair.c
> ===================================================================
> --- linux-2.6.orig/kernel/sched_fair.c
> +++ linux-2.6/kernel/sched_fair.c
> @@ -1925,20 +1925,23 @@ static void task_tick_fair(struct rq *rq
>   * Share the fairness runtime between parent and child, thus the
>   * total amount of pressure for CPU stays equal - new tasks
>   * get a chance to run but frequent forkers are not allowed to
> - * monopolize the CPU. Note: the parent runqueue is locked,
> - * the child is not running yet.
> + * monopolize the CPU. Note: the parent runqueue is locked at
> + * prep time, the child is not running yet.  At wakeup time,
> + * the clild's runqueue is locked.
>   */
> -static void task_new_fair(struct rq *rq, struct task_struct *p)
> +static void task_new_fair(struct rq *rq, struct task_struct *p, int prep)
>  {
>  	struct cfs_rq *cfs_rq = task_cfs_rq(p);
>  	struct sched_entity *se = &p->se, *curr = cfs_rq->curr;
>  	int this_cpu = smp_processor_id();
>  
> -	sched_info_queued(p);
> -
>  	update_curr(cfs_rq);
> -	if (curr)
> +
> +	if (prep && curr) {
>  		se->vruntime = curr->vruntime;
> +		return;
> +	}
> +
>  	place_entity(cfs_rq, se, 1);
>  
>  	/* 'curr' will be NULL if the child belongs to a different group */
> @@ -1953,6 +1956,8 @@ static void task_new_fair(struct rq *rq,
>  	}
>  
>  	enqueue_task_fair(rq, p, 0);
> +
> +	sched_info_queued(p);
>  }

I don't think we need to call this twice, but see below.

> Index: linux-2.6/kernel/sched.c
> ===================================================================
> --- linux-2.6.orig/kernel/sched.c
> +++ linux-2.6/kernel/sched.c
> @@ -2558,7 +2558,6 @@ static void __sched_fork(struct task_str
>  void sched_fork(struct task_struct *p, int clone_flags)
>  {
>  	int cpu = get_cpu();
> -	unsigned long flags;
>  
>  	__sched_fork(p);
>  
> @@ -2566,7 +2565,7 @@ void sched_fork(struct task_struct *p, i
>  	 * Revert to default priority/policy on fork if requested.
>  	 */
>  	if (unlikely(p->sched_reset_on_fork)) {
> -		if (p->policy == SCHED_FIFO || p->policy == SCHED_RR) {
> +		if (task_has_rt_policy(p)) {
>  			p->policy = SCHED_NORMAL;
>  			p->normal_prio = p->static_prio;
>  		}

While a nice change, it shouldn't have been mixed in I think.

> @@ -2589,16 +2588,10 @@ void sched_fork(struct task_struct *p, i
>  	 */
>  	p->prio = current->normal_prio;
>  
> -	if (!rt_prio(p->prio))
> +	if (!task_has_rt_policy(p))
>  		p->sched_class = &fair_sched_class;

And I suspect this one is actually buggy, see how rt_mutex_setprio()
only changes ->prio and ->sched_class, but leaves ->policy to the
original value?

> -#ifdef CONFIG_SMP
> -	cpu = p->sched_class->select_task_rq(p, SD_BALANCE_FORK, 0);
> -#endif
> -	local_irq_save(flags);
> -	update_rq_clock(cpu_rq(cpu));
> -	set_task_cpu(p, cpu);
> -	local_irq_restore(flags);
> +	__set_task_cpu(p, cpu);
>  
>  #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
>  	if (likely(sched_info_on()))

Remove cpu selection from sched_fork(), seems the sane thing to do.

> @@ -2625,21 +2618,40 @@ void sched_fork(struct task_struct *p, i
>   */
>  void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
>  {
> +	int cpu = get_cpu();
>  	unsigned long flags;
> -	struct rq *rq;
> +	struct task_struct *parent = current;
> +	struct rq *rq, *orig_rq;
>  
> -	rq = task_rq_lock(p, &flags);
> +	smp_wmb();
> +	rq = orig_rq = task_rq_lock(parent, &flags);
>  	BUG_ON(p->state != TASK_RUNNING);
> -	update_rq_clock(rq);
> +	update_rq_clock(orig_rq);
>  
> -	if (!p->sched_class->task_new || !current->se.on_rq) {
> +	if (p->sched_class->task_new)
> +		p->sched_class->task_new(orig_rq, p, 1);
> +#ifdef CONFIG_SMP
> +	p->state = TASK_WAKING;
> +	__task_rq_unlock(orig_rq);
> +	cpu = p->sched_class->select_task_rq(p, SD_BALANCE_FORK, 0);
> +	rq = cpu_rq(cpu);
> +	if (rq != orig_rq) {
> +		update_rq_clock(rq);
> +		set_task_cpu(p, cpu);
> +	}
> +	__task_rq_lock(p);

[ should've been: rq == __task_rq_lock(p), because as we didn't hold the
rq->lock the task could have actually been migrated again after
set_task_cpu() ]

> +	WARN_ON(p->state != TASK_WAKING);
> +	p->state = TASK_RUNNING;
> +#endif
> +
> +	if (!p->sched_class->task_new || !parent->se.on_rq) {
>  		activate_task(rq, p, 0);
>  	} else {
>  		/*
>  		 * Let the scheduling class do new task startup
>  		 * management (if any):
>  		 */
> -		p->sched_class->task_new(rq, p);
> +		p->sched_class->task_new(rq, p, 0);
>  		inc_nr_running(rq);
>  	}
>  	trace_sched_wakeup_new(rq, p, 1);
> @@ -2649,6 +2661,7 @@ void wake_up_new_task(struct task_struct
>  		p->sched_class->task_wake_up(rq, p);
>  #endif
>  	task_rq_unlock(rq, &flags);
> +	put_cpu();
>  }

OK, so the general idea seems to be to call task_new(.prep=1) to update
and copy vruntime from the parent to the child _before_ we muck about
and move the child over to another cpu.

Then we muck about and move the thing to another cpu.

Then we call it again with .prep=0 to actually enqueue the thing.

So, the whole point of ->task_new() was to be able to poke at ->vruntime
before the regular enqueue, I think we folded the enqueue in in order to
avoid two class calls. But if you're going to do two calls, we might as
well use ->task_new() and ->enqueue_task() aka activate_task().

This leaves the problem that task_new() behaviour depends on knowing the
target cpu, could we solve that by relying on the fact that we're
executing on the original cpu, something like:

void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
{
	unsigned long flags;
	struct rq *rq, *orig_rq;
	int cpu = get_cpu();

	rq = task_rq_lock(p, &flags);
	BUG_ON(p->state != TASK_RUNNING);
	update_rq_clock(rq);

#ifdef CONFIG_SMP
	p->state = TASK_WAKING;
	__task_rq_unlock(rq);
	cpu = p->sched_class->select_task_rq(p, SD_BALANCE_FORK, 0);
	rq = cpu_rq(cpu);
	if (rq != orig_rq) {
		update_rq_clock(rq);
		set_task_cpu(p, cpu);
	}
	rq = __task_rq_lock(p);
	WARN_ON(p->state != TASK_WAKING);
	p->state = TASK_RUNNING;
#endif

	if (p->sched_class->task_new) {
		/* can detect migration through: task_cpu(p) != smp_processor_id() */
		p->sched_class->task_new(rq, p);
	}

	activate_task(rq, p, 0);

	trace_sched_wakeup_new(rq, p, 1);
	check_preempt_curr(rq, p, WF_FORK);
#ifdef CONFIG_SMP
	if (p->sched_class->task_wake_up)
		p->sched_class->task_wake_up(rq, p);
#endif
	task_rq_unlock(rq, &flags);
	put_cpu()
}

> Index: linux-2.6/kernel/fork.c
> ===================================================================
> --- linux-2.6.orig/kernel/fork.c
> +++ linux-2.6/kernel/fork.c
> @@ -1125,9 +1125,6 @@ static struct task_struct *copy_process(
>  
>  	p->stack_start = stack_start;
>  
> -	/* Perform scheduler related setup. Assign this task to a CPU. */
> -	sched_fork(p, clone_flags);
> -
>  	retval = perf_event_init_task(p);
>  	if (retval)
>  		goto bad_fork_cleanup_policy;
> @@ -1229,6 +1226,9 @@ static struct task_struct *copy_process(
>  	/* Need tasklist lock for parent etc handling! */
>  	write_lock_irq(&tasklist_lock);
>  
> +	/* Perform scheduler related setup. Assign this task to a CPU. */
> +	sched_fork(p, clone_flags);
> +

You just invalidated that comment ;-)


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/