[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <1259070683.4531.1476.camel@laptop>
Date: Tue, 24 Nov 2009 14:51:23 +0100
From: Peter Zijlstra <peterz@...radead.org>
To: Mike Galbraith <efault@....de>
Cc: Ingo Molnar <mingo@...e.hu>, LKML <linux-kernel@...r.kernel.org>
Subject: Re: [patch] sched: fix b5d9d734 blunder in task_new_fair()
On Sun, 2009-11-22 at 13:07 +0100, Mike Galbraith wrote:
> sched: fix b5d9d734 blunder in task_new_fair()
>
> b5d9d734 fixed the problem of a forking task's child gaining vruntime..
> IFF the child/parent shared a runqueue. In the other case, it broke
> fairness all to pieces by setting the child's vruntime to whatever task
> happened to be current on the child's runqueue at wakeup time. Fix this
> by adding a sched_class::task_new parameter to give the class a chance to
> prepare the child for wakeup.
>
> At child wakeup time, call task_new() with the parent's rq locked as the
> comment in task new states, update the parent's stats (which must be done
> with the rq locked), call task_new() to prepare the child, unlock parent rq,
> select a runqueue a runqueue for the child, _then_ set_task_cpu() with the
> child's vruntime set properly and both runqueue clocks updated to get the
> current offset. Lock child's rq and proceed with wakeup.
>
> Also, since setting scheduling policy requires the tasklist lock, move
> sched_fork() under the tasklist lock in copy_process();
OK, so hopefully I managed to untangle this...
> include/linux/sched.h | 2 +-
> kernel/fork.c | 6 +++---
> kernel/sched.c | 43 ++++++++++++++++++++++++++++---------------
> kernel/sched_fair.c | 17 +++++++++++------
> 4 files changed, 43 insertions(+), 25 deletions(-)
>
> Index: linux-2.6/kernel/sched_fair.c
> ===================================================================
> --- linux-2.6.orig/kernel/sched_fair.c
> +++ linux-2.6/kernel/sched_fair.c
> @@ -1925,20 +1925,23 @@ static void task_tick_fair(struct rq *rq
> * Share the fairness runtime between parent and child, thus the
> * total amount of pressure for CPU stays equal - new tasks
> * get a chance to run but frequent forkers are not allowed to
> - * monopolize the CPU. Note: the parent runqueue is locked,
> - * the child is not running yet.
> + * monopolize the CPU. Note: the parent runqueue is locked at
> + * prep time, the child is not running yet. At wakeup time,
> + * the clild's runqueue is locked.
> */
> -static void task_new_fair(struct rq *rq, struct task_struct *p)
> +static void task_new_fair(struct rq *rq, struct task_struct *p, int prep)
> {
> struct cfs_rq *cfs_rq = task_cfs_rq(p);
> struct sched_entity *se = &p->se, *curr = cfs_rq->curr;
> int this_cpu = smp_processor_id();
>
> - sched_info_queued(p);
> -
> update_curr(cfs_rq);
> - if (curr)
> +
> + if (prep && curr) {
> se->vruntime = curr->vruntime;
> + return;
> + }
> +
> place_entity(cfs_rq, se, 1);
>
> /* 'curr' will be NULL if the child belongs to a different group */
> @@ -1953,6 +1956,8 @@ static void task_new_fair(struct rq *rq,
> }
>
> enqueue_task_fair(rq, p, 0);
> +
> + sched_info_queued(p);
> }
I don't think we need to call this twice, but see below.
> Index: linux-2.6/kernel/sched.c
> ===================================================================
> --- linux-2.6.orig/kernel/sched.c
> +++ linux-2.6/kernel/sched.c
> @@ -2558,7 +2558,6 @@ static void __sched_fork(struct task_str
> void sched_fork(struct task_struct *p, int clone_flags)
> {
> int cpu = get_cpu();
> - unsigned long flags;
>
> __sched_fork(p);
>
> @@ -2566,7 +2565,7 @@ void sched_fork(struct task_struct *p, i
> * Revert to default priority/policy on fork if requested.
> */
> if (unlikely(p->sched_reset_on_fork)) {
> - if (p->policy == SCHED_FIFO || p->policy == SCHED_RR) {
> + if (task_has_rt_policy(p)) {
> p->policy = SCHED_NORMAL;
> p->normal_prio = p->static_prio;
> }
While a nice change, it shouldn't have been mixed in I think.
> @@ -2589,16 +2588,10 @@ void sched_fork(struct task_struct *p, i
> */
> p->prio = current->normal_prio;
>
> - if (!rt_prio(p->prio))
> + if (!task_has_rt_policy(p))
> p->sched_class = &fair_sched_class;
And I suspect this one is actually buggy, see how rt_mutex_setprio()
only changes ->prio and ->sched_class, but leaves ->policy to the
original value?
> -#ifdef CONFIG_SMP
> - cpu = p->sched_class->select_task_rq(p, SD_BALANCE_FORK, 0);
> -#endif
> - local_irq_save(flags);
> - update_rq_clock(cpu_rq(cpu));
> - set_task_cpu(p, cpu);
> - local_irq_restore(flags);
> + __set_task_cpu(p, cpu);
>
> #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
> if (likely(sched_info_on()))
Remove cpu selection from sched_fork(), seems the sane thing to do.
> @@ -2625,21 +2618,40 @@ void sched_fork(struct task_struct *p, i
> */
> void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
> {
> + int cpu = get_cpu();
> unsigned long flags;
> - struct rq *rq;
> + struct task_struct *parent = current;
> + struct rq *rq, *orig_rq;
>
> - rq = task_rq_lock(p, &flags);
> + smp_wmb();
> + rq = orig_rq = task_rq_lock(parent, &flags);
> BUG_ON(p->state != TASK_RUNNING);
> - update_rq_clock(rq);
> + update_rq_clock(orig_rq);
>
> - if (!p->sched_class->task_new || !current->se.on_rq) {
> + if (p->sched_class->task_new)
> + p->sched_class->task_new(orig_rq, p, 1);
> +#ifdef CONFIG_SMP
> + p->state = TASK_WAKING;
> + __task_rq_unlock(orig_rq);
> + cpu = p->sched_class->select_task_rq(p, SD_BALANCE_FORK, 0);
> + rq = cpu_rq(cpu);
> + if (rq != orig_rq) {
> + update_rq_clock(rq);
> + set_task_cpu(p, cpu);
> + }
> + __task_rq_lock(p);
[ should've been: rq == __task_rq_lock(p), because as we didn't hold the
rq->lock the task could have actually been migrated again after
set_task_cpu() ]
> + WARN_ON(p->state != TASK_WAKING);
> + p->state = TASK_RUNNING;
> +#endif
> +
> + if (!p->sched_class->task_new || !parent->se.on_rq) {
> activate_task(rq, p, 0);
> } else {
> /*
> * Let the scheduling class do new task startup
> * management (if any):
> */
> - p->sched_class->task_new(rq, p);
> + p->sched_class->task_new(rq, p, 0);
> inc_nr_running(rq);
> }
> trace_sched_wakeup_new(rq, p, 1);
> @@ -2649,6 +2661,7 @@ void wake_up_new_task(struct task_struct
> p->sched_class->task_wake_up(rq, p);
> #endif
> task_rq_unlock(rq, &flags);
> + put_cpu();
> }
OK, so the general idea seems to be to call task_new(.prep=1) to update
and copy vruntime from the parent to the child _before_ we muck about
and move the child over to another cpu.
Then we muck about and move the thing to another cpu.
Then we call it again with .prep=0 to actually enqueue the thing.
So, the whole point of ->task_new() was to be able to poke at ->vruntime
before the regular enqueue, I think we folded the enqueue in in order to
avoid two class calls. But if you're going to do two calls, we might as
well use ->task_new() and ->enqueue_task() aka activate_task().
This leaves the problem that task_new() behaviour depends on knowing the
target cpu, could we solve that by relying on the fact that we're
executing on the original cpu, something like:
void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
{
unsigned long flags;
struct rq *rq, *orig_rq;
int cpu = get_cpu();
rq = task_rq_lock(p, &flags);
BUG_ON(p->state != TASK_RUNNING);
update_rq_clock(rq);
#ifdef CONFIG_SMP
p->state = TASK_WAKING;
__task_rq_unlock(rq);
cpu = p->sched_class->select_task_rq(p, SD_BALANCE_FORK, 0);
rq = cpu_rq(cpu);
if (rq != orig_rq) {
update_rq_clock(rq);
set_task_cpu(p, cpu);
}
rq = __task_rq_lock(p);
WARN_ON(p->state != TASK_WAKING);
p->state = TASK_RUNNING;
#endif
if (p->sched_class->task_new) {
/* can detect migration through: task_cpu(p) != smp_processor_id() */
p->sched_class->task_new(rq, p);
}
activate_task(rq, p, 0);
trace_sched_wakeup_new(rq, p, 1);
check_preempt_curr(rq, p, WF_FORK);
#ifdef CONFIG_SMP
if (p->sched_class->task_wake_up)
p->sched_class->task_wake_up(rq, p);
#endif
task_rq_unlock(rq, &flags);
put_cpu()
}
> Index: linux-2.6/kernel/fork.c
> ===================================================================
> --- linux-2.6.orig/kernel/fork.c
> +++ linux-2.6/kernel/fork.c
> @@ -1125,9 +1125,6 @@ static struct task_struct *copy_process(
>
> p->stack_start = stack_start;
>
> - /* Perform scheduler related setup. Assign this task to a CPU. */
> - sched_fork(p, clone_flags);
> -
> retval = perf_event_init_task(p);
> if (retval)
> goto bad_fork_cleanup_policy;
> @@ -1229,6 +1226,9 @@ static struct task_struct *copy_process(
> /* Need tasklist lock for parent etc handling! */
> write_lock_irq(&tasklist_lock);
>
> + /* Perform scheduler related setup. Assign this task to a CPU. */
> + sched_fork(p, clone_flags);
> +
You just invalidated that comment ;-)
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Powered by blists - more mailing lists