[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20120809215101.GK10459@redhat.com>
Date: Thu, 9 Aug 2012 23:51:01 +0200
From: Andrea Arcangeli <aarcange@...hat.com>
To: Peter Zijlstra <a.p.zijlstra@...llo.nl>
Cc: mingo@...nel.org, riel@...hat.com, oleg@...hat.com, pjt@...gle.com,
akpm@...ux-foundation.org, torvalds@...ux-foundation.org,
tglx@...utronix.de, Lee.Schermerhorn@...com,
linux-kernel@...r.kernel.org, Christoph Lameter <cl@...ux.com>
Subject: Re: [PATCH 15/19] sched: Implement home-node awareness
On Tue, Jul 31, 2012 at 09:12:19PM +0200, Peter Zijlstra wrote:
> @@ -2699,6 +2705,29 @@ select_task_rq_fair(struct task_struct *
> }
>
> rcu_read_lock();
> + if (sched_feat_numa(NUMA_BIAS) && node != -1) {
> + int node_cpu;
> +
> + node_cpu = cpumask_any_and(tsk_cpus_allowed(p), cpumask_of_node(node));
> + if (node_cpu >= nr_cpu_ids)
> + goto find_sd;
> +
> + /*
> + * For fork,exec find the idlest cpu in the home-node.
> + */
> + if (sd_flag & (SD_BALANCE_FORK|SD_BALANCE_EXEC)) {
> + new_cpu = cpu = node_cpu;
> + sd = per_cpu(sd_node, cpu);
> + goto pick_idlest;
> + }
> +
> + /*
> + * For wake, pretend we were running in the home-node.
> + */
> + prev_cpu = node_cpu;
> + }
> +
> +find_sd:
> for_each_domain(cpu, tmp) {
> if (!(tmp->flags & SD_LOAD_BALANCE))
> continue;
This won't work right. I allow fork/clone to go anywhere if there are
more idle cpus. I won't limit to the idlest in the home node.
tsk->task_selected_nid with AutoNUMA is an hint, never an
"enforcement" no matter if it's a wakeup or fork or execve. Idle cpus
always gets priority over NUMA affinity. Not doing it, will regress
performance instead of improving it.
The only way to get good performance is what AutoNUMA does:
1) once in a while verify if tsk has a better node to go
2) if yes, move it to the better node and set task_selected_nid
3) in CFS during regular load balancing and idle balancing use
task_selected_nid as a _not_strict_ hint until 1) runs again
> @@ -3092,6 +3124,23 @@ static void move_task(struct task_struct
> check_preempt_curr(env->dst_rq, p, 0);
> }
>
> +static int task_numa_hot(struct task_struct *p, int from_cpu, int to_cpu)
> +{
> + int from_dist, to_dist;
> + int node = tsk_home_node(p);
> +
> + if (!sched_feat_numa(NUMA_HOT) || node == -1)
> + return 0; /* no node preference */
> +
> + from_dist = node_distance(cpu_to_node(from_cpu), node);
> + to_dist = node_distance(cpu_to_node(to_cpu), node);
> +
> + if (to_dist < from_dist)
> + return 0; /* getting closer is ok */
Getting closer is not ok if 30% of the ram is in the current node, 70%
is in the "home node" and "to_dist" has 0% of the ram. In short you're
taking things into account that are almost irrelevant (distance) and
ignoring the real important stuff to decide if you're improving the
overall NUMA convergence or not.
The objective of any CPU migration to different nodes has to be only a
global overall improvement of numa convergence, not to move the local
30% closer to the remote 70%, that's a regression in convergence if
you end up with 15% here, 15% there and 70% there.
You can take distance into account but only after you know the
mm->mm_autonuma statistical data.
> @@ -3177,6 +3226,7 @@ int can_migrate_task(struct task_struct
> */
>
> tsk_cache_hot = task_hot(p, env->src_rq->clock_task, env->sd);
> + tsk_cache_hot |= task_numa_hot(p, env->src_cpu, env->dst_cpu);
> if (!tsk_cache_hot ||
> env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
> #ifdef CONFIG_SCHEDSTATS
> @@ -3202,11 +3252,11 @@ int can_migrate_task(struct task_struct
> *
> * Called with both runqueues locked.
> */
> -static int move_one_task(struct lb_env *env)
> +static int __move_one_task(struct lb_env *env)
> {
> struct task_struct *p, *n;
>
> - list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) {
> + list_for_each_entry_safe(p, n, env->tasks, se.group_node) {
> if (throttled_lb_pair(task_group(p), env->src_rq->cpu, env->dst_cpu))
> continue;
>
> @@ -3225,6 +3275,21 @@ static int move_one_task(struct lb_env *
> return 0;
> }
>
> +static int move_one_task(struct lb_env *env)
> +{
> + if (sched_feat_numa(NUMA_PULL)) {
> + env->tasks = offnode_tasks(env->src_rq);
> + if (__move_one_task(env))
> + return 1;
> + }
> +
> + env->tasks = &env->src_rq->cfs_tasks;
> + if (__move_one_task(env))
> + return 1;
> +
> + return 0;
> +}
> +
> static unsigned long task_h_load(struct task_struct *p);
>
> static const unsigned int sched_nr_migrate_break = 32;
> @@ -3238,7 +3303,6 @@ static const unsigned int sched_nr_migra
> */
> static int move_tasks(struct lb_env *env)
> {
> - struct list_head *tasks = &env->src_rq->cfs_tasks;
> struct task_struct *p;
> unsigned long load;
> int pulled = 0;
> @@ -3246,8 +3310,9 @@ static int move_tasks(struct lb_env *env
> if (env->imbalance <= 0)
> return 0;
>
> - while (!list_empty(tasks)) {
> - p = list_first_entry(tasks, struct task_struct, se.group_node);
> +again:
> + while (!list_empty(env->tasks)) {
> + p = list_first_entry(env->tasks, struct task_struct, se.group_node);
>
> env->loop++;
> /* We've more or less seen every task there is, call it quits */
> @@ -3258,7 +3323,7 @@ static int move_tasks(struct lb_env *env
> if (env->loop > env->loop_break) {
> env->loop_break += sched_nr_migrate_break;
> env->flags |= LBF_NEED_BREAK;
> - break;
> + goto out;
> }
>
> if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
> @@ -3286,7 +3351,7 @@ static int move_tasks(struct lb_env *env
> * the critical section.
> */
> if (env->idle == CPU_NEWLY_IDLE)
> - break;
> + goto out;
> #endif
>
> /*
> @@ -3294,13 +3359,20 @@ static int move_tasks(struct lb_env *env
> * weighted load.
> */
> if (env->imbalance <= 0)
> - break;
> + goto out;
>
> continue;
> next:
> - list_move_tail(&p->se.group_node, tasks);
> + list_move_tail(&p->se.group_node, env->tasks);
> }
>
> + if (env->tasks == offnode_tasks(env->src_rq)) {
> + env->tasks = &env->src_rq->cfs_tasks;
> + env->loop = 0;
> + goto again;
> + }
> +
> +out:
> /*
> * Right now, this is one of only two places move_task() is called,
> * so we can safely collect move_task() stats here rather than
> @@ -3447,6 +3519,11 @@ struct sd_lb_stats {
> unsigned int busiest_group_weight;
>
> int group_imb; /* Is there imbalance in this sd */
> +#ifdef CONFIG_NUMA
> + struct sched_group *numa_group; /* group which has offnode_tasks */
> + unsigned long numa_group_weight;
> + unsigned long numa_group_running;
> +#endif
> };
>
> /*
> @@ -3462,6 +3539,10 @@ struct sg_lb_stats {
> unsigned long group_weight;
> int group_imb; /* Is there an imbalance in the group ? */
> int group_has_capacity; /* Is there extra capacity in the group? */
> +#ifdef CONFIG_NUMA
> + unsigned long numa_weight;
> + unsigned long numa_running;
> +#endif
> };
>
> /**
> @@ -3490,6 +3571,117 @@ static inline int get_sd_load_idx(struct
> return load_idx;
> }
>
> +#ifdef CONFIG_NUMA
> +static inline void update_sg_numa_stats(struct sg_lb_stats *sgs, struct rq *rq)
> +{
> + sgs->numa_weight += rq->offnode_weight;
> + sgs->numa_running += rq->offnode_running;
> +}
> +
offnode weight is never increased in this patch, maybe the increase to
numa_weight could be moved in later patches that increases
offnode_weight.
> +/*
> + * Since the offnode lists are indiscriminate (they contain tasks for all other
> + * nodes) it is impossible to say if there's any task on there that wants to
> + * move towards the pulling cpu. Therefore select a random offnode list to pull
> + * from such that eventually we'll try them all.
> + */
> +static inline bool pick_numa_rand(void)
> +{
> + return get_random_int() & 1;
> +}
This is what you have to do to pick tasks. I think having to resort to
the random generator to generate your algorithm input, shows had bad
this really is.
In comparison AutoNUMA never a single time _anywhere_ does anything in
function of random. Try to find a single place in AutoNUMA where I
take a random decision.
AutoNUMA always takes decisions it is sure they will improve overall
NUMA convergence.
AutoNUMA because it does things incrementally, and it's not fully
synchronous (leaves CFS alone with the hint for some period of time)
will behave slightly different too, but when it kicks in, it is fully
deterministic.
The only way to create a converging algorithm without
mm_autonuma/task_autonuma is by enforcing a overcommit_memory=2 model
where you account perfectly everything and you decide where the memory
is yourself and you can enforce it (no matter mlocks, tmpfs memory,
slab, or whatever). That kind of real strict model will never happen,
so you could try approximate it maybe making wild (sometime false)
assumptions like pagecache is always freeable.
But that's not what you're doing here. You're trying to solve the
problem in an incremental way like AutoNUMA, so without recalculating
the whole layout of the whole system at every page fault that
allocates 4k and could require to rebalance and move everything by
spilling something over a new node. Problem you lack enough
information to solve it with the incremental algorithm, so it'll never
work.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Powered by blists - more mailing lists