Detect 'big' processes for which the one home-node per process isn't going to work as desired. The current policy for such tasks is to ignore them entirely and put the home-node back to -1 (no preference) so they'll behave as if none of this NUMA nonsense is there. The current heuristic for determining if a task is 'big' is if its consuming more than 1/2 a node's worth of cputime. We might want to add a term here looking at the RSS of the process and compare this against the available memory per node. Cc: Rik van Riel Cc: Paul Turner Signed-off-by: Peter Zijlstra --- include/linux/mm_types.h | 1 include/linux/sched.h | 2 + kernel/sched/core.c | 6 ++++- kernel/sched/fair.c | 49 +++++++++++++++++++++++++++++++++++++++++++++-- 4 files changed, 55 insertions(+), 3 deletions(-) --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -389,6 +389,7 @@ struct mm_struct { struct cpumask cpumask_allocation; #endif #ifdef CONFIG_NUMA + unsigned int numa_big; unsigned long numa_next_scan; #endif struct uprobes_state uprobes_state; --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1525,6 +1525,8 @@ struct task_struct { int node_last; /* home node filter */ #ifdef CONFIG_SMP u64 node_stamp; /* migration stamp */ + u64 numa_runtime_stamp; + u64 numa_walltime_stamp; unsigned long numa_contrib; #endif /* CONFIG_SMP */ #endif /* CONFIG_NUMA */ --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1724,13 +1724,17 @@ static void __sched_fork(struct task_str #endif #ifdef CONFIG_NUMA - if (p->mm && atomic_read(&p->mm->mm_users) == 1) + if (p->mm && atomic_read(&p->mm->mm_users) == 1) { + p->mm->numa_big = 0; p->mm->numa_next_scan = jiffies; + } p->node = -1; p->node_last = -1; #ifdef CONFIG_SMP p->node_stamp = 0ULL; + p->numa_runtime_stamp = 0; + p->numa_walltime_stamp = local_clock(); #endif /* CONFIG_SMP */ #endif /* CONFIG_NUMA */ } --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -803,11 +803,47 @@ static void account_offnode_dequeue(stru } /* - * numa task sample period in ms + * numa task sample period in ms: 2.5s */ unsigned int sysctl_sched_numa_task_period = 2500; /* + * Determine if a process is 'big'. + */ +static bool task_numa_big(struct task_struct *p) +{ + struct sched_domain *sd; + struct task_struct *t; + u64 walltime = local_clock(); + u64 runtime = 0; + int weight = 0; + + rcu_read_lock(); + t = p; + do { + if (t->sched_class == &fair_sched_class) + runtime += t->se.sum_exec_runtime; + } while ((t = next_thread(t)) != p); + + sd = rcu_dereference(__get_cpu_var(sd_node)); + if (sd) + weight = sd->span_weight; + rcu_read_unlock(); + + runtime -= p->numa_runtime_stamp; + walltime -= p->numa_walltime_stamp; + + p->numa_runtime_stamp += runtime; + p->numa_walltime_stamp += walltime; + + /* + * We're 'big' when we burn more than half a node's worth + * of cputime. + */ + return runtime > walltime * max(1, weight / 2); +} + +/* * The expensive part of numa migration is done from task_work context. */ void task_numa_work(struct callback_head *work) @@ -815,6 +851,7 @@ void task_numa_work(struct callback_head unsigned long migrate, next_scan, now = jiffies; struct task_struct *t, *p = current; int node = p->node_last; + int big; WARN_ON_ONCE(p != container_of(work, struct task_struct, rcu)); @@ -835,6 +872,13 @@ void task_numa_work(struct callback_head if (cmpxchg(&p->mm->numa_next_scan, migrate, next_scan) != migrate) return; + /* + * If this task is too big, we bail on NUMA placement of the process. + */ + big = p->mm->numa_big = task_numa_big(p); + if (big) + node = -1; + rcu_read_lock(); t = p; do { @@ -858,8 +902,9 @@ void task_tick_numa(struct rq *rq, struc /* * We don't care about NUMA placement if we don't have memory. + * We also bail on placement if we're too big. */ - if (!curr->mm) + if (!curr->mm || curr->mm->numa_big) return; /* -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/