/* * NUMA scheduler * * Copyright (C) 2011-2012 Red Hat, Inc., Peter Zijlstra * * With input and fixes from: * * Ingo Molnar * Bharata B Rao * Dan Smith * * For licensing details see kernel-base/COPYING */ #include #include #include "sched.h" static const int numa_balance_interval = 2 * HZ; /* 2 seconds */ struct numa_cpu_load { unsigned long remote; /* load of tasks running away from their home node */ unsigned long all; /* load of tasks that should be running on this node */ }; static struct numa_cpu_load **numa_load_array; static struct { spinlock_t lock; unsigned long load; } max_mem_load = { .lock = __SPIN_LOCK_UNLOCKED(max_mem_load.lock), .load = 0, }; /* * Assumes symmetric NUMA -- that is, each node is of equal size. */ static void set_max_mem_load(unsigned long load) { unsigned long old_load; spin_lock(&max_mem_load.lock); old_load = max_mem_load.load; if (!old_load) old_load = load; max_mem_load.load = (old_load + load) >> 1; spin_unlock(&max_mem_load.lock); } static unsigned long get_max_mem_load(void) { return max_mem_load.load; } struct node_queue { struct task_struct *numad; unsigned long remote_cpu_load; unsigned long cpu_load; unsigned long prev_numa_foreign; unsigned long remote_mem_load; spinlock_t lock; struct list_head entity_list; int nr_processes; unsigned long next_schedule; int node; }; static struct node_queue **nqs; static inline struct node_queue *nq_of(int node) { return nqs[node]; } static inline struct node_queue *this_nq(void) { return nq_of(numa_node_id()); } bool account_numa_enqueue(struct task_struct *p) { int home_node = tsk_home_node(p); int cpu = task_cpu(p); int node = cpu_to_node(cpu); struct rq *rq = cpu_rq(cpu); struct numa_cpu_load *nl; unsigned long load; /* * not actually an auto-numa task, ignore */ if (home_node == -1) return false; load = task_h_load(p); nl = this_cpu_ptr(numa_load_array[home_node]); p->numa_remote = (node != home_node); p->numa_contrib = load; nl->all += load; if (p->numa_remote) nl->remote += load; /* * the task is on its home-node, we're done, the rest is offnode * accounting. */ if (!p->numa_remote) return false; list_add_tail(&p->se.group_node, &rq->offnode_tasks); rq->offnode_running++; rq->offnode_weight += load; return true; } void account_numa_dequeue(struct task_struct *p) { int home_node = tsk_home_node(p); struct numa_cpu_load *nl; struct rq *rq; /* * not actually an auto-numa task, ignore */ if (home_node == -1) return; nl = this_cpu_ptr(numa_load_array[home_node]); nl->all -= p->numa_contrib; if (p->numa_remote) nl->remote -= p->numa_contrib; /* * the task is on its home-node, we're done, the rest is offnode * accounting. */ if (!p->numa_remote) return; rq = task_rq(p); rq->offnode_running--; rq->offnode_weight -= p->numa_contrib; } static inline struct mm_struct *ne_mm(struct numa_entity *ne) { return container_of(ne, struct mm_struct, numa); } static inline struct task_struct *ne_owner(struct numa_entity *ne) { return rcu_dereference(ne_mm(ne)->owner); } static void process_cpu_migrate(struct numa_entity *ne, int node) { struct task_struct *p, *t; rcu_read_lock(); t = p = ne_owner(ne); if (p) do { sched_setnode(t, node); } while ((t = next_thread(t)) != p); rcu_read_unlock(); } static void process_mem_migrate(struct numa_entity *ne, int node) { lazy_migrate_process(ne_mm(ne), node); } static int process_tryget(struct numa_entity *ne) { /* * This is possible when we hold &nq_of(ne->node)->lock since then * numa_exit() will block on that lock, we can't however write an * assertion to check this, since if we don't hold the lock that * expression isn't safe to evaluate. */ return atomic_inc_not_zero(&ne_mm(ne)->mm_users); } static void process_put(struct numa_entity *ne) { mmput(ne_mm(ne)); } static struct node_queue *lock_ne_nq(struct numa_entity *ne) { struct node_queue *nq; int node; for (;;) { node = ACCESS_ONCE(ne->node); BUG_ON(node == -1); nq = nq_of(node); spin_lock(&nq->lock); if (likely(ne->node == node)) break; spin_unlock(&nq->lock); } return nq; } static void double_lock_nq(struct node_queue *nq1, struct node_queue *nq2) { if (nq1 > nq2) swap(nq1, nq2); spin_lock(&nq1->lock); if (nq2 != nq1) spin_lock_nested(&nq2->lock, SINGLE_DEPTH_NESTING); } static void double_unlock_nq(struct node_queue *nq1, struct node_queue *nq2) { if (nq1 > nq2) swap(nq1, nq2); if (nq2 != nq1) spin_unlock(&nq2->lock); spin_unlock(&nq1->lock); } static void __enqueue_ne(struct node_queue *nq, struct numa_entity *ne) { ne->node = nq->node; list_add_tail(&ne->numa_entry, &nq->entity_list); nq->nr_processes++; } static void __dequeue_ne(struct node_queue *nq, struct numa_entity *ne) { list_del(&ne->numa_entry); nq->nr_processes--; BUG_ON(nq->nr_processes < 0); } static void enqueue_ne(struct numa_entity *ne, int node) { struct node_queue *nq = nq_of(node); BUG_ON(ne->node != -1); process_cpu_migrate(ne, node); process_mem_migrate(ne, node); spin_lock(&nq->lock); __enqueue_ne(nq, ne); spin_unlock(&nq->lock); } static void dequeue_ne(struct numa_entity *ne) { struct node_queue *nq; if (ne->node == -1) // XXX serialization return; nq = lock_ne_nq(ne); ne->node = -1; __dequeue_ne(nq, ne); spin_unlock(&nq->lock); } static void init_ne(struct numa_entity *ne) { ne->node = -1; } void mm_init_numa(struct mm_struct *mm) { init_ne(&mm->numa); } void exit_numa(struct mm_struct *mm) { dequeue_ne(&mm->numa); } static inline unsigned long node_pages_load(int node) { unsigned long pages = 0; pages += node_page_state(node, NR_ANON_PAGES); pages += node_page_state(node, NR_ACTIVE_FILE); return pages; } static int find_idlest_node(int this_node) { unsigned long mem_load, cpu_load; unsigned long min_cpu_load; unsigned long this_cpu_load; int min_node; int node, cpu; min_node = -1; this_cpu_load = min_cpu_load = ULONG_MAX; // XXX should be sched_domain aware for_each_online_node(node) { struct node_queue *nq = nq_of(node); /* * Pick the node that has least cpu load provided there's no * foreign memory load. * * XXX if all nodes were to have foreign allocations we'd OOM, * however check the low-pass filter in update_node_load(). */ mem_load = nq->remote_mem_load; if (mem_load) continue; cpu_load = 0; for_each_cpu_mask(cpu, *cpumask_of_node(node)) cpu_load += cpu_rq(cpu)->load.weight; cpu_load += nq->remote_cpu_load; if (this_node == node) this_cpu_load = cpu_load; if (cpu_load < min_cpu_load) { min_cpu_load = cpu_load; min_node = node; } } /* * If there's no choice, stick to where we are. */ if (min_node == -1) return this_node; /* * Add a little hysteresis so we don't hard-interleave over nodes * scattering workloads. */ if (this_cpu_load != ULONG_MAX && this_node != min_node) { if (this_cpu_load * 100 < min_cpu_load * 110) return this_node; } return min_node; } void select_task_node(struct task_struct *p, struct mm_struct *mm, int sd_flags) { int node; if (!sched_feat(NUMA_SELECT)) { p->node = -1; return; } if (!mm) return; /* * If there's an explicit task policy set, bail. */ if (p->flags & PF_MEMPOLICY) { p->node = -1; return; } if (sd_flags & SD_BALANCE_FORK) { /* For new threads, set the home-node. */ if (mm == current->mm) { p->node = mm->numa.node; return; } } node = find_idlest_node(p->node); if (node == -1) node = numa_node_id(); enqueue_ne(&mm->numa, node); } __init void init_sched_numa(void) { int node; numa_load_array = kzalloc(sizeof(struct numa_cpu_load *) * nr_node_ids, GFP_KERNEL); BUG_ON(!numa_load_array); for_each_node(node) { numa_load_array[node] = alloc_percpu(struct numa_cpu_load); BUG_ON(!numa_load_array[node]); } } static void add_load(unsigned long *load, unsigned long new_load) { if (sched_feat(NUMA_SLOW)) { *load = (*load + new_load) >> 1; return; } *load = new_load; } /* * Called every @numa_balance_interval to update current node state. */ static void update_node_load(struct node_queue *nq) { unsigned long pages, delta; struct numa_cpu_load l; int cpu; memset(&l, 0, sizeof(l)); /* * Aggregate per-cpu cpu-load values for this node as per * account_numa_{en,de}queue(). * * XXX limit to max balance sched_domain */ for_each_online_cpu(cpu) { struct numa_cpu_load *nl = per_cpu_ptr(numa_load_array[nq->node], cpu); l.remote += nl->remote; l.all += nl->all; } add_load(&nq->remote_cpu_load, l.remote); add_load(&nq->cpu_load, l.all); /* * Fold regular samples of NUMA_FOREIGN into a memory load measure. */ pages = node_page_state(nq->node, NUMA_FOREIGN); delta = pages - nq->prev_numa_foreign; nq->prev_numa_foreign = pages; add_load(&nq->remote_mem_load, delta); /* * If there was NUMA_FOREIGN load, that means this node was at its * maximum memory capacity, record that. */ set_max_mem_load(node_pages_load(nq->node)); } enum numa_balance_type { NUMA_BALANCE_NONE = 0, NUMA_BALANCE_CPU = 1, NUMA_BALANCE_MEM = 2, NUMA_BALANCE_ALL = 3, }; struct numa_imbalance { long cpu, mem; long mem_load; enum numa_balance_type type; }; static unsigned long process_cpu_load(struct numa_entity *ne) { unsigned long load = 0; struct task_struct *t, *p; rcu_read_lock(); t = p = ne_owner(ne); if (p) do { load += t->numa_contrib; } while ((t = next_thread(t)) != p); rcu_read_unlock(); return load; } static unsigned long process_mem_load(struct numa_entity *ne) { return get_mm_counter(ne_mm(ne), MM_ANONPAGES); } static int find_busiest_node(int this_node, struct numa_imbalance *imb) { unsigned long cpu_load, mem_load; unsigned long max_cpu_load, max_mem_load; unsigned long sum_cpu_load, sum_mem_load; unsigned long mem_cpu_load, cpu_mem_load; int cpu_node, mem_node; struct node_queue *nq; int node; sum_cpu_load = sum_mem_load = 0; max_cpu_load = max_mem_load = 0; mem_cpu_load = cpu_mem_load = 0; cpu_node = mem_node = -1; /* XXX scalability -- sched_domain */ for_each_online_node(node) { nq = nq_of(node); cpu_load = nq->remote_cpu_load; mem_load = nq->remote_mem_load; /* * If this node is overloaded on memory, we don't want more * tasks, bail! */ if (node == this_node) { if (mem_load) return -1; } sum_cpu_load += cpu_load; if (cpu_load > max_cpu_load) { max_cpu_load = cpu_load; cpu_mem_load = mem_load; cpu_node = node; } sum_mem_load += mem_load; if (mem_load > max_mem_load) { max_mem_load = mem_load; mem_cpu_load = cpu_load; mem_node = node; } } /* * Nobody had overload of any kind, cool we're done! */ if (cpu_node == -1 && mem_node == -1) return -1; if (mem_node == -1) { set_cpu_node: node = cpu_node; cpu_load = max_cpu_load; mem_load = cpu_mem_load; goto calc_imb; } if (cpu_node == -1) { set_mem_node: node = mem_node; cpu_load = mem_cpu_load; mem_load = max_mem_load; goto calc_imb; } /* * We have both cpu and mem overload, oh my! pick whichever is most * overloaded wrt the average. */ if ((u64)max_mem_load * sum_cpu_load > (u64)max_cpu_load * sum_mem_load) goto set_mem_node; goto set_cpu_node; calc_imb: memset(imb, 0, sizeof(*imb)); if (cpu_node != -1) { imb->type |= NUMA_BALANCE_CPU; imb->cpu = (long)(nq_of(node)->cpu_load - nq_of(this_node)->cpu_load) / 2; } if (mem_node != -1) { imb->type |= NUMA_BALANCE_MEM; imb->mem_load = node_pages_load(this_node); imb->mem = (long)(node_pages_load(node) - imb->mem_load) / 2; } return node; } static bool can_move_ne(struct numa_entity *ne) { /* * XXX: consider mems_allowed, stinking cpusets has mems_allowed * per task and it can actually differ over a whole process, la-la-la. */ return true; } static void move_processes(struct node_queue *busiest_nq, struct node_queue *this_nq, struct numa_imbalance *imb) { unsigned long max_mem_load = get_max_mem_load(); long cpu_moved = 0, mem_moved = 0; struct numa_entity *ne; long ne_mem, ne_cpu; int loops; double_lock_nq(this_nq, busiest_nq); loops = busiest_nq->nr_processes; while (!list_empty(&busiest_nq->entity_list) && loops--) { ne = list_first_entry(&busiest_nq->entity_list, struct numa_entity, numa_entry); ne_cpu = process_cpu_load(ne); ne_mem = process_mem_load(ne); if (sched_feat(NUMA_BALANCE_FILTER)) { /* * Avoid moving ne's when we create a larger imbalance * on the other end. */ if ((imb->type & NUMA_BALANCE_CPU) && imb->cpu - cpu_moved < ne_cpu / 2) goto next; /* * Avoid migrating ne's when we'll know we'll push our * node over the memory limit. */ if (max_mem_load && imb->mem_load + mem_moved + ne_mem > max_mem_load) goto next; } if (!can_move_ne(ne)) goto next; __dequeue_ne(busiest_nq, ne); __enqueue_ne(this_nq, ne); if (process_tryget(ne)) { double_unlock_nq(this_nq, busiest_nq); process_cpu_migrate(ne, this_nq->node); process_mem_migrate(ne, this_nq->node); process_put(ne); double_lock_nq(this_nq, busiest_nq); } cpu_moved += ne_cpu; mem_moved += ne_mem; if (imb->cpu - cpu_moved <= 0 && imb->mem - mem_moved <= 0) break; continue; next: list_move_tail(&ne->numa_entry, &busiest_nq->entity_list); } double_unlock_nq(this_nq, busiest_nq); } static void numa_balance(struct node_queue *this_nq) { struct numa_imbalance imb; int busiest; busiest = find_busiest_node(this_nq->node, &imb); if (busiest == -1) return; if (imb.cpu <= 0 && imb.mem <= 0) return; move_processes(nq_of(busiest), this_nq, &imb); } static int wait_for_next_balance(struct node_queue *nq) { set_current_state(TASK_INTERRUPTIBLE); while (!kthread_should_stop()) { long timeout = nq->next_schedule - jiffies; if (timeout <= 0) { __set_current_state(TASK_RUNNING); return 1; } schedule_timeout(timeout); } __set_current_state(TASK_RUNNING); return 0; } static int numad_thread(void *data) { struct node_queue *nq = data; struct task_struct *p = nq->numad; set_cpus_allowed_ptr(p, cpumask_of_node(nq->node)); while (wait_for_next_balance(nq)) { get_online_cpus(); update_node_load(nq); if (sched_feat(NUMA_BALANCE)) numa_balance(nq); put_online_cpus(); nq->next_schedule += numa_balance_interval; } return 0; } static __init int numa_init(void) { int node; nqs = kzalloc(sizeof(struct node_queue*) * nr_node_ids, GFP_KERNEL); BUG_ON(!nqs); for_each_node(node) { // XXX hotplug struct node_queue *nq = kmalloc_node(sizeof(*nq), GFP_KERNEL | __GFP_ZERO, node); BUG_ON(!nq); nq->numad = kthread_create_on_node(numad_thread, nq, node, "numad/%d", node); BUG_ON(IS_ERR(nq->numad)); spin_lock_init(&nq->lock); INIT_LIST_HEAD(&nq->entity_list); nq->next_schedule = jiffies + HZ; nq->node = node; nqs[node] = nq; wake_up_process(nq->numad); } return 0; } early_initcall(numa_init);