Implement a per-task memory placement scheme for 'big' tasks (as per
the last patch). It relies on a regular PROT_NONE 'migration' fault to
scan the memory space of the procress and uses a two stage migration
scheme to reduce the invluence of unlikely usage relations.

It relies on the assumption that the compute part is tied to a
paticular task and builds a task<->page relation set to model the
compute<->data relation.

Probability says that the task faulting on a page after we protect it,
is most likely to be the task that uses that page most.

To decrease the likelyhood of acting on a false relation, we only
migrate a page when two consecutive samples are from the same task.

I'm still not entirely convinced this scheme is sound, esp. for things
like virtualization and n:m threading solutions in general the
compute<->task relation is fundamentally untrue.

NOTES:
 - we don't actually sample the task, but the task's home-node as a
   migration target, so we're effectively building home-node<->page
   relations not task<->page relations.

 - we migrate to the task's home-node, not the node the task is
   currently running on, since the home-node is the long term target
   for the task to run on, irrespective of whatever node it might
   temporarily run on.

Suggested-by: Rik van Riel <riel@redhat.com>
Cc: Paul Turner <pjt@google.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 include/linux/mempolicy.h |    6 ++--
 include/linux/mm_types.h  |    6 ++++
 kernel/sched/fair.c       |   67 ++++++++++++++++++++++++++++++++++------------
 kernel/sched/features.h   |    1 
 mm/huge_memory.c          |    3 +-
 mm/memory.c               |    2 -
 mm/mempolicy.c            |   39 +++++++++++++++++++++++++-
 7 files changed, 101 insertions(+), 23 deletions(-)
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -69,6 +69,7 @@ enum mpol_rebind_step {
 #define MPOL_F_LOCAL   (1 << 1)	/* preferred local allocation */
 #define MPOL_F_REBINDING (1 << 2)	/* identify policies in rebinding */
 #define MPOL_F_MOF	(1 << 3) /* this policy wants migrate on fault */
+#define MPOL_F_HOME	(1 << 4) /* this is the home-node policy */
 
 #ifdef __KERNEL__
 
@@ -263,7 +264,8 @@ static inline int vma_migratable(struct 
 	return 1;
 }
 
-extern int mpol_misplaced(struct page *, struct vm_area_struct *, unsigned long);
+extern int mpol_misplaced(struct page *, struct vm_area_struct *,
+			  unsigned long, int);
 
 extern void lazy_migrate_process(struct mm_struct *mm);
 
@@ -393,7 +395,7 @@ static inline int mpol_to_str(char *buff
 }
 
 static inline int mpol_misplaced(struct page *page, struct vm_area_struct *vma,
-				 unsigned long address)
+				 unsigned long address, int multi)
 {
 	return -1; /* no node preference */
 }
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -160,6 +160,12 @@ struct page {
 	 */
 	void *shadow;
 #endif
+#ifdef CONFIG_NUMA
+	/*
+	 * XXX fold this into flags for 64bit or so...
+	 */
+	int nid_last;
+#endif
 }
 /*
  * The struct page can be forced to be double word aligned so that atomic ops
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -785,6 +785,16 @@ update_stats_curr_start(struct cfs_rq *c
  * Tasks start out with their home-node unset (-1) this effectively means
  * they act !NUMA until we've established the task is busy enough to bother
  * with placement.
+ *
+ * Once we start doing NUMA placement there's two modes, 'small' process-wide
+ * and 'big' per-task. For the small mode we have a process-wide home node
+ * and lazily mirgrate all memory only when this home-node changes.
+ *
+ * For big mode we keep a home-node per task and use periodic fault scans
+ * to try and estalish a task<->page relation. This assumes the task<->page
+ * relation is a compute<->data relation, this is false for things like virt.
+ * and n:m threading solutions but its the best we can do given the
+ * information we have.
  */
 
 static unsigned long task_h_load(struct task_struct *p);
@@ -796,6 +806,7 @@ static void account_offnode_enqueue(stru
 	rq->offnode_weight += p->numa_contrib;
 	rq->offnode_running++;
 }
+
 static void account_offnode_dequeue(struct rq *rq, struct task_struct *p)
 {
 	rq->offnode_weight -= p->numa_contrib;
@@ -818,6 +829,9 @@ static bool task_numa_big(struct task_st
 	u64 runtime = 0;
 	int weight = 0;
 
+	if (sched_feat(NUMA_FORCE_BIG))
+		return true;
+
 	rcu_read_lock();
 	t = p;
 	do {
@@ -851,7 +865,7 @@ void task_numa_work(struct callback_head
 	unsigned long migrate, next_scan, now = jiffies;
 	struct task_struct *t, *p = current;
 	int node = p->node_last;
-	int big;
+	int big = p->mm->numa_big;
 
 	WARN_ON_ONCE(p != container_of(work, struct task_struct, rcu));
 
@@ -862,7 +876,7 @@ void task_numa_work(struct callback_head
 		return;
 
 	/*
-	 * Enforce maximal migration frequency..
+	 * Enforce maximal scan/migration frequency..
 	 */
 	migrate = p->mm->numa_next_scan;
 	if (time_before(now, migrate))
@@ -872,20 +886,34 @@ void task_numa_work(struct callback_head
 	if (cmpxchg(&p->mm->numa_next_scan, migrate, next_scan) != migrate)
 		return;
 
-	/*
-	 * If this task is too big, we bail on NUMA placement of the process.
-	 */
-	big = p->mm->numa_big = task_numa_big(p);
-	if (big)
-		node = -1;
+	if (!big)
+		big = p->mm->numa_big = task_numa_big(p);
 
-	rcu_read_lock();
-	t = p;
-	do {
-		sched_setnode(t, node);
-	} while ((t = next_thread(p)) != p);
-	rcu_read_unlock();
+	if (big) {
+		/*
+		 * For 'big' processes we do per-thread home-node, combined
+		 * with periodic fault scans.
+		 */
+		if (p->node != node)
+			sched_setnode(p, node);
+	} else {
+		/*
+		 * For 'small' processes we keep the entire process on a
+		 * node and migrate all memory once.
+		 */
+		rcu_read_lock();
+		t = p;
+		do {
+			sched_setnode(t, node);
+		} while ((t = next_thread(p)) != p);
+		rcu_read_unlock();
+	}
 
+	/*
+	 * Trigger fault driven migration, small processes do direct
+	 * lazy migration, big processes do gradual task<->page relations.
+	 * See mpol_misplaced().
+	 */
 	lazy_migrate_process(p->mm);
 }
 
@@ -902,9 +930,8 @@ void task_tick_numa(struct rq *rq, struc
 
 	/*
 	 * We don't care about NUMA placement if we don't have memory.
-	 * We also bail on placement if we're too big.
 	 */
-	if (!curr->mm || curr->mm->numa_big)
+	if (!curr->mm)
 		return;
 
 	/*
@@ -929,7 +956,13 @@ void task_tick_numa(struct rq *rq, struc
 		curr->node_stamp = now;
 		node = numa_node_id();
 
-		if (curr->node_last == node && curr->node != node) {
+		/*
+		 * 'small' tasks only migrate once when their process home-node
+		 * changes, 'big' tasks need continuous 'migration' faults to
+		 * keep the task<->page map accurate.
+		 */
+		if (curr->node_last == node &&
+		    (curr->node != node || curr->mm->numa_big)) {
 			/*
 			 * We can re-use curr->rcu because we checked curr->mm
 			 * != NULL so release_task()->call_rcu() was not called
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -72,6 +72,7 @@ SCHED_FEAT(LB_MIN, false)
 
 #ifdef CONFIG_NUMA
 SCHED_FEAT(NUMA,           true)
+SCHED_FEAT(NUMA_FORCE_BIG, false)
 SCHED_FEAT(NUMA_HOT,       true)
 SCHED_FEAT(NUMA_BIAS,      true)
 SCHED_FEAT(NUMA_PULL,      true)
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -780,7 +780,7 @@ void do_huge_pmd_prot_none(struct mm_str
 	 * XXX should we serialize against split_huge_page ?
 	 */
 
-	node = mpol_misplaced(page, vma, haddr);
+	node = mpol_misplaced(page, vma, haddr, mm->numa_big);
 	if (node == -1)
 		goto do_fixup;
 
@@ -1366,6 +1366,7 @@ static void __split_huge_page_refcount(s
 		page_tail->mapping = page->mapping;
 
 		page_tail->index = page->index + i;
+		page_tail->nid_last = page->nid_last;
 
 		BUG_ON(!PageAnon(page_tail));
 		BUG_ON(!PageUptodate(page_tail));
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3471,7 +3471,7 @@ static int do_prot_none(struct mm_struct
 	get_page(page);
 	pte_unmap_unlock(ptep, ptl);
 
-	node = mpol_misplaced(page, vma, address);
+	node = mpol_misplaced(page, vma, address, mm->numa_big);
 	if (node == -1)
 		goto do_fixup;
 
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -2168,6 +2168,7 @@ mpol_shared_policy_lookup(struct shared_
  * @page   - page to be checked
  * @vma    - vm area where page mapped
  * @addr   - virtual address where page mapped
+ * @multi  - use multi-stage node binding
  *
  * Lookup current policy node id for vma,addr and "compare to" page's
  * node id.
@@ -2179,7 +2180,8 @@ mpol_shared_policy_lookup(struct shared_
  * Policy determination "mimics" alloc_page_vma().
  * Called from fault path where we know the vma and faulting address.
  */
-int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr)
+int mpol_misplaced(struct page *page, struct vm_area_struct *vma,
+		   unsigned long addr, int multi)
 {
 	struct mempolicy *pol;
 	struct zone *zone;
@@ -2230,6 +2232,39 @@ int mpol_misplaced(struct page *page, st
 	default:
 		BUG();
 	}
+
+	/*
+	 * Multi-stage node selection is used in conjunction with a periodic
+	 * migration fault to build a temporal task<->page relation. By
+	 * using a two-stage filter we remove short/unlikely relations.
+	 *
+	 * Using P(p) ~ n_p / n_t as per frequentist probability, we can
+	 * equate a task's usage of a particular page (n_p) per total usage
+	 * of this page (n_t) (in a given time-span) to a probability.
+	 *
+	 * Our periodic faults will then sample this probability and getting
+	 * the same result twice in a row, given these samples are fully
+	 * independent, is then given by P(n)^2, provided our sample period
+	 * is sufficiently short compared to the usage pattern.
+	 *
+	 * This quadric squishes small probabilities, making it less likely
+	 * we act on an unlikely task<->page relation.
+	 *
+	 * NOTE: effectively we're using task-home-node<->page-node relations
+	 * since those are the only thing we can affect.
+	 *
+	 * NOTE: we're using task-home-node as opposed to the current node
+	 * the task might be running on, since the task-home-node is the
+	 * long-term node of this task, further reducing noise. Also see
+	 * task_tick_numa().
+	 */
+	if (multi && (pol->flags & MPOL_F_HOME)) {
+		if (page->nid_last != polnid) {
+			page->nid_last = polnid;
+			goto out;
+		}
+	}
+
 	if (curnid != polnid)
 		ret = polnid;
 out:
@@ -2421,7 +2456,7 @@ void __init numa_policy_init(void)
 		preferred_node_policy[nid] = (struct mempolicy) {
 			.refcnt = ATOMIC_INIT(1),
 			.mode = MPOL_PREFERRED,
-			.flags = MPOL_F_MOF,
+			.flags = MPOL_F_MOF | MPOL_F_HOME,
 			.v = { .preferred_node = nid, },
 		};
 	}


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/