Subject: sched,numa: be more careful about joining numa groups

Due to the way the pid is truncated, and tasks are moved between
CPUs by the scheduler, it is possible for the current task_numa_fault
to group together tasks that do not actually share memory together.

This patch adds a few easy sanity checks to task_numa_fault, joining
tasks together if they share the same tsk->mm, or if the fault was on
a page with an elevated mapcount, in a shared VMA.

Signed-off-by: Rik van Riel <riel@redhat.com>
---
 include/linux/sched.h |  6 ++++--
 kernel/sched/fair.c   | 23 +++++++++++++++++------
 mm/huge_memory.c      |  4 +++-
 mm/memory.c           |  8 ++++++--
 4 files changed, 30 insertions(+), 11 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 44a7cc7..de942a8 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1450,13 +1450,15 @@ struct task_struct {
 #define TNF_NO_GROUP	0x02
 
 #ifdef CONFIG_NUMA_BALANCING
-extern void task_numa_fault(int last_node, int node, int pages, int flags);
+extern void task_numa_fault(int last_node, int node, int pages, int flags,
+			    struct vm_area_struct *vma, int mapcount);
 extern pid_t task_numa_group_id(struct task_struct *p);
 extern void set_numabalancing_state(bool enabled);
 extern void task_numa_free(struct task_struct *p);
 #else
 static inline void task_numa_fault(int last_node, int node, int pages,
-				   int flags)
+				   int flags, struct vm_area_struct *vma,
+				   int mapcount)
 {
 }
 static inline pid_t task_numa_group_id(struct task_struct *p)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 8b3d877..22e859f 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1323,7 +1323,8 @@ static void double_lock(spinlock_t *l1, spinlock_t *l2)
 	spin_lock_nested(l2, SINGLE_DEPTH_NESTING);
 }
 
-static void task_numa_group(struct task_struct *p, int cpu, int pid)
+static void task_numa_group(struct task_struct *p, int cpu, int pid,
+			    struct vm_area_struct *vma, int mapcount)
 {
 	struct numa_group *grp, *my_grp;
 	struct task_struct *tsk;
@@ -1380,10 +1381,19 @@ static void task_numa_group(struct task_struct *p, int cpu, int pid)
 	if (my_grp->nr_tasks == grp->nr_tasks && my_grp > grp)
 		goto unlock;
 
-	if (!get_numa_group(grp))
-		goto unlock;
+	/* Always join threads in the same process. */
+	if (tsk->mm == current->mm)
+		join = true;
+
+	/*
+	 * Simple filter to avoid false positives due to PID collisions,
+	 * accesses on KSM shared pages, etc...
+	 */
+	if (mapcount > 1 && (vma->vm_flags & VM_SHARED))
+		join = true;
 
-	join = true;
+	if (join && !get_numa_group(grp))
+		join = false;
 
 unlock:
 	rcu_read_unlock();
@@ -1437,7 +1447,8 @@ void task_numa_free(struct task_struct *p)
 /*
  * Got a PROT_NONE fault for a page on @node.
  */
-void task_numa_fault(int last_cpupid, int node, int pages, int flags)
+void task_numa_fault(int last_cpupid, int node, int pages, int flags,
+		     struct vm_area_struct *vma, int mapcount)
 {
 	struct task_struct *p = current;
 	bool migrated = flags & TNF_MIGRATED;
@@ -1478,7 +1489,7 @@ void task_numa_fault(int last_cpupid, int node, int pages, int flags)
 
 		priv = (pid == (p->pid & LAST__PID_MASK));
 		if (!priv && !(flags & TNF_NO_GROUP))
-			task_numa_group(p, cpu, pid);
+			task_numa_group(p, cpu, pid, vma, mapcount);
 	}
 
 	/*
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 6f883df..a175191 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1298,6 +1298,7 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	bool page_locked;
 	bool migrated = false;
 	int flags = 0;
+	int mapcount = 0;
 
 	spin_lock(&mm->page_table_lock);
 	if (unlikely(!pmd_same(pmd, *pmdp)))
@@ -1306,6 +1307,7 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	page = pmd_page(pmd);
 	BUG_ON(is_huge_zero_page(page));
 	page_nid = page_to_nid(page);
+	mapcount = page_mapcount(page);
 	last_cpupid = page_cpupid_last(page);
 	count_vm_numa_event(NUMA_HINT_FAULTS);
 	if (page_nid == this_nid)
@@ -1388,7 +1390,7 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		page_unlock_anon_vma_read(anon_vma);
 
 	if (page_nid != -1)
-		task_numa_fault(last_cpupid, page_nid, HPAGE_PMD_NR, flags);
+		task_numa_fault(last_cpupid, page_nid, HPAGE_PMD_NR, flags, vma, mapcount);
 
 	return 0;
 }
diff --git a/mm/memory.c b/mm/memory.c
index 2e1d43b..8cef83c 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3545,6 +3545,7 @@ int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	int target_nid;
 	bool migrated = false;
 	int flags = 0;
+	int mapcount = 0;
 
 	/*
 	* The "pte" at this point cannot be used safely without
@@ -3583,6 +3584,7 @@ int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
 
 	last_cpupid = page_cpupid_last(page);
 	page_nid = page_to_nid(page);
+	mapcount = page_mapcount(page);
 	target_nid = numa_migrate_prep(page, vma, addr, page_nid);
 	pte_unmap_unlock(ptep, ptl);
 	if (target_nid == -1) {
@@ -3599,7 +3601,7 @@ int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
 
 out:
 	if (page_nid != -1)
-		task_numa_fault(last_cpupid, page_nid, 1, flags);
+		task_numa_fault(last_cpupid, page_nid, 1, flags, vma, mapcount);
 	return 0;
 }
 
@@ -3641,6 +3643,7 @@ static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		int target_nid;
 		bool migrated = false;
 		int flags = 0;
+		int mapcount;
 
 		if (!pte_present(pteval))
 			continue;
@@ -3670,6 +3673,7 @@ static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
 
 		last_cpupid = page_cpupid_last(page);
 		page_nid = page_to_nid(page);
+		mapcount = page_mapcount(page);
 		target_nid = numa_migrate_prep(page, vma, addr, page_nid);
 		pte_unmap_unlock(pte, ptl);
 		if (target_nid != -1) {
@@ -3683,7 +3687,7 @@ static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		}
 
 		if (page_nid != -1)
-			task_numa_fault(last_cpupid, page_nid, 1, flags);
+			task_numa_fault(last_cpupid, page_nid, 1, flags, vma, mapcount);
 
 		pte = pte_offset_map_lock(mm, pmdp, addr, &ptl);
 	}