Subject: sched,numa: be more careful about joining numa groups Due to the way the pid is truncated, and tasks are moved between CPUs by the scheduler, it is possible for the current task_numa_fault to group together tasks that do not actually share memory together. This patch adds a few easy sanity checks to task_numa_fault, joining tasks together if they share the same tsk->mm, or if the fault was on a page with an elevated mapcount, in a shared VMA. Signed-off-by: Rik van Riel --- include/linux/sched.h | 6 ++++-- kernel/sched/fair.c | 23 +++++++++++++++++------ mm/huge_memory.c | 4 +++- mm/memory.c | 8 ++++++-- 4 files changed, 30 insertions(+), 11 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 44a7cc7..de942a8 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1450,13 +1450,15 @@ struct task_struct { #define TNF_NO_GROUP 0x02 #ifdef CONFIG_NUMA_BALANCING -extern void task_numa_fault(int last_node, int node, int pages, int flags); +extern void task_numa_fault(int last_node, int node, int pages, int flags, + struct vm_area_struct *vma, int mapcount); extern pid_t task_numa_group_id(struct task_struct *p); extern void set_numabalancing_state(bool enabled); extern void task_numa_free(struct task_struct *p); #else static inline void task_numa_fault(int last_node, int node, int pages, - int flags) + int flags, struct vm_area_struct *vma, + int mapcount) { } static inline pid_t task_numa_group_id(struct task_struct *p) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 8b3d877..22e859f 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1323,7 +1323,8 @@ static void double_lock(spinlock_t *l1, spinlock_t *l2) spin_lock_nested(l2, SINGLE_DEPTH_NESTING); } -static void task_numa_group(struct task_struct *p, int cpu, int pid) +static void task_numa_group(struct task_struct *p, int cpu, int pid, + struct vm_area_struct *vma, int mapcount) { struct numa_group *grp, *my_grp; struct task_struct *tsk; @@ -1380,10 +1381,19 @@ static void task_numa_group(struct task_struct *p, int cpu, int pid) if (my_grp->nr_tasks == grp->nr_tasks && my_grp > grp) goto unlock; - if (!get_numa_group(grp)) - goto unlock; + /* Always join threads in the same process. */ + if (tsk->mm == current->mm) + join = true; + + /* + * Simple filter to avoid false positives due to PID collisions, + * accesses on KSM shared pages, etc... + */ + if (mapcount > 1 && (vma->vm_flags & VM_SHARED)) + join = true; - join = true; + if (join && !get_numa_group(grp)) + join = false; unlock: rcu_read_unlock(); @@ -1437,7 +1447,8 @@ void task_numa_free(struct task_struct *p) /* * Got a PROT_NONE fault for a page on @node. */ -void task_numa_fault(int last_cpupid, int node, int pages, int flags) +void task_numa_fault(int last_cpupid, int node, int pages, int flags, + struct vm_area_struct *vma, int mapcount) { struct task_struct *p = current; bool migrated = flags & TNF_MIGRATED; @@ -1478,7 +1489,7 @@ void task_numa_fault(int last_cpupid, int node, int pages, int flags) priv = (pid == (p->pid & LAST__PID_MASK)); if (!priv && !(flags & TNF_NO_GROUP)) - task_numa_group(p, cpu, pid); + task_numa_group(p, cpu, pid, vma, mapcount); } /* diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 6f883df..a175191 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1298,6 +1298,7 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, bool page_locked; bool migrated = false; int flags = 0; + int mapcount = 0; spin_lock(&mm->page_table_lock); if (unlikely(!pmd_same(pmd, *pmdp))) @@ -1306,6 +1307,7 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, page = pmd_page(pmd); BUG_ON(is_huge_zero_page(page)); page_nid = page_to_nid(page); + mapcount = page_mapcount(page); last_cpupid = page_cpupid_last(page); count_vm_numa_event(NUMA_HINT_FAULTS); if (page_nid == this_nid) @@ -1388,7 +1390,7 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, page_unlock_anon_vma_read(anon_vma); if (page_nid != -1) - task_numa_fault(last_cpupid, page_nid, HPAGE_PMD_NR, flags); + task_numa_fault(last_cpupid, page_nid, HPAGE_PMD_NR, flags, vma, mapcount); return 0; } diff --git a/mm/memory.c b/mm/memory.c index 2e1d43b..8cef83c 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3545,6 +3545,7 @@ int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, int target_nid; bool migrated = false; int flags = 0; + int mapcount = 0; /* * The "pte" at this point cannot be used safely without @@ -3583,6 +3584,7 @@ int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, last_cpupid = page_cpupid_last(page); page_nid = page_to_nid(page); + mapcount = page_mapcount(page); target_nid = numa_migrate_prep(page, vma, addr, page_nid); pte_unmap_unlock(ptep, ptl); if (target_nid == -1) { @@ -3599,7 +3601,7 @@ int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, out: if (page_nid != -1) - task_numa_fault(last_cpupid, page_nid, 1, flags); + task_numa_fault(last_cpupid, page_nid, 1, flags, vma, mapcount); return 0; } @@ -3641,6 +3643,7 @@ static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, int target_nid; bool migrated = false; int flags = 0; + int mapcount; if (!pte_present(pteval)) continue; @@ -3670,6 +3673,7 @@ static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, last_cpupid = page_cpupid_last(page); page_nid = page_to_nid(page); + mapcount = page_mapcount(page); target_nid = numa_migrate_prep(page, vma, addr, page_nid); pte_unmap_unlock(pte, ptl); if (target_nid != -1) { @@ -3683,7 +3687,7 @@ static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, } if (page_nid != -1) - task_numa_fault(last_cpupid, page_nid, 1, flags); + task_numa_fault(last_cpupid, page_nid, 1, flags, vma, mapcount); pte = pte_offset_map_lock(mm, pmdp, addr, &ptl); }