Commit: af2c1401e6f9 ("mm: numa: guarantee that tlb_flush_pending updates are visible before page table updates") added smp_mb__before_spinlock() to set_tlb_flush_pending(). I think we can solve the same problem without this barrier. If instead we mandate that mm_tlb_flush_pending() is used while holding the PTL we're guaranteed to observe prior set_tlb_flush_pending() instances. For this to work we need to rework migrate_misplaced_transhuge_page() a little and move the test up into do_huge_pmd_numa_page(). Cc: Mel Gorman Cc: Rik van Riel Signed-off-by: Peter Zijlstra (Intel) --- --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -527,18 +527,16 @@ static inline cpumask_t *mm_cpumask(stru */ static inline bool mm_tlb_flush_pending(struct mm_struct *mm) { - barrier(); + /* + * Must be called with PTL held; such that our PTL acquire will have + * observed the store from set_tlb_flush_pending(). + */ return mm->tlb_flush_pending; } static inline void set_tlb_flush_pending(struct mm_struct *mm) { mm->tlb_flush_pending = true; - - /* - * Guarantee that the tlb_flush_pending store does not leak into the - * critical section updating the page tables - */ - smp_mb__before_spinlock(); + barrier(); } /* Clearing is done after a TLB flush, which also provides a barrier. */ static inline void clear_tlb_flush_pending(struct mm_struct *mm) --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1410,6 +1410,7 @@ int do_huge_pmd_numa_page(struct vm_faul unsigned long haddr = vmf->address & HPAGE_PMD_MASK; int page_nid = -1, this_nid = numa_node_id(); int target_nid, last_cpupid = -1; + bool need_flush = false; bool page_locked; bool migrated = false; bool was_writable; @@ -1490,10 +1491,29 @@ int do_huge_pmd_numa_page(struct vm_faul } /* + * Since we took the NUMA fault, we must have observed the !accessible + * bit. Make sure all other CPUs agree with that, to avoid them + * modifying the page we're about to migrate. + * + * Must be done under PTL such that we'll observe the relevant + * set_tlb_flush_pending(). + */ + if (mm_tlb_flush_pending(mm)) + need_flush = true; + + /* * Migrate the THP to the requested node, returns with page unlocked * and access rights restored. */ spin_unlock(vmf->ptl); + + /* + * We are not sure a pending tlb flush here is for a huge page + * mapping or not. Hence use the tlb range variant + */ + if (need_flush) + flush_tlb_range(vma, haddr, haddr + HPAGE_PMD_SIZE); + migrated = migrate_misplaced_transhuge_page(vma->vm_mm, vma, vmf->pmd, pmd, vmf->address, page, target_nid); if (migrated) { --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1935,12 +1935,6 @@ int migrate_misplaced_transhuge_page(str put_page(new_page); goto out_fail; } - /* - * We are not sure a pending tlb flush here is for a huge page - * mapping or not. Hence use the tlb range variant - */ - if (mm_tlb_flush_pending(mm)) - flush_tlb_range(vma, mmun_start, mmun_end); /* Prepare a page as a migration target */ __SetPageLocked(new_page);