In order to facilitate a lazy -- fault driven -- migration of pages, create a special transient PROT_NONE variant, we can then use the 'spurious' protection faults to drive our migrations from. Pages that already had an effective PROT_NONE mapping will not be detected to generate these 'spuriuos' faults for the simple reason that we cannot distinguish them on their protection bits, see pte_numa(). This isn't a problem since PROT_NONE (and possible PROT_WRITE with dirty tracking) aren't used or are rare enough for us to not care about their placement. Suggested-by: Rik van Riel Signed-off-by: Peter Zijlstra Reviewed-by: Rik van Riel Cc: Paul Turner Cc: Linus Torvalds Cc: Andrew Morton Cc: Andrea Arcangeli [ fixed various cross-arch and THP/!THP details ] Signed-off-by: Ingo Molnar --- include/linux/huge_mm.h | 19 ++++++++++++ include/linux/mm.h | 18 +++++++++++ mm/huge_memory.c | 32 ++++++++++++++++++++ mm/memory.c | 75 +++++++++++++++++++++++++++++++++++++++++++----- mm/mprotect.c | 24 ++++++++++----- 5 files changed, 154 insertions(+), 14 deletions(-) Index: tip/include/linux/huge_mm.h =================================================================== --- tip.orig/include/linux/huge_mm.h +++ tip/include/linux/huge_mm.h @@ -159,6 +159,13 @@ static inline struct page *compound_tran } return page; } + +extern bool pmd_numa(struct vm_area_struct *vma, pmd_t pmd); + +extern void do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long address, pmd_t *pmd, + unsigned int flags, pmd_t orig_pmd); + #else /* CONFIG_TRANSPARENT_HUGEPAGE */ #define HPAGE_PMD_SHIFT ({ BUILD_BUG(); 0; }) #define HPAGE_PMD_MASK ({ BUILD_BUG(); 0; }) @@ -195,6 +202,18 @@ static inline int pmd_trans_huge_lock(pm { return 0; } + +static inline bool pmd_numa(struct vm_area_struct *vma, pmd_t pmd) +{ + return false; +} + +static inline void do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long address, pmd_t *pmd, + unsigned int flags, pmd_t orig_pmd) +{ +} + #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #endif /* _LINUX_HUGE_MM_H */ Index: tip/include/linux/mm.h =================================================================== --- tip.orig/include/linux/mm.h +++ tip/include/linux/mm.h @@ -1091,6 +1091,9 @@ extern unsigned long move_page_tables(st extern unsigned long do_mremap(unsigned long addr, unsigned long old_len, unsigned long new_len, unsigned long flags, unsigned long new_addr); +extern void change_protection(struct vm_area_struct *vma, unsigned long start, + unsigned long end, pgprot_t newprot, + int dirty_accountable); extern int mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev, unsigned long start, unsigned long end, unsigned long newflags); @@ -1561,6 +1564,21 @@ static inline pgprot_t vm_get_page_prot( } #endif +static inline pgprot_t vma_prot_none(struct vm_area_struct *vma) +{ + /* + * obtain PROT_NONE by removing READ|WRITE|EXEC privs + */ + vm_flags_t vmflags = vma->vm_flags & ~(VM_READ|VM_WRITE|VM_EXEC); + return pgprot_modify(vma->vm_page_prot, vm_get_page_prot(vmflags)); +} + +static inline void +change_prot_none(struct vm_area_struct *vma, unsigned long start, unsigned long end) +{ + change_protection(vma, start, end, vma_prot_none(vma), 0); +} + struct vm_area_struct *find_extend_vma(struct mm_struct *, unsigned long addr); int remap_pfn_range(struct vm_area_struct *, unsigned long addr, unsigned long pfn, unsigned long size, pgprot_t); Index: tip/mm/huge_memory.c =================================================================== --- tip.orig/mm/huge_memory.c +++ tip/mm/huge_memory.c @@ -725,6 +725,38 @@ out: return handle_pte_fault(mm, vma, address, pte, pmd, flags); } +bool pmd_numa(struct vm_area_struct *vma, pmd_t pmd) +{ + /* + * See pte_numa(). + */ + if (pmd_same(pmd, pmd_modify(pmd, vma->vm_page_prot))) + return false; + + return pmd_same(pmd, pmd_modify(pmd, vma_prot_none(vma))); +} + +void do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long address, pmd_t *pmd, + unsigned int flags, pmd_t entry) +{ + unsigned long haddr = address & HPAGE_PMD_MASK; + + spin_lock(&mm->page_table_lock); + if (unlikely(!pmd_same(*pmd, entry))) + goto out_unlock; + + /* do fancy stuff */ + + /* change back to regular protection */ + entry = pmd_modify(entry, vma->vm_page_prot); + if (pmdp_set_access_flags(vma, haddr, pmd, entry, 1)) + update_mmu_cache_pmd(vma, address, entry); + +out_unlock: + spin_unlock(&mm->page_table_lock); +} + int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr, struct vm_area_struct *vma) Index: tip/mm/memory.c =================================================================== --- tip.orig/mm/memory.c +++ tip/mm/memory.c @@ -1464,6 +1464,25 @@ int zap_vma_ptes(struct vm_area_struct * } EXPORT_SYMBOL_GPL(zap_vma_ptes); +static bool pte_numa(struct vm_area_struct *vma, pte_t pte) +{ + /* + * If we have the normal vma->vm_page_prot protections we're not a + * 'special' PROT_NONE page. + * + * This means we cannot get 'special' PROT_NONE faults from genuine + * PROT_NONE maps, nor from PROT_WRITE file maps that do dirty + * tracking. + * + * Neither case is really interesting for our current use though so we + * don't care. + */ + if (pte_same(pte, pte_modify(pte, vma->vm_page_prot))) + return false; + + return pte_same(pte, pte_modify(pte, vma_prot_none(vma))); +} + /** * follow_page - look up a page descriptor from a user-virtual address * @vma: vm_area_struct mapping @address @@ -3433,6 +3452,41 @@ static int do_nonlinear_fault(struct mm_ return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte); } +static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long address, pte_t *ptep, pmd_t *pmd, + unsigned int flags, pte_t entry) +{ + spinlock_t *ptl; + int ret = 0; + + if (!pte_unmap_same(mm, pmd, ptep, entry)) + goto out; + + /* + * Do fancy stuff... + */ + + /* + * OK, nothing to do,.. change the protection back to what it + * ought to be. + */ + ptep = pte_offset_map_lock(mm, pmd, address, &ptl); + if (unlikely(!pte_same(*ptep, entry))) + goto unlock; + + flush_cache_page(vma, address, pte_pfn(entry)); + + ptep_modify_prot_start(mm, address, ptep); + entry = pte_modify(entry, vma->vm_page_prot); + ptep_modify_prot_commit(mm, address, ptep, entry); + + update_mmu_cache(vma, address, ptep); +unlock: + pte_unmap_unlock(ptep, ptl); +out: + return ret; +} + /* * These routines also need to handle stuff like marking pages dirty * and/or accessed for architectures that don't do it in hardware (most @@ -3471,6 +3525,9 @@ int handle_pte_fault(struct mm_struct *m pte, pmd, flags, entry); } + if (pte_numa(vma, entry)) + return do_numa_page(mm, vma, address, pte, pmd, flags, entry); + ptl = pte_lockptr(mm, pmd); spin_lock(ptl); if (unlikely(!pte_same(*pte, entry))) @@ -3535,13 +3592,16 @@ retry: pmd, flags); } else { pmd_t orig_pmd = *pmd; - int ret; + int ret = 0; barrier(); - if (pmd_trans_huge(orig_pmd)) { - if (flags & FAULT_FLAG_WRITE && - !pmd_write(orig_pmd) && - !pmd_trans_splitting(orig_pmd)) { + if (pmd_trans_huge(orig_pmd) && !pmd_trans_splitting(orig_pmd)) { + if (pmd_numa(vma, orig_pmd)) { + do_huge_pmd_numa_page(mm, vma, address, pmd, + flags, orig_pmd); + } + + if ((flags & FAULT_FLAG_WRITE) && !pmd_write(orig_pmd)) { ret = do_huge_pmd_wp_page(mm, vma, address, pmd, orig_pmd); /* @@ -3551,12 +3611,13 @@ retry: */ if (unlikely(ret & VM_FAULT_OOM)) goto retry; - return ret; } - return 0; + + return ret; } } + /* * Use __pte_alloc instead of pte_alloc_map, because we can't * run pte_offset_map on the pmd, if an huge pmd could Index: tip/mm/mprotect.c =================================================================== --- tip.orig/mm/mprotect.c +++ tip/mm/mprotect.c @@ -112,7 +112,7 @@ static inline void change_pud_range(stru } while (pud++, addr = next, addr != end); } -static void change_protection(struct vm_area_struct *vma, +static void change_protection_range(struct vm_area_struct *vma, unsigned long addr, unsigned long end, pgprot_t newprot, int dirty_accountable) { @@ -134,6 +134,20 @@ static void change_protection(struct vm_ flush_tlb_range(vma, start, end); } +void change_protection(struct vm_area_struct *vma, unsigned long start, + unsigned long end, pgprot_t newprot, + int dirty_accountable) +{ + struct mm_struct *mm = vma->vm_mm; + + mmu_notifier_invalidate_range_start(mm, start, end); + if (is_vm_hugetlb_page(vma)) + hugetlb_change_protection(vma, start, end, newprot); + else + change_protection_range(vma, start, end, newprot, dirty_accountable); + mmu_notifier_invalidate_range_end(mm, start, end); +} + int mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev, unsigned long start, unsigned long end, unsigned long newflags) @@ -206,12 +220,8 @@ success: dirty_accountable = 1; } - mmu_notifier_invalidate_range_start(mm, start, end); - if (is_vm_hugetlb_page(vma)) - hugetlb_change_protection(vma, start, end, vma->vm_page_prot); - else - change_protection(vma, start, end, vma->vm_page_prot, dirty_accountable); - mmu_notifier_invalidate_range_end(mm, start, end); + change_protection(vma, start, end, vma->vm_page_prot, dirty_accountable); + vm_stat_account(mm, oldflags, vma->vm_file, -nrpages); vm_stat_account(mm, newflags, vma->vm_file, nrpages); perf_event_mmap(vma); -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/