In order to facilitate a lazy -- fault driven -- migration of pages, create a special transient PROT_NONE variant, we can then use the 'spurious' protection faults to drive our migrations from. Pages that already had an effective PROT_NONE mapping will not be detected to generate these 'spuriuos' faults for the simple reason that we cannot distinguish them on their protection bits, see pte_prot_none. This isn't a problem since PROT_NONE (and possible PROT_WRITE with dirty tracking) aren't used or are rare enough for us to not care about their placement. Suggested-by: Rik van Riel Cc: Paul Turner Signed-off-by: Peter Zijlstra --- include/linux/huge_mm.h | 3 + include/linux/mempolicy.h | 4 +- include/linux/mm.h | 12 ++++++ mm/huge_memory.c | 21 +++++++++++ mm/memory.c | 86 ++++++++++++++++++++++++++++++++++++++++++---- mm/mempolicy.c | 24 ++++++++++++ mm/mprotect.c | 24 +++++++++--- 7 files changed, 159 insertions(+), 15 deletions(-) --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -11,6 +11,9 @@ extern int copy_huge_pmd(struct mm_struc extern int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pmd_t *pmd, pmd_t orig_pmd); +extern void do_huge_pmd_prot_none(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long address, pmd_t *pmd, + unsigned int flags, pmd_t orig_pmd); extern pgtable_t get_pmd_huge_pte(struct mm_struct *mm); extern struct page *follow_trans_huge_pmd(struct mm_struct *mm, unsigned long addr, --- a/include/linux/mempolicy.h +++ b/include/linux/mempolicy.h @@ -254,7 +254,9 @@ static inline int vma_migratable(struct return 1; } -#else +extern void lazy_migrate_process(struct mm_struct *mm); + +#else /* CONFIG_NUMA */ struct mempolicy {}; --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1046,6 +1046,9 @@ extern unsigned long move_page_tables(st extern unsigned long do_mremap(unsigned long addr, unsigned long old_len, unsigned long new_len, unsigned long flags, unsigned long new_addr); +extern void change_protection(struct vm_area_struct *vma, unsigned long start, + unsigned long end, pgprot_t newprot, + int dirty_accountable); extern int mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev, unsigned long start, unsigned long end, unsigned long newflags); @@ -1495,6 +1498,15 @@ static inline pgprot_t vm_get_page_prot( } #endif +static inline pgprot_t vma_prot_none(struct vm_area_struct *vma) +{ + /* + * obtain PROT_NONE by removing READ|WRITE|EXEC privs + */ + vm_flags_t vmflags = vma->vm_flags & ~(VM_READ|VM_WRITE|VM_EXEC); + return pgprot_modify(vma->vm_page_prot, vm_get_page_prot(vmflags)); +} + struct vm_area_struct *find_extend_vma(struct mm_struct *, unsigned long addr); int remap_pfn_range(struct vm_area_struct *, unsigned long addr, unsigned long pfn, unsigned long size, pgprot_t); --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -750,6 +750,27 @@ int do_huge_pmd_anonymous_page(struct mm return handle_pte_fault(mm, vma, address, pte, pmd, flags); } +void do_huge_pmd_prot_none(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long address, pmd_t *pmd, + unsigned int flags, pmd_t entry) +{ + unsigned long haddr = address & HPAGE_PMD_MASK; + + spin_lock(&mm->page_table_lock); + if (unlikely(!pmd_same(*pmd, entry))) + goto out_unlock; + + /* do fancy stuff */ + + /* change back to regular protection */ + entry = pmd_modify(entry, vma->vm_page_prot); + if (pmdp_set_access_flags(vma, haddr, pmd, entry, 1)) + update_mmu_cache(vma, address, entry); + +out_unlock: + spin_unlock(&mm->page_table_lock); +} + int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr, struct vm_area_struct *vma) --- a/mm/memory.c +++ b/mm/memory.c @@ -3415,6 +3415,71 @@ static int do_nonlinear_fault(struct mm_ return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte); } +static bool pte_prot_none(struct vm_area_struct *vma, pte_t pte) +{ + /* + * If we have the normal vma->vm_page_prot protections we're not a + * 'special' PROT_NONE page. + * + * This means we cannot get 'special' PROT_NONE faults from genuine + * PROT_NONE maps, nor from PROT_WRITE file maps that do dirty + * tracking. + * + * Neither case is really interesting for our current use though so we + * don't care. + */ + if (pte_same(pte, pte_modify(pte, vma->vm_page_prot))) + return false; + + return pte_same(pte, pte_modify(pte, vma_prot_none(vma))); +} + +static bool pmd_prot_none(struct vm_area_struct *vma, pmd_t pmd) +{ + /* + * See pte_prot_none(). + */ + if (pmd_same(pmd, pmd_modify(pmd, vma->vm_page_prot))) + return false; + + return pmd_same(pmd, pmd_modify(pmd, vma_prot_none(vma))); +} + +static int do_prot_none(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long address, pte_t *ptep, pmd_t *pmd, + unsigned int flags, pte_t entry) +{ + spinlock_t *ptl; + int ret = 0; + + if (!pte_unmap_same(mm, pmd, ptep, entry)) + goto out; + + /* + * Do fancy stuff... + */ + + /* + * OK, nothing to do,.. change the protection back to what it + * ought to be. + */ + ptep = pte_offset_map_lock(mm, pmd, address, &ptl); + if (unlikely(!pte_same(*ptep, entry))) + goto unlock; + + flush_cache_page(vma, address, pte_pfn(entry)); + + ptep_modify_prot_start(mm, address, ptep); + entry = pte_modify(entry, vma->vm_page_prot); + ptep_modify_prot_commit(mm, address, ptep, entry); + + update_mmu_cache(vma, address, ptep); +unlock: + pte_unmap_unlock(ptep, ptl); +out: + return ret; +} + /* * These routines also need to handle stuff like marking pages dirty * and/or accessed for architectures that don't do it in hardware (most @@ -3453,6 +3518,9 @@ int handle_pte_fault(struct mm_struct *m pte, pmd, flags, entry); } + if (pte_prot_none(vma, entry)) + return do_prot_none(mm, vma, address, pte, pmd, flags, entry); + ptl = pte_lockptr(mm, pmd); spin_lock(ptl); if (unlikely(!pte_same(*pte, entry))) @@ -3517,13 +3585,16 @@ int handle_mm_fault(struct mm_struct *mm pmd, flags); } else { pmd_t orig_pmd = *pmd; - int ret; + int ret = 0; barrier(); - if (pmd_trans_huge(orig_pmd)) { - if (flags & FAULT_FLAG_WRITE && - !pmd_write(orig_pmd) && - !pmd_trans_splitting(orig_pmd)) { + if (pmd_trans_huge(orig_pmd) && !pmd_trans_splitting(orig_pmd)) { + if (pmd_prot_none(vma, orig_pmd)) { + do_huge_pmd_prot_none(mm, vma, address, pmd, + flags, orig_pmd); + } + + if ((flags & FAULT_FLAG_WRITE) && !pmd_write(orig_pmd)) { ret = do_huge_pmd_wp_page(mm, vma, address, pmd, orig_pmd); /* @@ -3533,12 +3604,13 @@ int handle_mm_fault(struct mm_struct *mm */ if (unlikely(ret & VM_FAULT_OOM)) goto retry; - return ret; } - return 0; + + return ret; } } + /* * Use __pte_alloc instead of pte_alloc_map, because we can't * run pte_offset_map on the pmd, if an huge pmd could --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -565,6 +565,12 @@ static inline int check_pgd_range(struct return 0; } +static void +change_prot_none(struct vm_area_struct *vma, unsigned long start, unsigned long end) +{ + change_protection(vma, start, end, vma_prot_none(vma), 0); +} + /* * Check if all pages in a range are on a set of nodes. * If pagelist != NULL then isolate pages from the LRU and @@ -1197,6 +1203,24 @@ static long do_mbind(unsigned long start return err; } +static void lazy_migrate_vma(struct vm_area_struct *vma) +{ + if (!vma_migratable(vma)) + return; + + change_prot_none(vma, vma->vm_start, vma->vm_end); +} + +void lazy_migrate_process(struct mm_struct *mm) +{ + struct vm_area_struct *vma; + + down_read(&mm->mmap_sem); + for (vma = mm->mmap; vma; vma = vma->vm_next) + lazy_migrate_vma(vma); + up_read(&mm->mmap_sem); +} + /* * User space interface with variable sized bitmaps for nodelists. */ --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -119,7 +119,7 @@ static inline void change_pud_range(stru } while (pud++, addr = next, addr != end); } -static void change_protection(struct vm_area_struct *vma, +static void change_protection_range(struct vm_area_struct *vma, unsigned long addr, unsigned long end, pgprot_t newprot, int dirty_accountable) { @@ -141,6 +141,20 @@ static void change_protection(struct vm_ flush_tlb_range(vma, start, end); } +void change_protection(struct vm_area_struct *vma, unsigned long start, + unsigned long end, pgprot_t newprot, + int dirty_accountable) +{ + struct mm_struct *mm = vma->vm_mm; + + mmu_notifier_invalidate_range_start(mm, start, end); + if (is_vm_hugetlb_page(vma)) + hugetlb_change_protection(vma, start, end, newprot); + else + change_protection_range(vma, start, end, newprot, dirty_accountable); + mmu_notifier_invalidate_range_end(mm, start, end); +} + int mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev, unsigned long start, unsigned long end, unsigned long newflags) @@ -213,12 +227,8 @@ mprotect_fixup(struct vm_area_struct *vm dirty_accountable = 1; } - mmu_notifier_invalidate_range_start(mm, start, end); - if (is_vm_hugetlb_page(vma)) - hugetlb_change_protection(vma, start, end, vma->vm_page_prot); - else - change_protection(vma, start, end, vma->vm_page_prot, dirty_accountable); - mmu_notifier_invalidate_range_end(mm, start, end); + change_protection(vma, start, end, vma->vm_page_prot, dirty_accountable); + vm_stat_account(mm, oldflags, vma->vm_file, -nrpages); vm_stat_account(mm, newflags, vma->vm_file, nrpages); perf_event_mmap(vma); -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/