lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20161115045714.GB8738@hori1.linux.bs1.fc.nec.co.jp>
Date:   Tue, 15 Nov 2016 04:57:15 +0000
From:   Naoya Horiguchi <n-horiguchi@...jp.nec.com>
To:     "Kirill A. Shutemov" <kirill@...temov.name>
CC:     "linux-mm@...ck.org" <linux-mm@...ck.org>,
        "Kirill A. Shutemov" <kirill.shutemov@...ux.intel.com>,
        Hugh Dickins <hughd@...gle.com>,
        "Andrew Morton" <akpm@...ux-foundation.org>,
        Dave Hansen <dave.hansen@...el.com>,
        Andrea Arcangeli <aarcange@...hat.com>,
        Mel Gorman <mgorman@...hsingularity.net>,
        Michal Hocko <mhocko@...nel.org>,
        "Vlastimil Babka" <vbabka@...e.cz>,
        Pavel Emelyanov <xemul@...allels.com>,
        Zi Yan <zi.yan@...rutgers.edu>,
        Balbir Singh <bsingharora@...il.com>,
        "linux-kernel@...r.kernel.org" <linux-kernel@...r.kernel.org>,
        "Naoya Horiguchi" <nao.horiguchi@...il.com>
Subject: Re: [PATCH v2 05/12] mm: thp: add core routines for thp/pmd
 migration

On Mon, Nov 14, 2016 at 02:45:03PM +0300, Kirill A. Shutemov wrote:
> On Tue, Nov 08, 2016 at 08:31:50AM +0900, Naoya Horiguchi wrote:
> > This patch prepares thp migration's core code. These code will be open when
> > unmap_and_move() stops unconditionally splitting thp and get_new_page() starts
> > to allocate destination thps.
> > 
> > Signed-off-by: Naoya Horiguchi <n-horiguchi@...jp.nec.com>
> > ---
> > ChangeLog v1 -> v2:
> > - support pte-mapped thp, doubly-mapped thp
> > ---
> >  arch/x86/include/asm/pgtable_64.h |   2 +
> >  include/linux/swapops.h           |  61 +++++++++++++++
> >  mm/huge_memory.c                  | 154 ++++++++++++++++++++++++++++++++++++++
> >  mm/migrate.c                      |  44 ++++++++++-
> >  mm/pgtable-generic.c              |   3 +-
> >  5 files changed, 262 insertions(+), 2 deletions(-)
> > 
> > diff --git v4.9-rc2-mmotm-2016-10-27-18-27/arch/x86/include/asm/pgtable_64.h v4.9-rc2-mmotm-2016-10-27-18-27_patched/arch/x86/include/asm/pgtable_64.h
> > index 1cc82ec..3a1b48e 100644
> > --- v4.9-rc2-mmotm-2016-10-27-18-27/arch/x86/include/asm/pgtable_64.h
> > +++ v4.9-rc2-mmotm-2016-10-27-18-27_patched/arch/x86/include/asm/pgtable_64.h
> > @@ -167,7 +167,9 @@ static inline int pgd_large(pgd_t pgd) { return 0; }
> >  					 ((type) << (SWP_TYPE_FIRST_BIT)) \
> >  					 | ((offset) << SWP_OFFSET_FIRST_BIT) })
> >  #define __pte_to_swp_entry(pte)		((swp_entry_t) { pte_val((pte)) })
> > +#define __pmd_to_swp_entry(pte)		((swp_entry_t) { pmd_val((pmd)) })
> >  #define __swp_entry_to_pte(x)		((pte_t) { .pte = (x).val })
> > +#define __swp_entry_to_pmd(x)		((pmd_t) { .pmd = (x).val })
> >  
> >  extern int kern_addr_valid(unsigned long addr);
> >  extern void cleanup_highmap(void);
> > diff --git v4.9-rc2-mmotm-2016-10-27-18-27/include/linux/swapops.h v4.9-rc2-mmotm-2016-10-27-18-27_patched/include/linux/swapops.h
> > index 5c3a5f3..b6b22a2 100644
> > --- v4.9-rc2-mmotm-2016-10-27-18-27/include/linux/swapops.h
> > +++ v4.9-rc2-mmotm-2016-10-27-18-27_patched/include/linux/swapops.h
> > @@ -163,6 +163,67 @@ static inline int is_write_migration_entry(swp_entry_t entry)
> >  
> >  #endif
> >  
> > +#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
> > +extern void set_pmd_migration_entry(struct page *page,
> > +		struct vm_area_struct *vma, unsigned long address);
> > +
> > +extern int remove_migration_pmd(struct page *new, pmd_t *pmd,
> > +		struct vm_area_struct *vma, unsigned long addr, void *old);
> > +
> > +extern void pmd_migration_entry_wait(struct mm_struct *mm, pmd_t *pmd);
> > +
> > +static inline swp_entry_t pmd_to_swp_entry(pmd_t pmd)
> > +{
> > +	swp_entry_t arch_entry;
> > +
> > +	arch_entry = __pmd_to_swp_entry(pmd);
> > +	return swp_entry(__swp_type(arch_entry), __swp_offset(arch_entry));
> > +}
> > +
> > +static inline pmd_t swp_entry_to_pmd(swp_entry_t entry)
> > +{
> > +	swp_entry_t arch_entry;
> > +
> > +	arch_entry = __swp_entry(swp_type(entry), swp_offset(entry));
> > +	return __swp_entry_to_pmd(arch_entry);
> > +}
> > +
> > +static inline int is_pmd_migration_entry(pmd_t pmd)
> > +{
> > +	return !pmd_present(pmd) && is_migration_entry(pmd_to_swp_entry(pmd));
> > +}
> > +#else
> > +static inline void set_pmd_migration_entry(struct page *page,
> > +			struct vm_area_struct *vma, unsigned long address)
> > +{
> 
> VM_BUG()? Or BUILD_BUG()?

These should be compiled out, so BUILD_BUG() seems better to me.
3 routines below will be done in the same manner.

> > +}
> > +
> > +static inline int remove_migration_pmd(struct page *new, pmd_t *pmd,
> > +		struct vm_area_struct *vma, unsigned long addr, void *old)
> > +{
> > +	return 0;
> 
> Ditto.
> 
> > +}
> > +
> > +static inline void pmd_migration_entry_wait(struct mm_struct *m, pmd_t *p) { }
> > +
> > +static inline swp_entry_t pmd_to_swp_entry(pmd_t pmd)
> > +{
> > +	return swp_entry(0, 0);
> 
> Ditto.
> 
> > +}
> > +
> > +static inline pmd_t swp_entry_to_pmd(swp_entry_t entry)
> > +{
> > +	pmd_t pmd = {};
> 
> Ditto.
> 
> > +	return pmd;
> > +}
> > +
> > +static inline int is_pmd_migration_entry(pmd_t pmd)
> > +{
> > +	return 0;
> > +}
> > +#endif
> > +
> >  #ifdef CONFIG_MEMORY_FAILURE
> >  
> >  extern atomic_long_t num_poisoned_pages __read_mostly;
> > diff --git v4.9-rc2-mmotm-2016-10-27-18-27/mm/huge_memory.c v4.9-rc2-mmotm-2016-10-27-18-27_patched/mm/huge_memory.c
> > index 0509d17..b3022b3 100644
> > --- v4.9-rc2-mmotm-2016-10-27-18-27/mm/huge_memory.c
> > +++ v4.9-rc2-mmotm-2016-10-27-18-27_patched/mm/huge_memory.c
> > @@ -2310,3 +2310,157 @@ static int __init split_huge_pages_debugfs(void)
> >  }
> >  late_initcall(split_huge_pages_debugfs);
> >  #endif
> > +
> > +#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
> > +void set_pmd_migration_entry(struct page *page, struct vm_area_struct *vma,
> > +				unsigned long addr)
> > +{
> > +	struct mm_struct *mm = vma->vm_mm;
> > +	pgd_t *pgd;
> > +	pud_t *pud;
> > +	pmd_t *pmd;
> > +	pmd_t pmdval;
> > +	swp_entry_t entry;
> > +	spinlock_t *ptl;
> > +
> > +	pgd = pgd_offset(mm, addr);
> > +	if (!pgd_present(*pgd))
> > +		return;
> > +	pud = pud_offset(pgd, addr);
> > +	if (!pud_present(*pud))
> > +		return;
> > +	pmd = pmd_offset(pud, addr);
> > +	pmdval = *pmd;
> > +	barrier();
> > +	if (!pmd_present(pmdval))
> > +		return;
> > +
> > +	mmu_notifier_invalidate_range_start(mm, addr, addr + HPAGE_PMD_SIZE);
> > +	if (pmd_trans_huge(pmdval)) {
> > +		pmd_t pmdswp;
> > +
> > +		ptl = pmd_lock(mm, pmd);
> > +		if (!pmd_present(*pmd))
> > +			goto unlock_pmd;
> > +		if (unlikely(!pmd_trans_huge(*pmd)))
> > +			goto unlock_pmd;
> 
> Just check *pmd == pmdval?

OK.

> 
> > +		if (pmd_page(*pmd) != page)
> > +			goto unlock_pmd;
> > +
> > +		pmdval = pmdp_huge_get_and_clear(mm, addr, pmd);
> > +		if (pmd_dirty(pmdval))
> > +			set_page_dirty(page);
> > +		entry = make_migration_entry(page, pmd_write(pmdval));
> > +		pmdswp = swp_entry_to_pmd(entry);
> > +		pmdswp = pmd_mkhuge(pmdswp);
> > +		set_pmd_at(mm, addr, pmd, pmdswp);
> > +		page_remove_rmap(page, true);
> > +		put_page(page);
> > +unlock_pmd:
> > +		spin_unlock(ptl);
> > +	} else { /* pte-mapped thp */
> > +		pte_t *pte;
> > +		pte_t pteval;
> > +		struct page *tmp = compound_head(page);
> > +		unsigned long address = addr & HPAGE_PMD_MASK;
> > +		pte_t swp_pte;
> > +		int i;
> > +
> > +		pte = pte_offset_map(pmd, address);
> > +		ptl = pte_lockptr(mm, pmd);
> > +		spin_lock(ptl);
> 
> pte_offset_map_lock() ?

Right.

> > +		for (i = 0; i < HPAGE_PMD_NR; i++, pte++, tmp++) {
> > +			if (!(pte_present(*pte) &&
> > +			      page_to_pfn(tmp) == pte_pfn(*pte)))
> 
> 			if (!pte_present(*pte) || pte_page(*pte) != tmp) ?

Yes, this is shorter/simpler.

> 
> > +				continue;
> > +			pteval = ptep_clear_flush(vma, address, pte);
> > +			if (pte_dirty(pteval))
> > +				set_page_dirty(tmp);
> > +			entry = make_migration_entry(tmp, pte_write(pteval));
> > +			swp_pte = swp_entry_to_pte(entry);
> > +			set_pte_at(mm, address, pte, swp_pte);
> > +			page_remove_rmap(tmp, false);
> > +			put_page(tmp);
> > +		}
> > +		pte_unmap_unlock(pte, ptl);
> > +	}
> > +	mmu_notifier_invalidate_range_end(mm, addr, addr + HPAGE_PMD_SIZE);
> > +	return;
> > +}
> > +
> > +int remove_migration_pmd(struct page *new, pmd_t *pmd,
> > +		struct vm_area_struct *vma, unsigned long addr, void *old)
> > +{
> > +	struct mm_struct *mm = vma->vm_mm;
> > +	spinlock_t *ptl;
> > +	pmd_t pmde;
> > +	swp_entry_t entry;
> > +
> > +	pmde = *pmd;
> > +	barrier();
> > +
> > +	if (!pmd_present(pmde)) {
> > +		if (is_migration_entry(pmd_to_swp_entry(pmde))) {
> 
> 		if (!is_migration_entry(pmd_to_swp_entry(pmde)))
> 			return SWAP_AGAIN;
> 
> And one level less indentation below.

OK.

> > +			unsigned long mmun_start = addr & HPAGE_PMD_MASK;
> > +			unsigned long mmun_end = mmun_start + HPAGE_PMD_SIZE;
> > +
> > +			ptl = pmd_lock(mm, pmd);
> > +			entry = pmd_to_swp_entry(*pmd);
> > +			if (migration_entry_to_page(entry) != old)
> > +				goto unlock_ptl;
> > +			get_page(new);
> > +			pmde = pmd_mkold(mk_huge_pmd(new, vma->vm_page_prot));
> > +			if (is_write_migration_entry(entry))
> > +				pmde = maybe_pmd_mkwrite(pmde, vma);
> > +			flush_cache_range(vma, mmun_start, mmun_end);
> > +			page_add_anon_rmap(new, vma, mmun_start, true);
> > +			pmdp_huge_clear_flush_notify(vma, mmun_start, pmd);
> > +			set_pmd_at(mm, mmun_start, pmd, pmde);
> > +			flush_tlb_range(vma, mmun_start, mmun_end);
> > +			if (vma->vm_flags & VM_LOCKED)
> > +				mlock_vma_page(new);
> > +			update_mmu_cache_pmd(vma, addr, pmd);
> > +unlock_ptl:
> > +			spin_unlock(ptl);
> 
> 			return SWAP_AGAIN;
> 
> And one level less indentation below.
> 
> > +		}
> > +	} else { /* pte-mapped thp */
> > +		pte_t *ptep;
> > +		pte_t pte;
> > +		int i;
> > +		struct page *tmpnew = compound_head(new);
> > +		struct page *tmpold = compound_head((struct page *)old);
> > +		unsigned long address = addr & HPAGE_PMD_MASK;
> > +
> > +		ptep = pte_offset_map(pmd, addr);
> > +		ptl = pte_lockptr(mm, pmd);
> > +		spin_lock(ptl);
> 
> pte_offset_map_lock() ?
> 
> > +
> > +		for (i = 0; i < HPAGE_PMD_NR;
> > +		     i++, ptep++, tmpnew++, tmpold++, address += PAGE_SIZE) {
> > +			pte = *ptep;
> > +			if (!is_swap_pte(pte))
> > +				continue;
> > +			entry = pte_to_swp_entry(pte);
> > +			if (!is_migration_entry(entry) ||
> > +			    migration_entry_to_page(entry) != tmpold)
> > +				continue;
> > +			get_page(tmpnew);
> > +			pte = pte_mkold(mk_pte(tmpnew,
> > +					       READ_ONCE(vma->vm_page_prot)));
> 
> READ_ONCE()? Do we get here under mmap_sem, right?

Some callers of page migration (mbind, move_pages, migrate_pages, cpuset)
do get mmap_sem, but others (memory hotremove, soft offline) don't.
For this part, I borrowed some code from remove_migration_pte() which was
updated at the following commit:

  commit 6d2329f8872f23e46a19d240930571510ce525eb
  Author: Andrea Arcangeli <aarcange@...hat.com>
  Date:   Fri Oct 7 17:01:22 2016 -0700
  
      mm: vm_page_prot: update with WRITE_ONCE/READ_ONCE


Thank you for reviewing in detail!

Naoya Horiguchi

> > +			if (pte_swp_soft_dirty(*ptep))
> > +				pte = pte_mksoft_dirty(pte);
> > +			if (is_write_migration_entry(entry))
> > +				pte = maybe_mkwrite(pte, vma);
> > +			flush_dcache_page(tmpnew);
> > +			set_pte_at(mm, address, ptep, pte);
> > +			if (PageAnon(new))
> > +				page_add_anon_rmap(tmpnew, vma, address, false);
> > +			else
> > +				page_add_file_rmap(tmpnew, false);
> > +			update_mmu_cache(vma, address, ptep);
> > +		}
> > +		pte_unmap_unlock(ptep, ptl);
> > +	}
> > +	return SWAP_AGAIN;
> > +}
> > +#endif
> > diff --git v4.9-rc2-mmotm-2016-10-27-18-27/mm/migrate.c v4.9-rc2-mmotm-2016-10-27-18-27_patched/mm/migrate.c
> > index 66ce6b4..54f2eb6 100644
> > --- v4.9-rc2-mmotm-2016-10-27-18-27/mm/migrate.c
> > +++ v4.9-rc2-mmotm-2016-10-27-18-27_patched/mm/migrate.c
> > @@ -198,6 +198,8 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
> >  {
> >  	struct mm_struct *mm = vma->vm_mm;
> >  	swp_entry_t entry;
> > +	pgd_t *pgd;
> > +	pud_t *pud;
> >   	pmd_t *pmd;
> >  	pte_t *ptep, pte;
> >   	spinlock_t *ptl;
> > @@ -208,10 +210,29 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
> >  			goto out;
> >  		ptl = huge_pte_lockptr(hstate_vma(vma), mm, ptep);
> >  	} else {
> > -		pmd = mm_find_pmd(mm, addr);
> > +		pmd_t pmde;
> > +
> > +		pgd = pgd_offset(mm, addr);
> > +		if (!pgd_present(*pgd))
> > +			goto out;
> > +		pud = pud_offset(pgd, addr);
> > +		if (!pud_present(*pud))
> > +			goto out;
> > +		pmd = pmd_offset(pud, addr);
> >  		if (!pmd)
> >  			goto out;
> >  
> > +		if (PageTransCompound(new)) {
> > +			remove_migration_pmd(new, pmd, vma, addr, old);
> > +			goto out;
> > +		}
> > +
> > +		pmde = *pmd;
> > +		barrier();
> > +
> > +		if (!pmd_present(pmde) || pmd_trans_huge(pmde))
> > +			goto out;
> > +
> >  		ptep = pte_offset_map(pmd, addr);
> >  
> >  		/*
> > @@ -344,6 +365,27 @@ void migration_entry_wait_huge(struct vm_area_struct *vma,
> >  	__migration_entry_wait(mm, pte, ptl);
> >  }
> >  
> > +#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
> > +void pmd_migration_entry_wait(struct mm_struct *mm, pmd_t *pmd)
> > +{
> > +	spinlock_t *ptl;
> > +	struct page *page;
> > +
> > +	ptl = pmd_lock(mm, pmd);
> > +	if (!is_pmd_migration_entry(*pmd))
> > +		goto unlock;
> > +	page = migration_entry_to_page(pmd_to_swp_entry(*pmd));
> > +	if (!get_page_unless_zero(page))
> > +		goto unlock;
> > +	spin_unlock(ptl);
> > +	wait_on_page_locked(page);
> > +	put_page(page);
> > +	return;
> > +unlock:
> > +	spin_unlock(ptl);
> > +}
> > +#endif
> > +
> >  #ifdef CONFIG_BLOCK
> >  /* Returns true if all buffers are successfully locked */
> >  static bool buffer_migrate_lock_buffers(struct buffer_head *head,
> > diff --git v4.9-rc2-mmotm-2016-10-27-18-27/mm/pgtable-generic.c v4.9-rc2-mmotm-2016-10-27-18-27_patched/mm/pgtable-generic.c
> > index 71c5f91..6012343 100644
> > --- v4.9-rc2-mmotm-2016-10-27-18-27/mm/pgtable-generic.c
> > +++ v4.9-rc2-mmotm-2016-10-27-18-27_patched/mm/pgtable-generic.c
> > @@ -118,7 +118,8 @@ pmd_t pmdp_huge_clear_flush(struct vm_area_struct *vma, unsigned long address,
> >  {
> >  	pmd_t pmd;
> >  	VM_BUG_ON(address & ~HPAGE_PMD_MASK);
> > -	VM_BUG_ON(!pmd_trans_huge(*pmdp) && !pmd_devmap(*pmdp));
> > +	VM_BUG_ON(pmd_present(*pmdp) && !pmd_trans_huge(*pmdp) &&
> > +		  !pmd_devmap(*pmdp));
> >  	pmd = pmdp_huge_get_and_clear(vma->vm_mm, address, pmdp);
> >  	flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
> >  	return pmd;
> > -- 
> > 2.7.0
> > 
> 
> -- 
>  Kirill A. Shutemov
> 

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ