linux-kernel - Re: [PATCH v3 09/14] mm: thp: check pmd migration entry in common path

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20170209091616.GA15890@hori1.linux.bs1.fc.nec.co.jp>
Date:   Thu, 9 Feb 2017 09:16:16 +0000
From:   Naoya Horiguchi <n-horiguchi@...jp.nec.com>
To:     Zi Yan <zi.yan@...t.com>
CC:     "linux-kernel@...r.kernel.org" <linux-kernel@...r.kernel.org>,
        "linux-mm@...ck.org" <linux-mm@...ck.org>,
        "kirill.shutemov@...ux.intel.com" <kirill.shutemov@...ux.intel.com>,
        "akpm@...ux-foundation.org" <akpm@...ux-foundation.org>,
        "minchan@...nel.org" <minchan@...nel.org>,
        "vbabka@...e.cz" <vbabka@...e.cz>,
        "mgorman@...hsingularity.net" <mgorman@...hsingularity.net>,
        "khandual@...ux.vnet.ibm.com" <khandual@...ux.vnet.ibm.com>,
        "zi.yan@...rutgers.edu" <zi.yan@...rutgers.edu>
Subject: Re: [PATCH v3 09/14] mm: thp: check pmd migration entry in common
 path

On Sun, Feb 05, 2017 at 11:12:47AM -0500, Zi Yan wrote:
> From: Naoya Horiguchi <n-horiguchi@...jp.nec.com>
> 
> If one of callers of page migration starts to handle thp,
> memory management code start to see pmd migration entry, so we need
> to prepare for it before enabling. This patch changes various code
> point which checks the status of given pmds in order to prevent race
> between thp migration and the pmd-related works.
> 
> ChangeLog v1 -> v2:
> - introduce pmd_related() (I know the naming is not good, but can't
>   think up no better name. Any suggesntion is welcomed.)
> 
> Signed-off-by: Naoya Horiguchi <n-horiguchi@...jp.nec.com>
> 
> ChangeLog v2 -> v3:
> - add is_swap_pmd()
> - a pmd entry should be is_swap_pmd(), pmd_trans_huge(), pmd_devmap(),
>   or pmd_none()

(nitpick) ... or normal pmd pointing to pte pages?

> - use pmdp_huge_clear_flush() instead of pmdp_huge_get_and_clear()
> - flush_cache_range() while set_pmd_migration_entry()
> - pmd_none_or_trans_huge_or_clear_bad() and pmd_trans_unstable() return
>   true on pmd_migration_entry, so that migration entries are not
>   treated as pmd page table entries.
> 
> Signed-off-by: Zi Yan <zi.yan@...rutgers.edu>
> ---
>  arch/x86/mm/gup.c             |  4 +--
>  fs/proc/task_mmu.c            | 22 ++++++++-----
>  include/asm-generic/pgtable.h | 71 ----------------------------------------
>  include/linux/huge_mm.h       | 21 ++++++++++--
>  include/linux/swapops.h       | 74 +++++++++++++++++++++++++++++++++++++++++
>  mm/gup.c                      | 20 ++++++++++--
>  mm/huge_memory.c              | 76 ++++++++++++++++++++++++++++++++++++-------
>  mm/madvise.c                  |  2 ++
>  mm/memcontrol.c               |  2 ++
>  mm/memory.c                   |  9 +++--
>  mm/memory_hotplug.c           | 13 +++++++-
>  mm/mempolicy.c                |  1 +
>  mm/mprotect.c                 |  6 ++--
>  mm/mremap.c                   |  2 +-
>  mm/pagewalk.c                 |  2 ++
>  15 files changed, 221 insertions(+), 104 deletions(-)
> 
> diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c
> index 0d4fb3ebbbac..78a153d90064 100644
> --- a/arch/x86/mm/gup.c
> +++ b/arch/x86/mm/gup.c
> @@ -222,9 +222,9 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
>  		pmd_t pmd = *pmdp;
>  
>  		next = pmd_addr_end(addr, end);
> -		if (pmd_none(pmd))
> +		if (!pmd_present(pmd))
>  			return 0;
> -		if (unlikely(pmd_large(pmd) || !pmd_present(pmd))) {
> +		if (unlikely(pmd_large(pmd))) {
>  			/*
>  			 * NUMA hinting faults need to be handled in the GUP
>  			 * slowpath for accounting purposes and so that they
> diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
> index 6c07c7813b26..1e64d6898c68 100644
> --- a/fs/proc/task_mmu.c
> +++ b/fs/proc/task_mmu.c
> @@ -596,7 +596,8 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
>  
>  	ptl = pmd_trans_huge_lock(pmd, vma);
>  	if (ptl) {
> -		smaps_pmd_entry(pmd, addr, walk);
> +		if (pmd_present(*pmd))
> +			smaps_pmd_entry(pmd, addr, walk);
>  		spin_unlock(ptl);
>  		return 0;
>  	}
> @@ -929,6 +930,9 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
>  			goto out;
>  		}
>  
> +		if (!pmd_present(*pmd))
> +			goto out;
> +
>  		page = pmd_page(*pmd);
>  
>  		/* Clear accessed and referenced bits. */
> @@ -1208,19 +1212,19 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end,
>  	if (ptl) {
>  		u64 flags = 0, frame = 0;
>  		pmd_t pmd = *pmdp;
> +		struct page *page;
>  
>  		if ((vma->vm_flags & VM_SOFTDIRTY) || pmd_soft_dirty(pmd))
>  			flags |= PM_SOFT_DIRTY;
>  
> -		/*
> -		 * Currently pmd for thp is always present because thp
> -		 * can not be swapped-out, migrated, or HWPOISONed
> -		 * (split in such cases instead.)
> -		 * This if-check is just to prepare for future implementation.
> -		 */
> -		if (pmd_present(pmd)) {
> -			struct page *page = pmd_page(pmd);
> +		if (is_pmd_migration_entry(pmd)) {
> +			swp_entry_t entry = pmd_to_swp_entry(pmd);
>  
> +			frame = swp_type(entry) |
> +				(swp_offset(entry) << MAX_SWAPFILES_SHIFT);
> +			page = migration_entry_to_page(entry);
> +		} else if (pmd_present(pmd)) {
> +			page = pmd_page(pmd);
>  			if (page_mapcount(page) == 1)
>  				flags |= PM_MMAP_EXCLUSIVE;
>  
> diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
> index b71a431ed649..6cf9e9b5a7be 100644
> --- a/include/asm-generic/pgtable.h
> +++ b/include/asm-generic/pgtable.h
> @@ -726,77 +726,6 @@ static inline pmd_t pmd_read_atomic(pmd_t *pmdp)
>  #ifndef arch_needs_pgtable_deposit
>  #define arch_needs_pgtable_deposit() (false)
>  #endif
> -/*
> - * This function is meant to be used by sites walking pagetables with
> - * the mmap_sem hold in read mode to protect against MADV_DONTNEED and
> - * transhuge page faults. MADV_DONTNEED can convert a transhuge pmd
> - * into a null pmd and the transhuge page fault can convert a null pmd
> - * into an hugepmd or into a regular pmd (if the hugepage allocation
> - * fails). While holding the mmap_sem in read mode the pmd becomes
> - * stable and stops changing under us only if it's not null and not a
> - * transhuge pmd. When those races occurs and this function makes a
> - * difference vs the standard pmd_none_or_clear_bad, the result is
> - * undefined so behaving like if the pmd was none is safe (because it
> - * can return none anyway). The compiler level barrier() is critically
> - * important to compute the two checks atomically on the same pmdval.
> - *
> - * For 32bit kernels with a 64bit large pmd_t this automatically takes
> - * care of reading the pmd atomically to avoid SMP race conditions
> - * against pmd_populate() when the mmap_sem is hold for reading by the
> - * caller (a special atomic read not done by "gcc" as in the generic
> - * version above, is also needed when THP is disabled because the page
> - * fault can populate the pmd from under us).
> - */
> -static inline int pmd_none_or_trans_huge_or_clear_bad(pmd_t *pmd)
> -{
> -	pmd_t pmdval = pmd_read_atomic(pmd);
> -	/*
> -	 * The barrier will stabilize the pmdval in a register or on
> -	 * the stack so that it will stop changing under the code.
> -	 *
> -	 * When CONFIG_TRANSPARENT_HUGEPAGE=y on x86 32bit PAE,
> -	 * pmd_read_atomic is allowed to return a not atomic pmdval
> -	 * (for example pointing to an hugepage that has never been
> -	 * mapped in the pmd). The below checks will only care about
> -	 * the low part of the pmd with 32bit PAE x86 anyway, with the
> -	 * exception of pmd_none(). So the important thing is that if
> -	 * the low part of the pmd is found null, the high part will
> -	 * be also null or the pmd_none() check below would be
> -	 * confused.
> -	 */
> -#ifdef CONFIG_TRANSPARENT_HUGEPAGE
> -	barrier();
> -#endif
> -	if (pmd_none(pmdval) || pmd_trans_huge(pmdval))
> -		return 1;
> -	if (unlikely(pmd_bad(pmdval))) {
> -		pmd_clear_bad(pmd);
> -		return 1;
> -	}
> -	return 0;
> -}
> -
> -/*
> - * This is a noop if Transparent Hugepage Support is not built into
> - * the kernel. Otherwise it is equivalent to
> - * pmd_none_or_trans_huge_or_clear_bad(), and shall only be called in
> - * places that already verified the pmd is not none and they want to
> - * walk ptes while holding the mmap sem in read mode (write mode don't
> - * need this). If THP is not enabled, the pmd can't go away under the
> - * code even if MADV_DONTNEED runs, but if THP is enabled we need to
> - * run a pmd_trans_unstable before walking the ptes after
> - * split_huge_page_pmd returns (because it may have run when the pmd
> - * become null, but then a page fault can map in a THP and not a
> - * regular page).
> - */
> -static inline int pmd_trans_unstable(pmd_t *pmd)
> -{
> -#ifdef CONFIG_TRANSPARENT_HUGEPAGE
> -	return pmd_none_or_trans_huge_or_clear_bad(pmd);
> -#else
> -	return 0;
> -#endif
> -}
>  
>  #ifndef CONFIG_NUMA_BALANCING
>  /*
> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
> index 83a8d42f9d55..c2e5a4eab84a 100644
> --- a/include/linux/huge_mm.h
> +++ b/include/linux/huge_mm.h
> @@ -131,7 +131,7 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
>  #define split_huge_pmd(__vma, __pmd, __address)				\
>  	do {								\
>  		pmd_t *____pmd = (__pmd);				\
> -		if (pmd_trans_huge(*____pmd)				\
> +		if (is_swap_pmd(*____pmd) || pmd_trans_huge(*____pmd)	\
>  					|| pmd_devmap(*____pmd))	\
>  			__split_huge_pmd(__vma, __pmd, __address,	\
>  						false, NULL);		\
> @@ -162,12 +162,18 @@ extern spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd,
>  		struct vm_area_struct *vma);
>  extern spinlock_t *__pud_trans_huge_lock(pud_t *pud,
>  		struct vm_area_struct *vma);
> +
> +static inline int is_swap_pmd(pmd_t pmd)
> +{
> +	return !pmd_none(pmd) && !pmd_present(pmd);
> +}
> +
>  /* mmap_sem must be held on entry */
>  static inline spinlock_t *pmd_trans_huge_lock(pmd_t *pmd,
>  		struct vm_area_struct *vma)
>  {
>  	VM_BUG_ON_VMA(!rwsem_is_locked(&vma->vm_mm->mmap_sem), vma);
> -	if (pmd_trans_huge(*pmd) || pmd_devmap(*pmd))
> +	if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || pmd_devmap(*pmd))
>  		return __pmd_trans_huge_lock(pmd, vma);
>  	else
>  		return NULL;
> @@ -192,6 +198,12 @@ struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr,
>  		pmd_t *pmd, int flags);
>  struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr,
>  		pud_t *pud, int flags);
> +static inline int hpage_order(struct page *page)
> +{
> +	if (unlikely(PageTransHuge(page)))
> +		return HPAGE_PMD_ORDER;
> +	return 0;
> +}
>  
>  extern int do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t orig_pmd);
>  
> @@ -232,6 +244,7 @@ static inline bool thp_migration_supported(void)
>  #define HPAGE_PUD_SIZE ({ BUILD_BUG(); 0; })
>  
>  #define hpage_nr_pages(x) 1
> +#define hpage_order(x) 0
>  
>  #define transparent_hugepage_enabled(__vma) 0
>  
> @@ -274,6 +287,10 @@ static inline void vma_adjust_trans_huge(struct vm_area_struct *vma,
>  					 long adjust_next)
>  {
>  }
> +static inline int is_swap_pmd(pmd_t pmd)
> +{
> +	return 0;
> +}
>  static inline spinlock_t *pmd_trans_huge_lock(pmd_t *pmd,
>  		struct vm_area_struct *vma)
>  {
> diff --git a/include/linux/swapops.h b/include/linux/swapops.h
> index 6625bea13869..50e4aa7e7ff9 100644
> --- a/include/linux/swapops.h
> +++ b/include/linux/swapops.h
> @@ -229,6 +229,80 @@ static inline int is_pmd_migration_entry(pmd_t pmd)
>  }
>  #endif
>  
> +/*
> + * This function is meant to be used by sites walking pagetables with
> + * the mmap_sem hold in read mode to protect against MADV_DONTNEED and
> + * transhuge page faults. MADV_DONTNEED can convert a transhuge pmd
> + * into a null pmd and the transhuge page fault can convert a null pmd
> + * into an hugepmd or into a regular pmd (if the hugepage allocation
> + * fails). While holding the mmap_sem in read mode the pmd becomes
> + * stable and stops changing under us only if it's not null and not a
> + * transhuge pmd. When those races occurs and this function makes a
> + * difference vs the standard pmd_none_or_clear_bad, the result is
> + * undefined so behaving like if the pmd was none is safe (because it
> + * can return none anyway). The compiler level barrier() is critically
> + * important to compute the two checks atomically on the same pmdval.
> + *
> + * For 32bit kernels with a 64bit large pmd_t this automatically takes
> + * care of reading the pmd atomically to avoid SMP race conditions
> + * against pmd_populate() when the mmap_sem is hold for reading by the
> + * caller (a special atomic read not done by "gcc" as in the generic
> + * version above, is also needed when THP is disabled because the page
> + * fault can populate the pmd from under us).
> + */
> +static inline int pmd_none_or_trans_huge_or_clear_bad(pmd_t *pmd)
> +{
> +	pmd_t pmdval = pmd_read_atomic(pmd);
> +	/*
> +	 * The barrier will stabilize the pmdval in a register or on
> +	 * the stack so that it will stop changing under the code.
> +	 *
> +	 * When CONFIG_TRANSPARENT_HUGEPAGE=y on x86 32bit PAE,
> +	 * pmd_read_atomic is allowed to return a not atomic pmdval
> +	 * (for example pointing to an hugepage that has never been
> +	 * mapped in the pmd). The below checks will only care about
> +	 * the low part of the pmd with 32bit PAE x86 anyway, with the
> +	 * exception of pmd_none(). So the important thing is that if
> +	 * the low part of the pmd is found null, the high part will
> +	 * be also null or the pmd_none() check below would be
> +	 * confused.
> +	 */
> +#ifdef CONFIG_TRANSPARENT_HUGEPAGE
> +	barrier();
> +#endif
> +	if (pmd_none(pmdval) || pmd_trans_huge(pmdval)
> +			|| is_pmd_migration_entry(pmdval))
> +		return 1;
> +	if (unlikely(pmd_bad(pmdval))) {
> +		pmd_clear_bad(pmd);
> +		return 1;
> +	}
> +	return 0;
> +}
> +
> +/*
> + * This is a noop if Transparent Hugepage Support is not built into
> + * the kernel. Otherwise it is equivalent to
> + * pmd_none_or_trans_huge_or_clear_bad(), and shall only be called in
> + * places that already verified the pmd is not none and they want to
> + * walk ptes while holding the mmap sem in read mode (write mode don't
> + * need this). If THP is not enabled, the pmd can't go away under the
> + * code even if MADV_DONTNEED runs, but if THP is enabled we need to
> + * run a pmd_trans_unstable before walking the ptes after
> + * split_huge_page_pmd returns (because it may have run when the pmd
> + * become null, but then a page fault can map in a THP and not a
> + * regular page).
> + */
> +static inline int pmd_trans_unstable(pmd_t *pmd)
> +{
> +#ifdef CONFIG_TRANSPARENT_HUGEPAGE
> +	return pmd_none_or_trans_huge_or_clear_bad(pmd);
> +#else
> +	return 0;
> +#endif
> +}
> +
> +

These functions are page table or thp matter, so putting them in swapops.h
looks weird to me. Maybe you can avoid this code transfer by using !pmd_present
instead of is_pmd_migration_entry?
And we have to consider renaming pmd_none_or_trans_huge_or_clear_bad(),
I like a simple name like __pmd_trans_unstable(), but if you have an idea,
that's great.

>  #ifdef CONFIG_MEMORY_FAILURE
>  
>  extern atomic_long_t num_poisoned_pages __read_mostly;
> diff --git a/mm/gup.c b/mm/gup.c
> index 1e67461b2733..82e0304e5d29 100644
> --- a/mm/gup.c
> +++ b/mm/gup.c
> @@ -274,6 +274,13 @@ struct page *follow_page_mask(struct vm_area_struct *vma,
>  	}
>  	if ((flags & FOLL_NUMA) && pmd_protnone(*pmd))
>  		return no_page_table(vma, flags);
> +	if (!pmd_present(*pmd)) {
> +retry:
> +		if (likely(!(flags & FOLL_MIGRATION)))
> +			return no_page_table(vma, flags);
> +		pmd_migration_entry_wait(mm, pmd);
> +		goto retry;
> +	}
>  	if (pmd_devmap(*pmd)) {
>  		ptl = pmd_lock(mm, pmd);
>  		page = follow_devmap_pmd(vma, address, pmd, flags);
> @@ -285,6 +292,15 @@ struct page *follow_page_mask(struct vm_area_struct *vma,
>  		return follow_page_pte(vma, address, pmd, flags);
>  
>  	ptl = pmd_lock(mm, pmd);
> +	if (unlikely(!pmd_present(*pmd))) {
> +retry_locked:
> +		if (likely(!(flags & FOLL_MIGRATION))) {
> +			spin_unlock(ptl);
> +			return no_page_table(vma, flags);
> +		}
> +		pmd_migration_entry_wait(mm, pmd);
> +		goto retry_locked;
> +	}
>  	if (unlikely(!pmd_trans_huge(*pmd))) {
>  		spin_unlock(ptl);
>  		return follow_page_pte(vma, address, pmd, flags);
> @@ -340,7 +356,7 @@ static int get_gate_page(struct mm_struct *mm, unsigned long address,
>  	pud = pud_offset(pgd, address);
>  	BUG_ON(pud_none(*pud));
>  	pmd = pmd_offset(pud, address);
> -	if (pmd_none(*pmd))
> +	if (!pmd_present(*pmd))
>  		return -EFAULT;
>  	VM_BUG_ON(pmd_trans_huge(*pmd));
>  	pte = pte_offset_map(pmd, address);
> @@ -1368,7 +1384,7 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
>  		pmd_t pmd = READ_ONCE(*pmdp);
>  
>  		next = pmd_addr_end(addr, end);
> -		if (pmd_none(pmd))
> +		if (!pmd_present(pmd))
>  			return 0;
>  
>  		if (unlikely(pmd_trans_huge(pmd) || pmd_huge(pmd))) {
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index fd54bbdc16cf..4ac923539372 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -897,6 +897,21 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
>  
>  	ret = -EAGAIN;
>  	pmd = *src_pmd;
> +
> +	if (unlikely(is_pmd_migration_entry(pmd))) {
> +		swp_entry_t entry = pmd_to_swp_entry(pmd);
> +
> +		if (is_write_migration_entry(entry)) {
> +			make_migration_entry_read(&entry);
> +			pmd = swp_entry_to_pmd(entry);
> +			set_pmd_at(src_mm, addr, src_pmd, pmd);
> +		}
> +		set_pmd_at(dst_mm, addr, dst_pmd, pmd);
> +		ret = 0;
> +		goto out_unlock;
> +	}
> +	WARN_ONCE(!pmd_present(pmd), "Uknown non-present format on pmd.\n");
> +
>  	if (unlikely(!pmd_trans_huge(pmd))) {
>  		pte_free(dst_mm, pgtable);
>  		goto out_unlock;
> @@ -1203,6 +1218,9 @@ int do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd)
>  	if (unlikely(!pmd_same(*vmf->pmd, orig_pmd)))
>  		goto out_unlock;
>  
> +	if (unlikely(!pmd_present(orig_pmd)))
> +		goto out_unlock;
> +
>  	page = pmd_page(orig_pmd);
>  	VM_BUG_ON_PAGE(!PageCompound(page) || !PageHead(page), page);
>  	/*
> @@ -1337,7 +1355,15 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
>  	if ((flags & FOLL_NUMA) && pmd_protnone(*pmd))
>  		goto out;
>  
> -	page = pmd_page(*pmd);
> +	if (is_pmd_migration_entry(*pmd)) {
> +		swp_entry_t entry;
> +
> +		entry = pmd_to_swp_entry(*pmd);
> +		page = pfn_to_page(swp_offset(entry));
> +		if (!is_migration_entry(entry))
> +			goto out;
> +	} else
> +		page = pmd_page(*pmd);
>  	VM_BUG_ON_PAGE(!PageHead(page) && !is_zone_device_page(page), page);
>  	if (flags & FOLL_TOUCH)
>  		touch_pmd(vma, addr, pmd);
> @@ -1533,6 +1559,9 @@ bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
>  	if (is_huge_zero_pmd(orig_pmd))
>  		goto out;
>  
> +	if (unlikely(!pmd_present(orig_pmd)))
> +		goto out;
> +
>  	page = pmd_page(orig_pmd);
>  	/*
>  	 * If other processes are mapping this page, we couldn't discard
> @@ -1659,7 +1688,8 @@ int __zap_huge_pmd_locked(struct mmu_gather *tlb, struct vm_area_struct *vma,
>  			free_swap_and_cache(entry); /* waring in failure? */
>  			migration = 1;
>  		}
> -		tlb_remove_page_size(tlb, page, HPAGE_PMD_SIZE);
> +		if (!migration)
> +			tlb_remove_page_size(tlb, page, HPAGE_PMD_SIZE);
>  	}
>  
>  	return 1;
> @@ -1775,10 +1805,22 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
>  		 * data is likely to be read-cached on the local CPU and
>  		 * local/remote hits to the zero page are not interesting.
>  		 */
> -		if (prot_numa && is_huge_zero_pmd(*pmd)) {
> -			spin_unlock(ptl);
> -			return ret;
> -		}
> +		if (prot_numa && is_huge_zero_pmd(*pmd))
> +			goto unlock;
> +
> +		if (is_pmd_migration_entry(*pmd)) {
> +			swp_entry_t entry = pmd_to_swp_entry(*pmd);
> +
> +			if (is_write_migration_entry(entry)) {
> +				pmd_t newpmd;
> +
> +				make_migration_entry_read(&entry);
> +				newpmd = swp_entry_to_pmd(entry);
> +				set_pmd_at(mm, addr, pmd, newpmd);
> +			}
> +			goto unlock;
> +		} else if (!pmd_present(*pmd))
> +			WARN_ONCE(1, "Uknown non-present format on pmd.\n");
>  
>  		if (!prot_numa || !pmd_protnone(*pmd)) {
>  			entry = pmdp_huge_get_and_clear_notify(mm, addr, pmd);
> @@ -1790,6 +1832,7 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
>  			BUG_ON(vma_is_anonymous(vma) && !preserve_write &&
>  					pmd_write(entry));
>  		}
> +unlock:
>  		spin_unlock(ptl);
>  	}
>  
> @@ -1806,7 +1849,8 @@ spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma)
>  {
>  	spinlock_t *ptl;
>  	ptl = pmd_lock(vma->vm_mm, pmd);
> -	if (likely(pmd_trans_huge(*pmd) || pmd_devmap(*pmd)))
> +	if (likely(is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) ||
> +			pmd_devmap(*pmd)))
>  		return ptl;
>  	spin_unlock(ptl);
>  	return NULL;
> @@ -1924,7 +1968,7 @@ void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
>  	struct page *page;
>  	pgtable_t pgtable;
>  	pmd_t _pmd;
> -	bool young, write, dirty, soft_dirty;
> +	bool young, write, dirty, soft_dirty, pmd_migration;
>  	unsigned long addr;
>  	int i;
>  	unsigned long haddr = address & HPAGE_PMD_MASK;
> @@ -1932,7 +1976,8 @@ void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
>  	VM_BUG_ON(haddr & ~HPAGE_PMD_MASK);
>  	VM_BUG_ON_VMA(vma->vm_start > haddr, vma);
>  	VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PMD_SIZE, vma);
> -	VM_BUG_ON(!pmd_trans_huge(*pmd) && !pmd_devmap(*pmd));
> +	VM_BUG_ON(!is_pmd_migration_entry(*pmd) && !pmd_trans_huge(*pmd)
> +				&& !pmd_devmap(*pmd));
>  
>  	count_vm_event(THP_SPLIT_PMD);
>  
> @@ -1960,7 +2005,14 @@ void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
>  		goto out;
>  	}
>  
> -	page = pmd_page(*pmd);
> +	pmd_migration = is_pmd_migration_entry(*pmd);
> +	if (pmd_migration) {
> +		swp_entry_t entry;
> +
> +		entry = pmd_to_swp_entry(*pmd);
> +		page = pfn_to_page(swp_offset(entry));
> +	} else
> +		page = pmd_page(*pmd);
>  	VM_BUG_ON_PAGE(!page_count(page), page);
>  	page_ref_add(page, HPAGE_PMD_NR - 1);
>  	write = pmd_write(*pmd);
> @@ -1979,7 +2031,7 @@ void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
>  		 * transferred to avoid any possibility of altering
>  		 * permissions across VMAs.
>  		 */
> -		if (freeze) {
> +		if (freeze || pmd_migration) {
>  			swp_entry_t swp_entry;
>  			swp_entry = make_migration_entry(page + i, write);
>  			entry = swp_entry_to_pte(swp_entry);
> @@ -2077,7 +2129,7 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
>  		page = pmd_page(*pmd);
>  		if (PageMlocked(page))
>  			clear_page_mlock(page);
> -	} else if (!pmd_devmap(*pmd))
> +	} else if (!(pmd_devmap(*pmd) || is_pmd_migration_entry(*pmd)))
>  		goto out;
>  	__split_huge_pmd_locked(vma, pmd, address, freeze);
>  out:
> diff --git a/mm/madvise.c b/mm/madvise.c
> index e424a06e9f2b..0497a502351f 100644
> --- a/mm/madvise.c
> +++ b/mm/madvise.c
> @@ -310,6 +310,8 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
>  	unsigned long next;
>  
>  	next = pmd_addr_end(addr, end);
> +	if (!pmd_present(*pmd))
> +		return 0;
>  	if (pmd_trans_huge(*pmd))
>  		if (madvise_free_huge_pmd(tlb, vma, pmd, addr, next))
>  			goto next;
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 44fb1e80701a..09bce3f0d622 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -4633,6 +4633,8 @@ static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
>  	struct page *page = NULL;
>  	enum mc_target_type ret = MC_TARGET_NONE;
>  
> +	if (unlikely(!pmd_present(pmd)))
> +		return ret;
>  	page = pmd_page(pmd);
>  	VM_BUG_ON_PAGE(!page || !PageHead(page), page);
>  	if (!(mc.flags & MOVE_ANON))
> diff --git a/mm/memory.c b/mm/memory.c
> index 7cfdd5208ef5..bf10b19e02d3 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -999,7 +999,8 @@ static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src
>  	src_pmd = pmd_offset(src_pud, addr);
>  	do {
>  		next = pmd_addr_end(addr, end);
> -		if (pmd_trans_huge(*src_pmd) || pmd_devmap(*src_pmd)) {
> +		if (is_swap_pmd(*src_pmd) || pmd_trans_huge(*src_pmd)
> +			|| pmd_devmap(*src_pmd)) {
>  			int err;
>  			VM_BUG_ON_VMA(next-addr != HPAGE_PMD_SIZE, vma);
>  			err = copy_huge_pmd(dst_mm, src_mm,
> @@ -1240,7 +1241,7 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
>  	ptl = pmd_lock(vma->vm_mm, pmd);
>  	do {
>  		next = pmd_addr_end(addr, end);
> -		if (pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) {
> +		if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) {
>  			if (next - addr != HPAGE_PMD_SIZE) {
>  				VM_BUG_ON_VMA(vma_is_anonymous(vma) &&
>  				    !rwsem_is_locked(&tlb->mm->mmap_sem), vma);
> @@ -3697,6 +3698,10 @@ static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
>  		pmd_t orig_pmd = *vmf.pmd;
>  
>  		barrier();
> +		if (unlikely(is_pmd_migration_entry(orig_pmd))) {
> +			pmd_migration_entry_wait(mm, vmf.pmd);
> +			return 0;
> +		}
>  		if (pmd_trans_huge(orig_pmd) || pmd_devmap(orig_pmd)) {
>  			vmf.flags |= FAULT_FLAG_SIZE_PMD;
>  			if (pmd_protnone(orig_pmd) && vma_is_accessible(vma))
> diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
> index 19b460acb5e1..9cb4c83151a8 100644
> --- a/mm/memory_hotplug.c
> +++ b/mm/memory_hotplug.c

Changes on mm/memory_hotplug.c should be with patch 14/14?
# If that's right, definition of hpage_order() also should go to 14/14.

Thanks,
Naoya Horiguchi

> @@ -1570,6 +1570,7 @@ static struct page *new_node_page(struct page *page, unsigned long private,
>  	int nid = page_to_nid(page);
>  	nodemask_t nmask = node_states[N_MEMORY];
>  	struct page *new_page = NULL;
> +	unsigned int order = 0;
>  
>  	/*
>  	 * TODO: allocate a destination hugepage from a nearest neighbor node,
> @@ -1580,6 +1581,11 @@ static struct page *new_node_page(struct page *page, unsigned long private,
>  		return alloc_huge_page_node(page_hstate(compound_head(page)),
>  					next_node_in(nid, nmask));
>  
> +	if (thp_migration_supported() && PageTransHuge(page)) {
> +		order = hpage_order(page);
> +		gfp_mask |= GFP_TRANSHUGE;
> +	}
> +
>  	node_clear(nid, nmask);
>  
>  	if (PageHighMem(page)
> @@ -1593,6 +1599,9 @@ static struct page *new_node_page(struct page *page, unsigned long private,
>  		new_page = __alloc_pages(gfp_mask, 0,
>  					node_zonelist(nid, gfp_mask));
>  
> +	if (new_page && order == hpage_order(page))
> +		prep_transhuge_page(new_page);
> +
>  	return new_page;
>  }
>  
> @@ -1622,7 +1631,9 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
>  			if (isolate_huge_page(page, &source))
>  				move_pages -= 1 << compound_order(head);
>  			continue;
> -		}
> +		} else if (thp_migration_supported() && PageTransHuge(page))
> +			pfn = page_to_pfn(compound_head(page))
> +				+ hpage_nr_pages(page) - 1;
>  
>  		if (!get_page_unless_zero(page))
>  			continue;
> diff --git a/mm/mempolicy.c b/mm/mempolicy.c
> index 5cc6a99918ab..021ff13b9a7a 100644
> --- a/mm/mempolicy.c
> +++ b/mm/mempolicy.c
> @@ -94,6 +94,7 @@
>  #include <linux/mm_inline.h>
>  #include <linux/mmu_notifier.h>
>  #include <linux/printk.h>
> +#include <linux/swapops.h>
>  
>  #include <asm/tlbflush.h>
>  #include <asm/uaccess.h>
> diff --git a/mm/mprotect.c b/mm/mprotect.c
> index 98acf7d5cef2..bfbe66798a7a 100644
> --- a/mm/mprotect.c
> +++ b/mm/mprotect.c
> @@ -150,7 +150,9 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
>  		unsigned long this_pages;
>  
>  		next = pmd_addr_end(addr, end);
> -		if (!pmd_trans_huge(*pmd) && !pmd_devmap(*pmd)
> +		if (!pmd_present(*pmd))
> +			continue;
> +		if (!is_swap_pmd(*pmd) && !pmd_trans_huge(*pmd) && !pmd_devmap(*pmd)
>  				&& pmd_none_or_clear_bad(pmd))
>  			continue;
>  
> @@ -160,7 +162,7 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
>  			mmu_notifier_invalidate_range_start(mm, mni_start, end);
>  		}
>  
> -		if (pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) {
> +		if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) {
>  			if (next - addr != HPAGE_PMD_SIZE) {
>  				__split_huge_pmd(vma, pmd, addr, false, NULL);
>  				if (pmd_trans_unstable(pmd))
> diff --git a/mm/mremap.c b/mm/mremap.c
> index 8233b0105c82..5d537ce12adc 100644
> --- a/mm/mremap.c
> +++ b/mm/mremap.c
> @@ -213,7 +213,7 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
>  		new_pmd = alloc_new_pmd(vma->vm_mm, vma, new_addr);
>  		if (!new_pmd)
>  			break;
> -		if (pmd_trans_huge(*old_pmd)) {
> +		if (is_swap_pmd(*old_pmd) || pmd_trans_huge(*old_pmd)) {
>  			if (extent == HPAGE_PMD_SIZE) {
>  				bool moved;
>  				/* See comment in move_ptes() */
> diff --git a/mm/pagewalk.c b/mm/pagewalk.c
> index 03761577ae86..114fc2b5a370 100644
> --- a/mm/pagewalk.c
> +++ b/mm/pagewalk.c
> @@ -2,6 +2,8 @@
>  #include <linux/highmem.h>
>  #include <linux/sched.h>
>  #include <linux/hugetlb.h>
> +#include <linux/swap.h>
> +#include <linux/swapops.h>
>  
>  static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
>  			  struct mm_walk *walk)
> -- 
> 2.11.0
>