linux-kernel - Re: [PATCHv5 25/28] thp: reintroduce split_huge

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <555B3000.6040805@suse.cz>
Date:	Tue, 19 May 2015 14:43:44 +0200
From:	Vlastimil Babka <vbabka@...e.cz>
To:	"Kirill A. Shutemov" <kirill.shutemov@...ux.intel.com>,
	Andrew Morton <akpm@...ux-foundation.org>,
	Andrea Arcangeli <aarcange@...hat.com>,
	Hugh Dickins <hughd@...gle.com>
CC:	Dave Hansen <dave.hansen@...el.com>, Mel Gorman <mgorman@...e.de>,
	Rik van Riel <riel@...hat.com>,
	Christoph Lameter <cl@...two.org>,
	Naoya Horiguchi <n-horiguchi@...jp.nec.com>,
	Steve Capper <steve.capper@...aro.org>,
	"Aneesh Kumar K.V" <aneesh.kumar@...ux.vnet.ibm.com>,
	Johannes Weiner <hannes@...xchg.org>,
	Michal Hocko <mhocko@...e.cz>,
	Jerome Marchand <jmarchan@...hat.com>,
	Sasha Levin <sasha.levin@...cle.com>,
	linux-kernel@...r.kernel.org, linux-mm@...ck.org
Subject: Re: [PATCHv5 25/28] thp: reintroduce split_huge_page()

On 04/23/2015 11:04 PM, Kirill A. Shutemov wrote:
> This patch adds implementation of split_huge_page() for new
> refcountings.
>
> Unlike previous implementation, new split_huge_page() can fail if
> somebody holds GUP pin on the page. It also means that pin on page
> would prevent it from bening split under you. It makes situation in
> many places much cleaner.
>
> The basic scheme of split_huge_page():
>
>    - Check that sum of mapcounts of all subpage is equal to page_count()
>      plus one (caller pin). Foll off with -EBUSY. This way we can avoid
>      useless PMD-splits.
>
>    - Freeze the page counters by splitting all PMD and setup migration
>      PTEs.
>
>    - Re-check sum of mapcounts against page_count(). Page's counts are
>      stable now. -EBUSY if page is pinned.
>
>    - Split compound page.
>
>    - Unfreeze the page by removing migration entries.
>
> Signed-off-by: Kirill A. Shutemov <kirill.shutemov@...ux.intel.com>
> Tested-by: Sasha Levin <sasha.levin@...cle.com>
> ---
>   include/linux/huge_mm.h |   7 +-
>   include/linux/pagemap.h |   9 +-
>   mm/huge_memory.c        | 322 ++++++++++++++++++++++++++++++++++++++++++++++++
>   mm/internal.h           |  26 +++-
>   mm/rmap.c               |  21 ----
>   5 files changed, 357 insertions(+), 28 deletions(-)
>
> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
> index b7844c73b7db..3c0a50ed3eb8 100644
> --- a/include/linux/huge_mm.h
> +++ b/include/linux/huge_mm.h
> @@ -92,8 +92,11 @@ extern bool is_vma_temporary_stack(struct vm_area_struct *vma);
>
>   extern unsigned long transparent_hugepage_flags;
>
> -#define split_huge_page_to_list(page, list) BUILD_BUG()
> -#define split_huge_page(page) BUILD_BUG()
> +int split_huge_page_to_list(struct page *page, struct list_head *list);
> +static inline int split_huge_page(struct page *page)
> +{
> +	return split_huge_page_to_list(page, NULL);
> +}
>
>   void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
>   		unsigned long address);
> diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
> index 7c3790764795..ffbb23dbebba 100644
> --- a/include/linux/pagemap.h
> +++ b/include/linux/pagemap.h
> @@ -387,10 +387,17 @@ static inline struct page *read_mapping_page(struct address_space *mapping,
>    */
>   static inline pgoff_t page_to_pgoff(struct page *page)
>   {
> +	pgoff_t pgoff;
> +
>   	if (unlikely(PageHeadHuge(page)))
>   		return page->index << compound_order(page);
> -	else
> +
> +	if (likely(!PageTransTail(page)))
>   		return page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
> +
> +	pgoff = page->first_page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
> +	pgoff += page - page->first_page;
> +	return pgoff;

This could use some comment or maybe separate preparatory patch?

>   }
>
>   /*
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index 2f9e2e882bab..7ad338ab2ac8 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -2704,3 +2704,325 @@ void __vma_adjust_trans_huge(struct vm_area_struct *vma,
>   			split_huge_pmd_address(next, nstart);
>   	}
>   }
> +
> +static void freeze_page_vma(struct vm_area_struct *vma, struct page *page,
> +		unsigned long address)
> +{
> +	spinlock_t *ptl;
> +	pgd_t *pgd;
> +	pud_t *pud;
> +	pmd_t *pmd;
> +	pte_t *pte;
> +	int i;
> +
> +	pgd = pgd_offset(vma->vm_mm, address);
> +	if (!pgd_present(*pgd))
> +		return;
> +	pud = pud_offset(pgd, address);
> +	if (!pud_present(*pud))
> +		return;
> +	pmd = pmd_offset(pud, address);
> +	ptl = pmd_lock(vma->vm_mm, pmd);
> +	if (!pmd_present(*pmd)) {
> +		spin_unlock(ptl);
> +		return;
> +	}
> +	if (pmd_trans_huge(*pmd)) {
> +		if (page == pmd_page(*pmd))
> +			__split_huge_pmd_locked(vma, pmd, address, true);
> +		spin_unlock(ptl);
> +		return;
> +	}
> +	spin_unlock(ptl);
> +
> +	pte = pte_offset_map_lock(vma->vm_mm, pmd, address, &ptl);
> +	for (i = 0; i < HPAGE_PMD_NR; i++, address += PAGE_SIZE, page++) {
> +		pte_t entry, swp_pte;
> +		swp_entry_t swp_entry;
> +
> +		if (!pte_present(pte[i]))
> +			continue;
> +		if (page_to_pfn(page) != pte_pfn(pte[i]))
> +			continue;
> +		flush_cache_page(vma, address, page_to_pfn(page));
> +		entry = ptep_clear_flush(vma, address, pte + i);
> +		swp_entry = make_migration_entry(page, pte_write(entry));
> +		swp_pte = swp_entry_to_pte(swp_entry);
> +		if (pte_soft_dirty(entry))
> +			swp_pte = pte_swp_mksoft_dirty(swp_pte);
> +		set_pte_at(vma->vm_mm, address, pte + i, swp_pte);
> +	}
> +	pte_unmap_unlock(pte, ptl);
> +}
> +
> +static void freeze_page(struct anon_vma *anon_vma, struct page *page)
> +{
> +	struct anon_vma_chain *avc;
> +	pgoff_t pgoff = page_to_pgoff(page);
> +
> +	VM_BUG_ON_PAGE(!PageHead(page), page);
> +
> +	anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff,
> +			pgoff + HPAGE_PMD_NR - 1) {
> +		unsigned long haddr;
> +
> +		haddr = __vma_address(page, avc->vma) & HPAGE_PMD_MASK;
> +		mmu_notifier_invalidate_range_start(avc->vma->vm_mm,
> +				haddr, haddr + HPAGE_PMD_SIZE);
> +		freeze_page_vma(avc->vma, page, haddr);
> +		mmu_notifier_invalidate_range_end(avc->vma->vm_mm,
> +				haddr, haddr + HPAGE_PMD_SIZE);
> +	}
> +}
> +
> +static void unfreeze_page_vma(struct vm_area_struct *vma, struct page *page,
> +		unsigned long address)
> +{
> +	spinlock_t *ptl;
> +	pmd_t *pmd;
> +	pte_t *pte, entry;
> +	swp_entry_t swp_entry;
> +
> +	pmd = mm_find_pmd(vma->vm_mm, address);
> +	if (!pmd)
> +		return;
> +	pte = pte_offset_map_lock(vma->vm_mm, pmd, address, &ptl);
> +
> +	if (!is_swap_pte(*pte))
> +		goto unlock;
> +
> +	swp_entry = pte_to_swp_entry(*pte);
> +	if (!is_migration_entry(swp_entry) ||
> +			migration_entry_to_page(swp_entry) != page)
> +		goto unlock;
> +
> +	entry = pte_mkold(mk_pte(page, vma->vm_page_prot));
> +	if (is_write_migration_entry(swp_entry))
> +		entry = maybe_mkwrite(entry, vma);
> +
> +	flush_dcache_page(page);
> +	set_pte_at(vma->vm_mm, address, pte, entry);
> +
> +	/* No need to invalidate - it was non-present before */
> +	update_mmu_cache(vma, address, pte);
> +unlock:
> +	pte_unmap_unlock(pte, ptl);
> +}
> +
> +static void unfreeze_page(struct anon_vma *anon_vma, struct page *page)
> +{
> +	struct anon_vma_chain *avc;
> +	pgoff_t pgoff = page_to_pgoff(page);
> +	int i;
> +
> +	for (i = 0; i < HPAGE_PMD_NR; i++, pgoff++, page++) {

In case of freeze_page() this cycle is the inner one and it can batch 
ptl lock. Why not here?

> +		if (!page_mapcount(page))
> +			continue;
> +
> +		anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root,
> +				pgoff, pgoff) {
> +			unsigned long address = vma_address(page, avc->vma);
> +
> +			mmu_notifier_invalidate_range_start(avc->vma->vm_mm,
> +					address, address + PAGE_SIZE);
> +			unfreeze_page_vma(avc->vma, page, address);
> +			mmu_notifier_invalidate_range_end(avc->vma->vm_mm,
> +					address, address + PAGE_SIZE);
> +		}
> +	}
> +}
> +
> +static int total_mapcount(struct page *page)
> +{
> +	int i, ret;
> +
> +	ret = compound_mapcount(page);
> +	for (i = 0; i < HPAGE_PMD_NR; i++)
> +		ret += atomic_read(&page[i]._mapcount) + 1;
> +
> +	/*
> +	 * Positive compound_mapcount() offsets ->_mapcount in every subpage by
> +	 * one. Let's substract it here.
> +	 */
> +	if (compound_mapcount(page))
> +		ret -= HPAGE_PMD_NR;
> +
> +	return ret;
> +}
> +
> +static int __split_huge_page_tail(struct page *head, int tail,
> +		struct lruvec *lruvec, struct list_head *list)
> +{
> +	int mapcount;
> +	struct page *page_tail = head + tail;
> +
> +	mapcount = page_mapcount(page_tail);
> +	BUG_ON(atomic_read(&page_tail->_count) != 0);

VM_BUG_ON?

> +
> +	/*
> +	 * tail_page->_count is zero and not changing from under us. But
> +	 * get_page_unless_zero() may be running from under us on the
> +	 * tail_page. If we used atomic_set() below instead of atomic_add(), we
> +	 * would then run atomic_set() concurrently with
> +	 * get_page_unless_zero(), and atomic_set() is implemented in C not
> +	 * using locked ops. spin_unlock on x86 sometime uses locked ops
> +	 * because of PPro errata 66, 92, so unless somebody can guarantee
> +	 * atomic_set() here would be safe on all archs (and not only on x86),
> +	 * it's safer to use atomic_add().
> +	 */
> +	atomic_add(page_mapcount(page_tail) + 1, &page_tail->_count);
> +
> +	/* after clearing PageTail the gup refcount can be released */
> +	smp_mb__after_atomic();
> +
> +	/*
> +	 * retain hwpoison flag of the poisoned tail page:
> +	 *   fix for the unsuitable process killed on Guest Machine(KVM)
> +	 *   by the memory-failure.
> +	 */
> +	page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP | __PG_HWPOISON;
> +	page_tail->flags |= (head->flags &
> +			((1L << PG_referenced) |
> +			 (1L << PG_swapbacked) |
> +			 (1L << PG_mlocked) |
> +			 (1L << PG_uptodate) |
> +			 (1L << PG_active) |
> +			 (1L << PG_locked) |
> +			 (1L << PG_unevictable)));
> +	page_tail->flags |= (1L << PG_dirty);
> +
> +	/* clear PageTail before overwriting first_page */
> +	smp_wmb();
> +
> +	/* ->mapping in first tail page is compound_mapcount */
> +	BUG_ON(tail != 1 && page_tail->mapping != TAIL_MAPPING);

VM_BUG_ON?

> +	page_tail->mapping = head->mapping;
> +
> +	page_tail->index = head->index + tail;
> +	page_cpupid_xchg_last(page_tail, page_cpupid_last(head));
> +	lru_add_page_tail(head, page_tail, lruvec, list);
> +
> +	return mapcount;
> +}
> +
> +static void __split_huge_page(struct page *page, struct list_head *list)
> +{
> +	struct page *head = compound_head(page);
> +	struct zone *zone = page_zone(head);
> +	struct lruvec *lruvec;
> +	int i, tail_mapcount;
> +
> +	/* prevent PageLRU to go away from under us, and freeze lru stats */
> +	spin_lock_irq(&zone->lru_lock);
> +	lruvec = mem_cgroup_page_lruvec(head, zone);
> +
> +	/* complete memcg works before add pages to LRU */
> +	mem_cgroup_split_huge_fixup(head);
> +
> +	tail_mapcount = 0;
> +	for (i = HPAGE_PMD_NR - 1; i >= 1; i--)
> +		tail_mapcount += __split_huge_page_tail(head, i, lruvec, list);
> +	atomic_sub(tail_mapcount, &head->_count);
> +
> +	ClearPageCompound(head);
> +	spin_unlock_irq(&zone->lru_lock);
> +
> +	unfreeze_page(page_anon_vma(head), head);
> +
> +	for (i = 0; i < HPAGE_PMD_NR; i++) {
> +		struct page *subpage = head + i;
> +		if (subpage == page)
> +			continue;
> +		unlock_page(subpage);
> +
> +		/*
> +		 * Subpages may be freed if there wasn't any mapping
> +		 * like if add_to_swap() is running on a lru page that
> +		 * had its mapping zapped. And freeing these pages
> +		 * requires taking the lru_lock so we do the put_page
> +		 * of the tail pages after the split is complete.
> +		 */
> +		put_page(subpage);
> +	}
> +}
> +
> +/*
> + * This function splits huge page into normal pages. @page can point to any
> + * subpage of huge page to split. Split doesn't change the position of @page.
> + *
> + * Only caller must hold pin on the @page, otherwise split fails with -EBUSY.
> + * The huge page must be locked.
> + *
> + * If @list is null, tail pages will be added to LRU list, otherwise, to @list.
> + *
> + * Both head page and tail pages will inherit mapping, flags, and so on from
> + * the hugepage.
> + *
> + * GUP pin and PG_locked transfered to @page. Rest subpages can be freed if
> + * they are not mapped.
> + *
> + * Returns 0 if the hugepage is split successfully.
> + * Returns -EBUSY if the page is pinned or if anon_vma disappeared from under
> + * us.
> + */
> +int split_huge_page_to_list(struct page *page, struct list_head *list)
> +{
> +	struct page *head = compound_head(page);
> +	struct anon_vma *anon_vma;
> +	int mapcount, ret;
> +
> +	BUG_ON(is_huge_zero_page(page));
> +	BUG_ON(!PageAnon(page));
> +	BUG_ON(!PageLocked(page));
> +	BUG_ON(!PageSwapBacked(page));
> +	BUG_ON(!PageCompound(page));

VM_BUG_ONs?

> +
> +	/*
> +	 * The caller does not necessarily hold an mmap_sem that would prevent
> +	 * the anon_vma disappearing so we first we take a reference to it
> +	 * and then lock the anon_vma for write. This is similar to
> +	 * page_lock_anon_vma_read except the write lock is taken to serialise
> +	 * against parallel split or collapse operations.
> +	 */
> +	anon_vma = page_get_anon_vma(head);
> +	if (!anon_vma) {
> +		ret = -EBUSY;
> +		goto out;
> +	}
> +	anon_vma_lock_write(anon_vma);
> +
> +	/*
> +	 * Racy check if we can split the page, before freeze_page() will
> +	 * split PMDs
> +	 */
> +	if (total_mapcount(head) != page_count(head) - 1) {
> +		ret = -EBUSY;
> +		goto out_unlock;
> +	}
> +
> +	freeze_page(anon_vma, head);
> +	VM_BUG_ON_PAGE(compound_mapcount(head), head);
> +
> +	mapcount = total_mapcount(head);
> +	if (mapcount == page_count(head) - 1) {
> +		__split_huge_page(page, list);
> +		ret = 0;
> +	} else if (mapcount > page_count(page) - 1) {

It's confusing to use page_count(head) in one test and page_count(page) 
in other, although I know it should be same. Also what if you read a 
different value because something broke?

> +		pr_alert("total_mapcount: %u, page_count(): %u\n",
> +				mapcount, page_count(page));

Here you determine page_count(page) again although it could have 
possibly changed (we are in path where something went wrong already) so 
you potentially print different value than the one that was tested.


> +		if (PageTail(page))
> +			dump_page(head, NULL);
> +		dump_page(page, "tail_mapcount > page_count(page) - 1");

Here you say "tail_mapcount" which means something else in different places.
Also isn't the whole "else if" test a DEBUG_VM material as well?

> +		BUG();
> +	} else {
> +		unfreeze_page(anon_vma, head);
> +		ret = -EBUSY;
> +	}
> +
> +out_unlock:
> +	anon_vma_unlock_write(anon_vma);
> +	put_anon_vma(anon_vma);
> +out:
> +	count_vm_event(!ret ? THP_SPLIT_PAGE : THP_SPLIT_PAGE_FAILED);
> +	return ret;
> +}
> diff --git a/mm/internal.h b/mm/internal.h
> index 98bce4d12a16..aee0f2566fdd 100644
> --- a/mm/internal.h
> +++ b/mm/internal.h
> @@ -13,6 +13,7 @@
>
>   #include <linux/fs.h>
>   #include <linux/mm.h>
> +#include <linux/pagemap.h>
>
>   void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
>   		unsigned long floor, unsigned long ceiling);
> @@ -244,10 +245,27 @@ static inline void mlock_migrate_page(struct page *newpage, struct page *page)
>
>   extern pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma);
>
> -#ifdef CONFIG_TRANSPARENT_HUGEPAGE
> -extern unsigned long vma_address(struct page *page,
> -				 struct vm_area_struct *vma);
> -#endif
> +/*
> + * At what user virtual address is page expected in @vma?
> + */
> +static inline unsigned long
> +__vma_address(struct page *page, struct vm_area_struct *vma)
> +{
> +	pgoff_t pgoff = page_to_pgoff(page);
> +	return vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
> +}
> +
> +static inline unsigned long
> +vma_address(struct page *page, struct vm_area_struct *vma)
> +{
> +	unsigned long address = __vma_address(page, vma);
> +
> +	/* page should be within @vma mapping range */
> +	VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma);
> +
> +	return address;
> +}
> +
>   #else /* !CONFIG_MMU */
>   static inline void clear_page_mlock(struct page *page) { }
>   static inline void mlock_vma_page(struct page *page) { }
> diff --git a/mm/rmap.c b/mm/rmap.c
> index 047953145710..723af5bbeb02 100644
> --- a/mm/rmap.c
> +++ b/mm/rmap.c
> @@ -561,27 +561,6 @@ void page_unlock_anon_vma_read(struct anon_vma *anon_vma)
>   }
>
>   /*
> - * At what user virtual address is page expected in @vma?
> - */
> -static inline unsigned long
> -__vma_address(struct page *page, struct vm_area_struct *vma)
> -{
> -	pgoff_t pgoff = page_to_pgoff(page);
> -	return vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
> -}
> -
> -inline unsigned long
> -vma_address(struct page *page, struct vm_area_struct *vma)
> -{
> -	unsigned long address = __vma_address(page, vma);
> -
> -	/* page should be within @vma mapping range */
> -	VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma);
> -
> -	return address;
> -}
> -
> -/*
>    * At what user virtual address is page expected in vma?
>    * Caller should check the page is actually part of the vma.
>    */
>

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/