[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <555B3000.6040805@suse.cz>
Date: Tue, 19 May 2015 14:43:44 +0200
From: Vlastimil Babka <vbabka@...e.cz>
To: "Kirill A. Shutemov" <kirill.shutemov@...ux.intel.com>,
Andrew Morton <akpm@...ux-foundation.org>,
Andrea Arcangeli <aarcange@...hat.com>,
Hugh Dickins <hughd@...gle.com>
CC: Dave Hansen <dave.hansen@...el.com>, Mel Gorman <mgorman@...e.de>,
Rik van Riel <riel@...hat.com>,
Christoph Lameter <cl@...two.org>,
Naoya Horiguchi <n-horiguchi@...jp.nec.com>,
Steve Capper <steve.capper@...aro.org>,
"Aneesh Kumar K.V" <aneesh.kumar@...ux.vnet.ibm.com>,
Johannes Weiner <hannes@...xchg.org>,
Michal Hocko <mhocko@...e.cz>,
Jerome Marchand <jmarchan@...hat.com>,
Sasha Levin <sasha.levin@...cle.com>,
linux-kernel@...r.kernel.org, linux-mm@...ck.org
Subject: Re: [PATCHv5 25/28] thp: reintroduce split_huge_page()
On 04/23/2015 11:04 PM, Kirill A. Shutemov wrote:
> This patch adds implementation of split_huge_page() for new
> refcountings.
>
> Unlike previous implementation, new split_huge_page() can fail if
> somebody holds GUP pin on the page. It also means that pin on page
> would prevent it from bening split under you. It makes situation in
> many places much cleaner.
>
> The basic scheme of split_huge_page():
>
> - Check that sum of mapcounts of all subpage is equal to page_count()
> plus one (caller pin). Foll off with -EBUSY. This way we can avoid
> useless PMD-splits.
>
> - Freeze the page counters by splitting all PMD and setup migration
> PTEs.
>
> - Re-check sum of mapcounts against page_count(). Page's counts are
> stable now. -EBUSY if page is pinned.
>
> - Split compound page.
>
> - Unfreeze the page by removing migration entries.
>
> Signed-off-by: Kirill A. Shutemov <kirill.shutemov@...ux.intel.com>
> Tested-by: Sasha Levin <sasha.levin@...cle.com>
> ---
> include/linux/huge_mm.h | 7 +-
> include/linux/pagemap.h | 9 +-
> mm/huge_memory.c | 322 ++++++++++++++++++++++++++++++++++++++++++++++++
> mm/internal.h | 26 +++-
> mm/rmap.c | 21 ----
> 5 files changed, 357 insertions(+), 28 deletions(-)
>
> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
> index b7844c73b7db..3c0a50ed3eb8 100644
> --- a/include/linux/huge_mm.h
> +++ b/include/linux/huge_mm.h
> @@ -92,8 +92,11 @@ extern bool is_vma_temporary_stack(struct vm_area_struct *vma);
>
> extern unsigned long transparent_hugepage_flags;
>
> -#define split_huge_page_to_list(page, list) BUILD_BUG()
> -#define split_huge_page(page) BUILD_BUG()
> +int split_huge_page_to_list(struct page *page, struct list_head *list);
> +static inline int split_huge_page(struct page *page)
> +{
> + return split_huge_page_to_list(page, NULL);
> +}
>
> void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
> unsigned long address);
> diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
> index 7c3790764795..ffbb23dbebba 100644
> --- a/include/linux/pagemap.h
> +++ b/include/linux/pagemap.h
> @@ -387,10 +387,17 @@ static inline struct page *read_mapping_page(struct address_space *mapping,
> */
> static inline pgoff_t page_to_pgoff(struct page *page)
> {
> + pgoff_t pgoff;
> +
> if (unlikely(PageHeadHuge(page)))
> return page->index << compound_order(page);
> - else
> +
> + if (likely(!PageTransTail(page)))
> return page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
> +
> + pgoff = page->first_page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
> + pgoff += page - page->first_page;
> + return pgoff;
This could use some comment or maybe separate preparatory patch?
> }
>
> /*
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index 2f9e2e882bab..7ad338ab2ac8 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -2704,3 +2704,325 @@ void __vma_adjust_trans_huge(struct vm_area_struct *vma,
> split_huge_pmd_address(next, nstart);
> }
> }
> +
> +static void freeze_page_vma(struct vm_area_struct *vma, struct page *page,
> + unsigned long address)
> +{
> + spinlock_t *ptl;
> + pgd_t *pgd;
> + pud_t *pud;
> + pmd_t *pmd;
> + pte_t *pte;
> + int i;
> +
> + pgd = pgd_offset(vma->vm_mm, address);
> + if (!pgd_present(*pgd))
> + return;
> + pud = pud_offset(pgd, address);
> + if (!pud_present(*pud))
> + return;
> + pmd = pmd_offset(pud, address);
> + ptl = pmd_lock(vma->vm_mm, pmd);
> + if (!pmd_present(*pmd)) {
> + spin_unlock(ptl);
> + return;
> + }
> + if (pmd_trans_huge(*pmd)) {
> + if (page == pmd_page(*pmd))
> + __split_huge_pmd_locked(vma, pmd, address, true);
> + spin_unlock(ptl);
> + return;
> + }
> + spin_unlock(ptl);
> +
> + pte = pte_offset_map_lock(vma->vm_mm, pmd, address, &ptl);
> + for (i = 0; i < HPAGE_PMD_NR; i++, address += PAGE_SIZE, page++) {
> + pte_t entry, swp_pte;
> + swp_entry_t swp_entry;
> +
> + if (!pte_present(pte[i]))
> + continue;
> + if (page_to_pfn(page) != pte_pfn(pte[i]))
> + continue;
> + flush_cache_page(vma, address, page_to_pfn(page));
> + entry = ptep_clear_flush(vma, address, pte + i);
> + swp_entry = make_migration_entry(page, pte_write(entry));
> + swp_pte = swp_entry_to_pte(swp_entry);
> + if (pte_soft_dirty(entry))
> + swp_pte = pte_swp_mksoft_dirty(swp_pte);
> + set_pte_at(vma->vm_mm, address, pte + i, swp_pte);
> + }
> + pte_unmap_unlock(pte, ptl);
> +}
> +
> +static void freeze_page(struct anon_vma *anon_vma, struct page *page)
> +{
> + struct anon_vma_chain *avc;
> + pgoff_t pgoff = page_to_pgoff(page);
> +
> + VM_BUG_ON_PAGE(!PageHead(page), page);
> +
> + anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff,
> + pgoff + HPAGE_PMD_NR - 1) {
> + unsigned long haddr;
> +
> + haddr = __vma_address(page, avc->vma) & HPAGE_PMD_MASK;
> + mmu_notifier_invalidate_range_start(avc->vma->vm_mm,
> + haddr, haddr + HPAGE_PMD_SIZE);
> + freeze_page_vma(avc->vma, page, haddr);
> + mmu_notifier_invalidate_range_end(avc->vma->vm_mm,
> + haddr, haddr + HPAGE_PMD_SIZE);
> + }
> +}
> +
> +static void unfreeze_page_vma(struct vm_area_struct *vma, struct page *page,
> + unsigned long address)
> +{
> + spinlock_t *ptl;
> + pmd_t *pmd;
> + pte_t *pte, entry;
> + swp_entry_t swp_entry;
> +
> + pmd = mm_find_pmd(vma->vm_mm, address);
> + if (!pmd)
> + return;
> + pte = pte_offset_map_lock(vma->vm_mm, pmd, address, &ptl);
> +
> + if (!is_swap_pte(*pte))
> + goto unlock;
> +
> + swp_entry = pte_to_swp_entry(*pte);
> + if (!is_migration_entry(swp_entry) ||
> + migration_entry_to_page(swp_entry) != page)
> + goto unlock;
> +
> + entry = pte_mkold(mk_pte(page, vma->vm_page_prot));
> + if (is_write_migration_entry(swp_entry))
> + entry = maybe_mkwrite(entry, vma);
> +
> + flush_dcache_page(page);
> + set_pte_at(vma->vm_mm, address, pte, entry);
> +
> + /* No need to invalidate - it was non-present before */
> + update_mmu_cache(vma, address, pte);
> +unlock:
> + pte_unmap_unlock(pte, ptl);
> +}
> +
> +static void unfreeze_page(struct anon_vma *anon_vma, struct page *page)
> +{
> + struct anon_vma_chain *avc;
> + pgoff_t pgoff = page_to_pgoff(page);
> + int i;
> +
> + for (i = 0; i < HPAGE_PMD_NR; i++, pgoff++, page++) {
In case of freeze_page() this cycle is the inner one and it can batch
ptl lock. Why not here?
> + if (!page_mapcount(page))
> + continue;
> +
> + anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root,
> + pgoff, pgoff) {
> + unsigned long address = vma_address(page, avc->vma);
> +
> + mmu_notifier_invalidate_range_start(avc->vma->vm_mm,
> + address, address + PAGE_SIZE);
> + unfreeze_page_vma(avc->vma, page, address);
> + mmu_notifier_invalidate_range_end(avc->vma->vm_mm,
> + address, address + PAGE_SIZE);
> + }
> + }
> +}
> +
> +static int total_mapcount(struct page *page)
> +{
> + int i, ret;
> +
> + ret = compound_mapcount(page);
> + for (i = 0; i < HPAGE_PMD_NR; i++)
> + ret += atomic_read(&page[i]._mapcount) + 1;
> +
> + /*
> + * Positive compound_mapcount() offsets ->_mapcount in every subpage by
> + * one. Let's substract it here.
> + */
> + if (compound_mapcount(page))
> + ret -= HPAGE_PMD_NR;
> +
> + return ret;
> +}
> +
> +static int __split_huge_page_tail(struct page *head, int tail,
> + struct lruvec *lruvec, struct list_head *list)
> +{
> + int mapcount;
> + struct page *page_tail = head + tail;
> +
> + mapcount = page_mapcount(page_tail);
> + BUG_ON(atomic_read(&page_tail->_count) != 0);
VM_BUG_ON?
> +
> + /*
> + * tail_page->_count is zero and not changing from under us. But
> + * get_page_unless_zero() may be running from under us on the
> + * tail_page. If we used atomic_set() below instead of atomic_add(), we
> + * would then run atomic_set() concurrently with
> + * get_page_unless_zero(), and atomic_set() is implemented in C not
> + * using locked ops. spin_unlock on x86 sometime uses locked ops
> + * because of PPro errata 66, 92, so unless somebody can guarantee
> + * atomic_set() here would be safe on all archs (and not only on x86),
> + * it's safer to use atomic_add().
> + */
> + atomic_add(page_mapcount(page_tail) + 1, &page_tail->_count);
> +
> + /* after clearing PageTail the gup refcount can be released */
> + smp_mb__after_atomic();
> +
> + /*
> + * retain hwpoison flag of the poisoned tail page:
> + * fix for the unsuitable process killed on Guest Machine(KVM)
> + * by the memory-failure.
> + */
> + page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP | __PG_HWPOISON;
> + page_tail->flags |= (head->flags &
> + ((1L << PG_referenced) |
> + (1L << PG_swapbacked) |
> + (1L << PG_mlocked) |
> + (1L << PG_uptodate) |
> + (1L << PG_active) |
> + (1L << PG_locked) |
> + (1L << PG_unevictable)));
> + page_tail->flags |= (1L << PG_dirty);
> +
> + /* clear PageTail before overwriting first_page */
> + smp_wmb();
> +
> + /* ->mapping in first tail page is compound_mapcount */
> + BUG_ON(tail != 1 && page_tail->mapping != TAIL_MAPPING);
VM_BUG_ON?
> + page_tail->mapping = head->mapping;
> +
> + page_tail->index = head->index + tail;
> + page_cpupid_xchg_last(page_tail, page_cpupid_last(head));
> + lru_add_page_tail(head, page_tail, lruvec, list);
> +
> + return mapcount;
> +}
> +
> +static void __split_huge_page(struct page *page, struct list_head *list)
> +{
> + struct page *head = compound_head(page);
> + struct zone *zone = page_zone(head);
> + struct lruvec *lruvec;
> + int i, tail_mapcount;
> +
> + /* prevent PageLRU to go away from under us, and freeze lru stats */
> + spin_lock_irq(&zone->lru_lock);
> + lruvec = mem_cgroup_page_lruvec(head, zone);
> +
> + /* complete memcg works before add pages to LRU */
> + mem_cgroup_split_huge_fixup(head);
> +
> + tail_mapcount = 0;
> + for (i = HPAGE_PMD_NR - 1; i >= 1; i--)
> + tail_mapcount += __split_huge_page_tail(head, i, lruvec, list);
> + atomic_sub(tail_mapcount, &head->_count);
> +
> + ClearPageCompound(head);
> + spin_unlock_irq(&zone->lru_lock);
> +
> + unfreeze_page(page_anon_vma(head), head);
> +
> + for (i = 0; i < HPAGE_PMD_NR; i++) {
> + struct page *subpage = head + i;
> + if (subpage == page)
> + continue;
> + unlock_page(subpage);
> +
> + /*
> + * Subpages may be freed if there wasn't any mapping
> + * like if add_to_swap() is running on a lru page that
> + * had its mapping zapped. And freeing these pages
> + * requires taking the lru_lock so we do the put_page
> + * of the tail pages after the split is complete.
> + */
> + put_page(subpage);
> + }
> +}
> +
> +/*
> + * This function splits huge page into normal pages. @page can point to any
> + * subpage of huge page to split. Split doesn't change the position of @page.
> + *
> + * Only caller must hold pin on the @page, otherwise split fails with -EBUSY.
> + * The huge page must be locked.
> + *
> + * If @list is null, tail pages will be added to LRU list, otherwise, to @list.
> + *
> + * Both head page and tail pages will inherit mapping, flags, and so on from
> + * the hugepage.
> + *
> + * GUP pin and PG_locked transfered to @page. Rest subpages can be freed if
> + * they are not mapped.
> + *
> + * Returns 0 if the hugepage is split successfully.
> + * Returns -EBUSY if the page is pinned or if anon_vma disappeared from under
> + * us.
> + */
> +int split_huge_page_to_list(struct page *page, struct list_head *list)
> +{
> + struct page *head = compound_head(page);
> + struct anon_vma *anon_vma;
> + int mapcount, ret;
> +
> + BUG_ON(is_huge_zero_page(page));
> + BUG_ON(!PageAnon(page));
> + BUG_ON(!PageLocked(page));
> + BUG_ON(!PageSwapBacked(page));
> + BUG_ON(!PageCompound(page));
VM_BUG_ONs?
> +
> + /*
> + * The caller does not necessarily hold an mmap_sem that would prevent
> + * the anon_vma disappearing so we first we take a reference to it
> + * and then lock the anon_vma for write. This is similar to
> + * page_lock_anon_vma_read except the write lock is taken to serialise
> + * against parallel split or collapse operations.
> + */
> + anon_vma = page_get_anon_vma(head);
> + if (!anon_vma) {
> + ret = -EBUSY;
> + goto out;
> + }
> + anon_vma_lock_write(anon_vma);
> +
> + /*
> + * Racy check if we can split the page, before freeze_page() will
> + * split PMDs
> + */
> + if (total_mapcount(head) != page_count(head) - 1) {
> + ret = -EBUSY;
> + goto out_unlock;
> + }
> +
> + freeze_page(anon_vma, head);
> + VM_BUG_ON_PAGE(compound_mapcount(head), head);
> +
> + mapcount = total_mapcount(head);
> + if (mapcount == page_count(head) - 1) {
> + __split_huge_page(page, list);
> + ret = 0;
> + } else if (mapcount > page_count(page) - 1) {
It's confusing to use page_count(head) in one test and page_count(page)
in other, although I know it should be same. Also what if you read a
different value because something broke?
> + pr_alert("total_mapcount: %u, page_count(): %u\n",
> + mapcount, page_count(page));
Here you determine page_count(page) again although it could have
possibly changed (we are in path where something went wrong already) so
you potentially print different value than the one that was tested.
> + if (PageTail(page))
> + dump_page(head, NULL);
> + dump_page(page, "tail_mapcount > page_count(page) - 1");
Here you say "tail_mapcount" which means something else in different places.
Also isn't the whole "else if" test a DEBUG_VM material as well?
> + BUG();
> + } else {
> + unfreeze_page(anon_vma, head);
> + ret = -EBUSY;
> + }
> +
> +out_unlock:
> + anon_vma_unlock_write(anon_vma);
> + put_anon_vma(anon_vma);
> +out:
> + count_vm_event(!ret ? THP_SPLIT_PAGE : THP_SPLIT_PAGE_FAILED);
> + return ret;
> +}
> diff --git a/mm/internal.h b/mm/internal.h
> index 98bce4d12a16..aee0f2566fdd 100644
> --- a/mm/internal.h
> +++ b/mm/internal.h
> @@ -13,6 +13,7 @@
>
> #include <linux/fs.h>
> #include <linux/mm.h>
> +#include <linux/pagemap.h>
>
> void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
> unsigned long floor, unsigned long ceiling);
> @@ -244,10 +245,27 @@ static inline void mlock_migrate_page(struct page *newpage, struct page *page)
>
> extern pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma);
>
> -#ifdef CONFIG_TRANSPARENT_HUGEPAGE
> -extern unsigned long vma_address(struct page *page,
> - struct vm_area_struct *vma);
> -#endif
> +/*
> + * At what user virtual address is page expected in @vma?
> + */
> +static inline unsigned long
> +__vma_address(struct page *page, struct vm_area_struct *vma)
> +{
> + pgoff_t pgoff = page_to_pgoff(page);
> + return vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
> +}
> +
> +static inline unsigned long
> +vma_address(struct page *page, struct vm_area_struct *vma)
> +{
> + unsigned long address = __vma_address(page, vma);
> +
> + /* page should be within @vma mapping range */
> + VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma);
> +
> + return address;
> +}
> +
> #else /* !CONFIG_MMU */
> static inline void clear_page_mlock(struct page *page) { }
> static inline void mlock_vma_page(struct page *page) { }
> diff --git a/mm/rmap.c b/mm/rmap.c
> index 047953145710..723af5bbeb02 100644
> --- a/mm/rmap.c
> +++ b/mm/rmap.c
> @@ -561,27 +561,6 @@ void page_unlock_anon_vma_read(struct anon_vma *anon_vma)
> }
>
> /*
> - * At what user virtual address is page expected in @vma?
> - */
> -static inline unsigned long
> -__vma_address(struct page *page, struct vm_area_struct *vma)
> -{
> - pgoff_t pgoff = page_to_pgoff(page);
> - return vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
> -}
> -
> -inline unsigned long
> -vma_address(struct page *page, struct vm_area_struct *vma)
> -{
> - unsigned long address = __vma_address(page, vma);
> -
> - /* page should be within @vma mapping range */
> - VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma);
> -
> - return address;
> -}
> -
> -/*
> * At what user virtual address is page expected in vma?
> * Caller should check the page is actually part of the vma.
> */
>
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Powered by blists - more mailing lists