[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20190730145922.m5omqqf7rmilp6yy@box>
Date: Tue, 30 Jul 2019 17:59:22 +0300
From: "Kirill A. Shutemov" <kirill@...temov.name>
To: Song Liu <songliubraving@...com>
Cc: linux-kernel@...r.kernel.org, linux-mm@...ck.org,
akpm@...ux-foundation.org, matthew.wilcox@...cle.com,
kirill.shutemov@...ux.intel.com, oleg@...hat.com,
kernel-team@...com, william.kucharski@...cle.com,
srikar@...ux.vnet.ibm.com
Subject: Re: [PATCH 1/2] khugepaged: enable collapse pmd for pte-mapped THP
On Sun, Jul 28, 2019 at 10:43:34PM -0700, Song Liu wrote:
> khugepaged needs exclusive mmap_sem to access page table. When it fails
> to lock mmap_sem, the page will fault in as pte-mapped THP. As the page
> is already a THP, khugepaged will not handle this pmd again.
>
> This patch enables the khugepaged to retry collapse the page table.
>
> struct mm_slot (in khugepaged.c) is extended with an array, containing
> addresses of pte-mapped THPs. We use array here for simplicity. We can
> easily replace it with more advanced data structures when needed. This
> array is protected by khugepaged_mm_lock.
>
> In khugepaged_scan_mm_slot(), if the mm contains pte-mapped THP, we try
> to collapse the page table.
>
> Since collapse may happen at an later time, some pages may already fault
> in. collapse_pte_mapped_thp() is added to properly handle these pages.
> collapse_pte_mapped_thp() also double checks whether all ptes in this pmd
> are mapping to the same THP. This is necessary because some subpage of
> the THP may be replaced, for example by uprobe. In such cases, it is not
> possible to collapse the pmd.
>
> Signed-off-by: Song Liu <songliubraving@...com>
> ---
> include/linux/khugepaged.h | 15 ++++
> mm/khugepaged.c | 136 +++++++++++++++++++++++++++++++++++++
> 2 files changed, 151 insertions(+)
>
> diff --git a/include/linux/khugepaged.h b/include/linux/khugepaged.h
> index 082d1d2a5216..2d700830fe0e 100644
> --- a/include/linux/khugepaged.h
> +++ b/include/linux/khugepaged.h
> @@ -15,6 +15,16 @@ extern int __khugepaged_enter(struct mm_struct *mm);
> extern void __khugepaged_exit(struct mm_struct *mm);
> extern int khugepaged_enter_vma_merge(struct vm_area_struct *vma,
> unsigned long vm_flags);
> +#ifdef CONFIG_SHMEM
> +extern int khugepaged_add_pte_mapped_thp(struct mm_struct *mm,
> + unsigned long addr);
> +#else
> +static inline int khugepaged_add_pte_mapped_thp(struct mm_struct *mm,
> + unsigned long addr)
> +{
> + return 0;
> +}
> +#endif
>
> #define khugepaged_enabled() \
> (transparent_hugepage_flags & \
> @@ -73,6 +83,11 @@ static inline int khugepaged_enter_vma_merge(struct vm_area_struct *vma,
> {
> return 0;
> }
> +static inline int khugepaged_add_pte_mapped_thp(struct mm_struct *mm,
> + unsigned long addr)
> +{
> + return 0;
> +}
> #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
>
> #endif /* _LINUX_KHUGEPAGED_H */
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index eaaa21b23215..247c25aeb096 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -76,6 +76,7 @@ static __read_mostly DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS);
>
> static struct kmem_cache *mm_slot_cache __read_mostly;
>
> +#define MAX_PTE_MAPPED_THP 8
Is MAX_PTE_MAPPED_THP value random or do you have any justification for
it?
Please add empty line after it.
> /**
> * struct mm_slot - hash lookup from mm to mm_slot
> * @hash: hash collision list
> @@ -86,6 +87,10 @@ struct mm_slot {
> struct hlist_node hash;
> struct list_head mm_node;
> struct mm_struct *mm;
> +
> + /* pte-mapped THP in this mm */
> + int nr_pte_mapped_thp;
> + unsigned long pte_mapped_thp[MAX_PTE_MAPPED_THP];
> };
>
> /**
> @@ -1281,11 +1286,141 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
> up_write(&vma->vm_mm->mmap_sem);
> mm_dec_nr_ptes(vma->vm_mm);
> pte_free(vma->vm_mm, pmd_pgtable(_pmd));
> + } else if (down_read_trylock(&vma->vm_mm->mmap_sem)) {
> + /* need down_read for khugepaged_test_exit() */
> + khugepaged_add_pte_mapped_thp(vma->vm_mm, addr);
> + up_read(&vma->vm_mm->mmap_sem);
> }
> }
> i_mmap_unlock_write(mapping);
> }
>
> +/*
> + * Notify khugepaged that given addr of the mm is pte-mapped THP. Then
> + * khugepaged should try to collapse the page table.
> + */
> +int khugepaged_add_pte_mapped_thp(struct mm_struct *mm, unsigned long addr)
What is contract about addr alignment? Do we expect it PAGE_SIZE aligned
or PMD_SIZE aligned? Do we want to enforce it?
> +{
> + struct mm_slot *mm_slot;
> + int ret = 0;
> +
> + /* hold mmap_sem for khugepaged_test_exit() */
> + VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_sem), mm);
> +
> + if (unlikely(khugepaged_test_exit(mm)))
> + return 0;
> +
> + if (!test_bit(MMF_VM_HUGEPAGE, &mm->flags) &&
> + !test_bit(MMF_DISABLE_THP, &mm->flags)) {
> + ret = __khugepaged_enter(mm);
> + if (ret)
> + return ret;
> + }
Any reason not to call khugepaged_enter() here?
> +
> + spin_lock(&khugepaged_mm_lock);
> + mm_slot = get_mm_slot(mm);
> + if (likely(mm_slot && mm_slot->nr_pte_mapped_thp < MAX_PTE_MAPPED_THP))
> + mm_slot->pte_mapped_thp[mm_slot->nr_pte_mapped_thp++] = addr;
It's probably good enough for start, but I'm not sure how useful it will
be for real application, considering the limitation.
> +
Useless empty line?
> + spin_unlock(&khugepaged_mm_lock);
> + return 0;
> +}
> +
> +/**
> + * Try to collapse a pte-mapped THP for mm at address haddr.
> + *
> + * This function checks whether all the PTEs in the PMD are pointing to the
> + * right THP. If so, retract the page table so the THP can refault in with
> + * as pmd-mapped.
> + */
> +static void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long haddr)
> +{
> + struct vm_area_struct *vma = find_vma(mm, haddr);
> + pmd_t *pmd = mm_find_pmd(mm, haddr);
> + struct page *hpage = NULL;
> + unsigned long addr;
> + spinlock_t *ptl;
> + int count = 0;
> + pmd_t _pmd;
> + int i;
> +
> + if (!vma || !pmd || pmd_trans_huge(*pmd))
> + return;
> +
> + /* step 1: check all mapped PTEs are to the right huge page */
> + for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) {
> + pte_t *pte = pte_offset_map(pmd, addr);
> + struct page *page;
> +
> + if (pte_none(*pte))
> + continue;
> +
> + page = vm_normal_page(vma, addr, *pte);
> +
> + if (!PageCompound(page))
> + return;
I think khugepaged_scan_shmem() and collapse_shmem() should changed to not
stop on PageTransCompound() to make this useful for more cases.
Ideally, it collapse_shmem() and this routine should be the same thing.
Or do you thing it's not doable for some reason?
> +
> + if (!hpage) {
> + hpage = compound_head(page);
> + if (hpage->mapping != vma->vm_file->f_mapping)
> + return;
> + }
> +
> + if (hpage + i != page)
> + return;
> + count++;
> + }
> +
> + /* step 2: adjust rmap */
> + for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) {
> + pte_t *pte = pte_offset_map(pmd, addr);
> + struct page *page;
> +
> + if (pte_none(*pte))
> + continue;
> + page = vm_normal_page(vma, addr, *pte);
> + page_remove_rmap(page, false);
> + }
> +
> + /* step 3: set proper refcount and mm_counters. */
> + if (hpage) {
> + page_ref_sub(hpage, count);
> + add_mm_counter(vma->vm_mm, mm_counter_file(hpage), -count);
> + }
> +
> + /* step 4: collapse pmd */
> + ptl = pmd_lock(vma->vm_mm, pmd);
> + _pmd = pmdp_collapse_flush(vma, addr, pmd);
> + spin_unlock(ptl);
> + mm_dec_nr_ptes(mm);
> + pte_free(mm, pmd_pgtable(_pmd));
> +}
> +
> +static int khugepaged_collapse_pte_mapped_thps(struct mm_slot *mm_slot)
> +{
> + struct mm_struct *mm = mm_slot->mm;
> + int i;
> +
> + lockdep_assert_held(&khugepaged_mm_lock);
> +
> + if (likely(mm_slot->nr_pte_mapped_thp == 0))
> + return 0;
> +
> + if (!down_write_trylock(&mm->mmap_sem))
> + return -EBUSY;
> +
> + if (unlikely(khugepaged_test_exit(mm)))
> + goto out;
> +
> + for (i = 0; i < mm_slot->nr_pte_mapped_thp; i++)
> + collapse_pte_mapped_thp(mm, mm_slot->pte_mapped_thp[i]);
> +
> +out:
> + mm_slot->nr_pte_mapped_thp = 0;
> + up_write(&mm->mmap_sem);
> + return 0;
> +}
> +
> /**
> * collapse_shmem - collapse small tmpfs/shmem pages into huge one.
> *
> @@ -1667,6 +1802,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
> khugepaged_scan.address = 0;
> khugepaged_scan.mm_slot = mm_slot;
> }
> + khugepaged_collapse_pte_mapped_thps(mm_slot);
> spin_unlock(&khugepaged_mm_lock);
>
> mm = mm_slot->mm;
> --
> 2.17.1
>
--
Kirill A. Shutemov
Powered by blists - more mailing lists