[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <a38065af-6668-d371-4016-670c71e1e722@nvidia.com>
Date: Fri, 9 Dec 2022 12:06:30 -0800
From: John Hubbard <jhubbard@...dia.com>
To: Peter Xu <peterx@...hat.com>, <linux-mm@...ck.org>,
<linux-kernel@...r.kernel.org>
CC: Andrew Morton <akpm@...ux-foundation.org>,
Miaohe Lin <linmiaohe@...wei.com>,
David Hildenbrand <david@...hat.com>,
Nadav Amit <nadav.amit@...il.com>,
Andrea Arcangeli <aarcange@...hat.com>,
Jann Horn <jannh@...gle.com>,
Mike Kravetz <mike.kravetz@...cle.com>,
James Houghton <jthoughton@...gle.com>,
Rik van Riel <riel@...riel.com>,
Muchun Song <songmuchun@...edance.com>
Subject: Re: [PATCH v3 9/9] mm/hugetlb: Introduce hugetlb_walk()
On 12/9/22 09:01, Peter Xu wrote:
> huge_pte_offset() is the main walker function for hugetlb pgtables. The
> name is not really representing what it does, though.
>
> Instead of renaming it, introduce a wrapper function called hugetlb_walk()
> which will use huge_pte_offset() inside. Assert on the locks when walking
> the pgtable.
>
> Note, the vma lock assertion will be a no-op for private mappings.
>
> Document the last special case in the page_vma_mapped_walk() path where we
> don't need any more lock to call hugetlb_walk().
>
> Taking vma lock there is not needed because either: (1) potential callers
> of hugetlb pvmw holds i_mmap_rwsem already (from one rmap_walk()), or (2)
> the caller will not walk a hugetlb vma at all so the hugetlb code path not
> reachable (e.g. in ksm or uprobe paths).
>
> It's slightly implicit for future page_vma_mapped_walk() callers on that
> lock requirement. But anyway, when one day this rule breaks, one will get
> a straightforward warning in hugetlb_walk() with lockdep, then there'll be
> a way out.
>
> Reviewed-by: Mike Kravetz <mike.kravetz@...cle.com>
> Signed-off-by: Peter Xu <peterx@...hat.com>
> ---
> fs/hugetlbfs/inode.c | 4 +---
> fs/userfaultfd.c | 6 ++----
> include/linux/hugetlb.h | 39 +++++++++++++++++++++++++++++++++++++++
> mm/hugetlb.c | 32 +++++++++++++-------------------
> mm/page_vma_mapped.c | 9 ++++++---
> mm/pagewalk.c | 4 +---
> 6 files changed, 62 insertions(+), 32 deletions(-)
>
Reviewed-by: John Hubbard <jhubbard@...dia.com>
thanks,
--
John Hubbard
NVIDIA
> diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
> index fdb16246f46e..48f1a8ad2243 100644
> --- a/fs/hugetlbfs/inode.c
> +++ b/fs/hugetlbfs/inode.c
> @@ -388,9 +388,7 @@ static bool hugetlb_vma_maps_page(struct vm_area_struct *vma,
> {
> pte_t *ptep, pte;
>
> - ptep = huge_pte_offset(vma->vm_mm, addr,
> - huge_page_size(hstate_vma(vma)));
> -
> + ptep = hugetlb_walk(vma, addr, huge_page_size(hstate_vma(vma)));
> if (!ptep)
> return false;
>
> diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
> index 969f4be967c6..6a278941ec84 100644
> --- a/fs/userfaultfd.c
> +++ b/fs/userfaultfd.c
> @@ -237,14 +237,12 @@ static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx,
> unsigned long flags,
> unsigned long reason)
> {
> - struct mm_struct *mm = ctx->mm;
> pte_t *ptep, pte;
> bool ret = true;
>
> - mmap_assert_locked(mm);
> -
> - ptep = huge_pte_offset(mm, address, vma_mmu_pagesize(vma));
> + mmap_assert_locked(ctx->mm);
>
> + ptep = hugetlb_walk(vma, address, vma_mmu_pagesize(vma));
> if (!ptep)
> goto out;
>
> diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
> index d755e2a7c0db..a5e87ec7fa6e 100644
> --- a/include/linux/hugetlb.h
> +++ b/include/linux/hugetlb.h
> @@ -2,6 +2,7 @@
> #ifndef _LINUX_HUGETLB_H
> #define _LINUX_HUGETLB_H
>
> +#include <linux/mm.h>
> #include <linux/mm_types.h>
> #include <linux/mmdebug.h>
> #include <linux/fs.h>
> @@ -196,6 +197,11 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
> * huge_pte_offset(): Walk the hugetlb pgtable until the last level PTE.
> * Returns the pte_t* if found, or NULL if the address is not mapped.
> *
> + * IMPORTANT: we should normally not directly call this function, instead
> + * this is only a common interface to implement arch-specific
> + * walker. Please use hugetlb_walk() instead, because that will attempt to
> + * verify the locking for you.
> + *
> * Since this function will walk all the pgtable pages (including not only
> * high-level pgtable page, but also PUD entry that can be unshared
> * concurrently for VM_SHARED), the caller of this function should be
> @@ -1229,4 +1235,37 @@ bool want_pmd_share(struct vm_area_struct *vma, unsigned long addr);
> #define flush_hugetlb_tlb_range(vma, addr, end) flush_tlb_range(vma, addr, end)
> #endif
>
> +static inline bool
> +__vma_shareable_flags_pmd(struct vm_area_struct *vma)
> +{
> + return vma->vm_flags & (VM_MAYSHARE | VM_SHARED) &&
> + vma->vm_private_data;
> +}
> +
> +/*
> + * Safe version of huge_pte_offset() to check the locks. See comments
> + * above huge_pte_offset().
> + */
> +static inline pte_t *
> +hugetlb_walk(struct vm_area_struct *vma, unsigned long addr, unsigned long sz)
> +{
> +#if defined(CONFIG_HUGETLB_PAGE) && \
> + defined(CONFIG_ARCH_WANT_HUGE_PMD_SHARE) && defined(CONFIG_LOCKDEP)
> + struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
> +
> + /*
> + * If pmd sharing possible, locking needed to safely walk the
> + * hugetlb pgtables. More information can be found at the comment
> + * above huge_pte_offset() in the same file.
> + *
> + * NOTE: lockdep_is_held() is only defined with CONFIG_LOCKDEP.
> + */
> + if (__vma_shareable_flags_pmd(vma))
> + WARN_ON_ONCE(!lockdep_is_held(&vma_lock->rw_sema) &&
> + !lockdep_is_held(
> + &vma->vm_file->f_mapping->i_mmap_rwsem));
> +#endif
> + return huge_pte_offset(vma->vm_mm, addr, sz);
> +}
> +
> #endif /* _LINUX_HUGETLB_H */
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index 9d8bb6508288..b20120d14a71 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -4814,7 +4814,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
> } else {
> /*
> * For shared mappings the vma lock must be held before
> - * calling huge_pte_offset in the src vma. Otherwise, the
> + * calling hugetlb_walk() in the src vma. Otherwise, the
> * returned ptep could go away if part of a shared pmd and
> * another thread calls huge_pmd_unshare.
> */
> @@ -4824,7 +4824,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
> last_addr_mask = hugetlb_mask_last_page(h);
> for (addr = src_vma->vm_start; addr < src_vma->vm_end; addr += sz) {
> spinlock_t *src_ptl, *dst_ptl;
> - src_pte = huge_pte_offset(src, addr, sz);
> + src_pte = hugetlb_walk(src_vma, addr, sz);
> if (!src_pte) {
> addr |= last_addr_mask;
> continue;
> @@ -5028,7 +5028,7 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma,
> hugetlb_vma_lock_write(vma);
> i_mmap_lock_write(mapping);
> for (; old_addr < old_end; old_addr += sz, new_addr += sz) {
> - src_pte = huge_pte_offset(mm, old_addr, sz);
> + src_pte = hugetlb_walk(vma, old_addr, sz);
> if (!src_pte) {
> old_addr |= last_addr_mask;
> new_addr |= last_addr_mask;
> @@ -5091,7 +5091,7 @@ static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct
> last_addr_mask = hugetlb_mask_last_page(h);
> address = start;
> for (; address < end; address += sz) {
> - ptep = huge_pte_offset(mm, address, sz);
> + ptep = hugetlb_walk(vma, address, sz);
> if (!ptep) {
> address |= last_addr_mask;
> continue;
> @@ -5404,7 +5404,7 @@ static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma,
> mutex_lock(&hugetlb_fault_mutex_table[hash]);
> hugetlb_vma_lock_read(vma);
> spin_lock(ptl);
> - ptep = huge_pte_offset(mm, haddr, huge_page_size(h));
> + ptep = hugetlb_walk(vma, haddr, huge_page_size(h));
> if (likely(ptep &&
> pte_same(huge_ptep_get(ptep), pte)))
> goto retry_avoidcopy;
> @@ -5442,7 +5442,7 @@ static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma,
> * before the page tables are altered
> */
> spin_lock(ptl);
> - ptep = huge_pte_offset(mm, haddr, huge_page_size(h));
> + ptep = hugetlb_walk(vma, haddr, huge_page_size(h));
> if (likely(ptep && pte_same(huge_ptep_get(ptep), pte))) {
> /* Break COW or unshare */
> huge_ptep_clear_flush(vma, haddr, ptep);
> @@ -6228,7 +6228,7 @@ struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma,
> return NULL;
>
> hugetlb_vma_lock_read(vma);
> - pte = huge_pte_offset(mm, haddr, huge_page_size(h));
> + pte = hugetlb_walk(vma, haddr, huge_page_size(h));
> if (!pte)
> goto out_unlock;
>
> @@ -6293,8 +6293,8 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
> *
> * Note that page table lock is not held when pte is null.
> */
> - pte = huge_pte_offset(mm, vaddr & huge_page_mask(h),
> - huge_page_size(h));
> + pte = hugetlb_walk(vma, vaddr & huge_page_mask(h),
> + huge_page_size(h));
> if (pte)
> ptl = huge_pte_lock(h, mm, pte);
> absent = !pte || huge_pte_none(huge_ptep_get(pte));
> @@ -6480,7 +6480,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
> last_addr_mask = hugetlb_mask_last_page(h);
> for (; address < end; address += psize) {
> spinlock_t *ptl;
> - ptep = huge_pte_offset(mm, address, psize);
> + ptep = hugetlb_walk(vma, address, psize);
> if (!ptep) {
> address |= last_addr_mask;
> continue;
> @@ -6858,12 +6858,6 @@ void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
> *end = ALIGN(*end, PUD_SIZE);
> }
>
> -static bool __vma_shareable_flags_pmd(struct vm_area_struct *vma)
> -{
> - return vma->vm_flags & (VM_MAYSHARE | VM_SHARED) &&
> - vma->vm_private_data;
> -}
> -
> void hugetlb_vma_lock_read(struct vm_area_struct *vma)
> {
> if (__vma_shareable_flags_pmd(vma)) {
> @@ -7029,8 +7023,8 @@ pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
>
> saddr = page_table_shareable(svma, vma, addr, idx);
> if (saddr) {
> - spte = huge_pte_offset(svma->vm_mm, saddr,
> - vma_mmu_pagesize(svma));
> + spte = hugetlb_walk(svma, saddr,
> + vma_mmu_pagesize(svma));
> if (spte) {
> get_page(virt_to_page(spte));
> break;
> @@ -7388,7 +7382,7 @@ void hugetlb_unshare_all_pmds(struct vm_area_struct *vma)
> hugetlb_vma_lock_write(vma);
> i_mmap_lock_write(vma->vm_file->f_mapping);
> for (address = start; address < end; address += PUD_SIZE) {
> - ptep = huge_pte_offset(mm, address, sz);
> + ptep = hugetlb_walk(vma, address, sz);
> if (!ptep)
> continue;
> ptl = huge_pte_lock(h, mm, ptep);
> diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c
> index 93e13fc17d3c..f3729b23dd0e 100644
> --- a/mm/page_vma_mapped.c
> +++ b/mm/page_vma_mapped.c
> @@ -168,9 +168,12 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw)
> /* The only possible mapping was handled on last iteration */
> if (pvmw->pte)
> return not_found(pvmw);
> -
> - /* when pud is not present, pte will be NULL */
> - pvmw->pte = huge_pte_offset(mm, pvmw->address, size);
> + /*
> + * All callers that get here will already hold the
> + * i_mmap_rwsem. Therefore, no additional locks need to be
> + * taken before calling hugetlb_walk().
> + */
> + pvmw->pte = hugetlb_walk(vma, pvmw->address, size);
> if (!pvmw->pte)
> return false;
>
> diff --git a/mm/pagewalk.c b/mm/pagewalk.c
> index d98564a7be57..cb23f8a15c13 100644
> --- a/mm/pagewalk.c
> +++ b/mm/pagewalk.c
> @@ -305,13 +305,11 @@ static int walk_hugetlb_range(unsigned long addr, unsigned long end,
> hugetlb_vma_lock_read(vma);
> do {
> next = hugetlb_entry_end(h, addr, end);
> - pte = huge_pte_offset(walk->mm, addr & hmask, sz);
> -
> + pte = hugetlb_walk(vma, addr & hmask, sz);
> if (pte)
> err = ops->hugetlb_entry(pte, hmask, addr, next, walk);
> else if (ops->pte_hole)
> err = ops->pte_hole(addr, next, -1, walk);
> -
> if (err)
> break;
> } while (addr = next, addr != end);
Powered by blists - more mailing lists