[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20240807111754.2148d27e@p-imbrenda.boeblingen.de.ibm.com>
Date: Wed, 7 Aug 2024 11:17:54 +0200
From: Claudio Imbrenda <imbrenda@...ux.ibm.com>
To: David Hildenbrand <david@...hat.com>
Cc: linux-kernel@...r.kernel.org, linux-mm@...ck.org,
linux-doc@...r.kernel.org, kvm@...r.kernel.org,
linux-s390@...r.kernel.org, linux-fsdevel@...r.kernel.org,
Andrew Morton <akpm@...ux-foundation.org>,
"Matthew Wilcox (Oracle)" <willy@...radead.org>,
Jonathan Corbet
<corbet@....net>,
Christian Borntraeger <borntraeger@...ux.ibm.com>,
Janosch Frank <frankja@...ux.ibm.com>,
Heiko Carstens <hca@...ux.ibm.com>, Vasily Gorbik <gor@...ux.ibm.com>,
Alexander Gordeev
<agordeev@...ux.ibm.com>,
Sven Schnelle <svens@...ux.ibm.com>,
Gerald
Schaefer <gerald.schaefer@...ux.ibm.com>
Subject: Re: [PATCH v1 02/11] mm/pagewalk: introduce folio_walk_start() +
folio_walk_end()
On Fri, 2 Aug 2024 17:55:15 +0200
David Hildenbrand <david@...hat.com> wrote:
> We want to get rid of follow_page(), and have a more reasonable way to
> just lookup a folio mapped at a certain address, perform some checks while
> still under PTL, and then only conditionally grab a folio reference if
> really required.
>
> Further, we might want to get rid of some walk_page_range*() users that
> really only want to temporarily lookup a single folio at a single address.
>
> So let's add a new page table walker that does exactly that, similarly
> to GUP also being able to walk hugetlb VMAs.
>
> Add folio_walk_end() as a macro for now: the compiler is not easy to
> please with the pte_unmap()->kunmap_local().
>
> Note that one difference between follow_page() and get_user_pages(1) is
> that follow_page() will not trigger faults to get something mapped. So
> folio_walk is at least currently not a replacement for get_user_pages(1),
> but could likely be extended/reused to achieve something similar in the
> future.
[...]
> +struct folio *folio_walk_start(struct folio_walk *fw,
> + struct vm_area_struct *vma, unsigned long addr,
> + folio_walk_flags_t flags)
> +{
> + unsigned long entry_size;
> + bool expose_page = true;
> + struct page *page;
> + pud_t *pudp, pud;
> + pmd_t *pmdp, pmd;
> + pte_t *ptep, pte;
> + spinlock_t *ptl;
> + pgd_t *pgdp;
> + p4d_t *p4dp;
> +
> + mmap_assert_locked(vma->vm_mm);
> + vma_pgtable_walk_begin(vma);
> +
> + if (WARN_ON_ONCE(addr < vma->vm_start || addr >= vma->vm_end))
> + goto not_found;
> +
> + pgdp = pgd_offset(vma->vm_mm, addr);
> + if (pgd_none_or_clear_bad(pgdp))
> + goto not_found;
> +
> + p4dp = p4d_offset(pgdp, addr);
> + if (p4d_none_or_clear_bad(p4dp))
> + goto not_found;
> +
> + pudp = pud_offset(p4dp, addr);
> + pud = pudp_get(pudp);
> + if (pud_none(pud))
> + goto not_found;
> + if (IS_ENABLED(CONFIG_PGTABLE_HAS_HUGE_LEAVES) && pud_leaf(pud)) {
> + ptl = pud_lock(vma->vm_mm, pudp);
> + pud = pudp_get(pudp);
> +
> + entry_size = PUD_SIZE;
> + fw->level = FW_LEVEL_PUD;
> + fw->pudp = pudp;
> + fw->pud = pud;
> +
> + if (!pud_present(pud) || pud_devmap(pud)) {
> + spin_unlock(ptl);
> + goto not_found;
> + } else if (!pud_leaf(pud)) {
> + spin_unlock(ptl);
> + goto pmd_table;
> + }
> + /*
> + * TODO: vm_normal_page_pud() will be handy once we want to
> + * support PUD mappings in VM_PFNMAP|VM_MIXEDMAP VMAs.
> + */
> + page = pud_page(pud);
> + goto found;
> + }
> +
> +pmd_table:
> + VM_WARN_ON_ONCE(pud_leaf(*pudp));
is this warning necessary? can this actually happen?
and if it can happen, wouldn't it be more reasonable to return NULL?
> + pmdp = pmd_offset(pudp, addr);
> + pmd = pmdp_get_lockless(pmdp);
> + if (pmd_none(pmd))
> + goto not_found;
> + if (IS_ENABLED(CONFIG_PGTABLE_HAS_HUGE_LEAVES) && pmd_leaf(pmd)) {
> + ptl = pmd_lock(vma->vm_mm, pmdp);
> + pmd = pmdp_get(pmdp);
> +
> + entry_size = PMD_SIZE;
> + fw->level = FW_LEVEL_PMD;
> + fw->pmdp = pmdp;
> + fw->pmd = pmd;
> +
> + if (pmd_none(pmd)) {
> + spin_unlock(ptl);
> + goto not_found;
> + } else if (!pmd_leaf(pmd)) {
> + spin_unlock(ptl);
> + goto pte_table;
> + } else if (pmd_present(pmd)) {
> + page = vm_normal_page_pmd(vma, addr, pmd);
> + if (page) {
> + goto found;
> + } else if ((flags & FW_ZEROPAGE) &&
> + is_huge_zero_pmd(pmd)) {
> + page = pfn_to_page(pmd_pfn(pmd));
> + expose_page = false;
> + goto found;
> + }
> + } else if ((flags & FW_MIGRATION) &&
> + is_pmd_migration_entry(pmd)) {
> + swp_entry_t entry = pmd_to_swp_entry(pmd);
> +
> + page = pfn_swap_entry_to_page(entry);
> + expose_page = false;
> + goto found;
> + }
> + spin_unlock(ptl);
> + goto not_found;
> + }
> +
> +pte_table:
> + VM_WARN_ON_ONCE(pmd_leaf(pmdp_get_lockless(pmdp)));
same here
> + ptep = pte_offset_map_lock(vma->vm_mm, pmdp, addr, &ptl);
> + if (!ptep)
> + goto not_found;
> + pte = ptep_get(ptep);
> +
> + entry_size = PAGE_SIZE;
> + fw->level = FW_LEVEL_PTE;
> + fw->ptep = ptep;
> + fw->pte = pte;
> +
> + if (pte_present(pte)) {
> + page = vm_normal_page(vma, addr, pte);
> + if (page)
> + goto found;
> + if ((flags & FW_ZEROPAGE) &&
> + is_zero_pfn(pte_pfn(pte))) {
> + page = pfn_to_page(pte_pfn(pte));
> + expose_page = false;
> + goto found;
> + }
> + } else if (!pte_none(pte)) {
> + swp_entry_t entry = pte_to_swp_entry(pte);
> +
> + if ((flags & FW_MIGRATION) &&
> + is_migration_entry(entry)) {
> + page = pfn_swap_entry_to_page(entry);
> + expose_page = false;
> + goto found;
> + }
> + }
> + pte_unmap_unlock(ptep, ptl);
> +not_found:
> + vma_pgtable_walk_end(vma);
> + return NULL;
> +found:
> + if (expose_page)
> + /* Note: Offset from the mapped page, not the folio start. */
> + fw->page = nth_page(page, (addr & (entry_size - 1)) >> PAGE_SHIFT);
> + else
> + fw->page = NULL;
> + fw->ptl = ptl;
> + return page_folio(page);
> +}
Powered by blists - more mailing lists