[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <Z5tQL60SNNGCkfQR@phenom.ffwll.local>
Date: Thu, 30 Jan 2025 11:10:55 +0100
From: Simona Vetter <simona.vetter@...ll.ch>
To: David Hildenbrand <david@...hat.com>
Cc: linux-kernel@...r.kernel.org, linux-doc@...r.kernel.org,
dri-devel@...ts.freedesktop.org, linux-mm@...ck.org,
nouveau@...ts.freedesktop.org,
Andrew Morton <akpm@...ux-foundation.org>,
Jérôme Glisse <jglisse@...hat.com>,
Jonathan Corbet <corbet@....net>, Alex Shi <alexs@...nel.org>,
Yanteng Si <si.yanteng@...ux.dev>,
Karol Herbst <kherbst@...hat.com>, Lyude Paul <lyude@...hat.com>,
Danilo Krummrich <dakr@...nel.org>,
David Airlie <airlied@...il.com>, Simona Vetter <simona@...ll.ch>,
"Liam R. Howlett" <Liam.Howlett@...cle.com>,
Lorenzo Stoakes <lorenzo.stoakes@...cle.com>,
Vlastimil Babka <vbabka@...e.cz>, Jann Horn <jannh@...gle.com>,
Pasha Tatashin <pasha.tatashin@...een.com>,
Peter Xu <peterx@...hat.com>, Alistair Popple <apopple@...dia.com>,
Jason Gunthorpe <jgg@...dia.com>
Subject: Re: [PATCH v1 08/12] mm/rmap: handle device-exclusive entries
correctly in try_to_unmap_one()
On Wed, Jan 29, 2025 at 12:54:06PM +0100, David Hildenbrand wrote:
> Ever since commit b756a3b5e7ea ("mm: device exclusive memory access")
> we can return with a device-exclusive entry from page_vma_mapped_walk().
>
> try_to_unmap_one() is not prepared for that, so teach it about these
> non-present nonswap PTEs.
>
> Before that, could we also have triggered this case with device-private
> entries? Unlikely.
Just quick comment on this, I'm still pondering all the other aspects.
device-private memory is entirely owned by the driver, the core mm isn't
supposed to touch these beyond migrating it back to system memory in
do_swap_page. Plus using rmap when the driver asks for invalidating
mappings as needed.
So no lru, thp, migration or anything initiated by core mm should ever
happen on these device private pages. If it does, it'd be a bug.
device-exclusive is entirely different ofc since that's just normal system
memory managed by core mm/.
-Sima
>
> Note that we could currently only run into this case with
> device-exclusive entries on THPs. For order-0 folios, we still adjust
> the mapcount on conversion to device-exclusive, making the rmap walk
> abort early (folio_mapcount() == 0 and breaking swapout). We'll fix
> that next, now that try_to_unmap_one() can handle it.
>
> Further note that try_to_unmap() calls MMU notifiers and holds the
> folio lock, so any device-exclusive users should be properly prepared
> for this device-exclusive PTE to "vanish".
>
> Fixes: b756a3b5e7ea ("mm: device exclusive memory access")
> Signed-off-by: David Hildenbrand <david@...hat.com>
> ---
> mm/rmap.c | 53 ++++++++++++++++++++++++++++++++++++++++-------------
> 1 file changed, 40 insertions(+), 13 deletions(-)
>
> diff --git a/mm/rmap.c b/mm/rmap.c
> index 65d9bbea16d0..12900f367a2a 100644
> --- a/mm/rmap.c
> +++ b/mm/rmap.c
> @@ -1648,9 +1648,9 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
> {
> struct mm_struct *mm = vma->vm_mm;
> DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, 0);
> + bool anon_exclusive, ret = true;
> pte_t pteval;
> struct page *subpage;
> - bool anon_exclusive, ret = true;
> struct mmu_notifier_range range;
> enum ttu_flags flags = (enum ttu_flags)(long)arg;
> unsigned long pfn;
> @@ -1722,7 +1722,19 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
> /* Unexpected PMD-mapped THP? */
> VM_BUG_ON_FOLIO(!pvmw.pte, folio);
>
> - pfn = pte_pfn(ptep_get(pvmw.pte));
> + /*
> + * We can end up here with selected non-swap entries that
> + * actually map pages similar to PROT_NONE; see
> + * page_vma_mapped_walk()->check_pte().
> + */
> + pteval = ptep_get(pvmw.pte);
> + if (likely(pte_present(pteval))) {
> + pfn = pte_pfn(pteval);
> + } else {
> + pfn = swp_offset_pfn(pte_to_swp_entry(pteval));
> + VM_WARN_ON_FOLIO(folio_test_hugetlb(folio), folio);
> + }
> +
> subpage = folio_page(folio, pfn - folio_pfn(folio));
> address = pvmw.address;
> anon_exclusive = folio_test_anon(folio) &&
> @@ -1778,7 +1790,9 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
> hugetlb_vma_unlock_write(vma);
> }
> pteval = huge_ptep_clear_flush(vma, address, pvmw.pte);
> - } else {
> + if (pte_dirty(pteval))
> + folio_mark_dirty(folio);
> + } else if (likely(pte_present(pteval))) {
> flush_cache_page(vma, address, pfn);
> /* Nuke the page table entry. */
> if (should_defer_flush(mm, flags)) {
> @@ -1796,6 +1810,10 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
> } else {
> pteval = ptep_clear_flush(vma, address, pvmw.pte);
> }
> + if (pte_dirty(pteval))
> + folio_mark_dirty(folio);
> + } else {
> + pte_clear(mm, address, pvmw.pte);
> }
>
> /*
> @@ -1805,10 +1823,6 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
> */
> pte_install_uffd_wp_if_needed(vma, address, pvmw.pte, pteval);
>
> - /* Set the dirty flag on the folio now the pte is gone. */
> - if (pte_dirty(pteval))
> - folio_mark_dirty(folio);
> -
> /* Update high watermark before we lower rss */
> update_hiwater_rss(mm);
>
> @@ -1822,8 +1836,8 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
> dec_mm_counter(mm, mm_counter(folio));
> set_pte_at(mm, address, pvmw.pte, pteval);
> }
> -
> - } else if (pte_unused(pteval) && !userfaultfd_armed(vma)) {
> + } else if (likely(pte_present(pteval)) && pte_unused(pteval) &&
> + !userfaultfd_armed(vma)) {
> /*
> * The guest indicated that the page content is of no
> * interest anymore. Simply discard the pte, vmscan
> @@ -1902,6 +1916,12 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
> set_pte_at(mm, address, pvmw.pte, pteval);
> goto walk_abort;
> }
> +
> + /*
> + * arch_unmap_one() is expected to be a NOP on
> + * architectures where we could have non-swp entries
> + * here, so we'll not check/care.
> + */
> if (arch_unmap_one(mm, vma, address, pteval) < 0) {
> swap_free(entry);
> set_pte_at(mm, address, pvmw.pte, pteval);
> @@ -1926,10 +1946,17 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
> swp_pte = swp_entry_to_pte(entry);
> if (anon_exclusive)
> swp_pte = pte_swp_mkexclusive(swp_pte);
> - if (pte_soft_dirty(pteval))
> - swp_pte = pte_swp_mksoft_dirty(swp_pte);
> - if (pte_uffd_wp(pteval))
> - swp_pte = pte_swp_mkuffd_wp(swp_pte);
> + if (likely(pte_present(pteval))) {
> + if (pte_soft_dirty(pteval))
> + swp_pte = pte_swp_mksoft_dirty(swp_pte);
> + if (pte_uffd_wp(pteval))
> + swp_pte = pte_swp_mkuffd_wp(swp_pte);
> + } else {
> + if (pte_swp_soft_dirty(pteval))
> + swp_pte = pte_swp_mksoft_dirty(swp_pte);
> + if (pte_swp_uffd_wp(pteval))
> + swp_pte = pte_swp_mkuffd_wp(swp_pte);
> + }
> set_pte_at(mm, address, pvmw.pte, swp_pte);
> } else {
> /*
> --
> 2.48.1
>
--
Simona Vetter
Software Engineer, Intel Corporation
http://blog.ffwll.ch
Powered by blists - more mailing lists