[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20251106161117.350395-5-imbrenda@linux.ibm.com>
Date: Thu, 6 Nov 2025 17:10:58 +0100
From: Claudio Imbrenda <imbrenda@...ux.ibm.com>
To: kvm@...r.kernel.org
Cc: linux-kernel@...r.kernel.org, linux-s390@...r.kernel.org,
borntraeger@...ibm.com, frankja@...ux.ibm.com, nsg@...ux.ibm.com,
nrb@...ux.ibm.com, seiden@...ux.ibm.com, schlameuss@...ux.ibm.com,
hca@...ux.ibm.com, svens@...ux.ibm.com, agordeev@...ux.ibm.com,
gor@...ux.ibm.com, david@...hat.com, gerald.schaefer@...ux.ibm.com
Subject: [PATCH v3 04/23] KVM: s390: Add gmap_helper_set_unused()
Add gmap_helper_set_unused() to mark userspace ptes as unused.
Core mm code will use that information to discard unused pages instead
of attempting to swap them.
Signed-off-by: Claudio Imbrenda <imbrenda@...ux.ibm.com>
Reviewed-by: Nico Boehr <nrb@...ux.ibm.com>
Tested-by: Nico Boehr <nrb@...ux.ibm.com>
---
arch/s390/include/asm/gmap_helpers.h | 1 +
arch/s390/mm/gmap_helpers.c | 79 ++++++++++++++++++++++++++++
2 files changed, 80 insertions(+)
diff --git a/arch/s390/include/asm/gmap_helpers.h b/arch/s390/include/asm/gmap_helpers.h
index 5356446a61c4..2d3ae421077e 100644
--- a/arch/s390/include/asm/gmap_helpers.h
+++ b/arch/s390/include/asm/gmap_helpers.h
@@ -11,5 +11,6 @@
void gmap_helper_zap_one_page(struct mm_struct *mm, unsigned long vmaddr);
void gmap_helper_discard(struct mm_struct *mm, unsigned long vmaddr, unsigned long end);
int gmap_helper_disable_cow_sharing(void);
+void gmap_helper_try_set_pte_unused(struct mm_struct *mm, unsigned long vmaddr);
#endif /* _ASM_S390_GMAP_HELPERS_H */
diff --git a/arch/s390/mm/gmap_helpers.c b/arch/s390/mm/gmap_helpers.c
index e14a63119e30..dca783859a73 100644
--- a/arch/s390/mm/gmap_helpers.c
+++ b/arch/s390/mm/gmap_helpers.c
@@ -124,6 +124,85 @@ void gmap_helper_discard(struct mm_struct *mm, unsigned long vmaddr, unsigned lo
}
EXPORT_SYMBOL_GPL(gmap_helper_discard);
+/**
+ * gmap_helper_try_set_pte_unused() - mark a pte entry as unused
+ * @mm: the mm
+ * @vmaddr: the userspace address whose pte is to be marked
+ *
+ * Mark the pte corresponding the given address as unused. This will cause
+ * core mm code to just drop this page instead of swapping it.
+ *
+ * This function needs to be called with interrupts disabled (for example
+ * while holding a spinlock), or while holding the mmap lock. Normally this
+ * function is called as a result of an unmap operation, and thus KVM common
+ * code will already hold kvm->mmu_lock in write mode.
+ *
+ * Context: Needs to be called while holding the mmap lock or with interrupts
+ * disabled.
+ */
+void gmap_helper_try_set_pte_unused(struct mm_struct *mm, unsigned long vmaddr)
+{
+ pmd_t *pmdp, pmd, pmdval;
+ pud_t *pudp, pud;
+ p4d_t *p4dp, p4d;
+ pgd_t *pgdp, pgd;
+ spinlock_t *ptl; /* Lock for the host (userspace) page table */
+ pte_t *ptep;
+
+ pgdp = pgd_offset(mm, vmaddr);
+ pgd = pgdp_get(pgdp);
+ if (pgd_none(pgd) || !pgd_present(pgd))
+ return;
+
+ p4dp = p4d_offset(pgdp, vmaddr);
+ p4d = p4dp_get(p4dp);
+ if (p4d_none(p4d) || !p4d_present(p4d))
+ return;
+
+ pudp = pud_offset(p4dp, vmaddr);
+ pud = pudp_get(pudp);
+ if (pud_none(pud) || pud_leaf(pud) || !pud_present(pud))
+ return;
+
+ pmdp = pmd_offset(pudp, vmaddr);
+ pmd = pmdp_get_lockless(pmdp);
+ if (pmd_none(pmd) || pmd_leaf(pmd) || !pmd_present(pmd))
+ return;
+
+ ptep = pte_offset_map_rw_nolock(mm, pmdp, vmaddr, &pmdval, &ptl);
+ if (!ptep)
+ return;
+
+ /*
+ * Several paths exists that takes the ptl lock and then call the
+ * mmu_notifier, which takes the mmu_lock. The unmap path, instead,
+ * takes the mmu_lock in write mode first, and then potentially
+ * calls this function, which takes the ptl lock. This can lead to a
+ * deadlock.
+ * The unused page mechanism is only an optimization, if the
+ * _PAGE_UNUSED bit is not set, the unused page is swapped as normal
+ * instead of being discarded.
+ * If the lock is contended the bit is not set and the deadlock is
+ * avoided.
+ */
+ if (spin_trylock(ptl)) {
+ /*
+ * Make sure the pte we are touching is still the correct
+ * one. In theory this check should not be needed, but
+ * better safe than sorry.
+ * Disabling interrupts or holding the mmap lock is enough to
+ * guarantee that no concurrent updates to the page tables
+ * are possible.
+ */
+ if (likely(pmd_same(pmdval, pmdp_get_lockless(pmdp))))
+ __atomic64_or(_PAGE_UNUSED, (long *)ptep);
+ spin_unlock(ptl);
+ }
+
+ pte_unmap(ptep);
+}
+EXPORT_SYMBOL_GPL(gmap_helper_try_set_pte_unused);
+
static int find_zeropage_pte_entry(pte_t *pte, unsigned long addr,
unsigned long end, struct mm_walk *walk)
{
--
2.51.1
Powered by blists - more mailing lists