[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20250521171930.2edaaa8a@p-imbrenda>
Date: Wed, 21 May 2025 17:19:30 +0200
From: Claudio Imbrenda <imbrenda@...ux.ibm.com>
To: Nina Schoetterl-Glausch <nsg@...ux.ibm.com>
Cc: linux-kernel@...r.kernel.org, kvm@...r.kernel.org,
linux-s390@...r.kernel.org, frankja@...ux.ibm.com,
borntraeger@...ibm.com, seiden@...ux.ibm.com, nrb@...ux.ibm.com,
david@...hat.com, hca@...ux.ibm.com, agordeev@...ux.ibm.com,
svens@...ux.ibm.com, gor@...ux.ibm.com
Subject: Re: [PATCH v1 4/5] KVM: s390: refactor and split some gmap helpers
On Wed, 21 May 2025 16:55:18 +0200
Nina Schoetterl-Glausch <nsg@...ux.ibm.com> wrote:
> On Wed, 2025-05-14 at 18:38 +0200, Claudio Imbrenda wrote:
> > Refactor some gmap functions; move the implementation into a separate
> > file with only helper functions. The new helper functions work on vm
> > addresses, leaving all gmap logic in the gmap functions, which mostly
> > become just wrappers.
> >
> > The whole gmap handling is going to be moved inside KVM soon, but the
> > helper functions need to touch core mm functions, and thus need to
> > stay in the core of kernel.
> >
> > Signed-off-by: Claudio Imbrenda <imbrenda@...ux.ibm.com>
> > ---
> > MAINTAINERS | 2 +
> > arch/s390/include/asm/gmap_helpers.h | 18 ++
> > arch/s390/kvm/diag.c | 11 +-
> > arch/s390/kvm/kvm-s390.c | 3 +-
> > arch/s390/mm/Makefile | 2 +
> > arch/s390/mm/gmap.c | 46 ++---
> > arch/s390/mm/gmap_helpers.c | 266 +++++++++++++++++++++++++++
> > 7 files changed, 307 insertions(+), 41 deletions(-)
> > create mode 100644 arch/s390/include/asm/gmap_helpers.h
> > create mode 100644 arch/s390/mm/gmap_helpers.c
> >
> [...]
>
> > diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c
> > index 4869555ff403..17a2a57de3a2 100644
> > --- a/arch/s390/mm/gmap.c
> > +++ b/arch/s390/mm/gmap.c
> >
> [...]
>
> > void gmap_discard(struct gmap *gmap, unsigned long from, unsigned long to)
> > {
> > - unsigned long gaddr, vmaddr, size;
> > - struct vm_area_struct *vma;
> > + unsigned long vmaddr, next, start, end;
> >
> > mmap_read_lock(gmap->mm);
> > - for (gaddr = from; gaddr < to;
> > - gaddr = (gaddr + PMD_SIZE) & PMD_MASK) {
> > - /* Find the vm address for the guest address */
> > - vmaddr = (unsigned long)
> > - radix_tree_lookup(&gmap->guest_to_host,
> > - gaddr >> PMD_SHIFT);
> > + for ( ; from < to; from = next) {
> > + next = ALIGN(from + 1, PMD_SIZE);
>
> I think you can use pmd_addr_end(from, to) here...
I guess
>
> > + vmaddr = (unsigned long)radix_tree_lookup(&gmap->guest_to_host, from >> PMD_SHIFT);
> > if (!vmaddr)
> > continue;
> > - vmaddr |= gaddr & ~PMD_MASK;
> > - /* Find vma in the parent mm */
> > - vma = find_vma(gmap->mm, vmaddr);
> > - if (!vma)
> > - continue;
> > - /*
> > - * We do not discard pages that are backed by
> > - * hugetlbfs, so we don't have to refault them.
> > - */
> > - if (is_vm_hugetlb_page(vma))
> > - continue;
> > - size = min(to - gaddr, PMD_SIZE - (gaddr & ~PMD_MASK));
> > - zap_page_range_single(vma, vmaddr, size, NULL);
> > + start = vmaddr | (from & ~PMD_MASK);
> > + end = (vmaddr | (min(to - 1, next - 1) & ~PMD_MASK)) + 1;
>
> ... then simply do:
> end = vmaddr + (next - from);
looks cleaner, yes
>
> > + __gmap_helper_discard(gmap->mm, start, end);
> > }
> > mmap_read_unlock(gmap->mm);
> > }
> > diff --git a/arch/s390/mm/gmap_helpers.c b/arch/s390/mm/gmap_helpers.c
> > new file mode 100644
> > index 000000000000..8e66586718f6
> > --- /dev/null
> > +++ b/arch/s390/mm/gmap_helpers.c
> > @@ -0,0 +1,266 @@
> > +// SPDX-License-Identifier: GPL-2.0
> > +/*
> > + * Helper functions for KVM guest address space mapping code
> > + *
> > + * Copyright IBM Corp. 2007, 2025
> > + * Author(s): Claudio Imbrenda <imbrenda@...ux.ibm.com>
> > + * Martin Schwidefsky <schwidefsky@...ibm.com>
> > + * David Hildenbrand <david@...hat.com>
> > + * Janosch Frank <frankja@...ux.vnet.ibm.com>
> > + */
> > +#include <linux/mm_types.h>
> > +#include <linux/mmap_lock.h>
> > +#include <linux/mm.h>
> > +#include <linux/hugetlb.h>
> > +#include <linux/pagewalk.h>
> > +#include <linux/ksm.h>
> > +#include <asm/gmap_helpers.h>
>
> Please add documentation to all these functions for those of us that
> don't live and breath mm code :)
eh... yes I think it's a good idea
>
> > +
> > +static void ptep_zap_swap_entry(struct mm_struct *mm, swp_entry_t entry)
> > +{
> > + if (!non_swap_entry(entry))
> > + dec_mm_counter(mm, MM_SWAPENTS);
> > + else if (is_migration_entry(entry))
> > + dec_mm_counter(mm, mm_counter(pfn_swap_entry_folio(entry)));
> > + free_swap_and_cache(entry);
> > +}
> > +
> > +void __gmap_helper_zap_one(struct mm_struct *mm, unsigned long vmaddr)
>
> __gmap_helper_zap_mapping_pte ?
but I'm not taking a pte as parameter
>
> > +{
> > + struct vm_area_struct *vma;
> > + spinlock_t *ptl;
> > + pte_t *ptep;
> > +
> > + mmap_assert_locked(mm);
> > +
> > + /* Find the vm address for the guest address */
> > + vma = vma_lookup(mm, vmaddr);
> > + if (!vma || is_vm_hugetlb_page(vma))
> > + return;
> > +
> > + /* Get pointer to the page table entry */
> > + ptep = get_locked_pte(mm, vmaddr, &ptl);
> > + if (!likely(ptep))
>
> if (unlikely(!ptep)) reads nicer to me.
ok
>
> > + return;
> > + if (pte_swap(*ptep))
> > + ptep_zap_swap_entry(mm, pte_to_swp_entry(*ptep));
> > + pte_unmap_unlock(ptep, ptl);
> > +}
> > +EXPORT_SYMBOL_GPL(__gmap_helper_zap_one);
>
> Looks reasonable, but I'm not well versed enough in mm code to evaluate
> that with confidence.
>
> > +
> > +void __gmap_helper_discard(struct mm_struct *mm, unsigned long vmaddr, unsigned long end)
>
> Maybe call this gmap_helper_discard_nolock or something.
maybe __gmap_helper_discard_unlocked?
the __ prefix often implies lack of locking
>
> > +{
> > + struct vm_area_struct *vma;
> > + unsigned long next;
> > +
> > + mmap_assert_locked(mm);
> > +
> > + while (vmaddr < end) {
> > + vma = find_vma_intersection(mm, vmaddr, end);
> > + if (!vma)
> > + break;
> > + vmaddr = max(vmaddr, vma->vm_start);
> > + next = min(end, vma->vm_end);
> > + if (!is_vm_hugetlb_page(vma))
> > + zap_page_range_single(vma, vmaddr, next - vmaddr, NULL);
> > + vmaddr = next;
> > + }
> > +}
> > +
> > +void gmap_helper_discard(struct mm_struct *mm, unsigned long vmaddr, unsigned long end)
> > +{
> > + mmap_read_lock(mm);
> > + __gmap_helper_discard(mm, vmaddr, end);
> > + mmap_read_unlock(mm);
> > +}
> > +EXPORT_SYMBOL_GPL(gmap_helper_discard);
> > +
> > +static int pmd_lookup(struct mm_struct *mm, unsigned long addr, pmd_t **pmdp)
> > +{
> > + struct vm_area_struct *vma;
> > + pgd_t *pgd;
> > + p4d_t *p4d;
> > + pud_t *pud;
> > +
> > + /* We need a valid VMA, otherwise this is clearly a fault. */
> > + vma = vma_lookup(mm, addr);
> > + if (!vma)
> > + return -EFAULT;
> > +
> > + pgd = pgd_offset(mm, addr);
> > + if (!pgd_present(*pgd))
> > + return -ENOENT;
>
> What about pgd_bad?
I don't know, this code is copied verbatim from mm/gmap.c
>
> > +
> > + p4d = p4d_offset(pgd, addr);
> > + if (!p4d_present(*p4d))
> > + return -ENOENT;
> > +
> > + pud = pud_offset(p4d, addr);
> > + if (!pud_present(*pud))
> > + return -ENOENT;
> > +
> > + /* Large PUDs are not supported yet. */
> > + if (pud_leaf(*pud))
> > + return -EFAULT;
> > +
> > + *pmdp = pmd_offset(pud, addr);
> > + return 0;
> > +}
> > +
> > +void __gmap_helper_set_unused(struct mm_struct *mm, unsigned long vmaddr)
>
> What is this function for? Why do you introduce it now?
this is for cmma handling, I'm introducing it now because it needs to
be in this file and I would like to put all the stuff in at once.
I will not need to touch this file anymore if I get this in now.
>
> > +{
> > + spinlock_t *ptl;
> > + pmd_t *pmdp;
> > + pte_t *ptep;
> > +
> > + mmap_assert_locked(mm);
> > +
> > + if (pmd_lookup(mm, vmaddr, &pmdp))
> > + return;
> > + ptl = pmd_lock(mm, pmdp);
> > + if (!pmd_present(*pmdp) || pmd_leaf(*pmdp)) {
> > + spin_unlock(ptl);
> > + return;
> > + }
> > + spin_unlock(ptl);
> > +
> > + ptep = pte_offset_map_lock(mm, pmdp, vmaddr, &ptl);
> > + if (!ptep)
> > + return;
> > + /* The last byte of a pte can be changed freely without ipte */
> > + __atomic64_or(_PAGE_UNUSED, (long *)ptep);
> > + pte_unmap_unlock(ptep, ptl);
> > +}
> > +EXPORT_SYMBOL_GPL(__gmap_helper_set_unused);
>
> The stuff below is from arch/s390/mm/gmap.c right?
> Are you going to delete it from there?
not in this series, but the next series will remove mm/gmap.c altogether
>
> > +static int find_zeropage_pte_entry(pte_t *pte, unsigned long addr,
> > + unsigned long end, struct mm_walk *walk)
> > +{
>
> [...]
>
> > +}
> > +
> > +static const struct mm_walk_ops find_zeropage_ops = {
> > + .pte_entry = find_zeropage_pte_entry,
> > + .walk_lock = PGWALK_WRLOCK,
> > +};
> > +
> > +/*
> > + * Unshare all shared zeropages, replacing them by anonymous pages. Note that
> > + * we cannot simply zap all shared zeropages, because this could later
> > + * trigger unexpected userfaultfd missing events.
> > + *
> > + * This must be called after mm->context.allow_cow_sharing was
> > + * set to 0, to avoid future mappings of shared zeropages.
> > + *
> > + * mm contracts with s390, that even if mm were to remove a page table,
> > + * and racing with walk_page_range_vma() calling pte_offset_map_lock()
> > + * would fail, it will never insert a page table containing empty zero
> > + * pages once mm_forbids_zeropage(mm) i.e.
> > + * mm->context.allow_cow_sharing is set to 0.
> > + */
> > +static int __gmap_helper_unshare_zeropages(struct mm_struct *mm)
> > +{
>
> [...]
>
> > +}
> > +
> > +static int __gmap_helper_disable_cow_sharing(struct mm_struct *mm)
> > +{
>
> [...]
>
> > +}
> > +
> > +/*
> > + * Disable most COW-sharing of memory pages for the whole process:
> > + * (1) Disable KSM and unmerge/unshare any KSM pages.
> > + * (2) Disallow shared zeropages and unshare any zerpages that are mapped.
> > + *
> > + * Not that we currently don't bother with COW-shared pages that are shared
> > + * with parent/child processes due to fork().
> > + */
> > +int gmap_helper_disable_cow_sharing(void)
> > +{
>
> [...]
>
> > +}
> > +EXPORT_SYMBOL_GPL(gmap_helper_disable_cow_sharing);
>
Powered by blists - more mailing lists