[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <277aa125e8edaf55e82ca66a15b26eee6ba3320b.camel@linux.ibm.com>
Date: Wed, 21 May 2025 16:55:18 +0200
From: Nina Schoetterl-Glausch <nsg@...ux.ibm.com>
To: Claudio Imbrenda <imbrenda@...ux.ibm.com>, linux-kernel@...r.kernel.org
Cc: kvm@...r.kernel.org, linux-s390@...r.kernel.org, frankja@...ux.ibm.com,
borntraeger@...ibm.com, seiden@...ux.ibm.com, nrb@...ux.ibm.com,
david@...hat.com, hca@...ux.ibm.com, agordeev@...ux.ibm.com,
svens@...ux.ibm.com, gor@...ux.ibm.com
Subject: Re: [PATCH v1 4/5] KVM: s390: refactor and split some gmap helpers
On Wed, 2025-05-14 at 18:38 +0200, Claudio Imbrenda wrote:
> Refactor some gmap functions; move the implementation into a separate
> file with only helper functions. The new helper functions work on vm
> addresses, leaving all gmap logic in the gmap functions, which mostly
> become just wrappers.
>
> The whole gmap handling is going to be moved inside KVM soon, but the
> helper functions need to touch core mm functions, and thus need to
> stay in the core of kernel.
>
> Signed-off-by: Claudio Imbrenda <imbrenda@...ux.ibm.com>
> ---
> MAINTAINERS | 2 +
> arch/s390/include/asm/gmap_helpers.h | 18 ++
> arch/s390/kvm/diag.c | 11 +-
> arch/s390/kvm/kvm-s390.c | 3 +-
> arch/s390/mm/Makefile | 2 +
> arch/s390/mm/gmap.c | 46 ++---
> arch/s390/mm/gmap_helpers.c | 266 +++++++++++++++++++++++++++
> 7 files changed, 307 insertions(+), 41 deletions(-)
> create mode 100644 arch/s390/include/asm/gmap_helpers.h
> create mode 100644 arch/s390/mm/gmap_helpers.c
>
[...]
> diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c
> index 4869555ff403..17a2a57de3a2 100644
> --- a/arch/s390/mm/gmap.c
> +++ b/arch/s390/mm/gmap.c
>
[...]
> void gmap_discard(struct gmap *gmap, unsigned long from, unsigned long to)
> {
> - unsigned long gaddr, vmaddr, size;
> - struct vm_area_struct *vma;
> + unsigned long vmaddr, next, start, end;
>
> mmap_read_lock(gmap->mm);
> - for (gaddr = from; gaddr < to;
> - gaddr = (gaddr + PMD_SIZE) & PMD_MASK) {
> - /* Find the vm address for the guest address */
> - vmaddr = (unsigned long)
> - radix_tree_lookup(&gmap->guest_to_host,
> - gaddr >> PMD_SHIFT);
> + for ( ; from < to; from = next) {
> + next = ALIGN(from + 1, PMD_SIZE);
I think you can use pmd_addr_end(from, to) here...
> + vmaddr = (unsigned long)radix_tree_lookup(&gmap->guest_to_host, from >> PMD_SHIFT);
> if (!vmaddr)
> continue;
> - vmaddr |= gaddr & ~PMD_MASK;
> - /* Find vma in the parent mm */
> - vma = find_vma(gmap->mm, vmaddr);
> - if (!vma)
> - continue;
> - /*
> - * We do not discard pages that are backed by
> - * hugetlbfs, so we don't have to refault them.
> - */
> - if (is_vm_hugetlb_page(vma))
> - continue;
> - size = min(to - gaddr, PMD_SIZE - (gaddr & ~PMD_MASK));
> - zap_page_range_single(vma, vmaddr, size, NULL);
> + start = vmaddr | (from & ~PMD_MASK);
> + end = (vmaddr | (min(to - 1, next - 1) & ~PMD_MASK)) + 1;
... then simply do:
end = vmaddr + (next - from);
> + __gmap_helper_discard(gmap->mm, start, end);
> }
> mmap_read_unlock(gmap->mm);
> }
> diff --git a/arch/s390/mm/gmap_helpers.c b/arch/s390/mm/gmap_helpers.c
> new file mode 100644
> index 000000000000..8e66586718f6
> --- /dev/null
> +++ b/arch/s390/mm/gmap_helpers.c
> @@ -0,0 +1,266 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * Helper functions for KVM guest address space mapping code
> + *
> + * Copyright IBM Corp. 2007, 2025
> + * Author(s): Claudio Imbrenda <imbrenda@...ux.ibm.com>
> + * Martin Schwidefsky <schwidefsky@...ibm.com>
> + * David Hildenbrand <david@...hat.com>
> + * Janosch Frank <frankja@...ux.vnet.ibm.com>
> + */
> +#include <linux/mm_types.h>
> +#include <linux/mmap_lock.h>
> +#include <linux/mm.h>
> +#include <linux/hugetlb.h>
> +#include <linux/pagewalk.h>
> +#include <linux/ksm.h>
> +#include <asm/gmap_helpers.h>
Please add documentation to all these functions for those of us that
don't live and breath mm code :)
> +
> +static void ptep_zap_swap_entry(struct mm_struct *mm, swp_entry_t entry)
> +{
> + if (!non_swap_entry(entry))
> + dec_mm_counter(mm, MM_SWAPENTS);
> + else if (is_migration_entry(entry))
> + dec_mm_counter(mm, mm_counter(pfn_swap_entry_folio(entry)));
> + free_swap_and_cache(entry);
> +}
> +
> +void __gmap_helper_zap_one(struct mm_struct *mm, unsigned long vmaddr)
__gmap_helper_zap_mapping_pte ?
> +{
> + struct vm_area_struct *vma;
> + spinlock_t *ptl;
> + pte_t *ptep;
> +
> + mmap_assert_locked(mm);
> +
> + /* Find the vm address for the guest address */
> + vma = vma_lookup(mm, vmaddr);
> + if (!vma || is_vm_hugetlb_page(vma))
> + return;
> +
> + /* Get pointer to the page table entry */
> + ptep = get_locked_pte(mm, vmaddr, &ptl);
> + if (!likely(ptep))
if (unlikely(!ptep)) reads nicer to me.
> + return;
> + if (pte_swap(*ptep))
> + ptep_zap_swap_entry(mm, pte_to_swp_entry(*ptep));
> + pte_unmap_unlock(ptep, ptl);
> +}
> +EXPORT_SYMBOL_GPL(__gmap_helper_zap_one);
Looks reasonable, but I'm not well versed enough in mm code to evaluate
that with confidence.
> +
> +void __gmap_helper_discard(struct mm_struct *mm, unsigned long vmaddr, unsigned long end)
Maybe call this gmap_helper_discard_nolock or something.
> +{
> + struct vm_area_struct *vma;
> + unsigned long next;
> +
> + mmap_assert_locked(mm);
> +
> + while (vmaddr < end) {
> + vma = find_vma_intersection(mm, vmaddr, end);
> + if (!vma)
> + break;
> + vmaddr = max(vmaddr, vma->vm_start);
> + next = min(end, vma->vm_end);
> + if (!is_vm_hugetlb_page(vma))
> + zap_page_range_single(vma, vmaddr, next - vmaddr, NULL);
> + vmaddr = next;
> + }
> +}
> +
> +void gmap_helper_discard(struct mm_struct *mm, unsigned long vmaddr, unsigned long end)
> +{
> + mmap_read_lock(mm);
> + __gmap_helper_discard(mm, vmaddr, end);
> + mmap_read_unlock(mm);
> +}
> +EXPORT_SYMBOL_GPL(gmap_helper_discard);
> +
> +static int pmd_lookup(struct mm_struct *mm, unsigned long addr, pmd_t **pmdp)
> +{
> + struct vm_area_struct *vma;
> + pgd_t *pgd;
> + p4d_t *p4d;
> + pud_t *pud;
> +
> + /* We need a valid VMA, otherwise this is clearly a fault. */
> + vma = vma_lookup(mm, addr);
> + if (!vma)
> + return -EFAULT;
> +
> + pgd = pgd_offset(mm, addr);
> + if (!pgd_present(*pgd))
> + return -ENOENT;
What about pgd_bad?
> +
> + p4d = p4d_offset(pgd, addr);
> + if (!p4d_present(*p4d))
> + return -ENOENT;
> +
> + pud = pud_offset(p4d, addr);
> + if (!pud_present(*pud))
> + return -ENOENT;
> +
> + /* Large PUDs are not supported yet. */
> + if (pud_leaf(*pud))
> + return -EFAULT;
> +
> + *pmdp = pmd_offset(pud, addr);
> + return 0;
> +}
> +
> +void __gmap_helper_set_unused(struct mm_struct *mm, unsigned long vmaddr)
What is this function for? Why do you introduce it now?
> +{
> + spinlock_t *ptl;
> + pmd_t *pmdp;
> + pte_t *ptep;
> +
> + mmap_assert_locked(mm);
> +
> + if (pmd_lookup(mm, vmaddr, &pmdp))
> + return;
> + ptl = pmd_lock(mm, pmdp);
> + if (!pmd_present(*pmdp) || pmd_leaf(*pmdp)) {
> + spin_unlock(ptl);
> + return;
> + }
> + spin_unlock(ptl);
> +
> + ptep = pte_offset_map_lock(mm, pmdp, vmaddr, &ptl);
> + if (!ptep)
> + return;
> + /* The last byte of a pte can be changed freely without ipte */
> + __atomic64_or(_PAGE_UNUSED, (long *)ptep);
> + pte_unmap_unlock(ptep, ptl);
> +}
> +EXPORT_SYMBOL_GPL(__gmap_helper_set_unused);
The stuff below is from arch/s390/mm/gmap.c right?
Are you going to delete it from there?
> +static int find_zeropage_pte_entry(pte_t *pte, unsigned long addr,
> + unsigned long end, struct mm_walk *walk)
> +{
[...]
> +}
> +
> +static const struct mm_walk_ops find_zeropage_ops = {
> + .pte_entry = find_zeropage_pte_entry,
> + .walk_lock = PGWALK_WRLOCK,
> +};
> +
> +/*
> + * Unshare all shared zeropages, replacing them by anonymous pages. Note that
> + * we cannot simply zap all shared zeropages, because this could later
> + * trigger unexpected userfaultfd missing events.
> + *
> + * This must be called after mm->context.allow_cow_sharing was
> + * set to 0, to avoid future mappings of shared zeropages.
> + *
> + * mm contracts with s390, that even if mm were to remove a page table,
> + * and racing with walk_page_range_vma() calling pte_offset_map_lock()
> + * would fail, it will never insert a page table containing empty zero
> + * pages once mm_forbids_zeropage(mm) i.e.
> + * mm->context.allow_cow_sharing is set to 0.
> + */
> +static int __gmap_helper_unshare_zeropages(struct mm_struct *mm)
> +{
[...]
> +}
> +
> +static int __gmap_helper_disable_cow_sharing(struct mm_struct *mm)
> +{
[...]
> +}
> +
> +/*
> + * Disable most COW-sharing of memory pages for the whole process:
> + * (1) Disable KSM and unmerge/unshare any KSM pages.
> + * (2) Disallow shared zeropages and unshare any zerpages that are mapped.
> + *
> + * Not that we currently don't bother with COW-shared pages that are shared
> + * with parent/child processes due to fork().
> + */
> +int gmap_helper_disable_cow_sharing(void)
> +{
[...]
> +}
> +EXPORT_SYMBOL_GPL(gmap_helper_disable_cow_sharing);
--
IBM Deutschland Research & Development GmbH
Vorsitzender des Aufsichtsrats: Wolfgang Wendt
Geschäftsführung: David Faller
Sitz der Gesellschaft: Böblingen / Registergericht: Amtsgericht Stuttgart, HRB 243294
Powered by blists - more mailing lists