linux-kernel - Re: [PATCH v1 4/5] KVM: s390: refactor and split some gmap helpers

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <277aa125e8edaf55e82ca66a15b26eee6ba3320b.camel@linux.ibm.com>
Date: Wed, 21 May 2025 16:55:18 +0200
From: Nina Schoetterl-Glausch <nsg@...ux.ibm.com>
To: Claudio Imbrenda <imbrenda@...ux.ibm.com>, linux-kernel@...r.kernel.org
Cc: kvm@...r.kernel.org, linux-s390@...r.kernel.org, frankja@...ux.ibm.com,
        borntraeger@...ibm.com, seiden@...ux.ibm.com, nrb@...ux.ibm.com,
        david@...hat.com, hca@...ux.ibm.com, agordeev@...ux.ibm.com,
        svens@...ux.ibm.com, gor@...ux.ibm.com
Subject: Re: [PATCH v1 4/5] KVM: s390: refactor and split some gmap helpers

On Wed, 2025-05-14 at 18:38 +0200, Claudio Imbrenda wrote:
> Refactor some gmap functions; move the implementation into a separate
> file with only helper functions. The new helper functions work on vm
> addresses, leaving all gmap logic in the gmap functions, which mostly
> become just wrappers.
> 
> The whole gmap handling is going to be moved inside KVM soon, but the
> helper functions need to touch core mm functions, and thus need to
> stay in the core of kernel.
> 
> Signed-off-by: Claudio Imbrenda <imbrenda@...ux.ibm.com>
> ---
>  MAINTAINERS                          |   2 +
>  arch/s390/include/asm/gmap_helpers.h |  18 ++
>  arch/s390/kvm/diag.c                 |  11 +-
>  arch/s390/kvm/kvm-s390.c             |   3 +-
>  arch/s390/mm/Makefile                |   2 +
>  arch/s390/mm/gmap.c                  |  46 ++---
>  arch/s390/mm/gmap_helpers.c          | 266 +++++++++++++++++++++++++++
>  7 files changed, 307 insertions(+), 41 deletions(-)
>  create mode 100644 arch/s390/include/asm/gmap_helpers.h
>  create mode 100644 arch/s390/mm/gmap_helpers.c
> 
[...]

> diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c
> index 4869555ff403..17a2a57de3a2 100644
> --- a/arch/s390/mm/gmap.c
> +++ b/arch/s390/mm/gmap.c
> 
[...]

>  void gmap_discard(struct gmap *gmap, unsigned long from, unsigned long to)
>  {
> -	unsigned long gaddr, vmaddr, size;
> -	struct vm_area_struct *vma;
> +	unsigned long vmaddr, next, start, end;
>  
>  	mmap_read_lock(gmap->mm);
> -	for (gaddr = from; gaddr < to;
> -	     gaddr = (gaddr + PMD_SIZE) & PMD_MASK) {
> -		/* Find the vm address for the guest address */
> -		vmaddr = (unsigned long)
> -			radix_tree_lookup(&gmap->guest_to_host,
> -					  gaddr >> PMD_SHIFT);
> +	for ( ; from < to; from = next) {
> +		next = ALIGN(from + 1, PMD_SIZE);

I think you can use pmd_addr_end(from, to) here...

> +		vmaddr = (unsigned long)radix_tree_lookup(&gmap->guest_to_host, from >> PMD_SHIFT);
>  		if (!vmaddr)
>  			continue;
> -		vmaddr |= gaddr & ~PMD_MASK;
> -		/* Find vma in the parent mm */
> -		vma = find_vma(gmap->mm, vmaddr);
> -		if (!vma)
> -			continue;
> -		/*
> -		 * We do not discard pages that are backed by
> -		 * hugetlbfs, so we don't have to refault them.
> -		 */
> -		if (is_vm_hugetlb_page(vma))
> -			continue;
> -		size = min(to - gaddr, PMD_SIZE - (gaddr & ~PMD_MASK));
> -		zap_page_range_single(vma, vmaddr, size, NULL);
> +		start = vmaddr | (from & ~PMD_MASK);
> +		end = (vmaddr | (min(to - 1, next - 1) & ~PMD_MASK)) + 1;

... then simply do:
		end = vmaddr + (next - from);

> +		__gmap_helper_discard(gmap->mm, start, end);
>  	}
>  	mmap_read_unlock(gmap->mm);
>  }
> diff --git a/arch/s390/mm/gmap_helpers.c b/arch/s390/mm/gmap_helpers.c
> new file mode 100644
> index 000000000000..8e66586718f6
> --- /dev/null
> +++ b/arch/s390/mm/gmap_helpers.c
> @@ -0,0 +1,266 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + *  Helper functions for KVM guest address space mapping code
> + *
> + *    Copyright IBM Corp. 2007, 2025
> + *    Author(s): Claudio Imbrenda <imbrenda@...ux.ibm.com>
> + *               Martin Schwidefsky <schwidefsky@...ibm.com>
> + *               David Hildenbrand <david@...hat.com>
> + *               Janosch Frank <frankja@...ux.vnet.ibm.com>
> + */
> +#include <linux/mm_types.h>
> +#include <linux/mmap_lock.h>
> +#include <linux/mm.h>
> +#include <linux/hugetlb.h>
> +#include <linux/pagewalk.h>
> +#include <linux/ksm.h>
> +#include <asm/gmap_helpers.h>

Please add documentation to all these functions for those of us that
don't live and breath mm code :)

> +
> +static void ptep_zap_swap_entry(struct mm_struct *mm, swp_entry_t entry)
> +{
> +	if (!non_swap_entry(entry))
> +		dec_mm_counter(mm, MM_SWAPENTS);
> +	else if (is_migration_entry(entry))
> +		dec_mm_counter(mm, mm_counter(pfn_swap_entry_folio(entry)));
> +	free_swap_and_cache(entry);
> +}
> +
> +void __gmap_helper_zap_one(struct mm_struct *mm, unsigned long vmaddr)

__gmap_helper_zap_mapping_pte ?

> +{
> +	struct vm_area_struct *vma;
> +	spinlock_t *ptl;
> +	pte_t *ptep;
> +
> +	mmap_assert_locked(mm);
> +
> +	/* Find the vm address for the guest address */
> +	vma = vma_lookup(mm, vmaddr);
> +	if (!vma || is_vm_hugetlb_page(vma))
> +		return;
> +
> +	/* Get pointer to the page table entry */
> +	ptep = get_locked_pte(mm, vmaddr, &ptl);
> +	if (!likely(ptep))

if (unlikely(!ptep)) reads nicer to me.

> +		return;
> +	if (pte_swap(*ptep))
> +		ptep_zap_swap_entry(mm, pte_to_swp_entry(*ptep));
> +	pte_unmap_unlock(ptep, ptl);
> +}
> +EXPORT_SYMBOL_GPL(__gmap_helper_zap_one);

Looks reasonable, but I'm not well versed enough in mm code to evaluate
that with confidence.

> +
> +void __gmap_helper_discard(struct mm_struct *mm, unsigned long vmaddr, unsigned long end)

Maybe call this gmap_helper_discard_nolock or something.

> +{
> +	struct vm_area_struct *vma;
> +	unsigned long next;
> +
> +	mmap_assert_locked(mm);
> +
> +	while (vmaddr < end) {
> +		vma = find_vma_intersection(mm, vmaddr, end);
> +		if (!vma)
> +			break;
> +		vmaddr = max(vmaddr, vma->vm_start);
> +		next = min(end, vma->vm_end);
> +		if (!is_vm_hugetlb_page(vma))
> +			zap_page_range_single(vma, vmaddr, next - vmaddr, NULL);
> +		vmaddr = next;
> +	}
> +}
> +
> +void gmap_helper_discard(struct mm_struct *mm, unsigned long vmaddr, unsigned long end)
> +{
> +	mmap_read_lock(mm);
> +	__gmap_helper_discard(mm, vmaddr, end);
> +	mmap_read_unlock(mm);
> +}
> +EXPORT_SYMBOL_GPL(gmap_helper_discard);
> +
> +static int pmd_lookup(struct mm_struct *mm, unsigned long addr, pmd_t **pmdp)
> +{
> +	struct vm_area_struct *vma;
> +	pgd_t *pgd;
> +	p4d_t *p4d;
> +	pud_t *pud;
> +
> +	/* We need a valid VMA, otherwise this is clearly a fault. */
> +	vma = vma_lookup(mm, addr);
> +	if (!vma)
> +		return -EFAULT;
> +
> +	pgd = pgd_offset(mm, addr);
> +	if (!pgd_present(*pgd))
> +		return -ENOENT;

What about pgd_bad?

> +
> +	p4d = p4d_offset(pgd, addr);
> +	if (!p4d_present(*p4d))
> +		return -ENOENT;
> +
> +	pud = pud_offset(p4d, addr);
> +	if (!pud_present(*pud))
> +		return -ENOENT;
> +
> +	/* Large PUDs are not supported yet. */
> +	if (pud_leaf(*pud))
> +		return -EFAULT;
> +
> +	*pmdp = pmd_offset(pud, addr);
> +	return 0;
> +}
> +
> +void __gmap_helper_set_unused(struct mm_struct *mm, unsigned long vmaddr)

What is this function for? Why do you introduce it now?

> +{
> +	spinlock_t *ptl;
> +	pmd_t *pmdp;
> +	pte_t *ptep;
> +
> +	mmap_assert_locked(mm);
> +
> +	if (pmd_lookup(mm, vmaddr, &pmdp))
> +		return;
> +	ptl = pmd_lock(mm, pmdp);
> +	if (!pmd_present(*pmdp) || pmd_leaf(*pmdp)) {
> +		spin_unlock(ptl);
> +		return;
> +	}
> +	spin_unlock(ptl);
> +
> +	ptep = pte_offset_map_lock(mm, pmdp, vmaddr, &ptl);
> +	if (!ptep)
> +		return;
> +	/* The last byte of a pte can be changed freely without ipte */
> +	__atomic64_or(_PAGE_UNUSED, (long *)ptep);
> +	pte_unmap_unlock(ptep, ptl);
> +}
> +EXPORT_SYMBOL_GPL(__gmap_helper_set_unused);

The stuff below is from arch/s390/mm/gmap.c right?
Are you going to delete it from there?

> +static int find_zeropage_pte_entry(pte_t *pte, unsigned long addr,
> +				   unsigned long end, struct mm_walk *walk)
> +{

[...]

> +}
> +
> +static const struct mm_walk_ops find_zeropage_ops = {
> +	.pte_entry      = find_zeropage_pte_entry,
> +	.walk_lock      = PGWALK_WRLOCK,
> +};
> +
> +/*
> + * Unshare all shared zeropages, replacing them by anonymous pages. Note that
> + * we cannot simply zap all shared zeropages, because this could later
> + * trigger unexpected userfaultfd missing events.
> + *
> + * This must be called after mm->context.allow_cow_sharing was
> + * set to 0, to avoid future mappings of shared zeropages.
> + *
> + * mm contracts with s390, that even if mm were to remove a page table,
> + * and racing with walk_page_range_vma() calling pte_offset_map_lock()
> + * would fail, it will never insert a page table containing empty zero
> + * pages once mm_forbids_zeropage(mm) i.e.
> + * mm->context.allow_cow_sharing is set to 0.
> + */
> +static int __gmap_helper_unshare_zeropages(struct mm_struct *mm)
> +{

[...]

> +}
> +
> +static int __gmap_helper_disable_cow_sharing(struct mm_struct *mm)
> +{

[...]

> +}
> +
> +/*
> + * Disable most COW-sharing of memory pages for the whole process:
> + * (1) Disable KSM and unmerge/unshare any KSM pages.
> + * (2) Disallow shared zeropages and unshare any zerpages that are mapped.
> + *
> + * Not that we currently don't bother with COW-shared pages that are shared
> + * with parent/child processes due to fork().
> + */
> +int gmap_helper_disable_cow_sharing(void)
> +{

[...]

> +}
> +EXPORT_SYMBOL_GPL(gmap_helper_disable_cow_sharing);

-- 
IBM Deutschland Research & Development GmbH
Vorsitzender des Aufsichtsrats: Wolfgang Wendt
Geschäftsführung: David Faller
Sitz der Gesellschaft: Böblingen / Registergericht: Amtsgericht Stuttgart, HRB 243294