linux-kernel - Re: [PATCH v3] RISC-V: KVM: Transparent huge page support

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite for Android: free password hash cracker in your pocket
[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <CAAhSdy1bL75K5B_bgVA=EpT6cw_9TOk+=PMNHr+K=sAjTNmhXA@mail.gmail.com>
Date: Mon, 24 Nov 2025 09:51:02 +0530
From: Anup Patel <anup@...infault.org>
To: liu.xuemei1@....com.cn
Cc: atish.patra@...ux.dev, alex@...ti.fr, pjw@...nel.org, palmer@...belt.com, 
	aou@...s.berkeley.edu, kvm@...r.kernel.org, kvm-riscv@...ts.infradead.org, 
	inux-riscv@...ts.infradead.org, linux-kernel@...r.kernel.org
Subject: Re: [PATCH v3] RISC-V: KVM: Transparent huge page support

On Tue, Nov 11, 2025 at 10:19 AM <liu.xuemei1@....com.cn> wrote:
>
> From: Jessica Liu <liu.xuemei1@....com.cn>
>
> Use block mapping if backed by a THP, as implemented in architectures
> like ARM and x86_64.
>
> Signed-off-by: Jessica Liu <liu.xuemei1@....com.cn>
> ---
> Changes in v3:
> - Changed prototype of gstage_get_user_mapping_size to
>   kvm_riscv_gstage_get_mapping_size.
> - Relocated the remaining functions from gstage.c in v2 to mmu.c and
>   renamed them.
>
>  arch/riscv/include/asm/kvm_gstage.h |  2 +
>  arch/riscv/kvm/gstage.c             | 15 +++++
>  arch/riscv/kvm/mmu.c                | 97 ++++++++++++++++++++++++++++-
>  3 files changed, 113 insertions(+), 1 deletion(-)
>
> diff --git a/arch/riscv/include/asm/kvm_gstage.h b/arch/riscv/include/asm/kvm_gstage.h
> index 595e2183173e..006bbdb90df8 100644
> --- a/arch/riscv/include/asm/kvm_gstage.h
> +++ b/arch/riscv/include/asm/kvm_gstage.h
> @@ -69,4 +69,6 @@ void kvm_riscv_gstage_wp_range(struct kvm_gstage *gstage, gpa_t start, gpa_t end
>
>  void kvm_riscv_gstage_mode_detect(void);
>
> +int kvm_riscv_gstage_get_mapping_size(struct kvm_gstage *gstage, gpa_t addr);
> +
>  #endif
> diff --git a/arch/riscv/kvm/gstage.c b/arch/riscv/kvm/gstage.c
> index b67d60d722c2..a63089206869 100644
> --- a/arch/riscv/kvm/gstage.c
> +++ b/arch/riscv/kvm/gstage.c
> @@ -357,3 +357,18 @@ void __init kvm_riscv_gstage_mode_detect(void)
>         csr_write(CSR_HGATP, 0);
>         kvm_riscv_local_hfence_gvma_all();
>  }
> +
> +int kvm_riscv_gstage_get_mapping_size(struct kvm_gstage *gstage, gpa_t addr)
> +{
> +       pte_t *ptepp;
> +       u32 ptep_level;
> +       unsigned long out_pgsize;
> +
> +       if (!kvm_riscv_gstage_get_leaf(gstage, addr, &ptepp, &ptep_level))
> +               return -EFAULT;
> +
> +       if (gstage_level_to_page_size(ptep_level, &out_pgsize))
> +               return -EFAULT;
> +
> +       return out_pgsize;
> +}
> diff --git a/arch/riscv/kvm/mmu.c b/arch/riscv/kvm/mmu.c
> index 525fb5a330c0..1457bc958505 100644
> --- a/arch/riscv/kvm/mmu.c
> +++ b/arch/riscv/kvm/mmu.c
> @@ -323,6 +323,91 @@ bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
>         return pte_young(ptep_get(ptep));
>  }
>
> +static bool fault_supports_gstage_huge_mapping(struct kvm_memory_slot *memslot, unsigned long hva)
> +{
> +       gpa_t gpa_start;
> +       hva_t uaddr_start, uaddr_end;
> +       size_t size;

Declare local variables in inverted pyramid fashion when possible.

> +
> +       size = memslot->npages * PAGE_SIZE;
> +       uaddr_start = memslot->userspace_addr;
> +       uaddr_end = uaddr_start + size;
> +
> +       gpa_start = memslot->base_gfn << PAGE_SHIFT;
> +
> +       /*
> +        * Pages belonging to memslots that don't have the same alignment
> +        * within a PMD for userspace and GPA cannot be mapped with g-stage
> +        * PMD entries, because we'll end up mapping the wrong pages.
> +        *
> +        * Consider a layout like the following:
> +        *
> +        *    memslot->userspace_addr:
> +        *    +-----+--------------------+--------------------+---+
> +        *    |abcde|fgh  vs-stage block  |    vs-stage block tv|xyz|
> +        *    +-----+--------------------+--------------------+---+
> +        *
> +        *    memslot->base_gfn << PAGE_SHIFT:
> +        *      +---+--------------------+--------------------+-----+
> +        *      |abc|def  g-stage block  |    g-stage block   |tvxyz|
> +        *      +---+--------------------+--------------------+-----+
> +        *
> +        * If we create those g-stage blocks, we'll end up with this incorrect
> +        * mapping:
> +        *   d -> f
> +        *   e -> g
> +        *   f -> h
> +        */
> +       if ((gpa_start & (PMD_SIZE - 1)) != (uaddr_start & (PMD_SIZE - 1)))
> +               return false;
> +
> +       /*
> +        * Next, let's make sure we're not trying to map anything not covered
> +        * by the memslot. This means we have to prohibit block size mappings
> +        * for the beginning and end of a non-block aligned and non-block sized
> +        * memory slot (illustrated by the head and tail parts of the
> +        * userspace view above containing pages 'abcde' and 'xyz',
> +        * respectively).
> +        *
> +        * Note that it doesn't matter if we do the check using the
> +        * userspace_addr or the base_gfn, as both are equally aligned (per
> +        * the check above) and equally sized.
> +        */
> +       return (hva >= ALIGN(uaddr_start, PMD_SIZE)) && (hva < ALIGN_DOWN(uaddr_end, PMD_SIZE));
> +}
> +
> +static long transparent_hugepage_adjust(struct kvm *kvm, struct kvm_memory_slot *memslot,
> +                                       unsigned long hva, kvm_pfn_t *hfnp, gpa_t *gpa)
> +{
> +       kvm_pfn_t hfn = *hfnp;
> +
> +       /*
> +        * Make sure the adjustment is done only for THP pages. Also make
> +        * sure that the HVA and GPA are sufficiently aligned and that the
> +        * block map is contained within the memslot.
> +        */
> +       if (fault_supports_gstage_huge_mapping(memslot, hva)) {
> +               struct kvm_gstage gstage;

Declare sz here.

> +
> +               gstage.pgd = kvm->mm->pgd;
> +               int sz = kvm_riscv_gstage_get_mapping_size(&gstage, hva);

This is broken because you are passing hva as gpa to
kvm_riscv_gstage_get_mapping_size().

> +
> +               if (sz < 0)
> +                       return sz;
> +
> +               if (sz < PMD_SIZE)
> +                       return PAGE_SIZE;
> +
> +               *gpa &= PMD_MASK;
> +               hfn &= ~(PTRS_PER_PMD - 1);
> +               *hfnp = hfn;
> +
> +               return PMD_SIZE;
> +       }
> +
> +       return PAGE_SIZE;
> +}
> +
>  int kvm_riscv_mmu_map(struct kvm_vcpu *vcpu, struct kvm_memory_slot *memslot,
>                       gpa_t gpa, unsigned long hva, bool is_write,
>                       struct kvm_gstage_mapping *out_map)
> @@ -337,7 +422,8 @@ int kvm_riscv_mmu_map(struct kvm_vcpu *vcpu, struct kvm_memory_slot *memslot,
>         struct kvm_mmu_memory_cache *pcache = &vcpu->arch.mmu_page_cache;
>         bool logging = (memslot->dirty_bitmap &&
>                         !(memslot->flags & KVM_MEM_READONLY)) ? true : false;
> -       unsigned long vma_pagesize, mmu_seq;
> +       unsigned long mmu_seq;
> +       long vma_pagesize;
>         struct kvm_gstage gstage;
>         struct page *page;
>
> @@ -416,6 +502,15 @@ int kvm_riscv_mmu_map(struct kvm_vcpu *vcpu, struct kvm_memory_slot *memslot,
>         if (mmu_invalidate_retry(kvm, mmu_seq))
>                 goto out_unlock;
>
> +       /* check if we are backed by a THP and thus use block mapping if possible */
> +       if (vma_pagesize == PAGE_SIZE) {
> +               vma_pagesize = transparent_hugepage_adjust(kvm, memslot, hva, &hfn, &gpa);
> +               if (vma_pagesize < 0) {
> +                       ret = vma_pagesize;
> +                       goto out_unlock;
> +               }
> +       }
> +
>         if (writable) {
>                 mark_page_dirty_in_slot(kvm, memslot, gfn);
>                 ret = kvm_riscv_gstage_map_page(&gstage, pcache, gpa, hfn << PAGE_SHIFT,
> --
> 2.27.0

Regards,
Anup