[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20250612-70c2e573983d05c4fbc41102@orel>
Date: Thu, 12 Jun 2025 11:42:23 +0200
From: Andrew Jones <ajones@...tanamicro.com>
To: Sean Christopherson <seanjc@...gle.com>
Cc: zhouquan@...as.ac.cn, anup@...infault.org, atishp@...shpatra.org,
paul.walmsley@...ive.com, palmer@...belt.com, linux-kernel@...r.kernel.org,
linux-riscv@...ts.infradead.org, kvm@...r.kernel.org, kvm-riscv@...ts.infradead.org
Subject: Re: [PATCH] RISC-V: KVM: Avoid re-acquiring memslot in
kvm_riscv_gstage_map()
On Wed, Jun 11, 2025 at 09:17:36AM -0700, Sean Christopherson wrote:
> On Wed, Jun 11, 2025, Andrew Jones wrote:
> > On Wed, Jun 11, 2025 at 05:51:40PM +0800, zhouquan@...as.ac.cn wrote:
> > > From: Quan Zhou <zhouquan@...as.ac.cn>
> > >
> > > The caller has already passed in the memslot, and there are
> > > two instances `{kvm_faultin_pfn/mark_page_dirty}` of retrieving
> > > the memslot again in `kvm_riscv_gstage_map`, we can replace them
> > > with `{__kvm_faultin_pfn/mark_page_dirty_in_slot}`.
> > >
> > > Signed-off-by: Quan Zhou <zhouquan@...as.ac.cn>
> > > ---
> > > arch/riscv/kvm/mmu.c | 5 +++--
> > > 1 file changed, 3 insertions(+), 2 deletions(-)
> > >
> > > diff --git a/arch/riscv/kvm/mmu.c b/arch/riscv/kvm/mmu.c
> > > index 1087ea74567b..f9059dac3ba3 100644
> > > --- a/arch/riscv/kvm/mmu.c
> > > +++ b/arch/riscv/kvm/mmu.c
> > > @@ -648,7 +648,8 @@ int kvm_riscv_gstage_map(struct kvm_vcpu *vcpu,
> > > return -EFAULT;
> > > }
> > >
> > > - hfn = kvm_faultin_pfn(vcpu, gfn, is_write, &writable, &page);
> > > + hfn = __kvm_faultin_pfn(memslot, gfn, is_write ? FOLL_WRITE : 0,
> > > + &writable, &page);
> >
> > I think introducing another function with the following diff would be
> > better than duplicating the is_write to foll translation.
>
> NAK, I don't want an explosion of wrapper APIs (especially with boolean params).
>
> I 100% agree that it's mildly annoying to force arch code to do convert "write"
> to FOLL_WRITE, but that's a symptom of KVM not providing a common structure for
> passing page fault information.
>
> What I want to get to is a set of APIs that look something the below (very off
> the cuff), not add more wrappers and put KVM back in a situation where there are
> a bajillion ways to do the same basic thing.
>
> struct kvm_page_fault {
> const gpa_t addr;
> const bool exec;
> const bool write;
> const bool present;
>
> gfn_t gfn;
>
> /* The memslot containing gfn. May be NULL. */
> struct kvm_memory_slot *slot;
>
> /* Outputs */
> unsigned long mmu_seq;
> kvm_pfn_t pfn;
> struct page *refcounted_page;
> bool map_writable;
> };
>
> kvm_pfn_t __kvm_faultin_pfn(struct kvm_page_fault *fault, unsigned int flags)
> {
> struct kvm_follow_pfn kfp = {
> .slot = fault->slot,
> .gfn = fault->gfn,
> .flags = flags | fault->write ? FOLL_WRITE : 0,
> .map_writable = &fault->writable,
> .refcounted_page = &fault->refcounted_page,
> };
>
> fault->writable = false;
> fault->refcounted_page = NULL;
>
> return kvm_follow_pfn(&kfp);
> }
> EXPORT_SYMBOL_GPL(__kvm_faultin_pfn);
>
> kvm_pfn_t kvm_faultin_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, bool write,
> bool *writable, struct page **refcounted_page)
> {
> struct kvm_follow_pfn kfp = {
> .slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn),,
> .gfn = gfn,
> .flags = write ? FOLL_WRITE : 0,
> .map_writable = writable,
> .refcounted_page = refcounted_page,
> };
>
> if (WARN_ON_ONCE(!writable || !refcounted_page))
> return KVM_PFN_ERR_FAULT;
>
> *writable = false;
> *refcounted_page = NULL;
>
> return kvm_follow_pfn(&kfp);
> }
> EXPORT_SYMBOL_GPL(__kvm_faultin_pfn);
>
>
> To get things started, I proposed moving "struct kvm_page_fault" to common code
> so that it can be shared by x86 and arm64 as part of the KVM userfault series.
> But I'd be more than happy to acclerate the standardization of "struct kvm_page_fault"
> if we want to get there sooner than later.
>
> [*] https://lore.kernel.org/all/aBqlkz1bqhu-9toV@google.com
>
> In the meantime, RISC-V can start preparing for that future, and clean up its
> code in the process.
>
> E.g. "fault_addr" should be "gpa_t", not "unsigned long". If 32-bit RISC-V
> is strictly limited to 32-bit _physical_ addresses in the *architecture*, then
> gpa_t should probably be tweaked accordingly.
32-bit riscv supports 34-bit physical addresses, so fault_addr should
indeed be gpa_t.
>
> And vma_pageshift should be "unsigned int", not "short".
Yes, particularly because huge_page_shift() returns unsigned int which may
be used to assign vma_pageshift.
>
> Looks like y'all also have a bug where an -EEXIST will be returned to userspace,
> and will generate what's probably a spurious kvm_err() message.
On 32-bit riscv, due to losing the upper bits of the physical address? Or
is there yet another thing to fix?
>
> E.g. in the short term:
The diff looks good to me, should I test and post it for you?
Thanks,
drew
>
> ---
> arch/riscv/include/asm/kvm_host.h | 5 ++--
> arch/riscv/kvm/mmu.c | 49 +++++++++++++++++++++----------
> arch/riscv/kvm/vcpu_exit.c | 40 +------------------------
> 3 files changed, 36 insertions(+), 58 deletions(-)
>
> diff --git a/arch/riscv/include/asm/kvm_host.h b/arch/riscv/include/asm/kvm_host.h
> index 85cfebc32e4c..84c5db715ba5 100644
> --- a/arch/riscv/include/asm/kvm_host.h
> +++ b/arch/riscv/include/asm/kvm_host.h
> @@ -361,9 +361,8 @@ int kvm_riscv_gstage_ioremap(struct kvm *kvm, gpa_t gpa,
> bool writable, bool in_atomic);
> void kvm_riscv_gstage_iounmap(struct kvm *kvm, gpa_t gpa,
> unsigned long size);
> -int kvm_riscv_gstage_map(struct kvm_vcpu *vcpu,
> - struct kvm_memory_slot *memslot,
> - gpa_t gpa, unsigned long hva, bool is_write);
> +int kvm_riscv_gstage_page_fault(struct kvm_vcpu *vcpu, struct kvm_run *run,
> + struct kvm_cpu_trap *trap);
> int kvm_riscv_gstage_alloc_pgd(struct kvm *kvm);
> void kvm_riscv_gstage_free_pgd(struct kvm *kvm);
> void kvm_riscv_gstage_update_hgatp(struct kvm_vcpu *vcpu);
> diff --git a/arch/riscv/kvm/mmu.c b/arch/riscv/kvm/mmu.c
> index 1087ea74567b..3b0afc1c0832 100644
> --- a/arch/riscv/kvm/mmu.c
> +++ b/arch/riscv/kvm/mmu.c
> @@ -586,22 +586,37 @@ bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
> return pte_young(ptep_get(ptep));
> }
>
> -int kvm_riscv_gstage_map(struct kvm_vcpu *vcpu,
> - struct kvm_memory_slot *memslot,
> - gpa_t gpa, unsigned long hva, bool is_write)
> +int kvm_riscv_gstage_page_fault(struct kvm_vcpu *vcpu, struct kvm_run *run,
> + struct kvm_cpu_trap *trap)
> {
> - int ret;
> - kvm_pfn_t hfn;
> - bool writable;
> - short vma_pageshift;
> +
> + struct kvm_mmu_memory_cache *pcache = &vcpu->arch.mmu_page_cache;
> + gpa_t gpa = (trap->htval << 2) | (trap->stval & 0x3);
> gfn_t gfn = gpa >> PAGE_SHIFT;
> - struct vm_area_struct *vma;
> + struct kvm_memory_slot *memslot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
> + bool logging = memslot->dirty_bitmap && !(memslot->flags & KVM_MEM_READONLY);
> + bool write = trap->scause == EXC_STORE_GUEST_PAGE_FAULT;
> + bool read = trap->scause == EXC_LOAD_GUEST_PAGE_FAULT;
> + unsigned int flags = write ? FOLL_WRITE : 0;
> + unsigned long hva, vma_pagesize, mmu_seq;
> struct kvm *kvm = vcpu->kvm;
> - struct kvm_mmu_memory_cache *pcache = &vcpu->arch.mmu_page_cache;
> - bool logging = (memslot->dirty_bitmap &&
> - !(memslot->flags & KVM_MEM_READONLY)) ? true : false;
> - unsigned long vma_pagesize, mmu_seq;
> + unsigned int vma_pageshift;
> + struct vm_area_struct *vma;
> struct page *page;
> + kvm_pfn_t hfn;
> + bool writable;
> + int ret;
> +
> + hva = gfn_to_hva_memslot_prot(memslot, gfn, &writable);
> + if (kvm_is_error_hva(hva) || (write && !writable)) {
> + if (read)
> + return kvm_riscv_vcpu_mmio_load(vcpu, run, gpa,
> + trap->htinst);
> + if (write)
> + return kvm_riscv_vcpu_mmio_store(vcpu, run, gpa,
> + trap->htinst);
> + return -EOPNOTSUPP;
> + }
>
> /* We need minimum second+third level pages */
> ret = kvm_mmu_topup_memory_cache(pcache, gstage_pgd_levels);
> @@ -648,7 +663,7 @@ int kvm_riscv_gstage_map(struct kvm_vcpu *vcpu,
> return -EFAULT;
> }
>
> - hfn = kvm_faultin_pfn(vcpu, gfn, is_write, &writable, &page);
> + hfn = __kvm_faultin_pfn(memslot, gfn, flags, &writable, &page);
> if (hfn == KVM_PFN_ERR_HWPOISON) {
> send_sig_mceerr(BUS_MCEERR_AR, (void __user *)hva,
> vma_pageshift, current);
> @@ -661,7 +676,7 @@ int kvm_riscv_gstage_map(struct kvm_vcpu *vcpu,
> * If logging is active then we allow writable pages only
> * for write faults.
> */
> - if (logging && !is_write)
> + if (logging && !write)
> writable = false;
>
> spin_lock(&kvm->mmu_lock);
> @@ -677,14 +692,16 @@ int kvm_riscv_gstage_map(struct kvm_vcpu *vcpu,
> ret = gstage_map_page(kvm, pcache, gpa, hfn << PAGE_SHIFT,
> vma_pagesize, true, true);
> }
> + if (ret == -EEXIST)
> + ret = 0;
>
> if (ret)
> kvm_err("Failed to map in G-stage\n");
>
> out_unlock:
> - kvm_release_faultin_page(kvm, page, ret && ret != -EEXIST, writable);
> + kvm_release_faultin_page(kvm, page, ret, writable);
> spin_unlock(&kvm->mmu_lock);
> - return ret;
> + return ret ? ret : 1;
> }
>
> int kvm_riscv_gstage_alloc_pgd(struct kvm *kvm)
> diff --git a/arch/riscv/kvm/vcpu_exit.c b/arch/riscv/kvm/vcpu_exit.c
> index 6e0c18412795..6f07077068f6 100644
> --- a/arch/riscv/kvm/vcpu_exit.c
> +++ b/arch/riscv/kvm/vcpu_exit.c
> @@ -10,44 +10,6 @@
> #include <asm/csr.h>
> #include <asm/insn-def.h>
>
> -static int gstage_page_fault(struct kvm_vcpu *vcpu, struct kvm_run *run,
> - struct kvm_cpu_trap *trap)
> -{
> - struct kvm_memory_slot *memslot;
> - unsigned long hva, fault_addr;
> - bool writable;
> - gfn_t gfn;
> - int ret;
> -
> - fault_addr = (trap->htval << 2) | (trap->stval & 0x3);
> - gfn = fault_addr >> PAGE_SHIFT;
> - memslot = gfn_to_memslot(vcpu->kvm, gfn);
> - hva = gfn_to_hva_memslot_prot(memslot, gfn, &writable);
> -
> - if (kvm_is_error_hva(hva) ||
> - (trap->scause == EXC_STORE_GUEST_PAGE_FAULT && !writable)) {
> - switch (trap->scause) {
> - case EXC_LOAD_GUEST_PAGE_FAULT:
> - return kvm_riscv_vcpu_mmio_load(vcpu, run,
> - fault_addr,
> - trap->htinst);
> - case EXC_STORE_GUEST_PAGE_FAULT:
> - return kvm_riscv_vcpu_mmio_store(vcpu, run,
> - fault_addr,
> - trap->htinst);
> - default:
> - return -EOPNOTSUPP;
> - };
> - }
> -
> - ret = kvm_riscv_gstage_map(vcpu, memslot, fault_addr, hva,
> - (trap->scause == EXC_STORE_GUEST_PAGE_FAULT) ? true : false);
> - if (ret < 0)
> - return ret;
> -
> - return 1;
> -}
> -
> /**
> * kvm_riscv_vcpu_unpriv_read -- Read machine word from Guest memory
> *
> @@ -229,7 +191,7 @@ int kvm_riscv_vcpu_exit(struct kvm_vcpu *vcpu, struct kvm_run *run,
> case EXC_LOAD_GUEST_PAGE_FAULT:
> case EXC_STORE_GUEST_PAGE_FAULT:
> if (vcpu->arch.guest_context.hstatus & HSTATUS_SPV)
> - ret = gstage_page_fault(vcpu, run, trap);
> + ret = kvm_riscv_gstage_page_fault(vcpu, run, trap);
> break;
> case EXC_SUPERVISOR_SYSCALL:
> if (vcpu->arch.guest_context.hstatus & HSTATUS_SPV)
>
> base-commit: 19272b37aa4f83ca52bdf9c16d5d81bdd1354494
> --
Powered by blists - more mailing lists