[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <aPiQYBoDlUmrQxEw@yzhao56-desk.sh.intel.com>
Date: Wed, 22 Oct 2025 16:05:52 +0800
From: Yan Zhao <yan.y.zhao@...el.com>
To: Sean Christopherson <seanjc@...gle.com>
CC: Marc Zyngier <maz@...nel.org>, Oliver Upton <oliver.upton@...ux.dev>,
Tianrui Zhao <zhaotianrui@...ngson.cn>, Bibo Mao <maobibo@...ngson.cn>,
Huacai Chen <chenhuacai@...nel.org>, Madhavan Srinivasan
<maddy@...ux.ibm.com>, Anup Patel <anup@...infault.org>, Paul Walmsley
<pjw@...nel.org>, Palmer Dabbelt <palmer@...belt.com>, Albert Ou
<aou@...s.berkeley.edu>, Christian Borntraeger <borntraeger@...ux.ibm.com>,
Janosch Frank <frankja@...ux.ibm.com>, Claudio Imbrenda
<imbrenda@...ux.ibm.com>, Paolo Bonzini <pbonzini@...hat.com>, "Kirill A.
Shutemov" <kas@...nel.org>, <linux-arm-kernel@...ts.infradead.org>,
<kvmarm@...ts.linux.dev>, <kvm@...r.kernel.org>, <loongarch@...ts.linux.dev>,
<linux-mips@...r.kernel.org>, <linuxppc-dev@...ts.ozlabs.org>,
<kvm-riscv@...ts.infradead.org>, <linux-riscv@...ts.infradead.org>,
<x86@...nel.org>, <linux-coco@...ts.linux.dev>,
<linux-kernel@...r.kernel.org>, Ira Weiny <ira.weiny@...el.com>, Kai Huang
<kai.huang@...el.com>, Michael Roth <michael.roth@....com>, Vishal Annapurve
<vannapurve@...gle.com>, Rick Edgecombe <rick.p.edgecombe@...el.com>,
Ackerley Tng <ackerleytng@...gle.com>, Binbin Wu <binbin.wu@...ux.intel.com>
Subject: Re: [PATCH v3 04/25] KVM: x86/mmu: Add dedicated API to map
guest_memfd pfn into TDP MMU
On Tue, Oct 21, 2025 at 09:36:52AM -0700, Sean Christopherson wrote:
> On Tue, Oct 21, 2025, Yan Zhao wrote:
> > On Thu, Oct 16, 2025 at 05:32:22PM -0700, Sean Christopherson wrote:
> > > diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> > > index 18d69d48bc55..ba5cca825a7f 100644
> > > --- a/arch/x86/kvm/mmu/mmu.c
> > > +++ b/arch/x86/kvm/mmu/mmu.c
> > > @@ -5014,6 +5014,65 @@ long kvm_arch_vcpu_pre_fault_memory(struct kvm_vcpu *vcpu,
> > > return min(range->size, end - range->gpa);
> > > }
> > >
> > > +int kvm_tdp_mmu_map_private_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn)
> > > +{
> > > + struct kvm_page_fault fault = {
> > > + .addr = gfn_to_gpa(gfn),
> > > + .error_code = PFERR_GUEST_FINAL_MASK | PFERR_PRIVATE_ACCESS,
> > > + .prefetch = true,
> > > + .is_tdp = true,
> > > + .nx_huge_page_workaround_enabled = is_nx_huge_page_enabled(vcpu->kvm),
> > > +
> > > + .max_level = PG_LEVEL_4K,
> > > + .req_level = PG_LEVEL_4K,
> > > + .goal_level = PG_LEVEL_4K,
> > > + .is_private = true,
> > > +
> > > + .gfn = gfn,
> > > + .slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn),
> > > + .pfn = pfn,
> > > + .map_writable = true,
> > > + };
> > > + struct kvm *kvm = vcpu->kvm;
> > > + int r;
> > > +
> > > + lockdep_assert_held(&kvm->slots_lock);
> > Do we need to assert that filemap_invalidate_lock() is held as well?
>
> Hrm, a lockdep assertion would be nice to have, but it's obviously not strictly
> necessary, and I'm not sure it's worth the cost. To safely assert, KVM would need
Not sure. Maybe just add a comment?
But even with kvm_assert_gmem_invalidate_lock_held() and
lockdep_assert_held(&kvm->slots_lock), it seems that
kvm_tdp_mmu_map_private_pfn() still can't guarantee that the pfn is not stale.
e.g., if hypothetically those locks were released and re-acquired after getting
the pfn.
> to first assert that the file refcount is elevated, e.g. to guard against
> guest_memfd _really_ screwing up and not grabbing a reference to the underlying
> file.
>
> E.g. it'd have to be something like this:
>
> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> index 94d7f32a03b6..5d46b2ac0292 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -5014,6 +5014,18 @@ long kvm_arch_vcpu_pre_fault_memory(struct kvm_vcpu *vcpu,
> return min(range->size, end - range->gpa);
> }
>
> +static void kvm_assert_gmem_invalidate_lock_held(struct kvm_memory_slot *slot)
> +{
> +#ifdef CONFIG_PROVE_LOCKING
> + if (WARN_ON_ONCE(!kvm_slot_has_gmem(slot)) ||
> + WARN_ON_ONCE(!slot->gmem.file) ||
> + WARN_ON_ONCE(!file_count(slot->gmem.file)))
> + return;
> +
> + lockdep_assert_held(file_inode(&slot->gmem.file)->i_mapping->invalidate_lock));
lockdep_assert_held(&file_inode(slot->gmem.file)->i_mapping->invalidate_lock);
> +#endif
> +}
> +
> int kvm_tdp_mmu_map_private_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn)
> {
> struct kvm_page_fault fault = {
> @@ -5038,6 +5050,8 @@ int kvm_tdp_mmu_map_private_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn)
>
> lockdep_assert_held(&kvm->slots_lock);
>
> + kvm_assert_gmem_invalidate_lock_held(fault.slot);
> +
> if (KVM_BUG_ON(!tdp_mmu_enabled, kvm))
> return -EIO;
> --
>
> Which I suppose isn't that terrible?
Is it good if we test is_page_fault_stale()? e.g.,
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 9e5045a60d8b..b2cf754f6f92 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -257,7 +257,8 @@ extern bool tdp_mmu_enabled;
#define tdp_mmu_enabled false
#endif
-int kvm_tdp_mmu_map_private_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn);
+int kvm_tdp_mmu_map_private_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn,
+ unsigned long mmu_seq);
static inline bool kvm_memslots_have_rmaps(struct kvm *kvm)
{
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 94d7f32a03b6..0dc9ff1bc63e 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -5014,7 +5014,8 @@ long kvm_arch_vcpu_pre_fault_memory(struct kvm_vcpu *vcpu,
return min(range->size, end - range->gpa);
}
-int kvm_tdp_mmu_map_private_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn)
+int kvm_tdp_mmu_map_private_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn,
+ unsigned long mmu_seq)
{
struct kvm_page_fault fault = {
.addr = gfn_to_gpa(gfn),
@@ -5032,12 +5033,12 @@ int kvm_tdp_mmu_map_private_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn)
.slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn),
.pfn = pfn,
.map_writable = true,
+
+ .mmu_seq = mmu_seq,
};
struct kvm *kvm = vcpu->kvm;
int r;
- lockdep_assert_held(&kvm->slots_lock);
-
if (KVM_BUG_ON(!tdp_mmu_enabled, kvm))
return -EIO;
@@ -5063,6 +5064,9 @@ int kvm_tdp_mmu_map_private_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn)
guard(read_lock)(&kvm->mmu_lock);
+ if (is_page_fault_stale(vcpu, &fault))
+ return -EIO;
+
r = kvm_tdp_mmu_map(vcpu, &fault);
} while (r == RET_PF_RETRY);
diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
index cae694d3ff33..4bb3e68a12b3 100644
--- a/arch/x86/kvm/vmx/tdx.c
+++ b/arch/x86/kvm/vmx/tdx.c
@@ -3113,7 +3113,8 @@ struct tdx_gmem_post_populate_arg {
};
static int tdx_gmem_post_populate(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn,
- void __user *src, int order, void *_arg)
+ unsigned long mmu_seq, void __user *src,
+ int order, void *_arg)
{
struct tdx_gmem_post_populate_arg *arg = _arg;
struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
@@ -3136,7 +3137,7 @@ static int tdx_gmem_post_populate(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn,
return -ENOMEM;
kvm_tdx->page_add_src = src_page;
- ret = kvm_tdp_mmu_map_private_pfn(arg->vcpu, gfn, pfn);
+ ret = kvm_tdp_mmu_map_private_pfn(arg->vcpu, gfn, pfn, mmu_seq);
kvm_tdx->page_add_src = NULL;
put_page(src_page);
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index d93f75b05ae2..406472f60e63 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -2581,7 +2581,8 @@ int kvm_arch_gmem_prepare(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, int max_ord
* Returns the number of pages that were populated.
*/
typedef int (*kvm_gmem_populate_cb)(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn,
- void __user *src, int order, void *opaque);
+ unsigned long mmu_seq, void __user *src,
+ int order, void *opaque);
long kvm_gmem_populate(struct kvm *kvm, gfn_t gfn, void __user *src, long npages,
kvm_gmem_populate_cb post_populate, void *opaque);
diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
index 427c0acee9d7..c9a87197412e 100644
--- a/virt/kvm/guest_memfd.c
+++ b/virt/kvm/guest_memfd.c
@@ -836,6 +836,7 @@ long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long
pgoff_t index = kvm_gmem_get_index(slot, gfn);
bool is_prepared = false;
kvm_pfn_t pfn;
+ unsigned long mmu_seq = kvm->mmu_invalidate_seq;
if (signal_pending(current)) {
ret = -EINTR;
@@ -869,7 +870,7 @@ long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long
}
p = src ? src + i * PAGE_SIZE : NULL;
- ret = post_populate(kvm, gfn, pfn, p, max_order, opaque);
+ ret = post_populate(kvm, gfn, pfn, mmu_seq, p, max_order, opaque);
if (!ret)
kvm_gmem_mark_prepared(folio);
Powered by blists - more mailing lists