[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <aCbtbemWD6RBOBln@yzhao56-desk.sh.intel.com>
Date: Fri, 16 May 2025 15:46:53 +0800
From: Yan Zhao <yan.y.zhao@...el.com>
To: "Edgecombe, Rick P" <rick.p.edgecombe@...el.com>
CC: "pbonzini@...hat.com" <pbonzini@...hat.com>, "seanjc@...gle.com"
<seanjc@...gle.com>, "Shutemov, Kirill" <kirill.shutemov@...el.com>,
"quic_eberman@...cinc.com" <quic_eberman@...cinc.com>, "Li, Xiaoyao"
<xiaoyao.li@...el.com>, "kvm@...r.kernel.org" <kvm@...r.kernel.org>, "Hansen,
Dave" <dave.hansen@...el.com>, "david@...hat.com" <david@...hat.com>,
"thomas.lendacky@....com" <thomas.lendacky@....com>, "tabba@...gle.com"
<tabba@...gle.com>, "Li, Zhiquan1" <zhiquan1.li@...el.com>, "Du, Fan"
<fan.du@...el.com>, "linux-kernel@...r.kernel.org"
<linux-kernel@...r.kernel.org>, "michael.roth@....com"
<michael.roth@....com>, "Weiny, Ira" <ira.weiny@...el.com>, "vbabka@...e.cz"
<vbabka@...e.cz>, "binbin.wu@...ux.intel.com" <binbin.wu@...ux.intel.com>,
"ackerleytng@...gle.com" <ackerleytng@...gle.com>, "Yamahata, Isaku"
<isaku.yamahata@...el.com>, "Peng, Chao P" <chao.p.peng@...el.com>,
"Annapurve, Vishal" <vannapurve@...gle.com>, "jroedel@...e.de"
<jroedel@...e.de>, "Miao, Jun" <jun.miao@...el.com>, "pgonda@...gle.com"
<pgonda@...gle.com>, "x86@...nel.org" <x86@...nel.org>
Subject: Re: [RFC PATCH 16/21] KVM: x86/mmu: Introduce
kvm_split_boundary_leafs() to split boundary leafs
On Wed, May 14, 2025 at 06:56:26AM +0800, Edgecombe, Rick P wrote:
> On Thu, 2025-04-24 at 11:08 +0800, Yan Zhao wrote:
> > Introduce kvm_split_boundary_leafs() to manage the splitting of boundary
> > leafs within the mirror root.
> >
> > Before zapping a specific GFN range in the mirror root, split any huge leaf
> > that intersects with the boundary of the GFN range to ensure that the
> > subsequent zap operation does not impact any GFN outside the specified
> > range. This is crucial for the mirror root as the private page table
> > requires the guest's ACCEPT operation after faulting back a GFN.
> >
> > This function should be called while kvm->mmu_lock is held for writing. The
> > kvm->mmu_lock is temporarily released to allocate memory for sp for split.
> > The only expected error is -ENOMEM.
> >
> > Opportunistically, WARN in tdp_mmu_zap_leafs() if zapping a huge leaf in
> > the mirror root affects a GFN outside the specified range.
> >
> > Signed-off-by: Xiaoyao Li <xiaoyao.li@...el.com>
> > Signed-off-by: Yan Zhao <yan.y.zhao@...el.com>
> > ---
> > arch/x86/kvm/mmu/mmu.c | 21 +++++++
> > arch/x86/kvm/mmu/tdp_mmu.c | 116 ++++++++++++++++++++++++++++++++++++-
> > arch/x86/kvm/mmu/tdp_mmu.h | 1 +
> > include/linux/kvm_host.h | 1 +
> > 4 files changed, 136 insertions(+), 3 deletions(-)
> >
> > diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> > index 0e227199d73e..0d49c69b6b55 100644
> > --- a/arch/x86/kvm/mmu/mmu.c
> > +++ b/arch/x86/kvm/mmu/mmu.c
> > @@ -1640,6 +1640,27 @@ static bool __kvm_rmap_zap_gfn_range(struct kvm *kvm,
> > start, end - 1, can_yield, true, flush);
> > }
> >
> > +/*
> > + * Split large leafs at the boundary of the specified range for the mirror root
> > + *
> > + * Return value:
> > + * 0 : success, no flush is required;
> > + * 1 : success, flush is required;
> > + * <0: failure.
> > + */
> > +int kvm_split_boundary_leafs(struct kvm *kvm, struct kvm_gfn_range *range)
> > +{
> > + bool ret = 0;
> > +
> > + lockdep_assert_once(kvm->mmu_invalidate_in_progress ||
> > + lockdep_is_held(&kvm->slots_lock));
> > +
> > + if (tdp_mmu_enabled)
> > + ret = kvm_tdp_mmu_gfn_range_split_boundary(kvm, range);
> > +
> > + return ret;
> > +}
> > +
> > bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
> > {
> > bool flush = false;
> > diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
> > index 0f683753a7bb..d3fba5d11ea2 100644
> > --- a/arch/x86/kvm/mmu/tdp_mmu.c
> > +++ b/arch/x86/kvm/mmu/tdp_mmu.c
> > @@ -324,6 +324,8 @@ static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
> > u64 old_spte, u64 new_spte, int level,
> > bool shared);
> >
> > +static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter,
> > + struct kvm_mmu_page *sp, bool shared);
> > static struct kvm_mmu_page *tdp_mmu_alloc_sp_for_split(bool mirror);
> > static void *get_external_spt(gfn_t gfn, u64 new_spte, int level);
> >
> > @@ -962,6 +964,19 @@ bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
> > return true;
> > }
> >
> > +static inline bool iter_split_required(struct kvm *kvm, struct kvm_mmu_page *root,
> > + struct tdp_iter *iter, gfn_t start, gfn_t end)
> > +{
> > + if (!is_mirror_sp(root) || !is_large_pte(iter->old_spte))
> > + return false;
> > +
> > + /* Fully contained, no need to split */
> > + if (iter->gfn >= start && iter->gfn + KVM_PAGES_PER_HPAGE(iter->level) <= end)
> > + return false;
> > +
> > + return true;
> > +}
> > +
> > /*
> > * If can_yield is true, will release the MMU lock and reschedule if the
> > * scheduler needs the CPU or there is contention on the MMU lock. If this
> > @@ -991,6 +1006,8 @@ static bool tdp_mmu_zap_leafs(struct kvm *kvm, struct kvm_mmu_page *root,
> > !is_last_spte(iter.old_spte, iter.level))
> > continue;
> >
> > + WARN_ON_ONCE(iter_split_required(kvm, root, &iter, start, end));
> > +
>
> Kind of unrelated change? But good idea. Maybe for another patch.
Yes, will move it to a separate patch in a formal version.
As initial RFC, I hoped to show related changes in one patch to allow a whole
picture.
> > tdp_mmu_iter_set_spte(kvm, &iter, SHADOW_NONPRESENT_VALUE);
> >
> > /*
> > @@ -1246,9 +1263,6 @@ static int tdp_mmu_link_sp(struct kvm *kvm, struct tdp_iter *iter,
> > return 0;
> > }
> >
> > -static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter,
> > - struct kvm_mmu_page *sp, bool shared);
> > -
> > /*
> > * Handle a TDP page fault (NPT/EPT violation/misconfiguration) by installing
> > * page tables and SPTEs to translate the faulting guest physical address.
> > @@ -1341,6 +1355,102 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
> > return ret;
> > }
> >
> > +/*
> > + * Split large leafs at the boundary of the specified range for the mirror root
> > + */
> > +static int tdp_mmu_split_boundary_leafs(struct kvm *kvm, struct kvm_mmu_page *root,
> > + gfn_t start, gfn_t end, bool can_yield, bool *flush)
> > +{
> > + struct kvm_mmu_page *sp = NULL;
> > + struct tdp_iter iter;
> > +
> > + WARN_ON_ONCE(!can_yield);
>
> Why pass this in then?
Right, can move the warning up to the caller.
Currently callers of kvm_split_boundary_leafs() are only
kvm_arch_pre_set_memory_attributes() and kvm_gmem_punch_hole(), so can_yield is
always false.
> > +
> > + if (!is_mirror_sp(root))
> > + return 0;
>
> What is special about mirror roots here?
Hmm, I thought only the mirror root needs splitting before zapping of the
S-EPT, which requires guest's acceptance. Other roots could tolerate zapping of
a larger range than required.
Maybe AMD guys can shout out if I'm wrong.
> > + end = min(end, tdp_mmu_max_gfn_exclusive());
> > +
> > + lockdep_assert_held_write(&kvm->mmu_lock);
> > +
> > + rcu_read_lock();
> > +
> > + for_each_tdp_pte_min_level(iter, kvm, root, PG_LEVEL_4K, start, end) {
> > +retry:
> > + if (can_yield &&
>
> Do we need this part of the conditional based on the above?
No need if we don't pass in can_yield.
> > + tdp_mmu_iter_cond_resched(kvm, &iter, *flush, false)) {
> > + *flush = false;
> > + continue;
> > + }
> > +
> > + if (!is_shadow_present_pte(iter.old_spte) ||
> > + !is_last_spte(iter.old_spte, iter.level) ||
> > + !iter_split_required(kvm, root, &iter, start, end))
> > + continue;
> > +
> > + if (!sp) {
> > + rcu_read_unlock();
> > +
> > + write_unlock(&kvm->mmu_lock);
> > +
> > + sp = tdp_mmu_alloc_sp_for_split(true);
> > +
> > + write_lock(&kvm->mmu_lock);
> > +
> > + if (!sp) {
> > + trace_kvm_mmu_split_huge_page(iter.gfn, iter.old_spte,
> > + iter.level, -ENOMEM);
> > + return -ENOMEM;
> > + }
> > + rcu_read_lock();
> > +
> > + iter.yielded = true;
> > + continue;
> > + }
> > + tdp_mmu_init_child_sp(sp, &iter);
> > +
> > + if (tdp_mmu_split_huge_page(kvm, &iter, sp, false))
>
> I think it can't fail when you hold mmu write lock.
You are right!
Thanks for catching it.
> > + goto retry;
> > +
> > + sp = NULL;
> > + /*
> > + * Set yielded in case after splitting to a lower level,
> > + * the new iter requires furter splitting.
> > + */
> > + iter.yielded = true;
> > + *flush = true;
> > + }
> > +
> > + rcu_read_unlock();
> > +
> > + /* Leave it here though it should be impossible for the mirror root */
> > + if (sp)
> > + tdp_mmu_free_sp(sp);
>
> What do you think about relying on tdp_mmu_split_huge_pages_root() and moving
> this to an optimization patch at the end?
>
> Or what about just two calls to tdp_mmu_split_huge_pages_root() at the
> boundaries?
Though the two generally look like the same, relying on
tdp_mmu_split_huge_pages_root() will create several minor changes scattering
in tdp_mmu_split_huge_pages_root().
e.g. update flush after tdp_mmu_iter_cond_resched(), check
iter_split_required(), set "iter.yielded = true".
So, it may be hard to review as a initial RFC.
I prefer to do that after Paolo and Sean have taken a look of it :)
> > + return 0;
> > +}
> > +
> > +int kvm_tdp_mmu_gfn_range_split_boundary(struct kvm *kvm, struct kvm_gfn_range *range)
> > +{
> > + enum kvm_tdp_mmu_root_types types;
> > + struct kvm_mmu_page *root;
> > + bool flush = false;
> > + int ret;
> > +
> > + types = kvm_gfn_range_filter_to_root_types(kvm, range->attr_filter) | KVM_INVALID_ROOTS;
>
> What is the reason for KVM_INVALID_ROOTS in this case?
I wanted to keep consistent with that in kvm_tdp_mmu_unmap_gfn_range().
Yes, we can remove the KVM_INVALID_ROOTS.
> > +
> > + __for_each_tdp_mmu_root_yield_safe(kvm, root, range->slot->as_id, types) {
>
> It would be better to check for mirror roots here, instead of inside
> tdp_mmu_split_boundary_leafs().
Ok.
>
> > + ret = tdp_mmu_split_boundary_leafs(kvm, root, range->start, range->end,
> > + range->may_block, &flush);
> > + if (ret < 0) {
> > + if (flush)
> > + kvm_flush_remote_tlbs(kvm);
> > +
> > + return ret;
> > + }
> > + }
> > + return flush;
> > +}
> > +
> > /* Used by mmu notifier via kvm_unmap_gfn_range() */
> > bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range,
> > bool flush)
> > diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h
> > index 52acf99d40a0..806a21d4f0e3 100644
> > --- a/arch/x86/kvm/mmu/tdp_mmu.h
> > +++ b/arch/x86/kvm/mmu/tdp_mmu.h
> > @@ -69,6 +69,7 @@ void kvm_tdp_mmu_zap_all(struct kvm *kvm);
> > void kvm_tdp_mmu_invalidate_roots(struct kvm *kvm,
> > enum kvm_tdp_mmu_root_types root_types);
> > void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm, bool shared);
> > +int kvm_tdp_mmu_gfn_range_split_boundary(struct kvm *kvm, struct kvm_gfn_range *range);
> >
> > int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault);
> >
> > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> > index 655d36e1f4db..19d7a577e7ed 100644
> > --- a/include/linux/kvm_host.h
> > +++ b/include/linux/kvm_host.h
> > @@ -272,6 +272,7 @@ struct kvm_gfn_range {
> > bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range);
> > bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range);
> > bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range);
> > +int kvm_split_boundary_leafs(struct kvm *kvm, struct kvm_gfn_range *range);
> > #endif
> >
> > enum {
>
Powered by blists - more mailing lists