[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <CALzav=c3+RXA1Sw2koaNY8LDA8B3KXdk1ZPGSkR=JzX54irYGw@mail.gmail.com>
Date: Mon, 28 Mar 2022 11:21:45 -0700
From: David Matlack <dmatlack@...gle.com>
To: Ben Gardon <bgardon@...gle.com>
Cc: LKML <linux-kernel@...r.kernel.org>,
kvm list <kvm@...r.kernel.org>,
Paolo Bonzini <pbonzini@...hat.com>,
Peter Xu <peterx@...hat.com>,
Sean Christopherson <seanjc@...gle.com>,
Jim Mattson <jmattson@...gle.com>,
David Dunn <daviddunn@...gle.com>,
Jing Zhang <jingzhangos@...gle.com>,
Junaid Shahid <junaids@...gle.com>
Subject: Re: [PATCH v2 9/9] KVM: x86/mmu: Promote pages in-place when
disabling dirty logging
On Mon, Mar 21, 2022 at 3:44 PM Ben Gardon <bgardon@...gle.com> wrote:
>
> When disabling dirty logging, the TDP MMU currently zaps each leaf entry
> mapping memory in the relevant memslot. This is very slow. Doing the zaps
> under the mmu read lock requires a TLB flush for every zap and the
> zapping causes a storm of ETP/NPT violations.
>
> Instead of zapping, replace the split large pages with large page
> mappings directly. While this sort of operation has historically only
> been done in the vCPU page fault handler context, refactorings earlier
> in this series and the relative simplicity of the TDP MMU make it
> possible here as well.
>
> Running the dirty_log_perf_test on an Intel Skylake with 96 vCPUs and 1G
> of memory per vCPU, this reduces the time required to disable dirty
> logging from over 45 seconds to just over 1 second. It also avoids
> provoking page faults, improving vCPU performance while disabling
> dirty logging.
>
> Signed-off-by: Ben Gardon <bgardon@...gle.com>
> ---
> arch/x86/kvm/mmu/mmu.c | 4 +-
> arch/x86/kvm/mmu/mmu_internal.h | 6 +++
> arch/x86/kvm/mmu/tdp_mmu.c | 73 ++++++++++++++++++++++++++++++++-
> 3 files changed, 79 insertions(+), 4 deletions(-)
>
> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> index 6f98111f8f8b..a99c23ef90b6 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -100,7 +100,7 @@ module_param_named(flush_on_reuse, force_flush_and_sync_on_reuse, bool, 0644);
> */
> bool tdp_enabled = false;
>
> -static int max_huge_page_level __read_mostly;
> +int max_huge_page_level;
> static int tdp_root_level __read_mostly;
> static int max_tdp_level __read_mostly;
>
> @@ -4486,7 +4486,7 @@ static inline bool boot_cpu_is_amd(void)
> * the direct page table on host, use as much mmu features as
> * possible, however, kvm currently does not do execution-protection.
> */
> -static void
> +void
> build_tdp_shadow_zero_bits_mask(struct rsvd_bits_validate *shadow_zero_check,
> int shadow_root_level)
> {
> diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
> index 1bff453f7cbe..6c08a5731fcb 100644
> --- a/arch/x86/kvm/mmu/mmu_internal.h
> +++ b/arch/x86/kvm/mmu/mmu_internal.h
> @@ -171,4 +171,10 @@ void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc);
> void account_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp);
> void unaccount_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp);
>
> +void
> +build_tdp_shadow_zero_bits_mask(struct rsvd_bits_validate *shadow_zero_check,
> + int shadow_root_level);
> +
> +extern int max_huge_page_level __read_mostly;
> +
> #endif /* __KVM_X86_MMU_INTERNAL_H */
> diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
> index af60922906ef..eb8929e394ec 100644
> --- a/arch/x86/kvm/mmu/tdp_mmu.c
> +++ b/arch/x86/kvm/mmu/tdp_mmu.c
> @@ -1709,6 +1709,66 @@ void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm,
> clear_dirty_pt_masked(kvm, root, gfn, mask, wrprot);
> }
>
> +static bool try_promote_lpage(struct kvm *kvm,
> + const struct kvm_memory_slot *slot,
> + struct tdp_iter *iter)
> +{
> + struct kvm_mmu_page *sp = sptep_to_sp(iter->sptep);
> + struct rsvd_bits_validate shadow_zero_check;
> + bool map_writable;
> + kvm_pfn_t pfn;
> + u64 new_spte;
> + u64 mt_mask;
> +
> + /*
> + * If addresses are being invalidated, don't do in-place promotion to
> + * avoid accidentally mapping an invalidated address.
> + */
> + if (unlikely(kvm->mmu_notifier_count))
> + return false;
> +
> + if (iter->level > max_huge_page_level || iter->gfn < slot->base_gfn ||
> + iter->gfn >= slot->base_gfn + slot->npages)
> + return false;
> +
> + pfn = __gfn_to_pfn_memslot(slot, iter->gfn, true, NULL, true,
> + &map_writable, NULL);
> + if (is_error_noslot_pfn(pfn))
> + return false;
> +
> + /*
> + * Can't reconstitute an lpage if the consituent pages can't be
> + * mapped higher.
> + */
> + if (iter->level > kvm_mmu_max_mapping_level(kvm, slot, iter->gfn,
> + pfn, PG_LEVEL_NUM))
> + return false;
> +
> + build_tdp_shadow_zero_bits_mask(&shadow_zero_check, iter->root_level);
> +
> + /*
> + * In some cases, a vCPU pointer is required to get the MT mask,
> + * however in most cases it can be generated without one. If a
> + * vCPU pointer is needed kvm_x86_try_get_mt_mask will fail.
> + * In that case, bail on in-place promotion.
> + */
> + if (unlikely(!static_call(kvm_x86_try_get_mt_mask)(kvm, iter->gfn,
> + kvm_is_mmio_pfn(pfn),
> + &mt_mask)))
> + return false;
> +
> + __make_spte(kvm, sp, slot, ACC_ALL, iter->gfn, pfn, 0, false, true,
> + map_writable, mt_mask, &shadow_zero_check, &new_spte);
> +
> + if (tdp_mmu_set_spte_atomic(kvm, iter, new_spte))
> + return true;
Ah shoot, tdp_mmu_set_spte_atomic() now returns 0/-EBUSY, so this
conditional needs to be flipped.
> +
> + /* Re-read the SPTE as it must have been changed by another thread. */
> + iter->old_spte = READ_ONCE(*rcu_dereference(iter->sptep));
tdp_mmu_set_spte_atomic() does this for you now.
> +
> + return false;
> +}
> +
> /*
> * Clear leaf entries which could be replaced by large mappings, for
> * GFNs within the slot.
> @@ -1729,8 +1789,17 @@ static void zap_collapsible_spte_range(struct kvm *kvm,
> if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
> continue;
>
> - if (!is_shadow_present_pte(iter.old_spte) ||
> - !is_last_spte(iter.old_spte, iter.level))
> + if (iter.level > max_huge_page_level ||
> + iter.gfn < slot->base_gfn ||
> + iter.gfn >= slot->base_gfn + slot->npages)
> + continue;
> +
> + if (!is_shadow_present_pte(iter.old_spte))
> + continue;
> +
> + /* Try to promote the constitutent pages to an lpage. */
> + if (!is_last_spte(iter.old_spte, iter.level) &&
> + try_promote_lpage(kvm, slot, &iter))
> continue;
>
> pfn = spte_to_pfn(iter.old_spte);
> --
> 2.35.1.894.gb6a874cedc-goog
>
Powered by blists - more mailing lists