[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <ZyJCjJx2lxnEnDwa@google.com>
Date: Wed, 30 Oct 2024 07:28:28 -0700
From: Sean Christopherson <seanjc@...gle.com>
To: Vipin Sharma <vipinsh@...gle.com>
Cc: pbonzini@...hat.com, dmatlack@...gle.com, kvm@...r.kernel.org,
linux-kernel@...r.kernel.org
Subject: Re: [PATCH v3 2/2] KVM: x86/mmu: Recover TDP MMU NX huge pages using
MMU read lock
On Fri, Sep 06, 2024, Vipin Sharma wrote:
> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> index 455caaaa04f5..fc597f66aa11 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -7317,8 +7317,8 @@ static int set_nx_huge_pages_recovery_param(const char *val, const struct kernel
> return err;
> }
>
> -void kvm_recover_nx_huge_pages(struct kvm *kvm, struct list_head *pages,
> - unsigned long nr_pages)
> +void kvm_recover_nx_huge_pages(struct kvm *kvm, bool shared,
> + struct list_head *pages, unsigned long nr_pages)
> {
> struct kvm_memory_slot *slot;
> int rcu_idx;
> @@ -7329,7 +7329,10 @@ void kvm_recover_nx_huge_pages(struct kvm *kvm, struct list_head *pages,
> ulong to_zap;
>
> rcu_idx = srcu_read_lock(&kvm->srcu);
> - write_lock(&kvm->mmu_lock);
> + if (shared)
Hmm, what if we do this?
enum kvm_mmu_types {
KVM_SHADOW_MMU,
#ifdef CONFIG_X86_64
KVM_TDP_MMU,
#endif
KVM_NR_MMU_TYPES,
};
#ifndef CONFIG_X86_64
#define KVM_TDP_MMU -1
#endif
And then this becomes:
if (mmu_type == KVM_TDP_MMU)
> + read_lock(&kvm->mmu_lock);
> + else
> + write_lock(&kvm->mmu_lock);
>
> /*
> * Zapping TDP MMU shadow pages, including the remote TLB flush, must
> @@ -7341,8 +7344,13 @@ void kvm_recover_nx_huge_pages(struct kvm *kvm, struct list_head *pages,
> ratio = READ_ONCE(nx_huge_pages_recovery_ratio);
> to_zap = ratio ? DIV_ROUND_UP(nr_pages, ratio) : 0;
> for ( ; to_zap; --to_zap) {
> - if (list_empty(pages))
> + if (tdp_mmu_enabled)
Shouldn't this be?
if (shared)
Or if we do the above
if (mmu_type == KVM_TDP_MMU)
Actually, better idea (sans comments)
if (mmu_type == KVM_TDP_MMU) {
read_lock(&kvm->mmu_lock);
kvm_tdp_mmu_pages_lock(kvm);
} else {
write_lock(&kvm->mmu_lock);
}
rcu_read_lock();
ratio = READ_ONCE(nx_huge_pages_recovery_ratio);
to_zap = ratio ? DIV_ROUND_UP(possible_nx->nr_pages, ratio) : 0;
for ( ; to_zap; --to_zap) {
if (list_empty(possible_nx->pages))
break;
...
/* Blah blah blah. */
if (mmu_type == KVM_TDP_MMU)
kvm_tdp_mmu_pages_unlock(kvm);
...
/* Blah blah blah. */
if (mmu_type == KVM_TDP_MMU)
kvm_tdp_mmu_pages_lock(kvm);
}
kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush);
rcu_read_unlock();
if (mmu_type == KVM_TDP_MMU) {
kvm_tdp_mmu_pages_unlock(kvm);
read_unlock(&kvm->mmu_lock);
} else {
write_unlock(&kvm->mmu_lock);
}
srcu_read_unlock(&kvm->srcu, rcu_idx);
> @@ -825,23 +835,51 @@ static void tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root,
> rcu_read_unlock();
> }
>
> -bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
> +bool kvm_tdp_mmu_zap_possible_nx_huge_page(struct kvm *kvm,
This rename, and any refactoring that is associated with said rename, e.g. comments,
belongs in a separate patch.
> + struct kvm_mmu_page *sp)
> {
> - u64 old_spte;
> + struct tdp_iter iter = {
> + .old_spte = sp->ptep ? kvm_tdp_mmu_read_spte(sp->ptep) : 0,
> + .sptep = sp->ptep,
> + .level = sp->role.level + 1,
> + .gfn = sp->gfn,
> + .as_id = kvm_mmu_page_as_id(sp),
> + };
> +
> + lockdep_assert_held_read(&kvm->mmu_lock);
Newline here, to isolate the lockdep assertion from the functional code.
> + if (WARN_ON_ONCE(!is_tdp_mmu_page(sp)))
> + return false;
>
> /*
> - * This helper intentionally doesn't allow zapping a root shadow page,
> - * which doesn't have a parent page table and thus no associated entry.
> + * Root shadow pages don't a parent page table and thus no associated
Missed a word or three.
> + * entry, but they can never be possible NX huge pages.
> */
> if (WARN_ON_ONCE(!sp->ptep))
> return false;
>
> - old_spte = kvm_tdp_mmu_read_spte(sp->ptep);
> - if (WARN_ON_ONCE(!is_shadow_present_pte(old_spte)))
> + /*
> + * Since mmu_lock is held in read mode, it's possible another task has
> + * already modified the SPTE. Zap the SPTE if and only if the SPTE
> + * points at the SP's page table, as checking shadow-present isn't
> + * sufficient, e.g. the SPTE could be replaced by a leaf SPTE, or even
> + * another SP. Note, spte_to_child_pt() also checks that the SPTE is
> + * shadow-present, i.e. guards against zapping a frozen SPTE.
> + */
> + if ((tdp_ptep_t)sp->spt != spte_to_child_pt(iter.old_spte, iter.level))
> return false;
>
> - tdp_mmu_set_spte(kvm, kvm_mmu_page_as_id(sp), sp->ptep, old_spte,
> - SHADOW_NONPRESENT_VALUE, sp->gfn, sp->role.level + 1);
> + /*
> + * If a different task modified the SPTE, then it should be impossible
> + * for the SPTE to still be used for the to-be-zapped SP. Non-leaf
> + * SPTEs don't have Dirty bits, KVM always sets the Accessed bit when
> + * creating non-leaf SPTEs, and all other bits are immutable for non-
> + * leaf SPTEs, i.e. the only legal operations for non-leaf SPTEs are
> + * zapping and replacement.
> + */
> + if (tdp_mmu_set_spte_atomic(kvm, &iter, SHADOW_NONPRESENT_VALUE)) {
> + WARN_ON_ONCE((tdp_ptep_t)sp->spt == spte_to_child_pt(iter.old_spte, iter.level));
> + return false;
> + }
>
> return true;
> }
Powered by blists - more mailing lists