[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <f862cefff2ed3f4211b69d785670f41667703cf3.camel@infradead.org>
Date: Mon, 05 Aug 2024 12:08:56 +0100
From: David Woodhouse <dwmw2@...radead.org>
To: Sean Christopherson <seanjc@...gle.com>, Paolo Bonzini
<pbonzini@...hat.com>, "Hussain, Mushahid" <hmushi@...zon.co.uk>
Cc: Vitaly Kuznetsov <vkuznets@...hat.com>, Wanpeng Li
<wanpengli@...cent.com>, Jim Mattson <jmattson@...gle.com>, Joerg Roedel
<joro@...tes.org>, kvm@...r.kernel.org, linux-kernel@...r.kernel.org,
Mingwei Zhang <mizhang@...gle.com>, Maxim Levitsky <mlevitsk@...hat.com>
Subject: [PATCH] KVM: Move gfn_to_pfn_cache invalidation to
invalidate_range_end hook
From: David Woodhouse <dwmw@...zon.co.uk>
The existing retry loop in hva_to_pfn_retry() is extremely pessimistic.
If there is an invalidation running concurrently, it is effectively just
a complex busy wait loop because its local mmu_notifier_retry_cache()
function will always return true.
It ends up functioning as a very unfair read/write lock. If userspace is
acting as a 'writer', performing many unrelated MM changes, then the
hva_to_pfn_retry() function acting as the 'reader' just backs off and
keep retrying for ever, not making any progress.
Solve this by introducing a separate 'validating' flag to the GPC, so
that it can be marked invalid before it's even mapped. This allows the
invalidation to be moved to the range_end hook, and the retry loop in
hva_to_pfn_retry() can be changed to loop only if its particular uHVA
has been affected.
Signed-off-by: David Woodhouse <dwmw@...zon.co.uk>
---
I note I'm deleting a big comment in kvm_main.c about doing the
invalidation before acquiring mmu_lock. But we don't hold the lock
in the range_end callback either, do we?
include/linux/kvm_types.h | 1 +
virt/kvm/kvm_main.c | 14 ++------
virt/kvm/kvm_mm.h | 12 +++----
virt/kvm/pfncache.c | 75 +++++++++++++++++++--------------------
4 files changed, 45 insertions(+), 57 deletions(-)
diff --git a/include/linux/kvm_types.h b/include/linux/kvm_types.h
index 827ecc0b7e10..30ed1019cfc6 100644
--- a/include/linux/kvm_types.h
+++ b/include/linux/kvm_types.h
@@ -69,6 +69,7 @@ struct gfn_to_pfn_cache {
void *khva;
kvm_pfn_t pfn;
bool active;
+ bool validating;
bool valid;
};
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index d0788d0a72cc..ffd6ab4c2a16 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -777,18 +777,6 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
kvm->mn_active_invalidate_count++;
spin_unlock(&kvm->mn_invalidate_lock);
- /*
- * Invalidate pfn caches _before_ invalidating the secondary MMUs, i.e.
- * before acquiring mmu_lock, to avoid holding mmu_lock while acquiring
- * each cache's lock. There are relatively few caches in existence at
- * any given time, and the caches themselves can check for hva overlap,
- * i.e. don't need to rely on memslot overlap checks for performance.
- * Because this runs without holding mmu_lock, the pfn caches must use
- * mn_active_invalidate_count (see above) instead of
- * mmu_invalidate_in_progress.
- */
- gfn_to_pfn_cache_invalidate_start(kvm, range->start, range->end);
-
/*
* If one or more memslots were found and thus zapped, notify arch code
* that guest memory has been reclaimed. This needs to be done *after*
@@ -849,6 +837,8 @@ static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
wake = !kvm->mn_active_invalidate_count;
spin_unlock(&kvm->mn_invalidate_lock);
+ gfn_to_pfn_cache_invalidate(kvm, range->start, range->end);
+
/*
* There can only be one waiter, since the wait happens under
* slots_lock.
diff --git a/virt/kvm/kvm_mm.h b/virt/kvm/kvm_mm.h
index 715f19669d01..34e4e67f09f8 100644
--- a/virt/kvm/kvm_mm.h
+++ b/virt/kvm/kvm_mm.h
@@ -24,13 +24,13 @@ kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool interruptible,
bool *async, bool write_fault, bool *writable);
#ifdef CONFIG_HAVE_KVM_PFNCACHE
-void gfn_to_pfn_cache_invalidate_start(struct kvm *kvm,
- unsigned long start,
- unsigned long end);
+void gfn_to_pfn_cache_invalidate(struct kvm *kvm,
+ unsigned long start,
+ unsigned long end);
#else
-static inline void gfn_to_pfn_cache_invalidate_start(struct kvm *kvm,
- unsigned long start,
- unsigned long end)
+static inline void gfn_to_pfn_cache_invalidate(struct kvm *kvm,
+ unsigned long start,
+ unsigned long end)
{
}
#endif /* HAVE_KVM_PFNCACHE */
diff --git a/virt/kvm/pfncache.c b/virt/kvm/pfncache.c
index f0039efb9e1e..187b58150ef7 100644
--- a/virt/kvm/pfncache.c
+++ b/virt/kvm/pfncache.c
@@ -20,10 +20,14 @@
#include "kvm_mm.h"
/*
- * MMU notifier 'invalidate_range_start' hook.
+ * MMU notifier 'invalidate_range_end' hook. The hva_to_pfn_retry() function
+ * below may look up a PFN just before it is zapped, and may be mapping it
+ * concurrently (with the GPC lock dropped). By using a separate 'validating'
+ * flag, the invalidation can occur concurrently, causing hva_to_pfn_retry()
+ * to drop its result and retry correctly.
*/
-void gfn_to_pfn_cache_invalidate_start(struct kvm *kvm, unsigned long start,
- unsigned long end)
+void gfn_to_pfn_cache_invalidate(struct kvm *kvm, unsigned long start,
+ unsigned long end)
{
struct gfn_to_pfn_cache *gpc;
@@ -32,7 +36,7 @@ void gfn_to_pfn_cache_invalidate_start(struct kvm *kvm, unsigned long start,
read_lock_irq(&gpc->lock);
/* Only a single page so no need to care about length */
- if (gpc->valid && !is_error_noslot_pfn(gpc->pfn) &&
+ if (gpc->validating && !is_error_noslot_pfn(gpc->pfn) &&
gpc->uhva >= start && gpc->uhva < end) {
read_unlock_irq(&gpc->lock);
@@ -45,9 +49,11 @@ void gfn_to_pfn_cache_invalidate_start(struct kvm *kvm, unsigned long start,
*/
write_lock_irq(&gpc->lock);
- if (gpc->valid && !is_error_noslot_pfn(gpc->pfn) &&
- gpc->uhva >= start && gpc->uhva < end)
+ if (gpc->validating && !is_error_noslot_pfn(gpc->pfn) &&
+ gpc->uhva >= start && gpc->uhva < end) {
+ gpc->validating = false;
gpc->valid = false;
+ }
write_unlock_irq(&gpc->lock);
continue;
}
@@ -93,6 +99,9 @@ bool kvm_gpc_check(struct gfn_to_pfn_cache *gpc, unsigned long len)
if (!gpc->valid)
return false;
+ /* It can never be valid unless it was once validating! */
+ WARN_ON_ONCE(!gpc->validating);
+
return true;
}
@@ -124,41 +133,12 @@ static void gpc_unmap(kvm_pfn_t pfn, void *khva)
#endif
}
-static inline bool mmu_notifier_retry_cache(struct kvm *kvm, unsigned long mmu_seq)
-{
- /*
- * mn_active_invalidate_count acts for all intents and purposes
- * like mmu_invalidate_in_progress here; but the latter cannot
- * be used here because the invalidation of caches in the
- * mmu_notifier event occurs _before_ mmu_invalidate_in_progress
- * is elevated.
- *
- * Note, it does not matter that mn_active_invalidate_count
- * is not protected by gpc->lock. It is guaranteed to
- * be elevated before the mmu_notifier acquires gpc->lock, and
- * isn't dropped until after mmu_invalidate_seq is updated.
- */
- if (kvm->mn_active_invalidate_count)
- return true;
-
- /*
- * Ensure mn_active_invalidate_count is read before
- * mmu_invalidate_seq. This pairs with the smp_wmb() in
- * mmu_notifier_invalidate_range_end() to guarantee either the
- * old (non-zero) value of mn_active_invalidate_count or the
- * new (incremented) value of mmu_invalidate_seq is observed.
- */
- smp_rmb();
- return kvm->mmu_invalidate_seq != mmu_seq;
-}
-
static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc)
{
/* Note, the new page offset may be different than the old! */
void *old_khva = (void *)PAGE_ALIGN_DOWN((uintptr_t)gpc->khva);
kvm_pfn_t new_pfn = KVM_PFN_ERR_FAULT;
void *new_khva = NULL;
- unsigned long mmu_seq;
lockdep_assert_held(&gpc->refresh_lock);
@@ -172,8 +152,16 @@ static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc)
gpc->valid = false;
do {
- mmu_seq = gpc->kvm->mmu_invalidate_seq;
- smp_rmb();
+ /*
+ * The translation made by hva_to_pfn() below could be made
+ * invalid as soon as it's mapped. But the uhva is already
+ * known and that's all that gfn_to_pfn_cache_invalidate()
+ * looks at. So set the 'validating' flag to allow the GPC
+ * to be marked invalid from the moment the lock is dropped,
+ * before the corresponding PFN is even found (and, more to
+ * the point, immediately afterwards).
+ */
+ gpc->validating = true;
write_unlock_irq(&gpc->lock);
@@ -224,7 +212,14 @@ static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc)
* attempting to refresh.
*/
WARN_ON_ONCE(gpc->valid);
- } while (mmu_notifier_retry_cache(gpc->kvm, mmu_seq));
+
+ /*
+ * Since gfn_to_pfn_cache_invalidate() is called from the
+ * kvm_mmu_notifier_invalidate_range_end() callback, it can
+ * invalidate the GPC the moment after hva_to_pfn() returned
+ * a valid PFN. If that happens, retry.
+ */
+ } while (!gpc->validating);
gpc->valid = true;
gpc->pfn = new_pfn;
@@ -339,6 +334,7 @@ static int __kvm_gpc_refresh(struct gfn_to_pfn_cache *gpc, gpa_t gpa, unsigned l
*/
if (ret) {
gpc->valid = false;
+ gpc->validating = false;
gpc->pfn = KVM_PFN_ERR_FAULT;
gpc->khva = NULL;
}
@@ -383,7 +379,7 @@ void kvm_gpc_init(struct gfn_to_pfn_cache *gpc, struct kvm *kvm)
gpc->pfn = KVM_PFN_ERR_FAULT;
gpc->gpa = INVALID_GPA;
gpc->uhva = KVM_HVA_ERR_BAD;
- gpc->active = gpc->valid = false;
+ gpc->active = gpc->valid = gpc->validating = false;
}
static int __kvm_gpc_activate(struct gfn_to_pfn_cache *gpc, gpa_t gpa, unsigned long uhva,
@@ -453,6 +449,7 @@ void kvm_gpc_deactivate(struct gfn_to_pfn_cache *gpc)
write_lock_irq(&gpc->lock);
gpc->active = false;
gpc->valid = false;
+ gpc->validating = false;
/*
* Leave the GPA => uHVA cache intact, it's protected by the
--
2.44.0
Download attachment "smime.p7s" of type "application/pkcs7-signature" (5965 bytes)
Powered by blists - more mailing lists