[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20240821202814.711673-2-dwmw2@infradead.org>
Date: Wed, 21 Aug 2024 21:28:10 +0100
From: David Woodhouse <dwmw2@...radead.org>
To: Sean Christopherson <seanjc@...gle.com>,
Paolo Bonzini <pbonzini@...hat.com>,
"Hussain, Mushahid" <hmushi@...zon.co.uk>
Cc: Vitaly Kuznetsov <vkuznets@...hat.com>,
Wanpeng Li <wanpengli@...cent.com>,
Jim Mattson <jmattson@...gle.com>,
Joerg Roedel <joro@...tes.org>,
kvm@...r.kernel.org,
linux-kernel@...r.kernel.org,
Mingwei Zhang <mizhang@...gle.com>,
Maxim Levitsky <mlevitsk@...hat.com>
Subject: [PATCH v3 2/5] KVM: pfncache: Implement range-based invalidation check for hva_to_pfn_retry()
From: David Woodhouse <dwmw@...zon.co.uk>
The existing retry loop in hva_to_pfn_retry() is extremely pessimistic.
If there are any concurrent invalidations running, it's effectively just
a complex busy wait loop because its local mmu_notifier_retry_cache()
function will always return true.
Since multiple invalidations can be running in parallel, this can result
in a situation where hva_to_pfn_retry() just backs off and keep retrying
for ever, not making any progress.
Solve this by being a bit more selective about when to retry. In
addition to the ->needs_invalidation flag added in a previous commit,
check before calling hva_to_pfn() if there is any pending invalidation
(i.e. between invalidate_range_start() and invalidate_range_end()) which
affects the uHVA of the GPC being updated. This catches the case where
hva_to_pfn() would return a soon-to-be-stale mapping of a range for which
the invalidate_range_start() hook had already been called before the uHVA
was even set in the GPC and the ->needs_invalidation flag set.
An invalidation which *starts* after the lock is dropped in the loop is
fine because it will clear the ->needs_revalidation flag and also trigger
a retry.
This is slightly more complex than it needs to be; moving the
invalidation to occur in the invalidate_range_end() hook will make life
simpler, in a subsequent commit.
Signed-off-by: David Woodhouse <dwmw@...zon.co.uk>
---
include/linux/kvm_host.h | 2 ++
virt/kvm/kvm_main.c | 20 ++++++++++++++
virt/kvm/pfncache.c | 60 +++++++++++++++++++++-------------------
3 files changed, 54 insertions(+), 28 deletions(-)
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 79a6b1a63027..1bfe2e8d52cd 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -770,6 +770,8 @@ struct kvm {
/* For management / invalidation of gfn_to_pfn_caches */
spinlock_t gpc_lock;
struct list_head gpc_list;
+ u64 mmu_gpc_invalidate_range_start;
+ u64 mmu_gpc_invalidate_range_end;
/*
* created_vcpus is protected by kvm->lock, and is incremented
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 92901656a0d4..84eb1ebb6f47 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -775,6 +775,24 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
*/
spin_lock(&kvm->mn_invalidate_lock);
kvm->mn_active_invalidate_count++;
+ if (likely(kvm->mn_active_invalidate_count == 1)) {
+ kvm->mmu_gpc_invalidate_range_start = range->start;
+ kvm->mmu_gpc_invalidate_range_end = range->end;
+ } else {
+ /*
+ * Fully tracking multiple concurrent ranges has diminishing
+ * returns. Keep things simple and just find the minimal range
+ * which includes the current and new ranges. As there won't be
+ * enough information to subtract a range after its invalidate
+ * completes, any ranges invalidated concurrently will
+ * accumulate and persist until all outstanding invalidates
+ * complete.
+ */
+ kvm->mmu_gpc_invalidate_range_start =
+ min(kvm->mmu_gpc_invalidate_range_start, range->start);
+ kvm->mmu_gpc_invalidate_range_end =
+ max(kvm->mmu_gpc_invalidate_range_end, range->end);
+ }
spin_unlock(&kvm->mn_invalidate_lock);
/*
@@ -1164,6 +1182,8 @@ static struct kvm *kvm_create_vm(unsigned long type, const char *fdname)
INIT_LIST_HEAD(&kvm->gpc_list);
spin_lock_init(&kvm->gpc_lock);
+ kvm->mmu_gpc_invalidate_range_start = KVM_HVA_ERR_BAD;
+ kvm->mmu_gpc_invalidate_range_end = KVM_HVA_ERR_BAD;
INIT_LIST_HEAD(&kvm->devices);
kvm->max_vcpus = KVM_MAX_VCPUS;
diff --git a/virt/kvm/pfncache.c b/virt/kvm/pfncache.c
index 7007d32d197a..eeb9bf43c04a 100644
--- a/virt/kvm/pfncache.c
+++ b/virt/kvm/pfncache.c
@@ -129,32 +129,17 @@ static void gpc_unmap(kvm_pfn_t pfn, void *khva)
#endif
}
-static inline bool mmu_notifier_retry_cache(struct kvm *kvm, unsigned long mmu_seq)
+static bool gpc_invalidations_pending(struct gfn_to_pfn_cache *gpc)
{
/*
- * mn_active_invalidate_count acts for all intents and purposes
- * like mmu_invalidate_in_progress here; but the latter cannot
- * be used here because the invalidation of caches in the
- * mmu_notifier event occurs _before_ mmu_invalidate_in_progress
- * is elevated.
- *
- * Note, it does not matter that mn_active_invalidate_count
- * is not protected by gpc->lock. It is guaranteed to
- * be elevated before the mmu_notifier acquires gpc->lock, and
- * isn't dropped until after mmu_invalidate_seq is updated.
+ * No need for locking on GPC here because these fields are protected
+ * by gpc->refresh_lock.
*/
- if (kvm->mn_active_invalidate_count)
- return true;
+ guard(spinlock)(&gpc->kvm->mn_invalidate_lock);
- /*
- * Ensure mn_active_invalidate_count is read before
- * mmu_invalidate_seq. This pairs with the smp_wmb() in
- * mmu_notifier_invalidate_range_end() to guarantee either the
- * old (non-zero) value of mn_active_invalidate_count or the
- * new (incremented) value of mmu_invalidate_seq is observed.
- */
- smp_rmb();
- return kvm->mmu_invalidate_seq != mmu_seq;
+ return unlikely(gpc->kvm->mn_active_invalidate_count) &&
+ (gpc->kvm->mmu_gpc_invalidate_range_start <= gpc->uhva) &&
+ (gpc->kvm->mmu_gpc_invalidate_range_end > gpc->uhva);
}
static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc)
@@ -163,7 +148,6 @@ static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc)
void *old_khva = (void *)PAGE_ALIGN_DOWN((uintptr_t)gpc->khva);
kvm_pfn_t new_pfn = KVM_PFN_ERR_FAULT;
void *new_khva = NULL;
- unsigned long mmu_seq;
lockdep_assert_held(&gpc->refresh_lock);
@@ -177,9 +161,6 @@ static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc)
gpc->valid = false;
do {
- mmu_seq = gpc->kvm->mmu_invalidate_seq;
- smp_rmb();
-
/*
* The translation made by hva_to_pfn() below could be made
* invalid as soon as it's mapped. But the uhva is already
@@ -193,6 +174,29 @@ static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc)
write_unlock_irq(&gpc->lock);
+ /*
+ * Invalidation occurs from the invalidate_range_start() hook,
+ * which could already have happened before __kvm_gpc_refresh()
+ * (or the previous turn around this loop) took gpc->lock().
+ * If so, and if the corresponding invalidate_range_end() hook
+ * hasn't happened yet, hva_to_pfn() could return a mapping
+ * which is about to be stale and which should not be used. So
+ * check if there are any currently-running invalidations which
+ * affect the uHVA of this GPC, and retry if there are. Any
+ * invalidation which starts after gpc->needs_invalidation is
+ * set is fine, because it will clear that flag and trigger a
+ * retry. And any invalidation which *completes* by having its
+ * invalidate_range_end() hook called immediately prior to this
+ * check is also fine, because the page tables are guaranteed
+ * to have been changted already, so hva_to_pfn() won't return
+ * a stale mapping in that case anyway.
+ */
+ while (gpc_invalidations_pending(gpc)) {
+ cond_resched();
+ write_lock_irq(&gpc->lock);
+ continue;
+ }
+
/*
* If the previous iteration "failed" due to an mmu_notifier
* event, release the pfn and unmap the kernel virtual address
@@ -233,6 +237,7 @@ static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc)
goto out_error;
}
+
write_lock_irq(&gpc->lock);
/*
@@ -240,8 +245,7 @@ static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc)
* attempting to refresh.
*/
WARN_ON_ONCE(gpc->valid);
- } while (!gpc->needs_invalidation ||
- mmu_notifier_retry_cache(gpc->kvm, mmu_seq));
+ } while (!gpc->needs_invalidation);
gpc->valid = true;
gpc->pfn = new_pfn;
--
2.44.0
Powered by blists - more mailing lists