[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <d806ed85d2487a7798769de39c5b3f33c2f93b54.camel@infradead.org>
Date: Tue, 06 Aug 2024 17:40:49 +0100
From: David Woodhouse <dwmw2@...radead.org>
To: Sean Christopherson <seanjc@...gle.com>
Cc: Paolo Bonzini <pbonzini@...hat.com>, Mushahid Hussain
<hmushi@...zon.co.uk>, Vitaly Kuznetsov <vkuznets@...hat.com>, Wanpeng Li
<wanpengli@...cent.com>, Jim Mattson <jmattson@...gle.com>, Joerg Roedel
<joro@...tes.org>, kvm@...r.kernel.org, linux-kernel@...r.kernel.org,
Mingwei Zhang <mizhang@...gle.com>, Maxim Levitsky <mlevitsk@...hat.com>
Subject: Re: [PATCH] KVM: Move gfn_to_pfn_cache invalidation to
invalidate_range_end hook
On Tue, 2024-08-06 at 08:57 -0700, Sean Christopherson wrote:
> The last one is key: the pages have already been freed when invalidate_range_end()
> is called, and so unmapping at that time would be too late.
OK, thanks.
> The obvious downside is what you've run into, where the start+end approach forces
> KVM to wait for all in-flight invalidations to go away. But again, in practice
> the rudimentary range tracking suffices for all known use cases.
Makes sense.
> I suspect you're trying to solve a problem that doesn't exist in practice.
> hva_to_pfn_retry() already has a cond_resched(), so getting stuck for a long
> duration isn't fatal, just suboptimal. And similar to the range-based tracking,
> _if_ there's a problem in practice, then it also affects guest page faults. KVM
> simply resumes the vCPU and keeps re-faulting until the in-flight invalidation(s)
> has gone away.
Yeah. When it's looping on actual page faults it's easy to pretend it
isn't a busy-loop :)
Making it wait is fairly simple; it would be easy to just make it
cond_resched() instead though. Here's what I have so far (incremental).
https://git.infradead.org/?p=users/dwmw2/linux.git;a=commitdiff;h=pfncache-2
I need to revive the torture test you had at the end of your earlier series.
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 689e8be873a7..9b5261e11802 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -770,6 +770,9 @@ struct kvm {
/* For management / invalidation of gfn_to_pfn_caches */
spinlock_t gpc_lock;
struct list_head gpc_list;
+ u64 mmu_gpc_invalidate_range_start;
+ u64 mmu_gpc_invalidate_range_end;
+ wait_queue_head_t gpc_invalidate_wq;
/*
* created_vcpus is protected by kvm->lock, and is incremented
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index ffd6ab4c2a16..a243f9f8a9c0 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -775,6 +775,24 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
*/
spin_lock(&kvm->mn_invalidate_lock);
kvm->mn_active_invalidate_count++;
+ if (likely(kvm->mn_active_invalidate_count == 1) {
+ kvm->mmu_gpc_invalidate_range_start = range->start;
+ kvm->mmu_gpc_invalidate_range_end = range->end;
+ } else {
+ /*
+ * Fully tracking multiple concurrent ranges has diminishing
+ * returns. Keep things simple and just find the minimal range
+ * which includes the current and new ranges. As there won't be
+ * enough information to subtract a range after its invalidate
+ * completes, any ranges invalidated concurrently will
+ * accumulate and persist until all outstanding invalidates
+ * complete.
+ */
+ kvm->mmu_gpc_invalidate_range_start =
+ min(kvm->mmu_gpc_invalidate_range_start, range->start);
+ kvm->mmu_gpc_invalidate_range_end =
+ max(kvm->mmu_gpc_invalidate_range_end, range->end);
+ }
spin_unlock(&kvm->mn_invalidate_lock);
/*
@@ -830,21 +848,27 @@ static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
__kvm_handle_hva_range(kvm, &hva_range);
+ gfn_to_pfn_cache_invalidate(kvm, range->start, range->end);
+
/* Pairs with the increment in range_start(). */
spin_lock(&kvm->mn_invalidate_lock);
if (!WARN_ON_ONCE(!kvm->mn_active_invalidate_count))
--kvm->mn_active_invalidate_count;
wake = !kvm->mn_active_invalidate_count;
+ if (wake) {
+ kvm->mmu_gpc_invalidate_range_start = KVM_HVA_ERR_BAD;
+ kvm->mmu_gpc_invalidate_range_end = KVM_HVA_ERR_BAD;
+ }
spin_unlock(&kvm->mn_invalidate_lock);
- gfn_to_pfn_cache_invalidate(kvm, range->start, range->end);
-
/*
* There can only be one waiter, since the wait happens under
* slots_lock.
*/
- if (wake)
+ if (wake) {
+ wake_up(&kvm->gpc_invalidate_wq);
rcuwait_wake_up(&kvm->mn_memslots_update_rcuwait);
+ }
}
static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn,
@@ -1154,7 +1178,9 @@ static struct kvm *kvm_create_vm(unsigned long type, const char *fdname)
INIT_LIST_HEAD(&kvm->gpc_list);
spin_lock_init(&kvm->gpc_lock);
-
+ init_waitqueue_head(&kvm->gpc_invalidate_wq);
+ kvm->mmu_gpc_invalidate_range_start = KVM_HVA_ERR_BAD;
+ kvm->mmu_gpc_invalidate_range_end = KVM_HVA_ERR_BAD;
INIT_LIST_HEAD(&kvm->devices);
kvm->max_vcpus = KVM_MAX_VCPUS;
diff --git a/virt/kvm/pfncache.c b/virt/kvm/pfncache.c
index 187b58150ef7..dad32ef5ac87 100644
--- a/virt/kvm/pfncache.c
+++ b/virt/kvm/pfncache.c
@@ -133,6 +133,39 @@ static void gpc_unmap(kvm_pfn_t pfn, void *khva)
#endif
}
+static bool gpc_invalidations_pending(struct gfn_to_pfn_cache *gpc)
+{
+ /*
+ * No need for locking on GPC here because these fields are protected
+ * by gpc->refresh_lock.
+ */
+ return unlikely(gpc->kvm->mn_active_invalidate_count) &&
+ (gpc->kvm->mmu_gpc_invalidate_range_start <= gpc->uhva) &&
+ (gpc->kvm->mmu_gpc_invalidate_range_end > gpc->uhva);
+}
+
+static void gpc_wait_for_invalidations(struct gfn_to_pfn_cache *gpc)
+{
+ spin_lock(&gpc->kvm->mn_invalidate_lock);
+ if (gpc_invalidations_pending(gpc)) {
+ DECLARE_WAITQUEUE(wait, current);
+
+ for (;;) {
+ prepare_to_wait(&gpc->kvm->gpc_invalidate_wq, &wait,
+ TASK_UNINTERRUPTIBLE);
+
+ if (!gpc_invalidations_pending(gpc))
+ break;
+
+ spin_unlock(&gpc->kvm->mn_invalidate_lock);
+ schedule();
+ spin_lock(&gpc->kvm->mn_invalidate_lock);
+ }
+ finish_wait(&gpc->kvm->gpc_invalidate_wq, &wait);
+ }
+ spin_unlock(&gpc->kvm->mn_invalidate_lock);
+}
+
static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc)
{
/* Note, the new page offset may be different than the old! */
@@ -205,6 +238,15 @@ static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc)
goto out_error;
}
+ /*
+ * Avoid populating a GPC with a user address which is already
+ * being invalidated, if the invalidate_range_start() notifier
+ * has already been called. The actual invalidation happens in
+ * the invalidate_range_end() callback, so wait until there are
+ * no active invalidations (for the relevant HVA).
+ */
+ gpc_wait_for_invalidations(gpc);
+
write_lock_irq(&gpc->lock);
/*
Download attachment "smime.p7s" of type "application/pkcs7-signature" (5965 bytes)
Powered by blists - more mailing lists