lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <d806ed85d2487a7798769de39c5b3f33c2f93b54.camel@infradead.org>
Date: Tue, 06 Aug 2024 17:40:49 +0100
From: David Woodhouse <dwmw2@...radead.org>
To: Sean Christopherson <seanjc@...gle.com>
Cc: Paolo Bonzini <pbonzini@...hat.com>, Mushahid Hussain
 <hmushi@...zon.co.uk>,  Vitaly Kuznetsov <vkuznets@...hat.com>, Wanpeng Li
 <wanpengli@...cent.com>, Jim Mattson <jmattson@...gle.com>,  Joerg Roedel
 <joro@...tes.org>, kvm@...r.kernel.org, linux-kernel@...r.kernel.org,
 Mingwei Zhang <mizhang@...gle.com>, Maxim Levitsky <mlevitsk@...hat.com>
Subject: Re: [PATCH] KVM: Move gfn_to_pfn_cache invalidation to
 invalidate_range_end hook

On Tue, 2024-08-06 at 08:57 -0700, Sean Christopherson wrote:
> The last one is key: the pages have already been freed when invalidate_range_end()
> is called, and so unmapping at that time would be too late.

OK, thanks.

> The obvious downside is what you've run into, where the start+end approach forces
> KVM to wait for all in-flight invalidations to go away.  But again, in practice
> the rudimentary range tracking suffices for all known use cases.

Makes sense.

> I suspect you're trying to solve a problem that doesn't exist in practice.
> hva_to_pfn_retry() already has a cond_resched(), so getting stuck for a long
> duration isn't fatal, just suboptimal.  And similar to the range-based tracking,
> _if_ there's a problem in practice, then it also affects guest page faults.  KVM
> simply resumes the vCPU and keeps re-faulting until the in-flight invalidation(s)
> has gone away.

Yeah. When it's looping on actual page faults it's easy to pretend it
isn't a busy-loop :)

Making it wait is fairly simple; it would be easy to just make it
cond_resched() instead though. Here's what I have so far (incremental).

https://git.infradead.org/?p=users/dwmw2/linux.git;a=commitdiff;h=pfncache-2

I need to revive the torture test you had at the end of your earlier series.

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 689e8be873a7..9b5261e11802 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -770,6 +770,9 @@ struct kvm {
 	/* For management / invalidation of gfn_to_pfn_caches */
 	spinlock_t gpc_lock;
 	struct list_head gpc_list;
+	u64 mmu_gpc_invalidate_range_start;
+	u64 mmu_gpc_invalidate_range_end;
+	wait_queue_head_t gpc_invalidate_wq;
 
 	/*
 	 * created_vcpus is protected by kvm->lock, and is incremented
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index ffd6ab4c2a16..a243f9f8a9c0 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -775,6 +775,24 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
 	 */
 	spin_lock(&kvm->mn_invalidate_lock);
 	kvm->mn_active_invalidate_count++;
+	if (likely(kvm->mn_active_invalidate_count == 1) {
+		kvm->mmu_gpc_invalidate_range_start = range->start;
+		kvm->mmu_gpc_invalidate_range_end = range->end;
+	} else {
+		/*
+		 * Fully tracking multiple concurrent ranges has diminishing
+		 * returns. Keep things simple and just find the minimal range
+		 * which includes the current and new ranges. As there won't be
+		 * enough information to subtract a range after its invalidate
+		 * completes, any ranges invalidated concurrently will
+		 * accumulate and persist until all outstanding invalidates
+		 * complete.
+		 */
+		kvm->mmu_gpc_invalidate_range_start =
+			min(kvm->mmu_gpc_invalidate_range_start, range->start);
+		kvm->mmu_gpc_invalidate_range_end =
+			max(kvm->mmu_gpc_invalidate_range_end, range->end);
+	}
 	spin_unlock(&kvm->mn_invalidate_lock);
 
 	/*
@@ -830,21 +848,27 @@ static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
 
 	__kvm_handle_hva_range(kvm, &hva_range);
 
+	gfn_to_pfn_cache_invalidate(kvm, range->start, range->end);
+
 	/* Pairs with the increment in range_start(). */
 	spin_lock(&kvm->mn_invalidate_lock);
 	if (!WARN_ON_ONCE(!kvm->mn_active_invalidate_count))
 		--kvm->mn_active_invalidate_count;
 	wake = !kvm->mn_active_invalidate_count;
+	if (wake) {
+		kvm->mmu_gpc_invalidate_range_start = KVM_HVA_ERR_BAD;
+		kvm->mmu_gpc_invalidate_range_end = KVM_HVA_ERR_BAD;
+	}
 	spin_unlock(&kvm->mn_invalidate_lock);
 
-	gfn_to_pfn_cache_invalidate(kvm, range->start, range->end);
-
 	/*
 	 * There can only be one waiter, since the wait happens under
 	 * slots_lock.
 	 */
-	if (wake)
+	if (wake) {
+		wake_up(&kvm->gpc_invalidate_wq);
 		rcuwait_wake_up(&kvm->mn_memslots_update_rcuwait);
+	}
 }
 
 static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn,
@@ -1154,7 +1178,9 @@ static struct kvm *kvm_create_vm(unsigned long type, const char *fdname)
 
 	INIT_LIST_HEAD(&kvm->gpc_list);
 	spin_lock_init(&kvm->gpc_lock);
-
+	init_waitqueue_head(&kvm->gpc_invalidate_wq);
+	kvm->mmu_gpc_invalidate_range_start = KVM_HVA_ERR_BAD;
+	kvm->mmu_gpc_invalidate_range_end = KVM_HVA_ERR_BAD;
 	INIT_LIST_HEAD(&kvm->devices);
 	kvm->max_vcpus = KVM_MAX_VCPUS;
 
diff --git a/virt/kvm/pfncache.c b/virt/kvm/pfncache.c
index 187b58150ef7..dad32ef5ac87 100644
--- a/virt/kvm/pfncache.c
+++ b/virt/kvm/pfncache.c
@@ -133,6 +133,39 @@ static void gpc_unmap(kvm_pfn_t pfn, void *khva)
 #endif
 }
 
+static bool gpc_invalidations_pending(struct gfn_to_pfn_cache *gpc)
+{
+	/*
+	 * No need for locking on GPC here because these fields are protected
+	 * by gpc->refresh_lock.
+	 */
+	return unlikely(gpc->kvm->mn_active_invalidate_count) &&
+		(gpc->kvm->mmu_gpc_invalidate_range_start <= gpc->uhva) &&
+		(gpc->kvm->mmu_gpc_invalidate_range_end > gpc->uhva);
+}
+
+static void gpc_wait_for_invalidations(struct gfn_to_pfn_cache *gpc)
+{
+	spin_lock(&gpc->kvm->mn_invalidate_lock);
+	if (gpc_invalidations_pending(gpc)) {
+		DECLARE_WAITQUEUE(wait, current);
+
+		for (;;) {
+			prepare_to_wait(&gpc->kvm->gpc_invalidate_wq, &wait,
+					TASK_UNINTERRUPTIBLE);
+
+			if (!gpc_invalidations_pending(gpc))
+				break;
+
+			spin_unlock(&gpc->kvm->mn_invalidate_lock);
+			schedule();
+			spin_lock(&gpc->kvm->mn_invalidate_lock);
+		}
+		finish_wait(&gpc->kvm->gpc_invalidate_wq, &wait);
+	}
+	spin_unlock(&gpc->kvm->mn_invalidate_lock);
+}
+
 static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc)
 {
 	/* Note, the new page offset may be different than the old! */
@@ -205,6 +238,15 @@ static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc)
 			goto out_error;
 		}
 
+		/*
+		 * Avoid populating a GPC with a user address which is already
+		 * being invalidated, if the invalidate_range_start() notifier
+		 * has already been called. The actual invalidation happens in
+		 * the invalidate_range_end() callback, so wait until there are
+		 * no active invalidations (for the relevant HVA).
+		 */
+		gpc_wait_for_invalidations(gpc);
+
 		write_lock_irq(&gpc->lock);
 
 		/*


Download attachment "smime.p7s" of type "application/pkcs7-signature" (5965 bytes)

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ