[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <aCtlDhNbgXKg4s5t@google.com>
Date: Mon, 19 May 2025 10:06:22 -0700
From: Sean Christopherson <seanjc@...gle.com>
To: Rick P Edgecombe <rick.p.edgecombe@...el.com>
Cc: Yan Y Zhao <yan.y.zhao@...el.com>, "kvm@...r.kernel.org" <kvm@...r.kernel.org>,
"pbonzini@...hat.com" <pbonzini@...hat.com>, Reinette Chatre <reinette.chatre@...el.com>,
"linux-kernel@...r.kernel.org" <linux-kernel@...r.kernel.org>
Subject: Re: [PATCH 1/2] KVM: x86/mmu: Add RET_PF_RETRY_INVALID_SLOT for fault
retry on invalid slot
On Mon, May 19, 2025, Rick P Edgecombe wrote:
> On Mon, 2025-05-19 at 06:33 -0700, Sean Christopherson wrote:
> > Was this hit by a real VMM? If so, why is a TDX VMM removing a memslot without
> > kicking vCPUs out of KVM?
> >
> > Regardless, I would prefer not to add a new RET_PF_* flag for this. At a glance,
> > KVM can simply drop and reacquire SRCU in the relevant paths.
>
> During the initial debugging and kicking around stage, this is the first
> direction we looked. But kvm_gmem_populate() doesn't have scru locked, so then
> kvm_tdp_map_page() tries to unlock without it being held. (although that version
> didn't check r == RET_PF_RETRY like you had). Yan had the following concerns and
> came up with the version in this series, which we held review on for the list:
Ah, I missed the kvm_gmem_populate() => kvm_tdp_map_page() chain.
> > However, upon further consideration, I am reluctant to implement this fix for
Which fix?
> > the following reasons:
> > - kvm_gmem_populate() already holds the kvm->slots_lock.
> > - While retrying with srcu unlock and lock can workaround the
> > KVM_MEMSLOT_INVALID deadlock, it results in each kvm_vcpu_pre_fault_memory()
> > and tdx_handle_ept_violation() faulting with different memslot layouts.
This behavior has existed since pretty much the beginning of KVM time. TDX is the
oddball that doesn't re-enter the guest. All other flavors re-enter the guest on
RET_PF_RETRY, which means dropping and reacquiring SRCU. Which is why I don't like
RET_PF_RETRY_INVALID_SLOT; it's simply handling the case we know about.
Arguably, _TDX_ is buggy by not providing this behavior.
> I'm not sure why the second one is really a problem. For the first one I think
> that path could just take the scru lock in the proper order with kvm-
> >slots_lock?
Acquiring SRCU inside slots_lock should be fine. The reserve order would be
problematic, as KVM synchronizes SRCU while holding slots_lock.
That said, I don't love the idea of grabbing SRCU, because it's so obviously a
hack. What about something like this?
---
arch/x86/kvm/mmu.h | 2 ++
arch/x86/kvm/mmu/mmu.c | 49 +++++++++++++++++++++++++++---------------
arch/x86/kvm/vmx/tdx.c | 7 ++++--
virt/kvm/kvm_main.c | 5 ++---
4 files changed, 41 insertions(+), 22 deletions(-)
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index b4b6860ab971..0fc68f0fe80e 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -259,6 +259,8 @@ extern bool tdp_mmu_enabled;
bool kvm_tdp_mmu_gpa_is_mapped(struct kvm_vcpu *vcpu, u64 gpa);
int kvm_tdp_map_page(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code, u8 *level);
+int kvm_tdp_prefault_page(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code,
+ u8 *level);
static inline bool kvm_memslots_have_rmaps(struct kvm *kvm)
{
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index cbc84c6abc2e..4f16fe95173c 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -4851,24 +4851,15 @@ int kvm_tdp_map_page(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code, u8 *level
{
int r;
- /*
- * Restrict to TDP page fault, since that's the only case where the MMU
- * is indexed by GPA.
- */
- if (vcpu->arch.mmu->page_fault != kvm_tdp_page_fault)
- return -EOPNOTSUPP;
+ if (signal_pending(current))
+ return -EINTR;
- do {
- if (signal_pending(current))
- return -EINTR;
+ if (kvm_check_request(KVM_REQ_VM_DEAD, vcpu))
+ return -EIO;
- if (kvm_check_request(KVM_REQ_VM_DEAD, vcpu))
- return -EIO;
-
- cond_resched();
- r = kvm_mmu_do_page_fault(vcpu, gpa, error_code, true, NULL, level);
- } while (r == RET_PF_RETRY);
+ cond_resched();
+ r = kvm_mmu_do_page_fault(vcpu, gpa, error_code, true, NULL, level);
if (r < 0)
return r;
@@ -4878,10 +4869,12 @@ int kvm_tdp_map_page(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code, u8 *level
case RET_PF_WRITE_PROTECTED:
return 0;
+ case RET_PF_RETRY:
+ return -EAGAIN;
+
case RET_PF_EMULATE:
return -ENOENT;
- case RET_PF_RETRY:
case RET_PF_CONTINUE:
case RET_PF_INVALID:
default:
@@ -4891,6 +4884,28 @@ int kvm_tdp_map_page(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code, u8 *level
}
EXPORT_SYMBOL_GPL(kvm_tdp_map_page);
+int kvm_tdp_prefault_page(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code, u8 *level)
+{
+ int r;
+
+ /*
+ * Restrict to TDP page fault, since that's the only case where the MMU
+ * is indexed by GPA.
+ */
+ if (vcpu->arch.mmu->page_fault != kvm_tdp_page_fault)
+ return -EOPNOTSUPP;
+
+ for (;;) {
+ r = kvm_tdp_map_page(vcpu, gpa, error_code, level);
+ if (r != -EAGAIN)
+ break;
+
+ /* Comment goes here. */
+ kvm_vcpu_srcu_read_unlock(vcpu);
+ kvm_vcpu_srcu_read_lock(vcpu);
+ }
+}
+
long kvm_arch_vcpu_pre_fault_memory(struct kvm_vcpu *vcpu,
struct kvm_pre_fault_memory *range)
{
@@ -4918,7 +4933,7 @@ long kvm_arch_vcpu_pre_fault_memory(struct kvm_vcpu *vcpu,
* Shadow paging uses GVA for kvm page fault, so restrict to
* two-dimensional paging.
*/
- r = kvm_tdp_map_page(vcpu, range->gpa, error_code, &level);
+ r = kvm_tdp_prefault_page(vcpu, range->gpa, error_code, &level);
if (r < 0)
return r;
diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
index b952bc673271..1a232562080d 100644
--- a/arch/x86/kvm/vmx/tdx.c
+++ b/arch/x86/kvm/vmx/tdx.c
@@ -3075,8 +3075,11 @@ static int tdx_gmem_post_populate(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn,
if (ret != 1)
return -ENOMEM;
- ret = kvm_tdp_map_page(vcpu, gpa, error_code, &level);
- if (ret < 0)
+ do {
+ ret = kvm_tdp_map_page(vcpu, gpa, error_code, &level);
+ while (ret == -EAGAIN);
+
+ if (ret)
goto out;
/*
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index b24db92e98f3..21a3fa7476dd 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -4266,7 +4266,6 @@ static int kvm_vcpu_ioctl_get_stats_fd(struct kvm_vcpu *vcpu)
static int kvm_vcpu_pre_fault_memory(struct kvm_vcpu *vcpu,
struct kvm_pre_fault_memory *range)
{
- int idx;
long r;
u64 full_size;
@@ -4279,7 +4278,7 @@ static int kvm_vcpu_pre_fault_memory(struct kvm_vcpu *vcpu,
return -EINVAL;
vcpu_load(vcpu);
- idx = srcu_read_lock(&vcpu->kvm->srcu);
+ kvm_vcpu_srcu_read_lock(vcpu);
full_size = range->size;
do {
@@ -4300,7 +4299,7 @@ static int kvm_vcpu_pre_fault_memory(struct kvm_vcpu *vcpu,
cond_resched();
} while (range->size);
- srcu_read_unlock(&vcpu->kvm->srcu, idx);
+ kvm_vcpu_srcu_read_unlock(vcpu);
vcpu_put(vcpu);
/* Return success if at least one page was mapped successfully. */
base-commit: 12ca5c63556bbfcd77fe890fcdd1cd1adfb31fdd
--
Powered by blists - more mailing lists