[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <Z5QhGndjNwYdnIZF@google.com>
Date: Fri, 24 Jan 2025 15:44:55 -0800
From: Sean Christopherson <seanjc@...gle.com>
To: Paolo Bonzini <pbonzini@...hat.com>
Cc: linux-kernel@...r.kernel.org, kvm <kvm@...r.kernel.org>
Subject: Re: [PATCH 1/2] KVM: x86: fix usage of kvm_lock in set_nx_huge_pages()
On Fri, Jan 24, 2025, Paolo Bonzini wrote:
> Il ven 24 gen 2025, 21:11 Sean Christopherson <seanjc@...gle.com> ha scritto:
> > Heh, except it's all kinds of broken.
>
> Yes, I didn't even try.
>
> > IMO, biting the bullet and converting to
> > an SRCU-protected list is going to be far less work in the long run.
>
> I did try a long SRCU critical section and it was unreviewable. It
> ends up a lot less manageable than just making the lock a leaf,
> especially as the lock hierarchy spans multiple subsystems (static
> key, KVM, cpufreq---thanks CPU hotplug lock...).
I'm not following. If __kvmclock_cpufreq_notifier() and set_nx_huge_pages()
switch to SRCU, then the deadlock goes away (it might even go away if just one
of those two switches).
SRCU readers would only interact with kvm_destroy_vm() from a locking perspective,
and if that's problematic then we would already have a plethora of issues.
> I also didn't like adding a synchronization primitive that's... kinda
> single-use, but that would not be a blocker of course.
It would be single use in the it only protects pure reader of vm_list, but there
are plenty of those users.
> So the second attempt was regular RCU, which looked a lot like this
> patch. I started writing all the dances to find a struct kvm that
> passes kvm_get_kvm_safe() before you do rcu_read_unlock() and drop the
> previous one (because you cannot do kvm_put_kvm() within the RCU read
> side) and set aside the idea, incorrectly thinking that they were not
> needed with kvm_lock. Plus I didn't like having to keep alive a bunch
> of data for a whole grace period if call_rcu() is used.
>
> So for the third attempt I could have chosen between dropping the SRCU
> or just using kvm_lock. I didn't even think of SRCU to be honest,
> because everything so far looked so bad, but it does seem a little
> better than RCU. At least, if kvm_destroy_vm() uses call_srcu(), you
> can call kvm_put_kvm() within srcu_read_lock()...srcu_read_unlock().
> It would look something like
>
> list_for_each_entry_srcu(kvm, &vm_list, vm_list, 1) {
> if (!kvm_get_kvm_safe(kvm))
Unless I'm missing something, we shouldn't need to take a reference so long as
SRCU is synchronized before destroying any part of the VM. If we don't take a
reference, then we don't need to deal with the complexity of kvm_put_kvm()
creating a recursive lock snafu.
This is what I'm thinking, lightly tested...
---
From: Sean Christopherson <seanjc@...gle.com>
Date: Fri, 24 Jan 2025 15:15:05 -0800
Subject: [PATCH] KVM: Use an SRCU lock to protect readers of vm_list
Introduce a global SRCU lock to protect KVM's global list of VMs, and use
it in all locations that currently take kvm_lock purely to prevent a VM
from being destroyed.
Keep using kvm_lock for flows that need to prevent VMs from being created,
as SRCU synchronization only guards against use-after-free, it doesn't
ensure a stable vm_list for readers.
This fixes a largely theoretical deadlock where:
- __kvm_set_memory_region() waits for kvm->srcu with kvm->slots_lock taken
- set_nx_huge_pages() waits for kvm->slots_lock with kvm_lock taken
- __kvmclock_cpufreq_notifier() waits for kvm_lock with cpu_hotplug_lock taken
- KVM_RUN waits for cpu_hotplug_lock with kvm->srcu taken
and therefore __kvm_set_memory_region() never completes
synchronize_srcu(&kvm->srcu).
__kvm_set_memory_region()
lock(&kvm->slots_lock)
set_nx_huge_pages()
lock(kvm_lock)
lock(&kvm->slots_lock)
__kvmclock_cpufreq_notifier()
lock(cpu_hotplug_lock)
lock(kvm_lock)
lock(&kvm->srcu)
kvm_lapic_set_base()
static_branch_inc()
lock(cpu_hotplug_lock)
sync(&kvm->srcu)
Opportunistically add macros to walk the list of VMs, and the array of
vCPUs in each VMs, to cut down on the amount of boilerplate.
Signed-off-by: Sean Christopherson <seanjc@...gle.com>
---
arch/x86/kvm/mmu/mmu.c | 19 +++++++++++------
arch/x86/kvm/x86.c | 36 +++++++++++++++++--------------
include/linux/kvm_host.h | 9 ++++++++
virt/kvm/kvm_main.c | 46 +++++++++++++++++++++++++---------------
4 files changed, 71 insertions(+), 39 deletions(-)
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 74fa38ebddbf..f5b7ceb7ca0e 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -7127,6 +7127,10 @@ static int set_nx_huge_pages(const char *val, const struct kernel_param *kp)
} else if (sysfs_streq(val, "never")) {
new_val = 0;
+ /*
+ * Take kvm_lock to ensure no VMs are *created* before the flag
+ * is set. vm_list_srcu only protect VMs being deleted.
+ */
mutex_lock(&kvm_lock);
if (!list_empty(&vm_list)) {
mutex_unlock(&kvm_lock);
@@ -7142,17 +7146,19 @@ static int set_nx_huge_pages(const char *val, const struct kernel_param *kp)
if (new_val != old_val) {
struct kvm *kvm;
+ int idx;
- mutex_lock(&kvm_lock);
+ idx = srcu_read_lock(&vm_list_srcu);
- list_for_each_entry(kvm, &vm_list, vm_list) {
+ kvm_for_each_vm_srcu(kvm) {
mutex_lock(&kvm->slots_lock);
kvm_mmu_zap_all_fast(kvm);
mutex_unlock(&kvm->slots_lock);
vhost_task_wake(kvm->arch.nx_huge_page_recovery_thread);
}
- mutex_unlock(&kvm_lock);
+
+ srcu_read_unlock(&vm_list_srcu, idx);
}
return 0;
@@ -7275,13 +7281,14 @@ static int set_nx_huge_pages_recovery_param(const char *val, const struct kernel
if (is_recovery_enabled &&
(!was_recovery_enabled || old_period > new_period)) {
struct kvm *kvm;
+ int idx;
- mutex_lock(&kvm_lock);
+ idx = srcu_read_lock(&vm_list_srcu);
- list_for_each_entry(kvm, &vm_list, vm_list)
+ kvm_for_each_vm_srcu(kvm)
vhost_task_wake(kvm->arch.nx_huge_page_recovery_thread);
- mutex_unlock(&kvm_lock);
+ srcu_read_unlock(&vm_list_srcu, idx);
}
return err;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index b2d9a16fd4d3..8fb49237d179 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -9428,6 +9428,11 @@ static void kvm_hyperv_tsc_notifier(void)
struct kvm *kvm;
int cpu;
+ /*
+ * Take kvm_lock, not just vm_list_srcu, trevent new VMs from coming
+ * along in the middle of the update and not getting the in-progress
+ * request.
+ */
mutex_lock(&kvm_lock);
list_for_each_entry(kvm, &vm_list, vm_list)
kvm_make_mclock_inprogress_request(kvm);
@@ -9456,7 +9461,7 @@ static void __kvmclock_cpufreq_notifier(struct cpufreq_freqs *freq, int cpu)
{
struct kvm *kvm;
struct kvm_vcpu *vcpu;
- int send_ipi = 0;
+ int send_ipi = 0, idx;
unsigned long i;
/*
@@ -9500,17 +9505,16 @@ static void __kvmclock_cpufreq_notifier(struct cpufreq_freqs *freq, int cpu)
smp_call_function_single(cpu, tsc_khz_changed, freq, 1);
- mutex_lock(&kvm_lock);
- list_for_each_entry(kvm, &vm_list, vm_list) {
- kvm_for_each_vcpu(i, vcpu, kvm) {
- if (vcpu->cpu != cpu)
- continue;
- kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
- if (vcpu->cpu != raw_smp_processor_id())
- send_ipi = 1;
- }
+ idx = srcu_read_lock(&vm_list_srcu);
+ kvm_for_each_vcpu_in_each_vm(kvm, vcpu, i) {
+ if (vcpu->cpu != cpu)
+ continue;
+
+ kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
+ if (vcpu->cpu != raw_smp_processor_id())
+ send_ipi = 1;
}
- mutex_unlock(&kvm_lock);
+ srcu_read_unlock(&vm_list_srcu, idx);
if (freq->old < freq->new && send_ipi) {
/*
@@ -9588,13 +9592,13 @@ static void pvclock_gtod_update_fn(struct work_struct *work)
struct kvm *kvm;
struct kvm_vcpu *vcpu;
unsigned long i;
+ int idx;
- mutex_lock(&kvm_lock);
- list_for_each_entry(kvm, &vm_list, vm_list)
- kvm_for_each_vcpu(i, vcpu, kvm)
- kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
+ idx = srcu_read_lock(&vm_list_srcu);
+ kvm_for_each_vcpu_in_each_vm(kvm, vcpu, i)
+ kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
+ srcu_read_unlock(&vm_list_srcu, idx);
atomic_set(&kvm_guest_has_master_clock, 0);
- mutex_unlock(&kvm_lock);
}
static DECLARE_WORK(pvclock_gtod_work, pvclock_gtod_update_fn);
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 9df590e8f3da..0d0edb697160 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -193,6 +193,11 @@ bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req);
extern struct mutex kvm_lock;
extern struct list_head vm_list;
+extern struct srcu_struct vm_list_srcu;
+
+#define kvm_for_each_vm_srcu(__kvm) \
+ list_for_each_entry_srcu(__kvm, &vm_list, vm_list, \
+ srcu_read_lock_held(&vm_list_srcu))
struct kvm_io_range {
gpa_t addr;
@@ -1001,6 +1006,10 @@ static inline struct kvm_vcpu *kvm_get_vcpu_by_id(struct kvm *kvm, int id)
return NULL;
}
+#define kvm_for_each_vcpu_in_each_vm(__kvm, __vcpu, __i) \
+ kvm_for_each_vm_srcu(__kvm) \
+ kvm_for_each_vcpu(__i, __vcpu, __kvm)
+
void kvm_destroy_vcpus(struct kvm *kvm);
void vcpu_load(struct kvm_vcpu *vcpu);
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index e0b9d6dd6a85..7fcc4433bf35 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -109,6 +109,7 @@ module_param(allow_unsafe_mappings, bool, 0444);
DEFINE_MUTEX(kvm_lock);
LIST_HEAD(vm_list);
+DEFINE_SRCU(vm_list_srcu);
static struct kmem_cache *kvm_vcpu_cache;
@@ -1261,13 +1262,21 @@ static void kvm_destroy_vm(struct kvm *kvm)
int i;
struct mm_struct *mm = kvm->mm;
+ mutex_lock(&kvm_lock);
+ list_del(&kvm->vm_list);
+ mutex_unlock(&kvm_lock);
+
+ /*
+ * Ensure all readers of the global list go away before destroying any
+ * aspect of the VM. After this, the VM object is reachable only via
+ * this task and notifiers that are registered to the VM itself.
+ */
+ synchronize_srcu(&vm_list_srcu);
+
kvm_destroy_pm_notifier(kvm);
kvm_uevent_notify_change(KVM_EVENT_DESTROY_VM, kvm);
kvm_destroy_vm_debugfs(kvm);
kvm_arch_sync_events(kvm);
- mutex_lock(&kvm_lock);
- list_del(&kvm->vm_list);
- mutex_unlock(&kvm_lock);
kvm_arch_pre_destroy_vm(kvm);
kvm_free_irq_routing(kvm);
@@ -6096,14 +6105,16 @@ static int vm_stat_get(void *_offset, u64 *val)
unsigned offset = (long)_offset;
struct kvm *kvm;
u64 tmp_val;
+ int idx;
*val = 0;
- mutex_lock(&kvm_lock);
- list_for_each_entry(kvm, &vm_list, vm_list) {
+ idx = srcu_read_lock(&vm_list_srcu);
+ kvm_for_each_vm_srcu(kvm) {
kvm_get_stat_per_vm(kvm, offset, &tmp_val);
*val += tmp_val;
}
- mutex_unlock(&kvm_lock);
+ srcu_read_unlock(&vm_list_srcu, idx);
+
return 0;
}
@@ -6111,15 +6122,15 @@ static int vm_stat_clear(void *_offset, u64 val)
{
unsigned offset = (long)_offset;
struct kvm *kvm;
+ int idx;
if (val)
return -EINVAL;
- mutex_lock(&kvm_lock);
- list_for_each_entry(kvm, &vm_list, vm_list) {
+ idx = srcu_read_lock(&vm_list_srcu);
+ kvm_for_each_vm_srcu(kvm)
kvm_clear_stat_per_vm(kvm, offset);
- }
- mutex_unlock(&kvm_lock);
+ srcu_read_unlock(&vm_list_srcu, idx);
return 0;
}
@@ -6132,14 +6143,15 @@ static int vcpu_stat_get(void *_offset, u64 *val)
unsigned offset = (long)_offset;
struct kvm *kvm;
u64 tmp_val;
+ int idx;
*val = 0;
- mutex_lock(&kvm_lock);
- list_for_each_entry(kvm, &vm_list, vm_list) {
+ idx = srcu_read_lock(&vm_list_srcu);
+ kvm_for_each_vm_srcu(kvm) {
kvm_get_stat_per_vcpu(kvm, offset, &tmp_val);
*val += tmp_val;
}
- mutex_unlock(&kvm_lock);
+ srcu_read_unlock(&vm_list_srcu, idx);
return 0;
}
@@ -6147,15 +6159,15 @@ static int vcpu_stat_clear(void *_offset, u64 val)
{
unsigned offset = (long)_offset;
struct kvm *kvm;
+ int idx;
if (val)
return -EINVAL;
- mutex_lock(&kvm_lock);
- list_for_each_entry(kvm, &vm_list, vm_list) {
+ idx = srcu_read_lock(&vm_list_srcu);
+ kvm_for_each_vm_srcu(kvm)
kvm_clear_stat_per_vcpu(kvm, offset);
- }
- mutex_unlock(&kvm_lock);
+ srcu_read_unlock(&vm_list_srcu, idx);
return 0;
}
base-commit: eb723766b1030a23c38adf2348b7c3d1409d11f0
--
Powered by blists - more mailing lists