linux-kernel - [PATCH v5] kvm: make vcpu life cycle separated from kvm instance

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <1324091975-20930-1-git-send-email-kernelfans@gmail.com>
Date:	Sat, 17 Dec 2011 11:19:35 +0800
From:	Liu Ping Fan <kernelfans@...il.com>
To:	kvm@...r.kernel.org
Cc:	linux-kernel@...r.kernel.org, avi@...hat.com, aliguori@...ibm.com,
	gleb@...hat.com, mtosatti@...hat.com, jan.kiszka@....de
Subject: [PATCH v5] kvm: make vcpu life cycle separated from kvm instance

From: Liu Ping Fan <pingfank@...ux.vnet.ibm.com>

Currently, vcpu can be destructed only when kvm instance destroyed.
Change this to vcpu's destruction before kvm instance, so vcpu MUST
and CAN be destroyed before kvm's destroy.

Signed-off-by: Liu Ping Fan <pingfank@...ux.vnet.ibm.com>
---
 arch/x86/kvm/i8254.c     |   10 +++--
 arch/x86/kvm/i8259.c     |   12 ++++--
 arch/x86/kvm/x86.c       |   53 +++++++++++------------
 include/linux/kvm_host.h |   20 ++++-----
 virt/kvm/irq_comm.c      |    6 ++-
 virt/kvm/kvm_main.c      |  106 ++++++++++++++++++++++++++++++++++-----------
 6 files changed, 132 insertions(+), 75 deletions(-)

diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index 76e3f1c..a3a5506 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -289,9 +289,8 @@ static void pit_do_work(struct work_struct *work)
 	struct kvm_pit *pit = container_of(work, struct kvm_pit, expired);
 	struct kvm *kvm = pit->kvm;
 	struct kvm_vcpu *vcpu;
-	int i;
 	struct kvm_kpit_state *ps = &pit->pit_state;
-	int inject = 0;
+	int idx, inject = 0;
 
 	/* Try to inject pending interrupts when
 	 * last one has been acked.
@@ -315,9 +314,12 @@ static void pit_do_work(struct work_struct *work)
 		 * LVT0 to NMI delivery. Other PIC interrupts are just sent to
 		 * VCPU0, and only if its LVT0 is in EXTINT mode.
 		 */
-		if (kvm->arch.vapics_in_nmi_mode > 0)
-			kvm_for_each_vcpu(i, vcpu, kvm)
+		if (kvm->arch.vapics_in_nmi_mode > 0) {
+			idx = srcu_read_lock(&kvm->srcu_vcpus);
+			kvm_for_each_vcpu(vcpu, kvm)
 				kvm_apic_nmi_wd_deliver(vcpu);
+			srcu_read_unlock(&kvm->srcu_vcpus, idx);
+		}
 	}
 }
 
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index cac4746..5ef5c05 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -50,25 +50,29 @@ static void pic_unlock(struct kvm_pic *s)
 {
 	bool wakeup = s->wakeup_needed;
 	struct kvm_vcpu *vcpu, *found = NULL;
-	int i;
+	struct kvm *kvm = s->kvm;
+	int idx;
 
 	s->wakeup_needed = false;
 
 	spin_unlock(&s->lock);
 
 	if (wakeup) {
-		kvm_for_each_vcpu(i, vcpu, s->kvm) {
+		idx = srcu_read_lock(&kvm->srcu_vcpus);
+		kvm_for_each_vcpu(vcpu, kvm)
 			if (kvm_apic_accept_pic_intr(vcpu)) {
 				found = vcpu;
 				break;
 			}
-		}
 
-		if (!found)
+		if (!found) {
+			srcu_read_unlock(&kvm->srcu_vcpus, idx);
 			return;
+		}
 
 		kvm_make_request(KVM_REQ_EVENT, found);
 		kvm_vcpu_kick(found);
+		srcu_read_unlock(&kvm->srcu_vcpus, idx);
 	}
 }
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 23c93fe..b79739d 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1774,14 +1774,20 @@ static int get_msr_hyperv_pw(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
 static int get_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
 {
 	u64 data = 0;
+	int idx;
 
 	switch (msr) {
 	case HV_X64_MSR_VP_INDEX: {
-		int r;
+		int r = 0;
 		struct kvm_vcpu *v;
-		kvm_for_each_vcpu(r, v, vcpu->kvm)
+		struct kvm *kvm = vcpu->kvm;
+		idx = srcu_read_lock(&kvm->srcu_vcpus);
+		kvm_for_each_vcpu(v, vcpu->kvm) {
 			if (v == vcpu)
 				data = r;
+			r++;
+		}
+		srcu_read_unlock(&kvm->srcu_vcpus, idx);
 		break;
 	}
 	case HV_X64_MSR_EOI:
@@ -4529,7 +4535,7 @@ static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long va
 	struct cpufreq_freqs *freq = data;
 	struct kvm *kvm;
 	struct kvm_vcpu *vcpu;
-	int i, send_ipi = 0;
+	int idx, send_ipi = 0;
 
 	/*
 	 * We allow guests to temporarily run on slowing clocks,
@@ -4579,13 +4585,16 @@ static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long va
 
 	raw_spin_lock(&kvm_lock);
 	list_for_each_entry(kvm, &vm_list, vm_list) {
-		kvm_for_each_vcpu(i, vcpu, kvm) {
+		idx = srcu_read_lock(&kvm->srcu_vcpus);
+		kvm_for_each_vcpu(vcpu, kvm) {
 			if (vcpu->cpu != freq->cpu)
 				continue;
 			kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
 			if (vcpu->cpu != smp_processor_id())
 				send_ipi = 1;
 		}
+		srcu_read_unlock(&kvm->srcu_vcpus, idx);
+
 	}
 	raw_spin_unlock(&kvm_lock);
 
@@ -5866,13 +5875,17 @@ int kvm_arch_hardware_enable(void *garbage)
 {
 	struct kvm *kvm;
 	struct kvm_vcpu *vcpu;
-	int i;
+	int idx;
 
 	kvm_shared_msr_cpu_online();
-	list_for_each_entry(kvm, &vm_list, vm_list)
-		kvm_for_each_vcpu(i, vcpu, kvm)
+	list_for_each_entry(kvm, &vm_list, vm_list) {
+		idx = srcu_read_lock(&kvm->srcu_vcpus);
+		kvm_for_each_vcpu(vcpu, kvm) {
 			if (vcpu->cpu == smp_processor_id())
 				kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
+		}
+		srcu_read_unlock(&kvm->srcu_vcpus, idx);
+	}
 	return kvm_x86_ops->hardware_enable(garbage);
 }
 
@@ -5989,27 +6002,14 @@ static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu)
 	vcpu_put(vcpu);
 }
 
-static void kvm_free_vcpus(struct kvm *kvm)
+void kvm_arch_vcpu_zap(struct kvm_vcpu *vcpu)
 {
-	unsigned int i;
-	struct kvm_vcpu *vcpu;
-
-	/*
-	 * Unpin any mmu pages first.
-	 */
-	kvm_for_each_vcpu(i, vcpu, kvm) {
-		kvm_clear_async_pf_completion_queue(vcpu);
-		kvm_unload_vcpu_mmu(vcpu);
-	}
-	kvm_for_each_vcpu(i, vcpu, kvm)
-		kvm_arch_vcpu_free(vcpu);
-
-	mutex_lock(&kvm->lock);
-	for (i = 0; i < atomic_read(&kvm->online_vcpus); i++)
-		kvm->vcpus[i] = NULL;
+	struct kvm *kvm = vcpu->kvm;
 
-	atomic_set(&kvm->online_vcpus, 0);
-	mutex_unlock(&kvm->lock);
+	kvm_clear_async_pf_completion_queue(vcpu);
+	kvm_unload_vcpu_mmu(vcpu);
+	kvm_arch_vcpu_free(vcpu);
+	kvm_put_kvm(kvm);
 }
 
 void kvm_arch_sync_events(struct kvm *kvm)
@@ -6023,7 +6023,6 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
 	kvm_iommu_unmap_guest(kvm);
 	kfree(kvm->arch.vpic);
 	kfree(kvm->arch.vioapic);
-	kvm_free_vcpus(kvm);
 	if (kvm->arch.apic_access_page)
 		put_page(kvm->arch.apic_access_page);
 	if (kvm->arch.ept_identity_pagetable)
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 8c5c303..ab22828 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -115,6 +115,7 @@ enum {
 
 struct kvm_vcpu {
 	struct kvm *kvm;
+	struct list_head list;
 #ifdef CONFIG_PREEMPT_NOTIFIERS
 	struct preempt_notifier preempt_notifier;
 #endif
@@ -249,13 +250,15 @@ struct kvm {
 	struct mm_struct *mm; /* userspace tied to this vm */
 	struct kvm_memslots *memslots;
 	struct srcu_struct srcu;
+	struct srcu_struct srcu_vcpus;
+
 #ifdef CONFIG_KVM_APIC_ARCHITECTURE
 	u32 bsp_vcpu_id;
 	struct kvm_vcpu *bsp_vcpu;
 #endif
-	struct kvm_vcpu *vcpus[KVM_MAX_VCPUS];
+	struct list_head vcpus;
 	atomic_t online_vcpus;
-	int last_boosted_vcpu;
+	struct kvm_vcpu *last_boosted_vcpu;
 	struct list_head vm_list;
 	struct mutex lock;
 	struct kvm_io_bus *buses[KVM_NR_BUSES];
@@ -302,17 +305,10 @@ struct kvm {
 #define kvm_printf(kvm, fmt ...) printk(KERN_DEBUG fmt)
 #define vcpu_printf(vcpu, fmt...) kvm_printf(vcpu->kvm, fmt)
 
-static inline struct kvm_vcpu *kvm_get_vcpu(struct kvm *kvm, int i)
-{
-	smp_rmb();
-	return kvm->vcpus[i];
-}
+void kvm_arch_vcpu_zap(struct kvm_vcpu *vcpu);
 
-#define kvm_for_each_vcpu(idx, vcpup, kvm) \
-	for (idx = 0; \
-	     idx < atomic_read(&kvm->online_vcpus) && \
-	     (vcpup = kvm_get_vcpu(kvm, idx)) != NULL; \
-	     idx++)
+#define kvm_for_each_vcpu(vcpu, kvm) \
+	list_for_each_entry_rcu(vcpu, &kvm->vcpus, list)
 
 #define kvm_for_each_memslot(memslot, slots)	\
 	for (memslot = &slots->memslots[0];	\
diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c
index 9f614b4..78dc97c 100644
--- a/virt/kvm/irq_comm.c
+++ b/virt/kvm/irq_comm.c
@@ -81,14 +81,15 @@ inline static bool kvm_is_dm_lowest_prio(struct kvm_lapic_irq *irq)
 int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
 		struct kvm_lapic_irq *irq)
 {
-	int i, r = -1;
+	int idx, r = -1;
 	struct kvm_vcpu *vcpu, *lowest = NULL;
 
 	if (irq->dest_mode == 0 && irq->dest_id == 0xff &&
 			kvm_is_dm_lowest_prio(irq))
 		printk(KERN_INFO "kvm: apic: phys broadcast and lowest prio\n");
 
-	kvm_for_each_vcpu(i, vcpu, kvm) {
+	idx = srcu_read_lock(&kvm->srcu_vcpus);
+	kvm_for_each_vcpu(vcpu, kvm) {
 		if (!kvm_apic_present(vcpu))
 			continue;
 
@@ -111,6 +112,7 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
 	if (lowest)
 		r = kvm_apic_set_irq(lowest, irq);
 
+	srcu_read_unlock(&kvm->srcu_vcpus, idx);
 	return r;
 }
 
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index e289486..ec0c920 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -171,7 +171,7 @@ static void ack_flush(void *_completed)
 
 static bool make_all_cpus_request(struct kvm *kvm, unsigned int req)
 {
-	int i, cpu, me;
+	int cpu, me, idx;
 	cpumask_var_t cpus;
 	bool called = true;
 	struct kvm_vcpu *vcpu;
@@ -179,7 +179,8 @@ static bool make_all_cpus_request(struct kvm *kvm, unsigned int req)
 	zalloc_cpumask_var(&cpus, GFP_ATOMIC);
 
 	me = get_cpu();
-	kvm_for_each_vcpu(i, vcpu, kvm) {
+	idx = srcu_read_lock(&kvm->srcu_vcpus);
+	kvm_for_each_vcpu(vcpu, kvm) {
 		kvm_make_request(req, vcpu);
 		cpu = vcpu->cpu;
 
@@ -190,12 +191,15 @@ static bool make_all_cpus_request(struct kvm *kvm, unsigned int req)
 		      kvm_vcpu_exiting_guest_mode(vcpu) != OUTSIDE_GUEST_MODE)
 			cpumask_set_cpu(cpu, cpus);
 	}
+	srcu_read_unlock(&kvm->srcu_vcpus, idx);
+
 	if (unlikely(cpus == NULL))
 		smp_call_function_many(cpu_online_mask, ack_flush, NULL, 1);
 	else if (!cpumask_empty(cpus))
 		smp_call_function_many(cpus, ack_flush, NULL, 1);
 	else
 		called = false;
+
 	put_cpu();
 	free_cpumask_var(cpus);
 	return called;
@@ -477,6 +481,8 @@ static struct kvm *kvm_create_vm(void)
 	kvm_init_memslots_id(kvm);
 	if (init_srcu_struct(&kvm->srcu))
 		goto out_err_nosrcu;
+	if (init_srcu_struct(&kvm->srcu_vcpus))
+		goto out_err_nosrcu_vcpus;
 	for (i = 0; i < KVM_NR_BUSES; i++) {
 		kvm->buses[i] = kzalloc(sizeof(struct kvm_io_bus),
 					GFP_KERNEL);
@@ -500,10 +506,13 @@ static struct kvm *kvm_create_vm(void)
 	raw_spin_lock(&kvm_lock);
 	list_add(&kvm->vm_list, &vm_list);
 	raw_spin_unlock(&kvm_lock);
+	INIT_LIST_HEAD(&kvm->vcpus);
 
 	return kvm;
 
 out_err:
+	cleanup_srcu_struct(&kvm->srcu_vcpus);
+out_err_nosrcu_vcpus:
 	cleanup_srcu_struct(&kvm->srcu);
 out_err_nosrcu:
 	hardware_disable_all();
@@ -587,6 +596,7 @@ static void kvm_destroy_vm(struct kvm *kvm)
 	kvm_arch_destroy_vm(kvm);
 	kvm_free_physmem(kvm);
 	cleanup_srcu_struct(&kvm->srcu);
+	cleanup_srcu_struct(&kvm->srcu_vcpus);
 	kvm_arch_free_vm(kvm);
 	hardware_disable_all();
 	mmdrop(mm);
@@ -1593,11 +1603,9 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me)
 {
 	struct kvm *kvm = me->kvm;
 	struct kvm_vcpu *vcpu;
-	int last_boosted_vcpu = me->kvm->last_boosted_vcpu;
-	int yielded = 0;
-	int pass;
-	int i;
-
+	struct task_struct *task = NULL;
+	struct pid *pid;
+	int pass, firststart, lastone, yielded, idx;
 	/*
 	 * We boost the priority of a VCPU that is runnable but not
 	 * currently running, because it got preempted by something
@@ -1605,15 +1613,22 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me)
 	 * VCPU is holding the lock that we need and will release it.
 	 * We approximate round-robin by starting at the last boosted VCPU.
 	 */
-	for (pass = 0; pass < 2 && !yielded; pass++) {
-		kvm_for_each_vcpu(i, vcpu, kvm) {
-			struct task_struct *task = NULL;
-			struct pid *pid;
-			if (!pass && i < last_boosted_vcpu) {
-				i = last_boosted_vcpu;
+	for (pass = 0, firststart = 0; pass < 2 && !yielded; pass++) {
+
+		idx = srcu_read_lock(&kvm->srcu_vcpus);
+		kvm_for_each_vcpu(vcpu, kvm) {
+			if (!pass && !firststart &&
+			    vcpu != kvm->last_boosted_vcpu &&
+			    kvm->last_boosted_vcpu != NULL) {
+				vcpu = kvm->last_boosted_vcpu;
+				firststart = 1;
 				continue;
-			} else if (pass && i > last_boosted_vcpu)
+			} else if (pass && !lastone) {
+				if (vcpu == kvm->last_boosted_vcpu)
+					lastone = 1;
+			} else if (pass && lastone)
 				break;
+
 			if (vcpu == me)
 				continue;
 			if (waitqueue_active(&vcpu->wq))
@@ -1629,15 +1644,20 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me)
 				put_task_struct(task);
 				continue;
 			}
+
 			if (yield_to(task, 1)) {
 				put_task_struct(task);
-				kvm->last_boosted_vcpu = i;
+				mutex_lock(&kvm->lock);
+				kvm->last_boosted_vcpu = vcpu;
+				mutex_unlock(&kvm->lock);
 				yielded = 1;
 				break;
 			}
 			put_task_struct(task);
 		}
+		srcu_read_unlock(&kvm->srcu_vcpus, idx);
 	}
+
 }
 EXPORT_SYMBOL_GPL(kvm_vcpu_on_spin);
 
@@ -1673,11 +1693,30 @@ static int kvm_vcpu_mmap(struct file *file, struct vm_area_struct *vma)
 	return 0;
 }
 
+static void kvm_vcpu_zap(struct kvm_vcpu *vcpu)
+{
+	kvm_arch_vcpu_zap(vcpu);
+}
+
 static int kvm_vcpu_release(struct inode *inode, struct file *filp)
 {
 	struct kvm_vcpu *vcpu = filp->private_data;
+	struct kvm *kvm = vcpu->kvm;
+	filp->private_data = NULL;
+
+	mutex_lock(&kvm->lock);
+	list_del_rcu(&vcpu->list);
+	atomic_dec(&kvm->online_vcpus);
+	mutex_unlock(&kvm->lock);
+	synchronize_srcu_expedited(&kvm->srcu_vcpus);
+
+	mutex_lock(&kvm->lock);
+	if (kvm->last_boosted_vcpu == vcpu)
+		kvm->last_boosted_vcpu = NULL;
+	mutex_unlock(&kvm->lock);
 
-	kvm_put_kvm(vcpu->kvm);
+	/*vcpu is out of list,drop it safely*/
+	kvm_vcpu_zap(vcpu);
 	return 0;
 }
 
@@ -1699,15 +1738,25 @@ static int create_vcpu_fd(struct kvm_vcpu *vcpu)
 	return anon_inode_getfd("kvm-vcpu", &kvm_vcpu_fops, vcpu, O_RDWR);
 }
 
+static struct kvm_vcpu *kvm_vcpu_create(struct kvm *kvm, u32 id)
+{
+	struct kvm_vcpu *vcpu;
+	vcpu = kvm_arch_vcpu_create(kvm, id);
+	if (IS_ERR(vcpu))
+		return vcpu;
+	INIT_LIST_HEAD(&vcpu->list);
+	return vcpu;
+}
+
 /*
  * Creates some virtual cpus.  Good luck creating more than one.
  */
 static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id)
 {
-	int r;
+	int r, idx;
 	struct kvm_vcpu *vcpu, *v;
 
-	vcpu = kvm_arch_vcpu_create(kvm, id);
+	vcpu = kvm_vcpu_create(kvm, id);
 	if (IS_ERR(vcpu))
 		return PTR_ERR(vcpu);
 
@@ -1723,13 +1772,15 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id)
 		goto unlock_vcpu_destroy;
 	}
 
-	kvm_for_each_vcpu(r, v, kvm)
+	idx = srcu_read_lock(&kvm->srcu_vcpus);
+	kvm_for_each_vcpu(v, kvm) {
 		if (v->vcpu_id == id) {
 			r = -EEXIST;
+			srcu_read_unlock(&kvm->srcu_vcpus, idx);
 			goto unlock_vcpu_destroy;
 		}
-
-	BUG_ON(kvm->vcpus[atomic_read(&kvm->online_vcpus)]);
+	}
+	srcu_read_unlock(&kvm->srcu_vcpus, idx);
 
 	/* Now it's all set up, let userspace reach it */
 	kvm_get_kvm(kvm);
@@ -1739,8 +1790,8 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id)
 		goto unlock_vcpu_destroy;
 	}
 
-	kvm->vcpus[atomic_read(&kvm->online_vcpus)] = vcpu;
-	smp_wmb();
+	/*Protected by kvm->lock*/
+	list_add_rcu(&vcpu->list, &kvm->vcpus);
 	atomic_inc(&kvm->online_vcpus);
 
 #ifdef CONFIG_KVM_APIC_ARCHITECTURE
@@ -2645,13 +2696,16 @@ static int vcpu_stat_get(void *_offset, u64 *val)
 	unsigned offset = (long)_offset;
 	struct kvm *kvm;
 	struct kvm_vcpu *vcpu;
-	int i;
+	int idx;
 
 	*val = 0;
 	raw_spin_lock(&kvm_lock);
-	list_for_each_entry(kvm, &vm_list, vm_list)
-		kvm_for_each_vcpu(i, vcpu, kvm)
+	list_for_each_entry(kvm, &vm_list, vm_list) {
+		idx = srcu_read_lock(&kvm->srcu_vcpus);
+		kvm_for_each_vcpu(vcpu, kvm)
 			*val += *(u32 *)((void *)vcpu + offset);
+		srcu_read_unlock(&kvm->srcu_vcpus, idx);
+	}
 
 	raw_spin_unlock(&kvm_lock);
 	return 0;
-- 
1.7.4.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/