linux-kernel - [PATCH 13/29] KVM: implement vCPU creation for extra planes

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20250401161106.790710-14-pbonzini@redhat.com>
Date: Tue,  1 Apr 2025 18:10:50 +0200
From: Paolo Bonzini <pbonzini@...hat.com>
To: linux-kernel@...r.kernel.org,
	kvm@...r.kernel.org
Cc: roy.hopkins@...e.com,
	seanjc@...gle.com,
	thomas.lendacky@....com,
	ashish.kalra@....com,
	michael.roth@....com,
	jroedel@...e.de,
	nsaenz@...zon.com,
	anelkz@...zon.de,
	James.Bottomley@...senPartnership.com
Subject: [PATCH 13/29] KVM: implement vCPU creation for extra planes

For userspace to have fun with planes it is probably useful to let them
create vCPUs on the non-zero planes as well.  Since such vCPUs are backed
by the same struct kvm_vcpu, these are regular vCPU file descriptors except
that they only allow a small subset of ioctls (mostly get/set) and they
share some of the backing resources, notably vcpu->run.

TODO: prefault might be useful on non-default planes as well?

Signed-off-by: Paolo Bonzini <pbonzini@...hat.com>
---
 Documentation/virt/kvm/locking.rst |   3 +
 include/linux/kvm_host.h           |   4 +-
 include/uapi/linux/kvm.h           |   1 +
 virt/kvm/kvm_main.c                | 167 +++++++++++++++++++++++------
 4 files changed, 142 insertions(+), 33 deletions(-)

diff --git a/Documentation/virt/kvm/locking.rst b/Documentation/virt/kvm/locking.rst
index ae8bce7fecbe..ad22344deb28 100644
--- a/Documentation/virt/kvm/locking.rst
+++ b/Documentation/virt/kvm/locking.rst
@@ -26,6 +26,9 @@ The acquisition orders for mutexes are as follows:
   are taken on the waiting side when modifying memslots, so MMU notifiers
   must not take either kvm->slots_lock or kvm->slots_arch_lock.
 
+- when VMs have multiple planes, vcpu->mutex for plane 0 can taken
+  outside vcpu->mutex for the same id and another plane
+
 cpus_read_lock() vs kvm_lock:
 
 - Taking cpus_read_lock() outside of kvm_lock is problematic, despite that
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index b511aed2de8e..99fd90c5d71b 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -343,6 +343,9 @@ struct kvm_vcpu {
 
 	struct mutex mutex;
 
+	/* Only valid on plane 0 */
+	bool wants_to_run;
+
 	/* Shared for all planes */
 	struct kvm_run *run;
 
@@ -388,7 +391,6 @@ struct kvm_vcpu {
 		bool dy_eligible;
 	} spin_loop;
 #endif
-	bool wants_to_run;
 	bool preempted;
 	bool ready;
 	bool scheduled_out;
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 96d25c7fa18f..24fa002cd7c1 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -1691,5 +1691,6 @@ struct kvm_pre_fault_memory {
 };
 
 #define KVM_CREATE_PLANE	_IO(KVMIO, 0xd6)
+#define KVM_CREATE_VCPU_PLANE	_IO(KVMIO, 0xd7)
 
 #endif /* __LINUX_KVM_H */
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 863fd80ddfbe..06fa2a6ad96f 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -438,11 +438,11 @@ void *kvm_mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc)
 }
 #endif
 
-static void kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
+static void kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm_plane *plane, unsigned id)
 {
 	mutex_init(&vcpu->mutex);
 	vcpu->cpu = -1;
-	vcpu->kvm = kvm;
+	vcpu->kvm = plane->kvm;
 	vcpu->vcpu_id = id;
 	vcpu->pid = NULL;
 	rwlock_init(&vcpu->pid_lock);
@@ -459,8 +459,13 @@ static void kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
 	vcpu->last_used_slot = NULL;
 
 	/* Fill the stats id string for the vcpu */
-	snprintf(vcpu->stats_id, sizeof(vcpu->stats_id), "kvm-%d/vcpu-%d",
-		 task_pid_nr(current), id);
+	if (plane->plane) {
+		snprintf(vcpu->stats_id, sizeof(vcpu->stats_id), "kvm-%d/vcpu-%d:%d",
+			 task_pid_nr(current), id, plane->plane);
+	} else {
+		snprintf(vcpu->stats_id, sizeof(vcpu->stats_id), "kvm-%d/vcpu-%d",
+			 task_pid_nr(current), id);
+	}
 }
 
 static void kvm_vcpu_destroy(struct kvm_vcpu *vcpu)
@@ -475,7 +480,9 @@ static void kvm_vcpu_destroy(struct kvm_vcpu *vcpu)
 	 */
 	put_pid(vcpu->pid);
 
-	free_page((unsigned long)vcpu->run);
+	if (!vcpu->plane)
+		free_page((unsigned long)vcpu->run);
+
 	kmem_cache_free(kvm_vcpu_cache, vcpu);
 }
 
@@ -4026,6 +4033,9 @@ static vm_fault_t kvm_vcpu_fault(struct vm_fault *vmf)
 	struct kvm_vcpu *vcpu = vmf->vma->vm_file->private_data;
 	struct page *page;
 
+	if (vcpu->plane)
+		return VM_FAULT_SIGBUS;
+
 	if (vmf->pgoff == 0)
 		page = virt_to_page(vcpu->run);
 #ifdef CONFIG_X86
@@ -4113,7 +4123,10 @@ static void kvm_create_vcpu_debugfs(struct kvm_vcpu *vcpu)
 	if (!debugfs_initialized())
 		return;
 
-	snprintf(dir_name, sizeof(dir_name), "vcpu%d", vcpu->vcpu_id);
+	if (vcpu->plane)
+		snprintf(dir_name, sizeof(dir_name), "vcpu%d:%d", vcpu->vcpu_id, vcpu->plane);
+	else
+		snprintf(dir_name, sizeof(dir_name), "vcpu%d", vcpu->vcpu_id);
 	debugfs_dentry = debugfs_create_dir(dir_name,
 					    vcpu->kvm->debugfs_dentry);
 	debugfs_create_file("pid", 0444, debugfs_dentry, vcpu,
@@ -4126,9 +4139,10 @@ static void kvm_create_vcpu_debugfs(struct kvm_vcpu *vcpu)
 /*
  * Creates some virtual cpus.  Good luck creating more than one.
  */
-static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, unsigned long id)
+static int kvm_vm_ioctl_create_vcpu(struct kvm_plane *plane, struct kvm_vcpu *plane0_vcpu, unsigned long id)
 {
 	int r;
+	struct kvm *kvm = plane->kvm;
 	struct kvm_vcpu *vcpu;
 	struct page *page;
 
@@ -4165,24 +4179,33 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, unsigned long id)
 		goto vcpu_decrement;
 	}
 
-	BUILD_BUG_ON(sizeof(struct kvm_run) > PAGE_SIZE);
-	page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
-	if (!page) {
-		r = -ENOMEM;
-		goto vcpu_free;
-	}
-	vcpu->run = page_address(page);
+	if (plane->plane) {
+		page = NULL;
+		vcpu->run = plane0_vcpu->run;
+	} else {
+		WARN_ON(plane0_vcpu != NULL);
+		plane0_vcpu = vcpu;
 
-	if (kvm->dirty_ring_size) {
-		r = kvm_dirty_ring_alloc(kvm, &vcpu->__dirty_ring,
-					 id, kvm->dirty_ring_size);
-		if (r)
-			goto vcpu_free_run_page;
+		BUILD_BUG_ON(sizeof(struct kvm_run) > PAGE_SIZE);
+		page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
+		if (!page) {
+			r = -ENOMEM;
+			goto vcpu_free;
+		}
+		vcpu->run = page_address(page);
+
+		if (kvm->dirty_ring_size) {
+			r = kvm_dirty_ring_alloc(kvm, &vcpu->__dirty_ring,
+						 id, kvm->dirty_ring_size);
+			if (r)
+				goto vcpu_free_run_page;
+		}
 	}
 
-	vcpu->plane0 = vcpu;
-	vcpu->stat = &vcpu->__stat;
-	kvm_vcpu_init(vcpu, kvm, id);
+	vcpu->plane0 = plane0_vcpu;
+	vcpu->stat = &plane0_vcpu->__stat;
+	vcpu->dirty_ring = &plane0_vcpu->__dirty_ring;
+	kvm_vcpu_init(vcpu, plane, id);
 
 	r = kvm_arch_vcpu_create(vcpu);
 	if (r)
@@ -4190,7 +4213,7 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, unsigned long id)
 
 	mutex_lock(&kvm->lock);
 
-	if (kvm_get_vcpu_by_id(kvm, id)) {
+	if (kvm_get_plane_vcpu_by_id(plane, id)) {
 		r = -EEXIST;
 		goto unlock_vcpu_destroy;
 	}
@@ -4200,8 +4223,13 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, unsigned long id)
 	 * release semantics, which ensures the write is visible to kvm_get_vcpu().
 	 */
 	vcpu->plane = -1;
-	vcpu->vcpu_idx = atomic_read(&kvm->online_vcpus);
-	r = xa_insert(&kvm->planes[0]->vcpu_array, vcpu->vcpu_idx, vcpu, GFP_KERNEL_ACCOUNT);
+	if (plane->plane)
+		vcpu->vcpu_idx = atomic_read(&kvm->online_vcpus);
+	else
+		vcpu->vcpu_idx = plane0_vcpu->vcpu_idx;
+
+	r = xa_insert(&plane->vcpu_array, vcpu->vcpu_idx,
+		      vcpu, GFP_KERNEL_ACCOUNT);
 	WARN_ON_ONCE(r == -EBUSY);
 	if (r)
 		goto unlock_vcpu_destroy;
@@ -4220,13 +4248,14 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, unsigned long id)
 	if (r < 0)
 		goto kvm_put_xa_erase;
 
-	atomic_inc(&kvm->online_vcpus);
+	if (!plane0_vcpu)
+		atomic_inc(&kvm->online_vcpus);
 
 	/*
 	 * Pairs with xa_load() in kvm_get_vcpu, ensuring that online_vcpus
 	 * is updated before vcpu->plane.
 	 */
-	smp_store_release(&vcpu->plane, 0);
+	smp_store_release(&vcpu->plane, plane->plane);
 	mutex_unlock(&vcpu->mutex);
 
 	mutex_unlock(&kvm->lock);
@@ -4237,14 +4266,15 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, unsigned long id)
 kvm_put_xa_erase:
 	mutex_unlock(&vcpu->mutex);
 	kvm_put_kvm_no_destroy(kvm);
-	xa_erase(&kvm->planes[0]->vcpu_array, vcpu->vcpu_idx);
+	xa_erase(&plane->vcpu_array, vcpu->vcpu_idx);
 unlock_vcpu_destroy:
 	mutex_unlock(&kvm->lock);
 	kvm_arch_vcpu_destroy(vcpu);
 vcpu_free_dirty_ring:
 	kvm_dirty_ring_free(&vcpu->__dirty_ring);
 vcpu_free_run_page:
-	free_page((unsigned long)vcpu->run);
+	if (page)
+		__free_page(page);
 vcpu_free:
 	kmem_cache_free(kvm_vcpu_cache, vcpu);
 vcpu_decrement:
@@ -4406,6 +4436,35 @@ static int kvm_plane_ioctl_check_extension(struct kvm_plane *plane, long arg)
 	}
 }
 
+static int kvm_plane_ioctl_create_vcpu(struct kvm_plane *plane, long arg)
+{
+	int r = -EINVAL;
+	struct file *file;
+	struct kvm_vcpu *vcpu;
+	int fd;
+
+	if (arg != (int)arg)
+		return -EBADF;
+
+	fd = arg;
+	file = fget(fd);
+	if (!file)
+		return -EBADF;
+
+	if (file->f_op != &kvm_vcpu_fops)
+		goto err;
+
+	vcpu = file->private_data;
+	if (vcpu->kvm != plane->kvm)
+		goto err;
+
+	r = kvm_vm_ioctl_create_vcpu(plane, vcpu, vcpu->vcpu_id);
+
+err:
+	fput(file);
+	return r;
+}
+
 static long __kvm_plane_ioctl(struct kvm_plane *plane, unsigned int ioctl,
 			      unsigned long arg)
 {
@@ -4432,6 +4491,8 @@ static long __kvm_plane_ioctl(struct kvm_plane *plane, unsigned int ioctl,
 #endif
 	case KVM_CHECK_EXTENSION:
 		return kvm_plane_ioctl_check_extension(plane, arg);
+	case KVM_CREATE_VCPU_PLANE:
+		return kvm_plane_ioctl_create_vcpu(plane, arg);
 	default:
 		return -ENOTTY;
 	}
@@ -4463,6 +4524,44 @@ static struct file_operations kvm_plane_fops = {
 };
 
 
+static inline bool kvm_arch_is_vcpu_plane_ioctl(unsigned ioctl)
+{
+	switch (ioctl) {
+	case KVM_GET_DEBUGREGS:
+	case KVM_SET_DEBUGREGS:
+	case KVM_GET_FPU:
+	case KVM_SET_FPU:
+	case KVM_GET_LAPIC:
+	case KVM_SET_LAPIC:
+	case KVM_GET_MSRS:
+	case KVM_SET_MSRS:
+	case KVM_GET_NESTED_STATE:
+	case KVM_SET_NESTED_STATE:
+	case KVM_GET_ONE_REG:
+	case KVM_SET_ONE_REG:
+	case KVM_GET_REGS:
+	case KVM_SET_REGS:
+	case KVM_GET_SREGS:
+	case KVM_SET_SREGS:
+	case KVM_GET_SREGS2:
+	case KVM_SET_SREGS2:
+	case KVM_GET_VCPU_EVENTS:
+	case KVM_SET_VCPU_EVENTS:
+	case KVM_GET_XCRS:
+	case KVM_SET_XCRS:
+	case KVM_GET_XSAVE:
+	case KVM_GET_XSAVE2:
+	case KVM_SET_XSAVE:
+
+	case KVM_GET_REG_LIST:
+	case KVM_TRANSLATE:
+		return true;
+
+	default:
+		return false;
+	}
+}
+
 static long kvm_vcpu_ioctl(struct file *filp,
 			   unsigned int ioctl, unsigned long arg)
 {
@@ -4475,6 +4574,9 @@ static long kvm_vcpu_ioctl(struct file *filp,
 	if (vcpu->kvm->mm != current->mm || vcpu->kvm->vm_dead)
 		return -EIO;
 
+	if (vcpu->plane && !kvm_arch_is_vcpu_plane_ioctl(ioctl))
+		return -EINVAL;
+
 	if (unlikely(_IOC_TYPE(ioctl) != KVMIO))
 		return -EINVAL;
 
@@ -4958,7 +5060,7 @@ static int kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg)
 	case KVM_CAP_PLANES:
 		if (kvm)
 			return kvm_arch_nr_vcpu_planes(kvm);
-		return KVM_MAX_PLANES;
+		return KVM_MAX_VCPU_PLANES;
 	case KVM_CAP_PLANES_FPU:
 		return kvm_arch_planes_share_fpu(kvm);
 #endif
@@ -5201,7 +5303,8 @@ static int kvm_vm_ioctl_create_plane(struct kvm *kvm, unsigned id)
 	struct file *file;
 	int r, fd;
 
-	if (id >= KVM_MAX_VCPU_PLANES)
+	if (id >= kvm_arch_nr_vcpu_planes(kvm)
+	    || WARN_ON_ONCE(id >= KVM_MAX_VCPU_PLANES))
 		return -EINVAL;
 
 	guard(mutex)(&kvm->lock);
@@ -5259,7 +5362,7 @@ static long kvm_vm_ioctl(struct file *filp,
 		r = kvm_vm_ioctl_create_plane(kvm, arg);
 		break;
 	case KVM_CREATE_VCPU:
-		r = kvm_vm_ioctl_create_vcpu(kvm, arg);
+		r = kvm_vm_ioctl_create_vcpu(kvm->planes[0], NULL, arg);
 		break;
 	case KVM_ENABLE_CAP: {
 		struct kvm_enable_cap cap;
-- 
2.49.0