linux-kernel - Re: [PATCH 03/16] KVM: x86: set gfn-to-pfn cache length consistently with VM word size

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <6acfdbd3e516ece5949e97c85e7db8dc97a3e6c6.camel@infradead.org>
Date:   Wed, 16 Nov 2022 14:49:42 -0800
From:   David Woodhouse <dwmw2@...radead.org>
To:     "Durrant, Paul" <pdurrant@...zon.co.uk>,
        Paolo Bonzini <pbonzini@...hat.com>,
        Sean Christopherson <seanjc@...gle.com>
Cc:     "linux-kernel@...r.kernel.org" <linux-kernel@...r.kernel.org>,
        "kvm@...r.kernel.org" <kvm@...r.kernel.org>,
        "mhal@...x.co" <mhal@...x.co>,
        "Kaya, Metin" <metikaya@...zon.co.uk>
Subject: Re: [PATCH 03/16] KVM: x86: set gfn-to-pfn cache length
 consistently with VM word size

On Mon, 2022-11-14 at 16:16 -0800, David Woodhouse wrote:
> 
> I'm playing with using a second GPC for the overrun onto the second
> page. Debating if it's already too ugly to live before I even fix up
> the actual copying part...

Well it certainly didn't get any *prettier*. Utterly untested other
than building it, so it's certainly going to be broken, but as an
illustration.

I can't see a sane way to get the two pages vmapped consecutively,
given that they might be IOMEM. So I can't see how to make a single GPC
do this "nicely", and I think we have to declare that the runstate area
is the only case that actually needs this, then do it this way as a
special case... even though it's fugly?

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 81114a376c4e..3fc08f416aa3 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -647,6 +647,7 @@ struct kvm_vcpu_xen {
 	struct gfn_to_pfn_cache vcpu_info_cache;
 	struct gfn_to_pfn_cache vcpu_time_info_cache;
 	struct gfn_to_pfn_cache runstate_cache;
+	struct gfn_to_pfn_cache runstate2_cache;
 	u64 last_steal;
 	u64 runstate_entry_time;
 	u64 runstate_times[4];
diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c
index 4b8e9628fbf5..14ba45b541bf 100644
--- a/arch/x86/kvm/xen.c
+++ b/arch/x86/kvm/xen.c
@@ -198,38 +198,101 @@ static void kvm_xen_update_runstate(struct kvm_vcpu *v, int state)
 	vx->runstate_entry_time = now;
 }
 
+/*
+ * The guest region is arbitrarily aligned, and could be split across
+ * two pages.
+ *
+ * d1: Pointer to kernel map of first byte of region.
+ * d2: Pointer to kernel map of first byte of second page.
+ * l1: length of first range [ == PAGE_SIZE - (d1 & ~PAGE_MASK) ]
+ * src: Source pointer.
+ * len: Source length to be copied.
+ * dst_ofs: Destination offset within the guest region.
+ */
+static inline void memcpy_to_runstate(void *d1, void *d2, size_t l1,
+				      void *src, size_t len, size_t dst_ofs)
+{
+	size_t copylen;
+
+	if (dst_ofs < l1) {
+		copylen = min(l1 - dst_ofs, len);
+		memcpy(d1 + dst_ofs, src, copylen);
+		if (copylen == len)
+			return;
+
+		src += copylen;
+		dst_ofs += copylen;
+		len -= copylen;
+	}
+
+	BUG_ON(dst_ofs < l1);
+	memcpy(d2 + dst_ofs - l1, src, len);
+}
+
 void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, int state)
 {
 	struct kvm_vcpu_xen *vx = &v->arch.xen;
-	struct gfn_to_pfn_cache *gpc = &vx->runstate_cache;
-	uint64_t *user_times;
+	struct gfn_to_pfn_cache *gpc1 = &vx->runstate_cache;
+	struct gfn_to_pfn_cache *gpc2 = &vx->runstate2_cache;
 	unsigned long flags;
-	size_t user_len;
-	int *user_state;
+	size_t user_len, user_len1, user_len2;
+	size_t times_ofs;
+	u8 *update_bit;
 
 	kvm_xen_update_runstate(v, state);
 
-	if (!vx->runstate_cache.active)
+	if (!gpc1->active)
 		return;
 
-	if (IS_ENABLED(CONFIG_64BIT) && v->kvm->arch.xen.long_mode)
+	if (IS_ENABLED(CONFIG_64BIT) && v->kvm->arch.xen.long_mode) {
 		user_len = sizeof(struct vcpu_runstate_info);
-	else
+		times_ofs = offsetof(struct vcpu_runstate_info,
+				     state_entry_time);
+	} else {
 		user_len = sizeof(struct compat_vcpu_runstate_info);
+		times_ofs = offsetof(struct compat_vcpu_runstate_info,
+				     state_entry_time);
+	}
 
-	read_lock_irqsave(&gpc->lock, flags);
-	while (!kvm_gfn_to_pfn_cache_check(v->kvm, gpc, gpc->gpa,
-					   user_len)) {
-		read_unlock_irqrestore(&gpc->lock, flags);
+	if ((gpc1->gpa & ~PAGE_MASK) + user_len >= PAGE_SIZE) {
+		user_len1 = PAGE_SIZE - (gpc1->gpa & ~PAGE_MASK);
+		user_len2 = user_len - user_len1;
+	} else {
+		user_len1 = user_len;
+		user_len2 = 0;
+	}
+	BUG_ON(user_len1 + user_len2 != user_len);
+
+ retry:
+	read_lock_irqsave(&gpc1->lock, flags);
+	while (!kvm_gfn_to_pfn_cache_check(v->kvm, gpc1, gpc1->gpa,
+					   user_len1)) {
+		read_unlock_irqrestore(&gpc1->lock, flags);
 
 		/* When invoked from kvm_sched_out() we cannot sleep */
 		if (state == RUNSTATE_runnable)
 			return;
 
-		if (kvm_gfn_to_pfn_cache_refresh(v->kvm, gpc, gpc->gpa, user_len))
+		if (kvm_gfn_to_pfn_cache_refresh(v->kvm, gpc1, gpc1->gpa, user_len1))
 			return;
 
-		read_lock_irqsave(&gpc->lock, flags);
+		read_lock_irqsave(&gpc1->lock, flags);
+	}
+	if (user_len2) {
+		read_lock(&gpc2->lock);
+		if (!kvm_gfn_to_pfn_cache_check(v->kvm, gpc2, gpc2->gpa, user_len2)) {
+			read_unlock(&gpc2->lock);
+			read_unlock_irqrestore(&gpc1->lock, flags);
+
+			if (state == RUNSTATE_runnable)
+				return;
+
+			if (kvm_gfn_to_pfn_cache_refresh(v->kvm, gpc2,
+							 gpc2->gpa, user_len2))
+				return;
+
+			goto retry;
+		}
 	}
 
 	/*
@@ -252,25 +315,23 @@ void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, int state)
 		     offsetof(struct compat_vcpu_runstate_info, time) + 4);
 #endif
 
-	user_state = gpc->khva;
-
-	if (IS_ENABLED(CONFIG_64BIT) && v->kvm->arch.xen.long_mode)
-		user_times = gpc->khva + offsetof(struct vcpu_runstate_info,
-						  state_entry_time);
-	else
-		user_times = gpc->khva + offsetof(struct compat_vcpu_runstate_info,
-						  state_entry_time);
-
 	/*
-	 * First write the updated state_entry_time at the appropriate
-	 * location determined by 'offset'.
+	 * The XEN_RUNSTATE_UPDATE bit is the top bit of the state_entry_time
+	 * field. We need to set it (and write-barrier) before the rest.
 	 */
 	BUILD_BUG_ON(sizeof_field(struct vcpu_runstate_info, state_entry_time) !=
-		     sizeof(user_times[0]));
+		     sizeof(uint64_t));
 	BUILD_BUG_ON(sizeof_field(struct compat_vcpu_runstate_info, state_entry_time) !=
-		     sizeof(user_times[0]));
+		     sizeof(uint64_t));
+	BUILD_BUG_ON((XEN_RUNSTATE_UPDATE >> 56) != 0x80);
 
-	user_times[0] = vx->runstate_entry_time | XEN_RUNSTATE_UPDATE;
+	if (user_len1 >= times_ofs + sizeof(uint64_t))
+		update_bit = ((u8 *)gpc1->khva) + times_ofs + sizeof(u64) - 1;
+	else
+		update_bit = ((u8 *)gpc2->khva) + times_ofs + sizeof(u64) - 1 -
+			user_len1;
+
+	*update_bit |= (XEN_RUNSTATE_UPDATE >> 56);
 	smp_wmb();
 
 	/*
@@ -284,7 +345,9 @@ void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, int state)
 	BUILD_BUG_ON(sizeof_field(struct compat_vcpu_runstate_info, state) !=
 		     sizeof(vx->current_runstate));
 
-	*user_state = vx->current_runstate;
+	memcpy_to_runstate(gpc1->khva, gpc2->khva, user_len1,
+			   &vx->current_runstate, sizeof(vx->current_runstate),
+			   offsetof(struct vcpu_runstate_info, state));
 
 	/*
 	 * Write the actual runstate times immediately after the
@@ -299,19 +362,28 @@ void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, int state)
 	BUILD_BUG_ON(sizeof_field(struct vcpu_runstate_info, time) !=
 		     sizeof(vx->runstate_times));
 
-	memcpy(user_times + 1, vx->runstate_times, sizeof(vx->runstate_times));
-	smp_wmb();
+	memcpy_to_runstate(gpc1->khva, gpc2->khva, user_len1,
+			   vx->runstate_times, sizeof(vx->runstate_times),
+			   times_ofs + sizeof(u64));
 
+	memcpy_to_runstate(gpc1->khva, gpc2->khva, user_len1,
+			   &vx->runstate_entry_time, sizeof(vx->runstate_entry_time) - 1,
+			   times_ofs);
+	smp_wmb();
 	/*
 	 * Finally, clear the XEN_RUNSTATE_UPDATE bit in the guest's
 	 * runstate_entry_time field.
 	 */
-	user_times[0] &= ~XEN_RUNSTATE_UPDATE;
+	*update_bit = vx->runstate_entry_time >> 56;
 	smp_wmb();
 
-	read_unlock_irqrestore(&gpc->lock, flags);
+	if (user_len2)
+		read_unlock_irqrestore(&gpc2->lock, flags);
+	read_unlock_irqrestore(&gpc1->lock, flags);
 
-	mark_page_dirty_in_slot(v->kvm, gpc->memslot, gpc->gpa >> PAGE_SHIFT);
+	mark_page_dirty_in_slot(v->kvm, gpc1->memslot, gpc1->gpa >> PAGE_SHIFT);
+	if (user_len2)
+		mark_page_dirty_in_slot(v->kvm, gpc2->memslot, gpc2->gpa >> PAGE_SHIFT);
 }
 
 static void kvm_xen_inject_vcpu_vector(struct kvm_vcpu *v)
@@ -584,23 +656,52 @@ int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data)
 			kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
 		break;
 
-	case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR:
+	case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR: {
+		size_t sz;
+
 		if (!sched_info_on()) {
 			r = -EOPNOTSUPP;
 			break;
 		}
 		if (data->u.gpa == GPA_INVALID) {
+			r = 0;
+		deactivate_out:
 			kvm_gpc_deactivate(vcpu->kvm,
 					   &vcpu->arch.xen.runstate_cache);
-			r = 0;
+		deactivate2_out:
+			kvm_gpc_deactivate(vcpu->kvm,
+					   &vcpu->arch.xen.runstate2_cache);
 			break;
 		}
 
-		r = kvm_gpc_activate(vcpu->kvm, &vcpu->arch.xen.runstate_cache,
-				     NULL, KVM_HOST_USES_PFN, data->u.gpa,
-				     sizeof(struct vcpu_runstate_info));
+		if (IS_ENABLED(CONFIG_64BIT) && vcpu->kvm->arch.xen.long_mode)
+			sz = sizeof(struct vcpu_runstate_info);
+		else
+			sz = sizeof(struct compat_vcpu_runstate_info);
+
+		/* Handle structures which cross a page boundary by using two GPCs */
+		if ((data->u.gpa & ~PAGE_MASK) + sz <= PAGE_SIZE) {
+			r = kvm_gpc_activate(vcpu->kvm, &vcpu->arch.xen.runstate_cache,
+					     NULL, KVM_HOST_USES_PFN, data->u.gpa,
+					     sizeof(struct vcpu_runstate_info));
+			goto deactivate2_out;
+		} else {
+			/* Map the end of the first page... */
+			r = kvm_gpc_activate(vcpu->kvm, &vcpu->arch.xen.runstate_cache,
+					     NULL, KVM_HOST_USES_PFN, data->u.gpa,
+					     PAGE_SIZE - (data->u.gpa & ~PAGE_MASK));
+			if (r)
+				goto deactivate2_out;
+			/* ... and the start of the second. */
+			sz -= PAGE_SIZE - (data->u.gpa & ~PAGE_MASK);
+			r = kvm_gpc_activate(vcpu->kvm, &vcpu->arch.xen.runstate2_cache,
+					     NULL, KVM_HOST_USES_PFN,
+					     (data->u.gpa + PAGE_SIZE) & PAGE_MASK, sz);
+			if (r)
+				goto deactivate_out;
+		}
 		break;
-
+	}
 	case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_CURRENT:
 		if (!sched_info_on()) {
 			r = -EOPNOTSUPP;
@@ -1834,6 +1935,7 @@ void kvm_xen_init_vcpu(struct kvm_vcpu *vcpu)
 	timer_setup(&vcpu->arch.xen.poll_timer, cancel_evtchn_poll, 0);
 
 	kvm_gpc_init(&vcpu->arch.xen.runstate_cache);
+	kvm_gpc_init(&vcpu->arch.xen.runstate2_cache);
 	kvm_gpc_init(&vcpu->arch.xen.vcpu_info_cache);
 	kvm_gpc_init(&vcpu->arch.xen.vcpu_time_info_cache);
 }
@@ -1844,6 +1946,7 @@ void kvm_xen_destroy_vcpu(struct kvm_vcpu *vcpu)
 		kvm_xen_stop_timer(vcpu);
 
 	kvm_gpc_deactivate(vcpu->kvm, &vcpu->arch.xen.runstate_cache);
+	kvm_gpc_deactivate(vcpu->kvm, &vcpu->arch.xen.runstate2_cache);
 	kvm_gpc_deactivate(vcpu->kvm, &vcpu->arch.xen.vcpu_info_cache);
 	kvm_gpc_deactivate(vcpu->kvm, &vcpu->arch.xen.vcpu_time_info_cache);
 

Download attachment "smime.p7s" of type "application/pkcs7-signature" (5965 bytes)