lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <b3b61de3224f17792bb26e0e9bcf267cf4ebbcc7.1749672978.git.afranji@google.com>
Date: Wed, 11 Jun 2025 21:16:33 +0000
From: Ryan Afranji <afranji@...gle.com>
To: kvm@...r.kernel.org, linux-kernel@...r.kernel.org, x86@...nel.org
Cc: sagis@...gle.com, bp@...en8.de, chao.p.peng@...ux.intel.com, 
	dave.hansen@...ux.intel.com, dmatlack@...gle.com, erdemaktas@...gle.com, 
	isaku.yamahata@...el.com, kai.huang@...el.com, mingo@...hat.com, 
	pbonzini@...hat.com, seanjc@...gle.com, tglx@...utronix.de, 
	zhi.wang.linux@...il.com, ackerleytng@...gle.com, andrew.jones@...ux.dev, 
	david@...hat.com, hpa@...or.com, kirill.shutemov@...ux.intel.com, 
	linux-kselftest@...r.kernel.org, tabba@...gle.com, vannapurve@...gle.com, 
	yan.y.zhao@...el.com, rick.p.edgecombe@...el.com, 
	Ryan Afranji <afranji@...gle.com>
Subject: [RFC PATCH v2 06/10] KVM: TDX: Add core logic for TDX intra-host migration

From: Sagi Shahar <sagis@...gle.com>

Adds the core logic for transferring state between source and
destination TDs during intra-host migration.

Signed-off-by: Sagi Shahar <sagis@...gle.com>
Co-developed-by: Ryan Afranji <afranji@...gle.com>
Signed-off-by: Ryan Afranji <afranji@...gle.com>
---
 arch/x86/kvm/vmx/tdx.c | 193 ++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 192 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
index 4582f94175b7..268aca28d878 100644
--- a/arch/x86/kvm/vmx/tdx.c
+++ b/arch/x86/kvm/vmx/tdx.c
@@ -3534,9 +3534,200 @@ static __always_inline bool tdx_finalized(struct kvm *kvm)
 	return tdx_kvm->state == TD_STATE_RUNNABLE;
 }
 
+#define MAX_APIC_VECTOR 256
+
+static int tdx_migrate_vcpus(struct kvm *dst, struct kvm *src)
+{
+	struct kvm_vcpu *src_vcpu;
+	struct kvm_tdx *dst_tdx;
+	unsigned long i;
+
+	dst_tdx = to_kvm_tdx(dst);
+
+	kvm_for_each_vcpu(i, src_vcpu, src)
+		tdx_flush_vp_on_cpu(src_vcpu);
+
+	/* Copy per-vCPU state. */
+	kvm_for_each_vcpu(i, src_vcpu, src) {
+		struct vcpu_tdx *dst_tdx_vcpu, *src_tdx_vcpu;
+		struct kvm_lapic_state src_lapic_state;
+		struct kvm_vcpu *dst_vcpu;
+		u64 apic_base;
+		u32 vector;
+		int ret;
+
+		src_tdx_vcpu = to_tdx(src_vcpu);
+		dst_vcpu = kvm_get_vcpu(dst, i);
+		dst_tdx_vcpu = to_tdx(dst_vcpu);
+
+		dst_vcpu->cpu = -1;
+
+		/* Destination vCPU initialization skipped so do it here. */
+		apic_base = APIC_DEFAULT_PHYS_BASE | LAPIC_MODE_X2APIC |
+			(kvm_vcpu_is_reset_bsp(dst_vcpu) ?
+			 MSR_IA32_APICBASE_BSP : 0);
+		if (kvm_apic_set_base(dst_vcpu, apic_base, true))
+			return -EINVAL;
+
+		/* Copy lapic state. */
+		ret = kvm_apic_get_state(src_vcpu, &src_lapic_state);
+		if (ret)
+			return -EINVAL;
+
+		ret = kvm_apic_set_state(dst_vcpu, &src_lapic_state);
+		if (ret)
+			return -EINVAL;
+
+		/*
+		 * pi_desc stores state of posted interrupts for VMs which are
+		 * processed by pcpu during VM entry/runtime. For
+		 * non-confidential VMs, this storage is synchronized to vcpu
+		 * state using set_lapic_state(sync_pir_to_virr).
+		 *
+		 * For TDX VMs, KVM doesn't have access to virtual lapic page,
+		 * so in order to preserve the interrupt state, copy over
+		 * pi_desc contents to destination VM during copyless migration.
+		 */
+		dst_tdx_vcpu->vt = src_tdx_vcpu->vt;
+		for (vector = 0; vector < MAX_APIC_VECTOR; vector++) {
+			if (pi_test_pir(vector, &src_tdx_vcpu->vt.pi_desc)) {
+				__vmx_deliver_posted_interrupt(
+						dst_vcpu,
+						&dst_tdx_vcpu->vt.pi_desc,
+						vector);
+			}
+		}
+
+		/* Copy non-TDX vCPU state. */
+		memcpy(dst_vcpu->arch.regs, src_vcpu->arch.regs,
+		       NR_VCPU_REGS * sizeof(src_vcpu->arch.regs[0]));
+
+		dst_vcpu->arch.regs_avail = src_vcpu->arch.regs_avail;
+		dst_vcpu->arch.regs_dirty = src_vcpu->arch.regs_dirty;
+		dst_vcpu->arch.tsc_offset = dst_tdx->tsc_offset;
+		dst_vcpu->arch.guest_state_protected =
+			src_vcpu->arch.guest_state_protected;
+		dst_vcpu->arch.xfd_no_write_intercept =
+			src_vcpu->arch.xfd_no_write_intercept;
+		dst_vcpu->arch.tsc_offset = dst_tdx->tsc_offset;
+
+		/* Copy TD structures. */
+		dst_tdx_vcpu->vp.tdvpr_page = src_tdx_vcpu->vp.tdvpr_page;
+		dst_tdx_vcpu->vp.tdcx_pages = src_tdx_vcpu->vp.tdcx_pages;
+
+		td_vmcs_write64(dst_tdx_vcpu, POSTED_INTR_DESC_ADDR,
+				__pa(&dst_tdx_vcpu->vt.pi_desc));
+
+		/* Copy current vCPU status. */
+		dst_tdx_vcpu->ext_exit_qualification =
+			src_tdx_vcpu->ext_exit_qualification;
+		dst_tdx_vcpu->exit_gpa = src_tdx_vcpu->exit_gpa;
+		dst_tdx_vcpu->vp_enter_args = src_tdx_vcpu->vp_enter_args;
+		dst_tdx_vcpu->vp_enter_ret = src_tdx_vcpu->vp_enter_ret;
+		dst_tdx_vcpu->guest_entered = src_tdx_vcpu->guest_entered;
+		dst_tdx_vcpu->map_gpa_next = src_tdx_vcpu->map_gpa_next;
+		dst_tdx_vcpu->map_gpa_end = src_tdx_vcpu->map_gpa_end;
+
+		/* Copy mirror EPT tables. */
+		vcpu_load(dst_vcpu);
+		if (kvm_mmu_move_mirror_pages_from(dst_vcpu, src_vcpu)) {
+			vcpu_put(dst_vcpu);
+			return -EINVAL;
+		}
+		vcpu_put(dst_vcpu);
+
+		dst_vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
+		dst_tdx_vcpu->state = VCPU_TD_STATE_INITIALIZED;
+
+		/*
+		 * Set these source's vCPU migrated structures to NULL to avoid
+		 * freeing them during source VM shutdown.
+		 */
+		src_tdx_vcpu->vp.tdvpr_page = NULL;
+		src_tdx_vcpu->vp.tdcx_pages = NULL;
+	}
+
+	return 0;
+}
+
 static int tdx_migrate_from(struct kvm *dst, struct kvm *src)
 {
-	return -EINVAL;
+	struct kvm_tdx *src_tdx, *dst_tdx;
+	bool charged = false;
+	int ret;
+
+	src_tdx = to_kvm_tdx(src);
+	dst_tdx = to_kvm_tdx(dst);
+
+	ret = -EINVAL;
+
+	if (src_tdx->state != TD_STATE_RUNNABLE) {
+		pr_warn("Cannot migrate from a non finalized VM\n");
+		goto abort;
+	}
+
+	/* Transfer miscellaneous cgroup. */
+	dst_tdx->misc_cg = get_current_misc_cg();
+	if (dst_tdx->misc_cg != src_tdx->misc_cg) {
+		ret = misc_cg_try_charge(MISC_CG_RES_TDX, dst_tdx->misc_cg, 1);
+		if (ret)
+			goto abort_dst_cgroup;
+		charged = true;
+	}
+
+	dst_tdx->hkid = src_tdx->hkid;
+
+	/* Copy VM data. */
+	dst_tdx->attributes = src_tdx->attributes;
+	dst_tdx->xfam = src_tdx->xfam;
+	dst_tdx->tsc_offset = src_tdx->tsc_offset;
+	dst_tdx->tsc_multiplier = src_tdx->tsc_multiplier;
+	dst_tdx->nr_premapped = src_tdx->nr_premapped;
+	dst_tdx->wait_for_sept_zap = src_tdx->wait_for_sept_zap;
+	dst_tdx->kvm.arch.gfn_direct_bits = src_tdx->kvm.arch.gfn_direct_bits;
+
+	/* Copy TD structures. */
+	dst_tdx->td.tdcs_nr_pages = src_tdx->td.tdcs_nr_pages;
+	dst_tdx->td.tdcx_nr_pages = src_tdx->td.tdcx_nr_pages;
+	dst_tdx->td.tdr_page = src_tdx->td.tdr_page;
+	dst_tdx->td.tdcs_pages = src_tdx->td.tdcs_pages;
+
+	/* Copy per-vCPU state. */
+	ret = tdx_migrate_vcpus(dst, src);
+	if (ret)
+		goto late_abort;
+
+	dst->mem_attr_array.xa_head = src->mem_attr_array.xa_head;
+	src->mem_attr_array.xa_head = NULL;
+
+	dst_tdx->state = TD_STATE_RUNNABLE;
+
+	/*
+	 * Set these source's vCPU migrated structures to NULL to avoid
+	 * freeing them during source VM shutdown.
+	 */
+	src_tdx->hkid = -1;
+	src_tdx->td.tdr_page = NULL;
+	src_tdx->td.tdcs_pages = NULL;
+
+	return 0;
+
+late_abort:
+	/*
+	 * If we aborted after the state transfer already started, the src VM
+	 * is no longer valid.
+	 */
+	kvm_vm_dead(src);
+
+abort_dst_cgroup:
+	if (charged)
+		misc_cg_uncharge(MISC_CG_RES_TDX, dst_tdx->misc_cg, 1);
+	put_misc_cg(dst_tdx->misc_cg);
+	dst_tdx->misc_cg = NULL;
+abort:
+	dst_tdx->hkid = -1;
+	dst_tdx->td.tdr_page = 0;
+	return ret;
 }
 
 int tdx_vm_move_enc_context_from(struct kvm *kvm, struct kvm *src_kvm)
-- 
2.50.0.rc1.591.g9c95f17f64-goog


Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ