linux-kernel - Re: [PATCH 13/25] KVM: TDX: create/destroy VM structure

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20240814030849.7yqx3db4oojsoh5k@yy-desk-7060>
Date: Wed, 14 Aug 2024 11:08:49 +0800
From: Yuan Yao <yuan.yao@...ux.intel.com>
To: Rick Edgecombe <rick.p.edgecombe@...el.com>
Cc: seanjc@...gle.com, pbonzini@...hat.com, kvm@...r.kernel.org,
	kai.huang@...el.com, isaku.yamahata@...il.com,
	tony.lindgren@...ux.intel.com, xiaoyao.li@...el.com,
	linux-kernel@...r.kernel.org,
	Isaku Yamahata <isaku.yamahata@...el.com>,
	Sean Christopherson <sean.j.christopherson@...el.com>,
	Yan Zhao <yan.y.zhao@...el.com>
Subject: Re: [PATCH 13/25] KVM: TDX: create/destroy VM structure

On Mon, Aug 12, 2024 at 03:48:08PM -0700, Rick Edgecombe wrote:
> From: Isaku Yamahata <isaku.yamahata@...el.com>
>
> Implement managing the TDX private KeyID to implement, create, destroy
> and free for a TDX guest.
>
> When creating at TDX guest, assign a TDX private KeyID for the TDX guest
> for memory encryption, and allocate pages for the guest. These are used
> for the Trust Domain Root (TDR) and Trust Domain Control Structure (TDCS).
>
> On destruction, free the allocated pages, and the KeyID.
>
> Before tearing down the private page tables, TDX requires the guest TD to
> be destroyed by reclaiming the KeyID. Do it at vm_destroy() kvm_x86_ops
> hook.
>
> Add a call for vm_free() at the end of kvm_arch_destroy_vm() because the
> per-VM TDR needs to be freed after the KeyID.
>
> Signed-off-by: Isaku Yamahata <isaku.yamahata@...el.com>
> Co-developed-by: Sean Christopherson <sean.j.christopherson@...el.com>
> Signed-off-by: Sean Christopherson <sean.j.christopherson@...el.com>
> Co-developed-by: Kai Huang <kai.huang@...el.com>
> Signed-off-by: Kai Huang <kai.huang@...el.com>
> Co-developed-by: Yan Zhao <yan.y.zhao@...el.com>
> Signed-off-by: Yan Zhao <yan.y.zhao@...el.com>
> Co-developed-by: Rick Edgecombe <rick.p.edgecombe@...el.com>
> Signed-off-by: Rick Edgecombe <rick.p.edgecombe@...el.com>
> ---
...
> +void tdx_mmu_release_hkid(struct kvm *kvm)
> +{
> +	bool packages_allocated, targets_allocated;
> +	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
> +	cpumask_var_t packages, targets;
> +	u64 err;
> +	int i;
> +
> +	if (!is_hkid_assigned(kvm_tdx))
> +		return;
> +
> +	/* KeyID has been allocated but guest is not yet configured */
> +	if (!is_td_created(kvm_tdx)) {
> +		tdx_hkid_free(kvm_tdx);
> +		return;
> +	}
> +
> +	packages_allocated = zalloc_cpumask_var(&packages, GFP_KERNEL);
> +	targets_allocated = zalloc_cpumask_var(&targets, GFP_KERNEL);
> +	cpus_read_lock();
> +
> +	/*
> +	 * TDH.PHYMEM.CACHE.WB tries to acquire the TDX module global lock
> +	 * and can fail with TDX_OPERAND_BUSY when it fails to get the lock.
> +	 * Multiple TDX guests can be destroyed simultaneously. Take the
> +	 * mutex to prevent it from getting error.
> +	 */
> +	mutex_lock(&tdx_lock);
> +
> +	/*
> +	 * We need three SEAMCALLs, TDH.MNG.VPFLUSHDONE(), TDH.PHYMEM.CACHE.WB(),
> +	 * and TDH.MNG.KEY.FREEID() to free the HKID. When the HKID is assigned,
> +	 * we need to use TDH.MEM.SEPT.REMOVE() or TDH.MEM.PAGE.REMOVE(). When
> +	 * the HKID is free, we need to use TDH.PHYMEM.PAGE.RECLAIM().  Get lock
> +	 * to not present transient state of HKID.
> +	 */
> +	write_lock(&kvm->mmu_lock);
> +
> +	for_each_online_cpu(i) {
> +		if (packages_allocated &&
> +		    cpumask_test_and_set_cpu(topology_physical_package_id(i),
> +					     packages))
> +			continue;
> +		if (targets_allocated)
> +			cpumask_set_cpu(i, targets);
> +	}
> +	if (targets_allocated)
> +		on_each_cpu_mask(targets, smp_func_do_phymem_cache_wb, NULL, true);
> +	else
> +		on_each_cpu(smp_func_do_phymem_cache_wb, NULL, true);
> +	/*
> +	 * In the case of error in smp_func_do_phymem_cache_wb(), the following
> +	 * tdh_mng_key_freeid() will fail.
> +	 */
> +	err = tdh_mng_key_freeid(kvm_tdx);
> +	if (KVM_BUG_ON(err, kvm)) {
> +		pr_tdx_error(TDH_MNG_KEY_FREEID, err);
> +		pr_err("tdh_mng_key_freeid() failed. HKID %d is leaked.\n",
> +		       kvm_tdx->hkid);
> +	} else {
> +		tdx_hkid_free(kvm_tdx);
> +	}
> +
> +	write_unlock(&kvm->mmu_lock);
> +	mutex_unlock(&tdx_lock);
> +	cpus_read_unlock();
> +	free_cpumask_var(targets);
> +	free_cpumask_var(packages);
> +}
> +
> +static inline u8 tdx_sysinfo_nr_tdcs_pages(void)
> +{
> +	return tdx_sysinfo->td_ctrl.tdcs_base_size / PAGE_SIZE;
> +}
> +
> +void tdx_vm_free(struct kvm *kvm)
> +{
> +	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
> +	u64 err;
> +	int i;
> +
> +	/*
> +	 * tdx_mmu_release_hkid() failed to reclaim HKID.  Something went wrong
> +	 * heavily with TDX module.  Give up freeing TD pages.  As the function
> +	 * already warned, don't warn it again.
> +	 */
> +	if (is_hkid_assigned(kvm_tdx))
> +		return;
> +
> +	if (kvm_tdx->tdcs_pa) {
> +		for (i = 0; i < tdx_sysinfo_nr_tdcs_pages(); i++) {
> +			if (!kvm_tdx->tdcs_pa[i])
> +				continue;
> +
> +			tdx_reclaim_control_page(kvm_tdx->tdcs_pa[i]);
> +		}
> +		kfree(kvm_tdx->tdcs_pa);
> +		kvm_tdx->tdcs_pa = NULL;
> +	}
> +
> +	if (!kvm_tdx->tdr_pa)
> +		return;
> +
> +	if (__tdx_reclaim_page(kvm_tdx->tdr_pa))
> +		return;
> +
> +	/*
> +	 * Use a SEAMCALL to ask the TDX module to flush the cache based on the
> +	 * KeyID. TDX module may access TDR while operating on TD (Especially
> +	 * when it is reclaiming TDCS).
> +	 */
> +	err = tdh_phymem_page_wbinvd(set_hkid_to_hpa(kvm_tdx->tdr_pa,
> +						     tdx_global_keyid));
> +	if (KVM_BUG_ON(err, kvm)) {
> +		pr_tdx_error(TDH_PHYMEM_PAGE_WBINVD, err);
> +		return;
> +	}
> +	tdx_clear_page(kvm_tdx->tdr_pa);
> +
> +	free_page((unsigned long)__va(kvm_tdx->tdr_pa));
> +	kvm_tdx->tdr_pa = 0;
> +}
> +
...
> +static int __tdx_td_init(struct kvm *kvm)
> +{
> +	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
> +	cpumask_var_t packages;
> +	unsigned long *tdcs_pa = NULL;
> +	unsigned long tdr_pa = 0;
> +	unsigned long va;
> +	int ret, i;
> +	u64 err;
> +
> +	ret = tdx_guest_keyid_alloc();
> +	if (ret < 0)
> +		return ret;
> +	kvm_tdx->hkid = ret;
> +
> +	va = __get_free_page(GFP_KERNEL_ACCOUNT);
> +	if (!va)
> +		goto free_hkid;
> +	tdr_pa = __pa(va);
> +
> +	tdcs_pa = kcalloc(tdx_sysinfo_nr_tdcs_pages(), sizeof(*kvm_tdx->tdcs_pa),
> +			  GFP_KERNEL_ACCOUNT | __GFP_ZERO);
> +	if (!tdcs_pa)
> +		goto free_tdr;
> +
> +	for (i = 0; i < tdx_sysinfo_nr_tdcs_pages(); i++) {
> +		va = __get_free_page(GFP_KERNEL_ACCOUNT);
> +		if (!va)
> +			goto free_tdcs;
> +		tdcs_pa[i] = __pa(va);
> +	}
> +
> +	if (!zalloc_cpumask_var(&packages, GFP_KERNEL)) {
> +		ret = -ENOMEM;
> +		goto free_tdcs;
> +	}
> +
> +	cpus_read_lock();
> +
> +	/*
> +	 * Need at least one CPU of the package to be online in order to
> +	 * program all packages for host key id.  Check it.
> +	 */
> +	for_each_present_cpu(i)
> +		cpumask_set_cpu(topology_physical_package_id(i), packages);
> +	for_each_online_cpu(i)
> +		cpumask_clear_cpu(topology_physical_package_id(i), packages);
> +	if (!cpumask_empty(packages)) {
> +		ret = -EIO;
> +		/*
> +		 * Because it's hard for human operator to figure out the
> +		 * reason, warn it.
> +		 */
> +#define MSG_ALLPKG	"All packages need to have online CPU to create TD. Online CPU and retry.\n"
> +		pr_warn_ratelimited(MSG_ALLPKG);
> +		goto free_packages;
> +	}
> +
> +	/*
> +	 * TDH.MNG.CREATE tries to grab the global TDX module and fails
> +	 * with TDX_OPERAND_BUSY when it fails to grab.  Take the global
> +	 * lock to prevent it from failure.
> +	 */
> +	mutex_lock(&tdx_lock);
> +	kvm_tdx->tdr_pa = tdr_pa;
> +	err = tdh_mng_create(kvm_tdx, kvm_tdx->hkid);
> +	mutex_unlock(&tdx_lock);
> +
> +	if (err == TDX_RND_NO_ENTROPY) {
> +		kvm_tdx->tdr_pa = 0;

code path after 'free_packages' set it to 0, so this can be removed.

> +		ret = -EAGAIN;
> +		goto free_packages;
> +	}
> +
> +	if (WARN_ON_ONCE(err)) {
> +		kvm_tdx->tdr_pa = 0;

Ditto.

> +		pr_tdx_error(TDH_MNG_CREATE, err);
> +		ret = -EIO;
> +		goto free_packages;
> +	}
> +
> +	for_each_online_cpu(i) {
> +		int pkg = topology_physical_package_id(i);
> +
> +		if (cpumask_test_and_set_cpu(pkg, packages))
> +			continue;
> +
> +		/*
> +		 * Program the memory controller in the package with an
> +		 * encryption key associated to a TDX private host key id
> +		 * assigned to this TDR.  Concurrent operations on same memory
> +		 * controller results in TDX_OPERAND_BUSY. No locking needed
> +		 * beyond the cpus_read_lock() above as it serializes against
> +		 * hotplug and the first online CPU of the package is always
> +		 * used. We never have two CPUs in the same socket trying to
> +		 * program the key.
> +		 */
> +		ret = smp_call_on_cpu(i, tdx_do_tdh_mng_key_config,
> +				      kvm_tdx, true);
> +		if (ret)
> +			break;
> +	}
> +	cpus_read_unlock();
> +	free_cpumask_var(packages);
> +	if (ret) {
> +		i = 0;
> +		goto teardown;
> +	}
> +
> +	kvm_tdx->tdcs_pa = tdcs_pa;
> +	for (i = 0; i < tdx_sysinfo_nr_tdcs_pages(); i++) {
> +		err = tdh_mng_addcx(kvm_tdx, tdcs_pa[i]);
> +		if (err == TDX_RND_NO_ENTROPY) {
> +			/* Here it's hard to allow userspace to retry. */
> +			ret = -EBUSY;
> +			goto teardown;
> +		}
> +		if (WARN_ON_ONCE(err)) {
> +			pr_tdx_error(TDH_MNG_ADDCX, err);
> +			ret = -EIO;
> +			goto teardown;

This and above 'goto teardown' under same for() free the
partially added TDCX pages w/o take ownership back, may
'goto teardown_reclaim' (or any better name) below can
handle this, see next comment for this patch.

> +		}
> +	}
> +
> +	/*
> +	 * Note, TDH_MNG_INIT cannot be invoked here.  TDH_MNG_INIT requires a dedicated
> +	 * ioctl() to define the configure CPUID values for the TD.
> +	 */
> +	return 0;
> +
> +	/*
> +	 * The sequence for freeing resources from a partially initialized TD
> +	 * varies based on where in the initialization flow failure occurred.
> +	 * Simply use the full teardown and destroy, which naturally play nice
> +	 * with partial initialization.
> +	 */
> +teardown:
> +	for (; i < tdx_sysinfo_nr_tdcs_pages(); i++) {
> +		if (tdcs_pa[i]) {
> +			free_page((unsigned long)__va(tdcs_pa[i]));
> +			tdcs_pa[i] = 0;
> +		}
> +	}
> +	if (!kvm_tdx->tdcs_pa)
> +		kfree(tdcs_pa);

Add 'teardown_reclaim:' Here, pair with my last comment.

> +	tdx_mmu_release_hkid(kvm);
> +	tdx_vm_free(kvm);
> +
> +	return ret;
> +
> +free_packages:
> +	cpus_read_unlock();
> +	free_cpumask_var(packages);
> +
> +free_tdcs:
> +	for (i = 0; i < tdx_sysinfo_nr_tdcs_pages(); i++) {
> +		if (tdcs_pa[i])
> +			free_page((unsigned long)__va(tdcs_pa[i]));
> +	}
> +	kfree(tdcs_pa);
> +	kvm_tdx->tdcs_pa = NULL;
> +
> +free_tdr:
> +	if (tdr_pa)
> +		free_page((unsigned long)__va(tdr_pa));
> +	kvm_tdx->tdr_pa = 0;
> +
> +free_hkid:
> +	tdx_hkid_free(kvm_tdx);
> +
> +	return ret;
> +}
> +
>  int tdx_vm_ioctl(struct kvm *kvm, void __user *argp)
>  {
>  	struct kvm_tdx_cmd tdx_cmd;
> @@ -274,6 +747,11 @@ static int __init __tdx_bringup(void)
>  {
>  	int r;
>
> +	if (!cpu_feature_enabled(X86_FEATURE_MOVDIR64B)) {
> +		pr_warn("MOVDIR64B is reqiured for TDX\n");
> +		return -EOPNOTSUPP;
> +	}
> +
>  	if (!enable_ept) {
>  		pr_err("Cannot enable TDX with EPT disabled.\n");
>  		return -EINVAL;
> diff --git a/arch/x86/kvm/vmx/tdx.h b/arch/x86/kvm/vmx/tdx.h
> index 78f84c53a948..268959d0f74f 100644
> --- a/arch/x86/kvm/vmx/tdx.h
> +++ b/arch/x86/kvm/vmx/tdx.h
> @@ -14,6 +14,9 @@ struct kvm_tdx {
>  	struct kvm kvm;
>
>  	unsigned long tdr_pa;
> +	unsigned long *tdcs_pa;
> +
> +	int hkid;
>  };
>
>  struct vcpu_tdx {
> diff --git a/arch/x86/kvm/vmx/x86_ops.h b/arch/x86/kvm/vmx/x86_ops.h
> index c1bdf7d8fee3..96c74880bd36 100644
> --- a/arch/x86/kvm/vmx/x86_ops.h
> +++ b/arch/x86/kvm/vmx/x86_ops.h
> @@ -120,12 +120,18 @@ void vmx_setup_mce(struct kvm_vcpu *vcpu);
>
>  #ifdef CONFIG_INTEL_TDX_HOST
>  int tdx_vm_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap);
> +int tdx_vm_init(struct kvm *kvm);
> +void tdx_mmu_release_hkid(struct kvm *kvm);
> +void tdx_vm_free(struct kvm *kvm);
>  int tdx_vm_ioctl(struct kvm *kvm, void __user *argp);
>  #else
>  static inline int tdx_vm_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap)
>  {
>  	return -EINVAL;
>  };
> +static inline int tdx_vm_init(struct kvm *kvm) { return -EOPNOTSUPP; }
> +static inline void tdx_mmu_release_hkid(struct kvm *kvm) {}
> +static inline void tdx_vm_free(struct kvm *kvm) {}
>  static inline int tdx_vm_ioctl(struct kvm *kvm, void __user *argp) { return -EOPNOTSUPP; }
>  #endif
>
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 751b3841c48f..ce2ef63f30f2 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -12852,6 +12852,7 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
>  	kvm_page_track_cleanup(kvm);
>  	kvm_xen_destroy_vm(kvm);
>  	kvm_hv_destroy_vm(kvm);
> +	static_call_cond(kvm_x86_vm_free)(kvm);
>  }
>
>  static void memslot_rmap_free(struct kvm_memory_slot *slot)
> --
> 2.34.1
>
>