linux-kernel - Re: [PATCH v2] x86/sev: Fix SNP guest kdump hang/softlockup/panic

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite for Android: free password hash cracker in your pocket
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <90a031fb-b0dd-b5af-7013-7483ebad49ab@amd.com>
Date: Thu, 24 Apr 2025 13:22:37 -0500
From: Tom Lendacky <thomas.lendacky@....com>
To: Ashish Kalra <Ashish.Kalra@....com>, tglx@...utronix.de,
 mingo@...hat.com, dave.hansen@...ux.intel.com, x86@...nel.org, bp@...en8.de,
 hpa@...or.com
Cc: kees@...nel.org, michael.roth@....com, nikunj@....com, seanjc@...gle.com,
 ardb@...nel.org, gustavoars@...nel.org, sgarzare@...hat.com,
 stable@...r.kernel.org, linux-kernel@...r.kernel.org,
 linux-hardening@...r.kernel.org, kexec@...ts.infradead.org,
 linux-coco@...ts.linux.dev
Subject: Re: [PATCH v2] x86/sev: Fix SNP guest kdump hang/softlockup/panic

On 4/24/25 09:15, Ashish Kalra wrote:
> From: Ashish Kalra <ashish.kalra@....com>
> 
> When kdump is running makedumpfile to generate vmcore and dumping SNP
> guest memory it touches the VMSA page of the vCPU executing kdump which
> then results in unrecoverable #NPF/RMP faults as the VMSA page is
> marked busy/in-use when the vCPU is running.
> 
> This leads to guest softlockup/hang:
> 
> [  117.111097] watchdog: BUG: soft lockup - CPU#0 stuck for 27s! [cp:318]
> [  117.111165] CPU: 0 UID: 0 PID: 318 Comm: cp Not tainted 6.14.0-next-20250328-snp-host-f2a41ff576cc-dirty #414 VOLUNTARY
> [  117.111171] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS unknown 02/02/2022
> [  117.111176] RIP: 0010:rep_movs_alternative+0x5b/0x70
> [  117.111200] Call Trace:
> [  117.111204]  <TASK>
> [  117.111206]  ? _copy_to_iter+0xc1/0x720
> [  117.111216]  ? srso_return_thunk+0x5/0x5f
> [  117.111220]  ? _raw_spin_unlock+0x27/0x40
> [  117.111234]  ? srso_return_thunk+0x5/0x5f
> [  117.111236]  ? find_vmap_area+0xd6/0xf0
> [  117.111251]  ? srso_return_thunk+0x5/0x5f
> [  117.111253]  ? __check_object_size+0x18d/0x2e0
> [  117.111268]  __copy_oldmem_page.part.0+0x64/0xa0
> [  117.111281]  copy_oldmem_page_encrypted+0x1d/0x30
> [  117.111285]  read_from_oldmem.part.0+0xf4/0x200
> [  117.111306]  read_vmcore+0x206/0x3c0
> [  117.111309]  ? srso_return_thunk+0x5/0x5f
> [  117.111325]  proc_reg_read_iter+0x59/0x90
> [  117.111334]  vfs_read+0x26e/0x350
> 
> Additionally other APs may be halted in guest mode and their VMSA pages
> are marked busy and touching these VMSA pages during guest memory dump
> will also cause #NPF.
> 
> Issue AP_DESTROY GHCB calls on other APs to ensure they are kicked out
> of guest mode and then clear the VMSA bit on their VMSA pages.
> 
> If the vCPU running kdump is an AP, mark it's VMSA page as offline to
> ensure that makedumpfile excludes that page while dumping guest memory.
> 
> Cc: stable@...r.kernel.org
> Fixes: 3074152e56c9 ("x86/sev: Convert shared memory back to private on kexec")
> Signed-off-by: Ashish Kalra <ashish.kalra@....com>
> ---
>  arch/x86/coco/sev/core.c | 129 ++++++++++++++++++++++++++++++---------
>  1 file changed, 101 insertions(+), 28 deletions(-)
> 
> diff --git a/arch/x86/coco/sev/core.c b/arch/x86/coco/sev/core.c
> index dcfaa698d6cf..870f4994a13d 100644
> --- a/arch/x86/coco/sev/core.c
> +++ b/arch/x86/coco/sev/core.c
> @@ -113,6 +113,8 @@ DEFINE_PER_CPU(struct sev_es_save_area *, sev_vmsa);
>  DEFINE_PER_CPU(struct svsm_ca *, svsm_caa);
>  DEFINE_PER_CPU(u64, svsm_caa_pa);
>  
> +static void snp_cleanup_vmsa(struct sev_es_save_area *vmsa, int apic_id);
> +
>  static __always_inline bool on_vc_stack(struct pt_regs *regs)
>  {
>  	unsigned long sp = regs->sp;
> @@ -877,6 +879,42 @@ void snp_accept_memory(phys_addr_t start, phys_addr_t end)
>  	set_pages_state(vaddr, npages, SNP_PAGE_STATE_PRIVATE);
>  }
>  
> +static int issue_vmgexit_ap_create_destroy(u64 event, struct sev_es_save_area *vmsa, u32 apic_id)
> +{
> +	struct ghcb_state state;
> +	unsigned long flags;
> +	struct ghcb *ghcb;
> +	int ret = 0;
> +
> +	local_irq_save(flags);
> +
> +	ghcb = __sev_get_ghcb(&state);
> +
> +	vc_ghcb_invalidate(ghcb);
> +	ghcb_set_rax(ghcb, vmsa->sev_features);

RAX should only be set on a SVM_VMGEXIT_AP_CREATE event.

> +	ghcb_set_sw_exit_code(ghcb, SVM_VMGEXIT_AP_CREATION);
> +	ghcb_set_sw_exit_info_1(ghcb,
> +				((u64)apic_id << 32)	|
> +				((u64)snp_vmpl << 16)	|
> +				event);
> +	ghcb_set_sw_exit_info_2(ghcb, __pa(vmsa));
> +
> +	sev_es_wr_ghcb_msr(__pa(ghcb));
> +	VMGEXIT();
> +
> +	if (!ghcb_sw_exit_info_1_is_valid(ghcb) ||
> +	    lower_32_bits(ghcb->save.sw_exit_info_1)) {
> +		pr_err("SNP AP %s error\n", (event == SVM_VMGEXIT_AP_CREATE ? "CREATE" : "DESTROY"));
> +		ret = -EINVAL;
> +	}
> +
> +	__sev_put_ghcb(&state);
> +
> +	local_irq_restore(flags);
> +
> +	return ret;
> +}
> +
>  static void set_pte_enc(pte_t *kpte, int level, void *va)
>  {
>  	struct pte_enc_desc d = {
> @@ -973,6 +1011,66 @@ void snp_kexec_begin(void)
>  		pr_warn("Failed to stop shared<->private conversions\n");
>  }
>  
> +/*
> + * Shutdown all APs except the one handling kexec/kdump and clearing
> + * the VMSA tag on AP's VMSA pages as they are not being used as
> + * VMSA page anymore.
> + */
> +static void snp_shutdown_all_aps(void)
> +{
> +	struct sev_es_save_area *vmsa;
> +	int apic_id, cpu;
> +
> +	/*
> +	 * APs are already in HLT loop when kexec_finish() is invoked.
> +	 */
> +	for_each_present_cpu(cpu) {
> +		vmsa = per_cpu(sev_vmsa, cpu);
> +
> +		/*
> +		 * BSP does not have guest allocated VMSA, so it's in-use/busy
> +		 * VMSA cannot touch a guest page and there is no need to clear
> +		 * the VMSA tag for this page.
> +		 */
> +		if (!vmsa)
> +			continue;
> +
> +		/*
> +		 * Cannot clear the VMSA tag for the currently running vCPU.
> +		 */
> +		if (get_cpu() == cpu) {
> +			unsigned long pa;
> +			struct page *p;
> +
> +			pa = __pa(vmsa);
> +			p = pfn_to_online_page(pa >> PAGE_SHIFT);
> +			/*
> +			 * Mark the VMSA page of the running vCPU as Offline
> +			 * so that is excluded and not touched by makedumpfile
> +			 * while generating vmcore during kdump boot.
> +			 */
> +			if (p)
> +				__SetPageOffline(p);
> +			put_cpu();
> +			continue;
> +		}
> +		put_cpu();
> +
> +		apic_id = cpuid_to_apicid[cpu];
> +
> +		/*
> +		 * Issue AP destroy on all APs (to ensure they are kicked out
> +		 * of guest mode) to allow using RMPADJUST to remove the VMSA
> +		 * tag on VMSA pages especially for guests that allow HLT to
> +		 * not be intercepted.
> +		 */
> +

Remove this blank line.

> +		issue_vmgexit_ap_create_destroy(SVM_VMGEXIT_AP_DESTROY, vmsa, apic_id);
> +

And this one.

> +		snp_cleanup_vmsa(vmsa, apic_id);
> +	}
> +}
> +
>  void snp_kexec_finish(void)
>  {
>  	struct sev_es_runtime_data *data;
> @@ -987,6 +1085,8 @@ void snp_kexec_finish(void)
>  	if (!IS_ENABLED(CONFIG_KEXEC_CORE))
>  		return;
>  
> +	snp_shutdown_all_aps();
> +
>  	unshare_all_memory();
>  
>  	/*
> @@ -1098,10 +1198,7 @@ static void snp_cleanup_vmsa(struct sev_es_save_area *vmsa, int apic_id)
>  static int wakeup_cpu_via_vmgexit(u32 apic_id, unsigned long start_ip)
>  {
>  	struct sev_es_save_area *cur_vmsa, *vmsa;
> -	struct ghcb_state state;
>  	struct svsm_ca *caa;
> -	unsigned long flags;
> -	struct ghcb *ghcb;
>  	u8 sipi_vector;
>  	int cpu, ret;
>  	u64 cr4;
> @@ -1215,31 +1312,7 @@ static int wakeup_cpu_via_vmgexit(u32 apic_id, unsigned long start_ip)
>  	}
>  
>  	/* Issue VMGEXIT AP Creation NAE event */
> -	local_irq_save(flags);
> -
> -	ghcb = __sev_get_ghcb(&state);
> -
> -	vc_ghcb_invalidate(ghcb);
> -	ghcb_set_rax(ghcb, vmsa->sev_features);
> -	ghcb_set_sw_exit_code(ghcb, SVM_VMGEXIT_AP_CREATION);
> -	ghcb_set_sw_exit_info_1(ghcb,
> -				((u64)apic_id << 32)	|
> -				((u64)snp_vmpl << 16)	|
> -				SVM_VMGEXIT_AP_CREATE);
> -	ghcb_set_sw_exit_info_2(ghcb, __pa(vmsa));
> -
> -	sev_es_wr_ghcb_msr(__pa(ghcb));
> -	VMGEXIT();
> -
> -	if (!ghcb_sw_exit_info_1_is_valid(ghcb) ||
> -	    lower_32_bits(ghcb->save.sw_exit_info_1)) {
> -		pr_err("SNP AP Creation error\n");
> -		ret = -EINVAL;
> -	}
> -
> -	__sev_put_ghcb(&state);
> -
> -	local_irq_restore(flags);
> +	ret = issue_vmgexit_ap_create_destroy(SVM_VMGEXIT_AP_CREATE, vmsa, apic_id);
>  
>  	/* Perform cleanup if there was an error */

You can remove the two lines above (the blank line and the comment) now
that the setting of ret is not a few lines before. That way you have

	/* Issue VMGEXIT AP Creation NAE event */
	ret = issue_vmgexit_ap_create_destroy(SVM_VMGEXIT_AP_CREATE, vmsa, apic_id);
	if (ret) {

and it's nicely grouped.

Thanks,
Tom

>  	if (ret) {