linux-kernel - RE: [PATCH hyperv-next v3 14/15] Drivers: hv: Support establishing the confidential VMBus connection

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID:
 <SN6PR02MB415749A79FD265F2CF8F4D77D472A@SN6PR02MB4157.namprd02.prod.outlook.com>
Date: Wed, 18 Jun 2025 16:19:39 +0000
From: Michael Kelley <mhklinux@...look.com>
To: Roman Kisel <romank@...ux.microsoft.com>, "alok.a.tiwari@...cle.com"
	<alok.a.tiwari@...cle.com>, "arnd@...db.de" <arnd@...db.de>, "bp@...en8.de"
	<bp@...en8.de>, "corbet@....net" <corbet@....net>,
	"dave.hansen@...ux.intel.com" <dave.hansen@...ux.intel.com>,
	"decui@...rosoft.com" <decui@...rosoft.com>, "haiyangz@...rosoft.com"
	<haiyangz@...rosoft.com>, "hpa@...or.com" <hpa@...or.com>,
	"kys@...rosoft.com" <kys@...rosoft.com>, "mingo@...hat.com"
	<mingo@...hat.com>, "tglx@...utronix.de" <tglx@...utronix.de>,
	"wei.liu@...nel.org" <wei.liu@...nel.org>, "linux-arch@...r.kernel.org"
	<linux-arch@...r.kernel.org>, "linux-doc@...r.kernel.org"
	<linux-doc@...r.kernel.org>, "linux-hyperv@...r.kernel.org"
	<linux-hyperv@...r.kernel.org>, "linux-kernel@...r.kernel.org"
	<linux-kernel@...r.kernel.org>, "x86@...nel.org" <x86@...nel.org>
CC: "apais@...rosoft.com" <apais@...rosoft.com>, "benhill@...rosoft.com"
	<benhill@...rosoft.com>, "bperkins@...rosoft.com" <bperkins@...rosoft.com>,
	"sunilmut@...rosoft.com" <sunilmut@...rosoft.com>
Subject: RE: [PATCH hyperv-next v3 14/15] Drivers: hv: Support establishing
 the confidential VMBus connection

From: Roman Kisel <romank@...ux.microsoft.com> Sent: Tuesday, June 3, 2025 5:44 PM
> 
> To establish the confidential VMBus connection the CoCo VM guest
> first attempts to connect to the VMBus server run by the paravisor.
> If that fails, the guest falls back to the non-confidential VMBus.
> 
> Implement that in the VMBus driver initialization.
> 
> Signed-off-by: Roman Kisel <romank@...ux.microsoft.com>
> ---
>  drivers/hv/vmbus_drv.c | 169 +++++++++++++++++++++++++++--------------
>  1 file changed, 110 insertions(+), 59 deletions(-)
> 
> diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c
> index f7e82a4fe133..88701c3ad999 100644
> --- a/drivers/hv/vmbus_drv.c
> +++ b/drivers/hv/vmbus_drv.c
> @@ -1057,12 +1057,9 @@ static void vmbus_onmessage_work(struct work_struct *work)
>  	kfree(ctx);
>  }
> 
> -void vmbus_on_msg_dpc(unsigned long data)
> +static void __vmbus_on_msg_dpc(void *message_page_addr)
>  {
> -	struct hv_per_cpu_context *hv_cpu = (void *)data;
> -	void *page_addr = hv_cpu->hyp_synic_message_page;
> -	struct hv_message msg_copy, *msg = (struct hv_message *)page_addr +
> -				  VMBUS_MESSAGE_SINT;
> +	struct hv_message msg_copy, *msg;
>  	struct vmbus_channel_message_header *hdr;
>  	enum vmbus_channel_message_type msgtype;
>  	const struct vmbus_channel_message_table_entry *entry;
> @@ -1070,6 +1067,10 @@ void vmbus_on_msg_dpc(unsigned long data)
>  	__u8 payload_size;
>  	u32 message_type;
> 
> +	if (!message_page_addr)
> +		return;
> +	msg = (struct hv_message *)message_page_addr + VMBUS_MESSAGE_SINT;
> +
>  	/*
>  	 * 'enum vmbus_channel_message_type' is supposed to always be 'u32' as
>  	 * it is being used in 'struct vmbus_channel_message_header' definition
> @@ -1195,6 +1196,14 @@ void vmbus_on_msg_dpc(unsigned long data)
>  	vmbus_signal_eom(msg, message_type);
>  }
> 
> +void vmbus_on_msg_dpc(unsigned long data)
> +{
> +	struct hv_per_cpu_context *hv_cpu = (void *)data;
> +
> +	__vmbus_on_msg_dpc(hv_cpu->hyp_synic_message_page);
> +	__vmbus_on_msg_dpc(hv_cpu->para_synic_message_page);
> +}
> +
>  #ifdef CONFIG_PM_SLEEP
>  /*
>   * Fake RESCIND_CHANNEL messages to clean up hv_sock channels by force for
> @@ -1233,21 +1242,19 @@ static void vmbus_force_channel_rescinded(struct
> vmbus_channel *channel)
>  #endif /* CONFIG_PM_SLEEP */
> 
>  /*
> - * Schedule all channels with events pending
> + * Schedule all channels with events pending.
> + * The event page can be directly checked to get the id of
> + * the channel that has the interrupt pending.
>   */
> -static void vmbus_chan_sched(struct hv_per_cpu_context *hv_cpu)
> +static void vmbus_chan_sched(struct hv_per_cpu_context *hv_cpu, void *event_page_addr)

The hv_cpu parameter to this function isn't used.  Couldn't it be removed? Perhaps there's a
case for making the function API more parallel to vmbus_message_sched(), but in my judgment
that's not enough value to warrant passing an unneeded parameter.

>  {
>  	unsigned long *recv_int_page;
>  	u32 maxbits, relid;
> +	union hv_synic_event_flags *event;
> 
> -	/*
> -	 * The event page can be directly checked to get the id of
> -	 * the channel that has the interrupt pending.
> -	 */
> -	void *page_addr = hv_cpu->hyp_synic_event_page;
> -	union hv_synic_event_flags *event
> -		= (union hv_synic_event_flags *)page_addr +
> -					 VMBUS_MESSAGE_SINT;
> +	if (!event_page_addr)
> +		return;
> +	event = (union hv_synic_event_flags *)event_page_addr + VMBUS_MESSAGE_SINT;
> 
>  	maxbits = HV_EVENT_FLAGS_COUNT;
>  	recv_int_page = event->flags;
> @@ -1318,26 +1325,40 @@ static void vmbus_chan_sched(struct hv_per_cpu_context *hv_cpu)
>  	}
>  }
> 
> -static void vmbus_isr(void)
> +static void vmbus_message_sched(struct hv_per_cpu_context *hv_cpu, void *message_page_addr)
>  {
> -	struct hv_per_cpu_context *hv_cpu
> -		= this_cpu_ptr(hv_context.cpu_context);
> -	void *page_addr;
>  	struct hv_message *msg;
> 
> -	vmbus_chan_sched(hv_cpu);
> -
> -	page_addr = hv_cpu->hyp_synic_message_page;
> -	msg = (struct hv_message *)page_addr + VMBUS_MESSAGE_SINT;
> +	if (!message_page_addr)
> +		return;
> +	msg = (struct hv_message *)message_page_addr + VMBUS_MESSAGE_SINT;
> 
>  	/* Check if there are actual msgs to be processed */
>  	if (msg->header.message_type != HVMSG_NONE) {
>  		if (msg->header.message_type == HVMSG_TIMER_EXPIRED) {
>  			hv_stimer0_isr();
>  			vmbus_signal_eom(msg, HVMSG_TIMER_EXPIRED);
> -		} else
> +		} else {
>  			tasklet_schedule(&hv_cpu->msg_dpc);
> +		}
>  	}
> +}
> +
> +static void vmbus_isr(void)
> +{
> +	struct hv_per_cpu_context *hv_cpu
> +		= this_cpu_ptr(hv_context.cpu_context);
> +
> +	/*
> +	 * Suggested-by: Michael Kelley <mhklinux@...look.com>
> +	 * One possible optimization would be to keep track of the largest relID that's in use,
> +	 * and only scan up to that relID.
> +	 */

I'd put this comment in vmbus_chan_sched() just before the "for_each_set_bit()"
loop. That's the loop that is doing the scanning.

> +	vmbus_chan_sched(hv_cpu, hv_cpu->hyp_synic_event_page);
> +	vmbus_chan_sched(hv_cpu, hv_cpu->para_synic_event_page);
> +
> +	vmbus_message_sched(hv_cpu, hv_cpu->hyp_synic_message_page);
> +	vmbus_message_sched(hv_cpu, hv_cpu->para_synic_message_page);
> 
>  	add_interrupt_randomness(vmbus_interrupt);
>  }
> @@ -1355,6 +1376,60 @@ static void vmbus_percpu_work(struct work_struct *work)
>  	hv_synic_init(cpu);
>  }
> 
> +static int vmbus_alloc_synic_and_connect(void)
> +{
> +	int ret, cpu;
> +	struct work_struct __percpu *works;
> +	int hyperv_cpuhp_online;
> +
> +	ret = hv_synic_alloc();
> +	if (ret < 0)
> +		goto err_alloc;
> +

Extra blank line here.

> +
> +	works = alloc_percpu(struct work_struct);
> +	if (!works) {
> +		ret = -ENOMEM;
> +		goto err_alloc;
> +	}
> +
> +	/*
> +	 * Initialize the per-cpu interrupt state and stimer state.
> +	 * Then connect to the host.
> +	 */
> +	cpus_read_lock();
> +	for_each_online_cpu(cpu) {
> +		struct work_struct *work = per_cpu_ptr(works, cpu);
> +
> +		INIT_WORK(work, vmbus_percpu_work);
> +		schedule_work_on(cpu, work);
> +	}
> +
> +	for_each_online_cpu(cpu)
> +		flush_work(per_cpu_ptr(works, cpu));
> +
> +	/* Register the callbacks for possible CPU online/offline'ing */
> +	ret = cpuhp_setup_state_nocalls_cpuslocked(CPUHP_AP_ONLINE_DYN, "hyperv/vmbus:online",
> +						   hv_synic_init, hv_synic_cleanup);
> +	cpus_read_unlock();

There's a subtle problem here. As soon as the lock is released, there could be
vCPUs that start coming online (started by the udev daemon), and running
hv_synic_init(). My assumption is that hv_synic_init() might fail if this invocation
of vmbus_alloc_synic_and_connect() is trying to determine if Confidential VMBus
is supported. A failure will abort the new CPU coming online. That suggests that
hv_synic_init() should not return an error in the case where initializing the paravisor
SynIC doesn't work.

> +	free_percpu(works);
> +	if (ret < 0)
> +		goto err_alloc;
> +	hyperv_cpuhp_online = ret;
> +
> +	ret = vmbus_connect();

When doing vmbus_alloc_synic_and_connect() the first time with "is_confidential"
set to "true", where exactly does the failure occur if the paravisor doesn't support
Confidential VMBus? Since your patches add machinery to detect MSR accesses
against the paravisor that fail, I'm presuming hv_synic_init() will fail for all vCPUs.
But the error return from hv_synic_init() isn't checked (except as I mentioned above
for cpuhp_setup_state). So evidently the failure is detected in vmbus_connect() when
vmbus_negotiate_version() sends the INITIATE_CONTACT message. And presumably
sending the message fails because there's no way to wait for a failure response since
the SynICs didn't initialized. So is it the HV_POST_MESSAGE hypercall that is the
exact point of failure, and if so, what error status is returned? It looks like
vmbus_post_msg() would output an error message. Or is there some other failure
point that I'm missing?

I ask because there's a lot of work done before the failure is detected. Each
vCPU must try to initialize their paravisor SynIC and fail. Then each CPU does
hv_synic_cleanup() as part of cpuhp_remove_state(). Workqueues are created
in vmbus_connect(), and memory is allocated, all of which must be cleaned
up. Then everything is retried with "is_confidential" set to "false". Do you have
any sense of how much elapsed time it takes to get the initial failure and then
cleanup? Consider the case with a large number of vCPUs, all of which must
run hv_synic_init() and then hv_synic_cleanup(). See commit 87c9741a38c4
where this elapsed time was a concern in large VMs.

Can the failure be detected earlier by doing a simple MSR read against
the paravisor for an MSR that's only available if Confidential VMBus is
implemented? Then "is _confidential" could be set correctly before
all the work is done, and the work would only be done once even
if Confidential VMBus isn't supported.

> +	if (ret)
> +		goto err_connect;
> +	return 0;
> +
> +err_connect:
> +	cpuhp_remove_state(hyperv_cpuhp_online);
> +	return -ENODEV;
> +err_alloc:
> +	hv_synic_free();
> +	return -ENOMEM;
> +}
> +
>  /*
>   * vmbus_bus_init -Main vmbus driver initialization routine.
>   *
> @@ -1365,8 +1440,7 @@ static void vmbus_percpu_work(struct work_struct *work)
>   */
>  static int vmbus_bus_init(void)
>  {
> -	int ret, cpu;
> -	struct work_struct __percpu *works;
> +	int ret;
> 
>  	ret = hv_init();
>  	if (ret != 0) {
> @@ -1401,41 +1475,21 @@ static int vmbus_bus_init(void)
>  		}
>  	}
> 
> -	ret = hv_synic_alloc();
> -	if (ret)
> -		goto err_alloc;
> -
> -	works = alloc_percpu(struct work_struct);
> -	if (!works) {
> -		ret = -ENOMEM;
> -		goto err_alloc;
> -	}
> -
>  	/*
> -	 * Initialize the per-cpu interrupt state and stimer state.
> -	 * Then connect to the host.
> +	 * Attempt to establish the confidential VMBus connection first if this VM is
> +	 * a hardware confidential VM, and the paravisor is present.
>  	 */
> -	cpus_read_lock();
> -	for_each_online_cpu(cpu) {
> -		struct work_struct *work = per_cpu_ptr(works, cpu);
> +	ret = -ENODEV;
> +	if (ms_hyperv.paravisor_present && (hv_isolation_type_tdx() || hv_isolation_type_snp())) {
> +		is_confidential = true;
> +		ret = vmbus_alloc_synic_and_connect();
> +		is_confidential = !ret;
> 
> -		INIT_WORK(work, vmbus_percpu_work);
> -		schedule_work_on(cpu, work);
> +		pr_info("VMBus is confidential: %d\n", is_confidential);
>  	}
> 
> -	for_each_online_cpu(cpu)
> -		flush_work(per_cpu_ptr(works, cpu));
> -
> -	/* Register the callbacks for possible CPU online/offline'ing */
> -	ret = cpuhp_setup_state_nocalls_cpuslocked(CPUHP_AP_ONLINE_DYN, "hyperv/vmbus:online",
> -						   hv_synic_init, hv_synic_cleanup);
> -	cpus_read_unlock();
> -	free_percpu(works);
> -	if (ret < 0)
> -		goto err_alloc;
> -	hyperv_cpuhp_online = ret;
> -
> -	ret = vmbus_connect();
> +	if (!is_confidential)
> +		ret = vmbus_alloc_synic_and_connect();
>  	if (ret)
>  		goto err_connect;
> 
> @@ -1451,9 +1505,6 @@ static int vmbus_bus_init(void)
>  	return 0;
> 
>  err_connect:
> -	cpuhp_remove_state(hyperv_cpuhp_online);
> -err_alloc:
> -	hv_synic_free();
>  	if (vmbus_irq == -1) {
>  		hv_remove_vmbus_handler();
>  	} else {
> --
> 2.43.0