lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite for Android: free password hash cracker in your pocket
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID:
 <KUZP153MB1444BE7FD66EA9CA9B4B9A97BE88A@KUZP153MB1444.APCP153.PROD.OUTLOOK.COM>
Date: Wed, 7 May 2025 10:19:30 +0000
From: Saurabh Singh Sengar <ssengar@...rosoft.com>
To: Naman Jain <namjain@...ux.microsoft.com>, KY Srinivasan
	<kys@...rosoft.com>, Haiyang Zhang <haiyangz@...rosoft.com>, Wei Liu
	<wei.liu@...nel.org>, Dexuan Cui <decui@...rosoft.com>
CC: Roman Kisel <romank@...ux.microsoft.com>, Anirudh Rayabharam
	<anrayabh@...ux.microsoft.com>, Saurabh Sengar <ssengar@...ux.microsoft.com>,
	Stanislav Kinsburskii <skinsburskii@...ux.microsoft.com>, Nuno Das Neves
	<nunodasneves@...ux.microsoft.com>, "linux-kernel@...r.kernel.org"
	<linux-kernel@...r.kernel.org>, "linux-hyperv@...r.kernel.org"
	<linux-hyperv@...r.kernel.org>
Subject: RE: [PATCH] Drivers: hv: Introduce mshv_vtl driver

> Provide an interface for Virtual Machine Monitor like OpenVMM and its
> use as OpenHCL paravisor to control VTL0 (Virtual trust Level).
> Expose devices and support IOCTLs for features like VTL creation,
> VTL0 memory management, context switch, making hypercalls,
> mapping VTL0 address space to VTL2 userspace, getting new VMBus
> messages and channel events in VTL2 etc.
> 
> Co-developed-by: Roman Kisel <romank@...ux.microsoft.com>
> Signed-off-by: Roman Kisel <romank@...ux.microsoft.com>
> Co-developed-by: Saurabh Sengar <ssengar@...ux.microsoft.com>
> Signed-off-by: Saurabh Sengar <ssengar@...ux.microsoft.com>
> Signed-off-by: Naman Jain <namjain@...ux.microsoft.com>
> ---
> 
> OpenVMM :
> https://nam06.safelinks.protection.outlook.com/?url=https%3A%2F%2Fopenv
> mm.dev%2Fguide%2F&data=05%7C02%7Cssengar%40microsoft.com%7Ce3b
> 0a61c2c72423aa33408dd8c7af2e9%7C72f988bf86f141af91ab2d7cd011db47%
> 7C1%7C0%7C638821181946438191%7CUnknown%7CTWFpbGZsb3d8eyJFbXB
> 0eU1hcGkiOnRydWUsIlYiOiIwLjAuMDAwMCIsIlAiOiJXaW4zMiIsIkFOIjoiTWFp
> bCIsIldUIjoyfQ%3D%3D%7C0%7C%7C%7C&sdata=uYUgaqKTazf0BL8ukdeUEor
> d9hN8NidMLwE19NdprlE%3D&reserved=0
> 
> ---
>  drivers/hv/Kconfig          |   20 +
>  drivers/hv/Makefile         |    3 +
>  drivers/hv/hv.c             |    2 +
>  drivers/hv/hyperv_vmbus.h   |    1 +
>  drivers/hv/mshv_vtl.h       |   52 ++
>  drivers/hv/mshv_vtl_main.c  | 1749
> +++++++++++++++++++++++++++++++++++
>  drivers/hv/vmbus_drv.c      |    3 +-
>  include/hyperv/hvgdk_mini.h |   81 ++
>  include/hyperv/hvhdk.h      |    1 +
>  include/uapi/linux/mshv.h   |   83 ++
>  10 files changed, 1994 insertions(+), 1 deletion(-)
>  create mode 100644 drivers/hv/mshv_vtl.h
>  create mode 100644 drivers/hv/mshv_vtl_main.c
> 
> diff --git a/drivers/hv/Kconfig b/drivers/hv/Kconfig
> index 6c1416167bd2..57dcfcb69b88 100644
> --- a/drivers/hv/Kconfig
> +++ b/drivers/hv/Kconfig
> @@ -72,4 +72,24 @@ config MSHV_ROOT
> 
>  	  If unsure, say N.
> 
> +config MSHV_VTL
> +	bool "Microsoft Hyper-V VTL driver"
> +	depends on HYPERV && X86_64
> +	depends on TRANSPARENT_HUGEPAGE
> +	depends on OF
> +	# MTRRs are not per-VTL and are controlled by VTL0, so don't look at
> or mutate them.
> +	depends on !MTRR
> +	select CPUMASK_OFFSTACK
> +	select HYPERV_VTL_MODE
> +	default n
> +	help
> +	  Select this option to enable Hyper-V VTL driver support.
> +	  This driver provides interfaces for Virtual Machine Manager (VMM)
> running in VTL2
> +	  userspace to create VTLs and partitions, setup and manage VTL0
> memory and
> +	  allow userspace to make direct hypercalls. This also allows to map
> VTL0's address
> +	  space to a usermode process in VTL2 and supports getting new
> VMBus messages and channel
> +	  events in VTL2.
> +
> +	  If unsure, say N.
> +
>  endmenu
> diff --git a/drivers/hv/Makefile b/drivers/hv/Makefile
> index 976189c725dc..5e785dae08cc 100644
> --- a/drivers/hv/Makefile
> +++ b/drivers/hv/Makefile
> @@ -3,6 +3,7 @@ obj-$(CONFIG_HYPERV)		+= hv_vmbus.o
>  obj-$(CONFIG_HYPERV_UTILS)	+= hv_utils.o
>  obj-$(CONFIG_HYPERV_BALLOON)	+= hv_balloon.o
>  obj-$(CONFIG_MSHV_ROOT)		+= mshv_root.o
> +obj-$(CONFIG_MSHV_VTL)          += mshv_vtl.o
> 
>  CFLAGS_hv_trace.o = -I$(src)
>  CFLAGS_hv_balloon.o = -I$(src)
> @@ -18,3 +19,5 @@ mshv_root-y := mshv_root_main.o mshv_synic.o
> mshv_eventfd.o mshv_irq.o \
>  # Code that must be built-in
>  obj-$(subst m,y,$(CONFIG_HYPERV)) += hv_common.o
>  obj-$(subst m,y,$(CONFIG_MSHV_ROOT)) += hv_proc.o mshv_common.o
> +
> +mshv_vtl-y := mshv_vtl_main.o mshv_common.o
> diff --git a/drivers/hv/hv.c b/drivers/hv/hv.c
> index 308c8f279df8..11e8096fe840 100644
> --- a/drivers/hv/hv.c
> +++ b/drivers/hv/hv.c
> @@ -25,6 +25,7 @@
> 
>  /* The one and only */
>  struct hv_context hv_context;
> +EXPORT_SYMBOL_GPL(hv_context);
> 
>  /*
>   * hv_init - Main initialization routine.
> @@ -93,6 +94,7 @@ int hv_post_message(union hv_connection_id
> connection_id,
> 
>  	return hv_result(status);
>  }
> +EXPORT_SYMBOL_GPL(hv_post_message);
> 
>  int hv_synic_alloc(void)
>  {
> diff --git a/drivers/hv/hyperv_vmbus.h b/drivers/hv/hyperv_vmbus.h
> index 0b450e53161e..b61f01fc1960 100644
> --- a/drivers/hv/hyperv_vmbus.h
> +++ b/drivers/hv/hyperv_vmbus.h
> @@ -32,6 +32,7 @@
>   */
>  #define HV_UTIL_NEGO_TIMEOUT 55
> 
> +void vmbus_isr(void);
> 
>  /* Definitions for the monitored notification facility */
>  union hv_monitor_trigger_group {
> diff --git a/drivers/hv/mshv_vtl.h b/drivers/hv/mshv_vtl.h
> new file mode 100644
> index 000000000000..f350e4650d7b
> --- /dev/null
> +++ b/drivers/hv/mshv_vtl.h
> @@ -0,0 +1,52 @@
> +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
> +#ifndef _MSHV_VTL_H
> +#define _MSHV_VTL_H
> +
> +#include <linux/mshv.h>
> +#include <linux/types.h>
> +#include <asm/fpu/types.h>
> +
> +struct mshv_vtl_cpu_context {
> +	union {
> +		struct {
> +			u64 rax;
> +			u64 rcx;
> +			u64 rdx;
> +			u64 rbx;
> +			u64 cr2;
> +			u64 rbp;
> +			u64 rsi;
> +			u64 rdi;
> +			u64 r8;
> +			u64 r9;
> +			u64 r10;
> +			u64 r11;
> +			u64 r12;
> +			u64 r13;
> +			u64 r14;
> +			u64 r15;
> +		};
> +		u64 gp_regs[16];
> +	};
> +
> +	struct fxregs_state fx_state;
> +};
> +
> +struct mshv_vtl_run {
> +	u32 cancel;
> +	u32 vtl_ret_action_size;
> +	u32 pad[2];
> +	char exit_message[MSHV_MAX_RUN_MSG_SIZE];
> +	union {
> +		struct mshv_vtl_cpu_context cpu_context;
> +
> +		/*
> +		 * Reserving room for the cpu context to grow and be
> +		 * able to maintain compat with user mode.
> +		 */
> +		char reserved[1024];
> +	};
> +	char vtl_ret_actions[MSHV_MAX_RUN_MSG_SIZE];
> +};
> +
> +#endif /* _MSHV_VTL_H */
> diff --git a/drivers/hv/mshv_vtl_main.c b/drivers/hv/mshv_vtl_main.c
> new file mode 100644
> index 000000000000..95db29472fc8
> --- /dev/null
> +++ b/drivers/hv/mshv_vtl_main.c
> @@ -0,0 +1,1749 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/*
> + * Copyright (c) 2023, Microsoft Corporation.
> + *
> + * Author:
> + *   Roman Kisel <romank@...ux.microsoft.com>
> + *   Saurabh Sengar <ssengar@...ux.microsoft.com>
> + *   Naman Jain <namjain@...ux.microsoft.com>
> + */
> +
> +#include <linux/kernel.h>
> +#include <linux/module.h>
> +#include <linux/miscdevice.h>
> +#include <linux/anon_inodes.h>
> +#include <linux/pfn_t.h>
> +#include <linux/cpuhotplug.h>
> +#include <linux/count_zeros.h>
> +#include <linux/eventfd.h>
> +#include <linux/poll.h>
> +#include <linux/file.h>
> +#include <linux/vmalloc.h>
> +#include <asm/debugreg.h>
> +#include <asm/mshyperv.h>
> +#include <trace/events/ipi.h>
> +#include <uapi/asm/mtrr.h>
> +#include <uapi/linux/mshv.h>
> +#include <hyperv/hvhdk.h>
> +
> +#include "../../kernel/fpu/legacy.h"
> +#include "mshv.h"
> +
> +#include "mshv_vtl.h"
> +#include "hyperv_vmbus.h"
> +
> +MODULE_AUTHOR("Microsoft");
> +MODULE_LICENSE("GPL");
> +MODULE_DESCRIPTION("Microsoft Hyper-V VTL Driver");
> +
> +#define MSHV_ENTRY_REASON_LOWER_VTL_CALL     0x1
> +#define MSHV_ENTRY_REASON_INTERRUPT          0x2
> +#define MSHV_ENTRY_REASON_INTERCEPT          0x3
> +
> +#define MAX_GUEST_MEM_SIZE	BIT_ULL(40)
> +#define MSHV_PG_OFF_CPU_MASK	0xFFFF
> +#define MSHV_REAL_OFF_SHIFT	16
> +#define MSHV_RUN_PAGE_OFFSET	0
> +#define MSHV_REG_PAGE_OFFSET	1
> +#define VTL2_VMBUS_SINT_INDEX	7
> +
> +static struct device *mem_dev;
> +
> +static struct tasklet_struct msg_dpc;
> +static wait_queue_head_t fd_wait_queue;
> +static bool has_message;
> +static struct eventfd_ctx *flag_eventfds[HV_EVENT_FLAGS_COUNT];
> +static DEFINE_MUTEX(flag_lock);
> +static bool __read_mostly mshv_has_reg_page;
> +
> +struct mshv_vtl_hvcall_fd {
> +	u64 allow_bitmap[2 * PAGE_SIZE];
> +	bool allow_map_intialized;
> +	/*
> +	 * Used to protect hvcall setup in IOCTLs
> +	 */
> +	struct mutex init_mutex;
> +	struct miscdevice *dev;
> +};
> +
> +struct mshv_vtl_poll_file {
> +	struct file *file;
> +	wait_queue_entry_t wait;
> +	wait_queue_head_t *wqh;
> +	poll_table pt;
> +	int cpu;
> +};
> +
> +struct mshv_vtl {
> +	struct device *module_dev;
> +	u64 id;
> +	refcount_t ref_count;
> +};
> +
> +union mshv_synic_overlay_page_msr {
> +	u64 as_u64;
> +	struct {
> +		u64 enabled: 1;
> +		u64 reserved: 11;
> +		u64 pfn: 52;
> +	};
> +};
> +
> +union hv_register_vsm_capabilities {
> +	u64 as_uint64;
> +	struct {
> +		u64 dr6_shared: 1;
> +		u64 mbec_vtl_mask: 16;
> +		u64 deny_lower_vtl_startup: 1;
> +		u64 supervisor_shadow_stack: 1;
> +		u64 hardware_hvpt_available: 1;
> +		u64 software_hvpt_available: 1;
> +		u64 hardware_hvpt_range_bits: 6;
> +		u64 intercept_page_available: 1;
> +		u64 return_action_available: 1;
> +		u64 reserved: 35;
> +	} __packed;
> +};
> +
> +union hv_register_vsm_page_offsets {
> +	struct {
> +		u64 vtl_call_offset : 12;
> +		u64 vtl_return_offset : 12;
> +		u64 reserved_mbz : 40;
> +	};
> +	u64 as_uint64;
> +} __packed;
> +
> +struct mshv_vtl_per_cpu {
> +	struct mshv_vtl_run *run;
> +	struct page *reg_page;
> +};
> +
> +static struct mutex mshv_vtl_poll_file_lock;
> +static union hv_register_vsm_page_offsets mshv_vsm_page_offsets;
> +static union hv_register_vsm_capabilities mshv_vsm_capabilities;
> +
> +static DEFINE_PER_CPU(struct mshv_vtl_poll_file, mshv_vtl_poll_file);
> +static DEFINE_PER_CPU(unsigned long long, num_vtl0_transitions);
> +static DEFINE_PER_CPU(struct mshv_vtl_per_cpu, mshv_vtl_per_cpu);
> +
> +static const struct file_operations mshv_vtl_fops;
> +
> +static long
> +mshv_ioctl_create_vtl(void __user *user_arg, struct device *module_dev)
> +{
> +	struct mshv_vtl *vtl;
> +	struct file *file;
> +	int fd;
> +
> +	vtl = kzalloc(sizeof(*vtl), GFP_KERNEL);
> +	if (!vtl)
> +		return -ENOMEM;
> +
> +	fd = get_unused_fd_flags(O_CLOEXEC);
> +	if (fd < 0)
> +		return fd;
> +	file = anon_inode_getfile("mshv_vtl", &mshv_vtl_fops,
> +				  vtl, O_RDWR);
> +	if (IS_ERR(file))
> +		return PTR_ERR(file);
> +	refcount_set(&vtl->ref_count, 1);
> +	vtl->module_dev = module_dev;
> +
> +	fd_install(fd, file);
> +
> +	return fd;
> +}
> +
> +static long
> +mshv_ioctl_check_extension(void __user *user_arg)
> +{
> +	u32 arg;
> +
> +	if (copy_from_user(&arg, user_arg, sizeof(arg)))
> +		return -EFAULT;
> +
> +	switch (arg) {
> +	case MSHV_CAP_CORE_API_STABLE:
> +		return 0;
> +	case MSHV_CAP_REGISTER_PAGE:
> +		return mshv_has_reg_page;
> +	case MSHV_CAP_VTL_RETURN_ACTION:
> +		return mshv_vsm_capabilities.return_action_available;
> +	case MSHV_CAP_DR6_SHARED:
> +		return mshv_vsm_capabilities.dr6_shared;
> +	}
> +
> +	return -EOPNOTSUPP;
> +}
> +
> +static long
> +mshv_dev_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg)
> +{
> +	struct miscdevice *misc = filp->private_data;
> +
> +	switch (ioctl) {
> +	case MSHV_CHECK_EXTENSION:
> +		return mshv_ioctl_check_extension((void __user *)arg);
> +	case MSHV_CREATE_VTL:
> +		return mshv_ioctl_create_vtl((void __user *)arg, misc-
> >this_device);
> +	}
> +
> +	return -ENOTTY;
> +}
> +
> +static const struct file_operations mshv_dev_fops = {
> +	.owner		= THIS_MODULE,
> +	.unlocked_ioctl	= mshv_dev_ioctl,
> +	.llseek		= noop_llseek,
> +};
> +
> +static struct miscdevice mshv_dev = {
> +	.minor = MISC_DYNAMIC_MINOR,
> +	.name = "mshv",
> +	.fops = &mshv_dev_fops,
> +	.mode = 0600,
> +};
> +
> +static struct mshv_vtl_run *mshv_vtl_this_run(void)
> +{
> +	return *this_cpu_ptr(&mshv_vtl_per_cpu.run);
> +}
> +
> +static struct mshv_vtl_run *mshv_vtl_cpu_run(int cpu)
> +{
> +	return *per_cpu_ptr(&mshv_vtl_per_cpu.run, cpu);
> +}
> +
> +static struct page *mshv_vtl_cpu_reg_page(int cpu)
> +{
> +	return *per_cpu_ptr(&mshv_vtl_per_cpu.reg_page, cpu);
> +}
> +
> +static void mshv_vtl_configure_reg_page(struct mshv_vtl_per_cpu *per_cpu)
> +{
> +	struct hv_register_assoc reg_assoc = {};
> +	union mshv_synic_overlay_page_msr overlay = {};
> +	struct page *reg_page;
> +	union hv_input_vtl vtl = { .as_uint8 = 0 };
> +
> +	reg_page = alloc_page(GFP_KERNEL | __GFP_ZERO |
> __GFP_RETRY_MAYFAIL);
> +	if (!reg_page) {
> +		WARN(1, "failed to allocate register page\n");
> +		return;
> +	}
> +
> +	overlay.enabled = 1;
> +	overlay.pfn = page_to_phys(reg_page) >> HV_HYP_PAGE_SHIFT;
> +	reg_assoc.name = HV_X64_REGISTER_REG_PAGE;
> +	reg_assoc.value.reg64 = overlay.as_u64;
> +
> +	if (hv_call_set_vp_registers(HV_VP_INDEX_SELF,
> HV_PARTITION_ID_SELF,
> +				     1, vtl, &reg_assoc)) {
> +		WARN(1, "failed to setup register page\n");
> +		__free_page(reg_page);
> +		return;
> +	}
> +
> +	per_cpu->reg_page = reg_page;
> +	mshv_has_reg_page = true;
> +}
> +
> +static void mshv_vtl_synic_enable_regs(unsigned int cpu)
> +{
> +	union hv_synic_sint sint;
> +
> +	sint.as_uint64 = 0;
> +	sint.vector = HYPERVISOR_CALLBACK_VECTOR;
> +	sint.masked = false;
> +	sint.auto_eoi = hv_recommend_using_aeoi();
> +
> +	/* Enable intercepts */
> +	if (!mshv_vsm_capabilities.intercept_page_available)
> +		hv_set_msr(HV_MSR_SINT0 +
> HV_SYNIC_INTERCEPTION_SINT_INDEX,
> +			   sint.as_uint64);
> +
> +	/* VTL2 Host VSP SINT is (un)masked when the user mode requests
> that */
> +}
> +
> +static int mshv_vtl_get_vsm_regs(void)
> +{
> +	struct hv_register_assoc registers[2];
> +	union hv_input_vtl input_vtl;
> +	int ret, count = 2;
> +
> +	input_vtl.as_uint8 = 0;
> +	registers[0].name = HV_REGISTER_VSM_CODE_PAGE_OFFSETS;
> +	registers[1].name = HV_REGISTER_VSM_CAPABILITIES;
> +
> +	ret = hv_call_get_vp_registers(HV_VP_INDEX_SELF,
> HV_PARTITION_ID_SELF,
> +				       count, input_vtl, registers);
> +	if (ret)
> +		return ret;
> +
> +	mshv_vsm_page_offsets.as_uint64 = registers[0].value.reg64;
> +	mshv_vsm_capabilities.as_uint64 = registers[1].value.reg64;
> +
> +	return ret;
> +}
> +
> +static int mshv_vtl_configure_vsm_partition(struct device *dev)
> +{
> +	union hv_register_vsm_partition_config config;
> +	struct hv_register_assoc reg_assoc;
> +	union hv_input_vtl input_vtl;
> +
> +	config.as_u64 = 0;
> +	config.default_vtl_protection_mask =
> HV_MAP_GPA_PERMISSIONS_MASK;
> +	config.enable_vtl_protection = 1;
> +	config.zero_memory_on_reset = 1;
> +	config.intercept_vp_startup = 1;
> +	config.intercept_cpuid_unimplemented = 1;
> +
> +	if (mshv_vsm_capabilities.intercept_page_available) {
> +		dev_dbg(dev, "using intercept page\n");
> +		config.intercept_page = 1;
> +	}
> +
> +	reg_assoc.name = HV_REGISTER_VSM_PARTITION_CONFIG;
> +	reg_assoc.value.reg64 = config.as_u64;
> +	input_vtl.as_uint8 = 0;
> +
> +	return hv_call_set_vp_registers(HV_VP_INDEX_SELF,
> HV_PARTITION_ID_SELF,
> +				       1, input_vtl, &reg_assoc);
> +}
> +
> +static void mshv_vtl_vmbus_isr(void)
> +{
> +	struct hv_per_cpu_context *per_cpu;
> +	struct hv_message *msg;
> +	u32 message_type;
> +	union hv_synic_event_flags *event_flags;
> +	unsigned long word;
> +	int i, j;
> +	struct eventfd_ctx *eventfd;
> +
> +	per_cpu = this_cpu_ptr(hv_context.cpu_context);
> +	if (smp_processor_id() == 0) {
> +		msg = (struct hv_message *)per_cpu->synic_message_page +
> VTL2_VMBUS_SINT_INDEX;
> +		message_type = READ_ONCE(msg->header.message_type);
> +		if (message_type != HVMSG_NONE)
> +			tasklet_schedule(&msg_dpc);
> +	}
> +
> +	event_flags = (union hv_synic_event_flags *)per_cpu-
> >synic_event_page +
> +			VTL2_VMBUS_SINT_INDEX;
> +	for (i = 0; i < HV_EVENT_FLAGS_LONG_COUNT; i++) {
> +		if (READ_ONCE(event_flags->flags[i])) {
> +			word = xchg(&event_flags->flags[i], 0);
> +			for_each_set_bit(j, &word, BITS_PER_LONG) {
> +				rcu_read_lock();
> +				eventfd = READ_ONCE(flag_eventfds[i *
> BITS_PER_LONG + j]);
> +				if (eventfd)
> +					eventfd_signal(eventfd);
> +				rcu_read_unlock();
> +			}
> +		}
> +	}
> +
> +	vmbus_isr();
> +}
> +
> +static int mshv_vtl_alloc_context(unsigned int cpu)
> +{
> +	struct mshv_vtl_per_cpu *per_cpu =
> this_cpu_ptr(&mshv_vtl_per_cpu);
> +	struct page *run_page;
> +
> +	run_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
> +	if (!run_page)
> +		return -ENOMEM;
> +
> +	per_cpu->run = page_address(run_page);
> +	if (mshv_vsm_capabilities.intercept_page_available)
> +		mshv_vtl_configure_reg_page(per_cpu);
> +
> +	mshv_vtl_synic_enable_regs(cpu);
> +
> +	return 0;
> +}
> +
> +static int mshv_vtl_cpuhp_online;
> +
> +static int hv_vtl_setup_synic(void)
> +{
> +	int ret;
> +
> +	/* Use our isr to first filter out packets destined for userspace */
> +	hv_setup_vmbus_handler(mshv_vtl_vmbus_isr);
> +
> +	ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN,
> "hyperv/vtl:online",
> +				mshv_vtl_alloc_context, NULL);
> +	if (ret < 0) {
> +		hv_remove_vmbus_handler();
> +		return ret;
> +	}
> +
> +	mshv_vtl_cpuhp_online = ret;
> +	return 0;
> +}
> +
> +static void hv_vtl_remove_synic(void)
> +{
> +	hv_remove_vmbus_handler();
> +	cpuhp_remove_state(mshv_vtl_cpuhp_online);
> +}
> +
> +static int vtl_get_vp_registers(u16 count,
> +				struct hv_register_assoc *registers)
> +{
> +	union hv_input_vtl input_vtl;
> +
> +	input_vtl.as_uint8 = 0;
> +	input_vtl.use_target_vtl = 1;
> +	return hv_call_get_vp_registers(HV_VP_INDEX_SELF,
> HV_PARTITION_ID_SELF,
> +					count, input_vtl, registers);
> +}
> +
> +static int vtl_set_vp_registers(u16 count,
> +				struct hv_register_assoc *registers)
> +{
> +	union hv_input_vtl input_vtl;
> +
> +	input_vtl.as_uint8 = 0;
> +	input_vtl.use_target_vtl = 1;
> +	return hv_call_set_vp_registers(HV_VP_INDEX_SELF,
> HV_PARTITION_ID_SELF,
> +					count, input_vtl, registers);
> +}
> +
> +static int mshv_vtl_ioctl_add_vtl0_mem(struct mshv_vtl *vtl, void __user
> *arg)
> +{
> +	struct mshv_vtl_ram_disposition vtl0_mem;
> +	struct dev_pagemap *pgmap;
> +	void *addr;
> +
> +	if (copy_from_user(&vtl0_mem, arg, sizeof(vtl0_mem)))
> +		return -EFAULT;
> +
> +	if (vtl0_mem.last_pfn <= vtl0_mem.start_pfn) {
> +		dev_err(vtl->module_dev, "range start pfn (%llx) > end pfn
> (%llx)\n",
> +			vtl0_mem.start_pfn, vtl0_mem.last_pfn);
> +		return -EFAULT;
> +	}
> +
> +	pgmap = kzalloc(sizeof(*pgmap), GFP_KERNEL);
> +	if (!pgmap)
> +		return -ENOMEM;
> +
> +	pgmap->ranges[0].start = PFN_PHYS(vtl0_mem.start_pfn);
> +	pgmap->ranges[0].end = PFN_PHYS(vtl0_mem.last_pfn) - 1;
> +	pgmap->nr_range = 1;
> +	pgmap->type = MEMORY_DEVICE_GENERIC;
> +
> +	/*
> +	 * Determine the highest page order that can be used for the range.
> +	 * This works best when the range is aligned; i.e. start and length.
> +	 */
> +	pgmap->vmemmap_shift = count_trailing_zeros(vtl0_mem.start_pfn
> | vtl0_mem.last_pfn);
> +	dev_dbg(vtl->module_dev,
> +		"Add VTL0 memory: start: 0x%llx, end_pfn: 0x%llx, page
> order: %lu\n",
> +		vtl0_mem.start_pfn, vtl0_mem.last_pfn, pgmap-
> >vmemmap_shift);
> +
> +	addr = devm_memremap_pages(mem_dev, pgmap);
> +	if (IS_ERR(addr)) {
> +		dev_err(vtl->module_dev, "devm_memremap_pages error:
> %ld\n", PTR_ERR(addr));
> +		kfree(pgmap);
> +		return -EFAULT;
> +	}
> +
> +	/* Don't free pgmap, since it has to stick around until the memory
> +	 * is unmapped, which will never happen as there is no scenario
> +	 * where VTL0 can be released/shutdown without bringing down
> VTL2.
> +	 */
> +	return 0;
> +}
> +
> +static void mshv_vtl_cancel(int cpu)
> +{
> +	int here = get_cpu();
> +
> +	if (here != cpu) {
> +		if (!xchg_relaxed(&mshv_vtl_cpu_run(cpu)->cancel, 1))
> +			smp_send_reschedule(cpu);
> +	} else {
> +		WRITE_ONCE(mshv_vtl_this_run()->cancel, 1);
> +	}
> +	put_cpu();
> +}
> +
> +static int mshv_vtl_poll_file_wake(wait_queue_entry_t *wait, unsigned int
> mode, int sync, void *key)
> +{
> +	struct mshv_vtl_poll_file *poll_file = container_of(wait, struct
> mshv_vtl_poll_file, wait);
> +
> +	mshv_vtl_cancel(poll_file->cpu);
> +	return 0;
> +}
> +
> +static void mshv_vtl_ptable_queue_proc(struct file *file, wait_queue_head_t
> *wqh, poll_table *pt)
> +{
> +	struct mshv_vtl_poll_file *poll_file = container_of(pt, struct
> mshv_vtl_poll_file, pt);
> +
> +	WARN_ON(poll_file->wqh);
> +	poll_file->wqh = wqh;
> +	add_wait_queue(wqh, &poll_file->wait);
> +}
> +
> +static int mshv_vtl_ioctl_set_poll_file(struct mshv_vtl_set_poll_file __user
> *user_input)
> +{
> +	struct file *file, *old_file;
> +	struct mshv_vtl_poll_file *poll_file;
> +	struct mshv_vtl_set_poll_file input;
> +
> +	if (copy_from_user(&input, user_input, sizeof(input)))
> +		return -EFAULT;
> +
> +	if (!cpu_online(input.cpu))
> +		return -EINVAL;
> +
> +	file = NULL;
> +	if (input.fd >= 0) {
> +		file = fget(input.fd);
> +		if (!file)
> +			return -EBADFD;
> +	}
> +
> +	poll_file = per_cpu_ptr(&mshv_vtl_poll_file, input.cpu);
> +
> +	mutex_lock(&mshv_vtl_poll_file_lock);
> +
> +	if (poll_file->wqh)
> +		remove_wait_queue(poll_file->wqh, &poll_file->wait);
> +	poll_file->wqh = NULL;
> +
> +	old_file = poll_file->file;
> +	poll_file->file = file;
> +	poll_file->cpu = input.cpu;
> +
> +	if (file) {
> +		init_waitqueue_func_entry(&poll_file->wait,
> mshv_vtl_poll_file_wake);
> +		init_poll_funcptr(&poll_file->pt,
> mshv_vtl_ptable_queue_proc);
> +		vfs_poll(file, &poll_file->pt);
> +	}
> +
> +	mutex_unlock(&mshv_vtl_poll_file_lock);
> +
> +	if (old_file)
> +		fput(old_file);
> +
> +	return 0;
> +}
> +
> +static int mshv_vtl_set_reg(struct hv_register_assoc *regs)
> +{
> +	u64 reg64;
> +	enum hv_register_name gpr_name;
> +
> +	gpr_name = regs->name;
> +	reg64 = regs->value.reg64;
> +
> +	switch (gpr_name) {
> +	case HV_X64_REGISTER_DR0:
> +		native_set_debugreg(0, reg64);
> +		break;
> +	case HV_X64_REGISTER_DR1:
> +		native_set_debugreg(1, reg64);
> +		break;
> +	case HV_X64_REGISTER_DR2:
> +		native_set_debugreg(2, reg64);
> +		break;
> +	case HV_X64_REGISTER_DR3:
> +		native_set_debugreg(3, reg64);
> +		break;
> +	case HV_X64_REGISTER_DR6:
> +		if (!mshv_vsm_capabilities.dr6_shared)
> +			goto hypercall;
> +		native_set_debugreg(6, reg64);
> +		break;
> +	case HV_X64_REGISTER_MSR_MTRR_CAP:
> +		wrmsrl(MSR_MTRRcap, reg64);
> +		break;
> +	case HV_X64_REGISTER_MSR_MTRR_DEF_TYPE:
> +		wrmsrl(MSR_MTRRdefType, reg64);
> +		break;
> +	case HV_X64_REGISTER_MSR_MTRR_PHYS_BASE0:
> +		wrmsrl(MTRRphysBase_MSR(0), reg64);
> +		break;
> +	case HV_X64_REGISTER_MSR_MTRR_PHYS_BASE1:
> +		wrmsrl(MTRRphysBase_MSR(1), reg64);
> +		break;
> +	case HV_X64_REGISTER_MSR_MTRR_PHYS_BASE2:
> +		wrmsrl(MTRRphysBase_MSR(2), reg64);
> +		break;
> +	case HV_X64_REGISTER_MSR_MTRR_PHYS_BASE3:
> +		wrmsrl(MTRRphysBase_MSR(3), reg64);
> +		break;
> +	case HV_X64_REGISTER_MSR_MTRR_PHYS_BASE4:
> +		wrmsrl(MTRRphysBase_MSR(4), reg64);
> +		break;
> +	case HV_X64_REGISTER_MSR_MTRR_PHYS_BASE5:
> +		wrmsrl(MTRRphysBase_MSR(5), reg64);
> +		break;
> +	case HV_X64_REGISTER_MSR_MTRR_PHYS_BASE6:
> +		wrmsrl(MTRRphysBase_MSR(6), reg64);
> +		break;
> +	case HV_X64_REGISTER_MSR_MTRR_PHYS_BASE7:
> +		wrmsrl(MTRRphysBase_MSR(7), reg64);
> +		break;
> +	case HV_X64_REGISTER_MSR_MTRR_PHYS_BASE8:
> +		wrmsrl(MTRRphysBase_MSR(8), reg64);
> +		break;
> +	case HV_X64_REGISTER_MSR_MTRR_PHYS_BASE9:
> +		wrmsrl(MTRRphysBase_MSR(9), reg64);
> +		break;
> +	case HV_X64_REGISTER_MSR_MTRR_PHYS_BASEA:
> +		wrmsrl(MTRRphysBase_MSR(0xa), reg64);
> +		break;
> +	case HV_X64_REGISTER_MSR_MTRR_PHYS_BASEB:
> +		wrmsrl(MTRRphysBase_MSR(0xb), reg64);
> +		break;
> +	case HV_X64_REGISTER_MSR_MTRR_PHYS_BASEC:
> +		wrmsrl(MTRRphysBase_MSR(0xc), reg64);
> +		break;
> +	case HV_X64_REGISTER_MSR_MTRR_PHYS_BASED:
> +		wrmsrl(MTRRphysBase_MSR(0xd), reg64);
> +		break;
> +	case HV_X64_REGISTER_MSR_MTRR_PHYS_BASEE:
> +		wrmsrl(MTRRphysBase_MSR(0xe), reg64);
> +		break;
> +	case HV_X64_REGISTER_MSR_MTRR_PHYS_BASEF:
> +		wrmsrl(MTRRphysBase_MSR(0xf), reg64);
> +		break;
> +	case HV_X64_REGISTER_MSR_MTRR_PHYS_MASK0:
> +		wrmsrl(MTRRphysMask_MSR(0), reg64);
> +		break;
> +	case HV_X64_REGISTER_MSR_MTRR_PHYS_MASK1:
> +		wrmsrl(MTRRphysMask_MSR(1), reg64);
> +		break;
> +	case HV_X64_REGISTER_MSR_MTRR_PHYS_MASK2:
> +		wrmsrl(MTRRphysMask_MSR(2), reg64);
> +		break;
> +	case HV_X64_REGISTER_MSR_MTRR_PHYS_MASK3:
> +		wrmsrl(MTRRphysMask_MSR(3), reg64);
> +		break;
> +	case HV_X64_REGISTER_MSR_MTRR_PHYS_MASK4:
> +		wrmsrl(MTRRphysMask_MSR(4), reg64);
> +		break;
> +	case HV_X64_REGISTER_MSR_MTRR_PHYS_MASK5:
> +		wrmsrl(MTRRphysMask_MSR(5), reg64);
> +		break;
> +	case HV_X64_REGISTER_MSR_MTRR_PHYS_MASK6:
> +		wrmsrl(MTRRphysMask_MSR(6), reg64);
> +		break;
> +	case HV_X64_REGISTER_MSR_MTRR_PHYS_MASK7:
> +		wrmsrl(MTRRphysMask_MSR(7), reg64);
> +		break;
> +	case HV_X64_REGISTER_MSR_MTRR_PHYS_MASK8:
> +		wrmsrl(MTRRphysMask_MSR(8), reg64);
> +		break;
> +	case HV_X64_REGISTER_MSR_MTRR_PHYS_MASK9:
> +		wrmsrl(MTRRphysMask_MSR(9), reg64);
> +		break;
> +	case HV_X64_REGISTER_MSR_MTRR_PHYS_MASKA:
> +		wrmsrl(MTRRphysMask_MSR(0xa), reg64);
> +		break;
> +	case HV_X64_REGISTER_MSR_MTRR_PHYS_MASKB:
> +		wrmsrl(MTRRphysMask_MSR(0xa), reg64);
> +		break;
> +	case HV_X64_REGISTER_MSR_MTRR_PHYS_MASKC:
> +		wrmsrl(MTRRphysMask_MSR(0xc), reg64);
> +		break;
> +	case HV_X64_REGISTER_MSR_MTRR_PHYS_MASKD:
> +		wrmsrl(MTRRphysMask_MSR(0xd), reg64);
> +		break;
> +	case HV_X64_REGISTER_MSR_MTRR_PHYS_MASKE:
> +		wrmsrl(MTRRphysMask_MSR(0xe), reg64);
> +		break;
> +	case HV_X64_REGISTER_MSR_MTRR_PHYS_MASKF:
> +		wrmsrl(MTRRphysMask_MSR(0xf), reg64);
> +		break;
> +	case HV_X64_REGISTER_MSR_MTRR_FIX64K00000:
> +		wrmsrl(MSR_MTRRfix64K_00000, reg64);
> +		break;
> +	case HV_X64_REGISTER_MSR_MTRR_FIX16K80000:
> +		wrmsrl(MSR_MTRRfix16K_80000, reg64);
> +		break;
> +	case HV_X64_REGISTER_MSR_MTRR_FIX16KA0000:
> +		wrmsrl(MSR_MTRRfix16K_A0000, reg64);
> +		break;
> +	case HV_X64_REGISTER_MSR_MTRR_FIX4KC0000:
> +		wrmsrl(MSR_MTRRfix4K_C0000, reg64);
> +		break;
> +	case HV_X64_REGISTER_MSR_MTRR_FIX4KC8000:
> +		wrmsrl(MSR_MTRRfix4K_C8000, reg64);
> +		break;
> +	case HV_X64_REGISTER_MSR_MTRR_FIX4KD0000:
> +		wrmsrl(MSR_MTRRfix4K_D0000, reg64);
> +		break;
> +	case HV_X64_REGISTER_MSR_MTRR_FIX4KD8000:
> +		wrmsrl(MSR_MTRRfix4K_D8000, reg64);
> +		break;
> +	case HV_X64_REGISTER_MSR_MTRR_FIX4KE0000:
> +		wrmsrl(MSR_MTRRfix4K_E0000, reg64);
> +		break;
> +	case HV_X64_REGISTER_MSR_MTRR_FIX4KE8000:
> +		wrmsrl(MSR_MTRRfix4K_E8000, reg64);
> +		break;
> +	case HV_X64_REGISTER_MSR_MTRR_FIX4KF0000:
> +		wrmsrl(MSR_MTRRfix4K_F0000, reg64);
> +		break;
> +	case HV_X64_REGISTER_MSR_MTRR_FIX4KF8000:
> +		wrmsrl(MSR_MTRRfix4K_F8000, reg64);
> +		break;
> +
> +	default:
> +		goto hypercall;
> +	}
> +
> +	return 0;
> +
> +hypercall:
> +	return 1;
> +}
> +
> +static int mshv_vtl_get_reg(struct hv_register_assoc *regs)
> +{
> +	u64 *reg64;
> +	enum hv_register_name gpr_name;
> +
> +	gpr_name = regs->name;
> +	reg64 = (u64 *)&regs->value.reg64;
> +
> +	switch (gpr_name) {
> +	case HV_X64_REGISTER_DR0:
> +		*reg64 = native_get_debugreg(0);
> +		break;
> +	case HV_X64_REGISTER_DR1:
> +		*reg64 = native_get_debugreg(1);
> +		break;
> +	case HV_X64_REGISTER_DR2:
> +		*reg64 = native_get_debugreg(2);
> +		break;
> +	case HV_X64_REGISTER_DR3:
> +		*reg64 = native_get_debugreg(3);
> +		break;
> +	case HV_X64_REGISTER_DR6:
> +		if (!mshv_vsm_capabilities.dr6_shared)
> +			goto hypercall;
> +		*reg64 = native_get_debugreg(6);
> +		break;
> +	case HV_X64_REGISTER_MSR_MTRR_CAP:
> +		rdmsrl(MSR_MTRRcap, *reg64);
> +		break;
> +	case HV_X64_REGISTER_MSR_MTRR_DEF_TYPE:
> +		rdmsrl(MSR_MTRRdefType, *reg64);
> +		break;
> +	case HV_X64_REGISTER_MSR_MTRR_PHYS_BASE0:
> +		rdmsrl(MTRRphysBase_MSR(0), *reg64);
> +		break;
> +	case HV_X64_REGISTER_MSR_MTRR_PHYS_BASE1:
> +		rdmsrl(MTRRphysBase_MSR(1), *reg64);
> +		break;
> +	case HV_X64_REGISTER_MSR_MTRR_PHYS_BASE2:
> +		rdmsrl(MTRRphysBase_MSR(2), *reg64);
> +		break;
> +	case HV_X64_REGISTER_MSR_MTRR_PHYS_BASE3:
> +		rdmsrl(MTRRphysBase_MSR(3), *reg64);
> +		break;
> +	case HV_X64_REGISTER_MSR_MTRR_PHYS_BASE4:
> +		rdmsrl(MTRRphysBase_MSR(4), *reg64);
> +		break;
> +	case HV_X64_REGISTER_MSR_MTRR_PHYS_BASE5:
> +		rdmsrl(MTRRphysBase_MSR(5), *reg64);
> +		break;
> +	case HV_X64_REGISTER_MSR_MTRR_PHYS_BASE6:
> +		rdmsrl(MTRRphysBase_MSR(6), *reg64);
> +		break;
> +	case HV_X64_REGISTER_MSR_MTRR_PHYS_BASE7:
> +		rdmsrl(MTRRphysBase_MSR(7), *reg64);
> +		break;
> +	case HV_X64_REGISTER_MSR_MTRR_PHYS_BASE8:
> +		rdmsrl(MTRRphysBase_MSR(8), *reg64);
> +		break;
> +	case HV_X64_REGISTER_MSR_MTRR_PHYS_BASE9:
> +		rdmsrl(MTRRphysBase_MSR(9), *reg64);
> +		break;
> +	case HV_X64_REGISTER_MSR_MTRR_PHYS_BASEA:
> +		rdmsrl(MTRRphysBase_MSR(0xa), *reg64);
> +		break;
> +	case HV_X64_REGISTER_MSR_MTRR_PHYS_BASEB:
> +		rdmsrl(MTRRphysBase_MSR(0xb), *reg64);
> +		break;
> +	case HV_X64_REGISTER_MSR_MTRR_PHYS_BASEC:
> +		rdmsrl(MTRRphysBase_MSR(0xc), *reg64);
> +		break;
> +	case HV_X64_REGISTER_MSR_MTRR_PHYS_BASED:
> +		rdmsrl(MTRRphysBase_MSR(0xd), *reg64);
> +		break;
> +	case HV_X64_REGISTER_MSR_MTRR_PHYS_BASEE:
> +		rdmsrl(MTRRphysBase_MSR(0xe), *reg64);
> +		break;
> +	case HV_X64_REGISTER_MSR_MTRR_PHYS_BASEF:
> +		rdmsrl(MTRRphysBase_MSR(0xf), *reg64);
> +		break;
> +	case HV_X64_REGISTER_MSR_MTRR_PHYS_MASK0:
> +		rdmsrl(MTRRphysMask_MSR(0), *reg64);
> +		break;
> +	case HV_X64_REGISTER_MSR_MTRR_PHYS_MASK1:
> +		rdmsrl(MTRRphysMask_MSR(1), *reg64);
> +		break;
> +	case HV_X64_REGISTER_MSR_MTRR_PHYS_MASK2:
> +		rdmsrl(MTRRphysMask_MSR(2), *reg64);
> +		break;
> +	case HV_X64_REGISTER_MSR_MTRR_PHYS_MASK3:
> +		rdmsrl(MTRRphysMask_MSR(3), *reg64);
> +		break;
> +	case HV_X64_REGISTER_MSR_MTRR_PHYS_MASK4:
> +		rdmsrl(MTRRphysMask_MSR(4), *reg64);
> +		break;
> +	case HV_X64_REGISTER_MSR_MTRR_PHYS_MASK5:
> +		rdmsrl(MTRRphysMask_MSR(5), *reg64);
> +		break;
> +	case HV_X64_REGISTER_MSR_MTRR_PHYS_MASK6:
> +		rdmsrl(MTRRphysMask_MSR(6), *reg64);
> +		break;
> +	case HV_X64_REGISTER_MSR_MTRR_PHYS_MASK7:
> +		rdmsrl(MTRRphysMask_MSR(7), *reg64);
> +		break;
> +	case HV_X64_REGISTER_MSR_MTRR_PHYS_MASK8:
> +		rdmsrl(MTRRphysMask_MSR(8), *reg64);
> +		break;
> +	case HV_X64_REGISTER_MSR_MTRR_PHYS_MASK9:
> +		rdmsrl(MTRRphysMask_MSR(9), *reg64);
> +		break;
> +	case HV_X64_REGISTER_MSR_MTRR_PHYS_MASKA:
> +		rdmsrl(MTRRphysMask_MSR(0xa), *reg64);
> +		break;
> +	case HV_X64_REGISTER_MSR_MTRR_PHYS_MASKB:
> +		rdmsrl(MTRRphysMask_MSR(0xb), *reg64);
> +		break;
> +	case HV_X64_REGISTER_MSR_MTRR_PHYS_MASKC:
> +		rdmsrl(MTRRphysMask_MSR(0xc), *reg64);
> +		break;
> +	case HV_X64_REGISTER_MSR_MTRR_PHYS_MASKD:
> +		rdmsrl(MTRRphysMask_MSR(0xd), *reg64);
> +		break;
> +	case HV_X64_REGISTER_MSR_MTRR_PHYS_MASKE:
> +		rdmsrl(MTRRphysMask_MSR(0xe), *reg64);
> +		break;
> +	case HV_X64_REGISTER_MSR_MTRR_PHYS_MASKF:
> +		rdmsrl(MTRRphysMask_MSR(0xf), *reg64);
> +		break;
> +	case HV_X64_REGISTER_MSR_MTRR_FIX64K00000:
> +		rdmsrl(MSR_MTRRfix64K_00000, *reg64);
> +		break;
> +	case HV_X64_REGISTER_MSR_MTRR_FIX16K80000:
> +		rdmsrl(MSR_MTRRfix16K_80000, *reg64);
> +		break;
> +	case HV_X64_REGISTER_MSR_MTRR_FIX16KA0000:
> +		rdmsrl(MSR_MTRRfix16K_A0000, *reg64);
> +		break;
> +	case HV_X64_REGISTER_MSR_MTRR_FIX4KC0000:
> +		rdmsrl(MSR_MTRRfix4K_C0000, *reg64);
> +		break;
> +	case HV_X64_REGISTER_MSR_MTRR_FIX4KC8000:
> +		rdmsrl(MSR_MTRRfix4K_C8000, *reg64);
> +		break;
> +	case HV_X64_REGISTER_MSR_MTRR_FIX4KD0000:
> +		rdmsrl(MSR_MTRRfix4K_D0000, *reg64);
> +		break;
> +	case HV_X64_REGISTER_MSR_MTRR_FIX4KD8000:
> +		rdmsrl(MSR_MTRRfix4K_D8000, *reg64);
> +		break;
> +	case HV_X64_REGISTER_MSR_MTRR_FIX4KE0000:
> +		rdmsrl(MSR_MTRRfix4K_E0000, *reg64);
> +		break;
> +	case HV_X64_REGISTER_MSR_MTRR_FIX4KE8000:
> +		rdmsrl(MSR_MTRRfix4K_E8000, *reg64);
> +		break;
> +	case HV_X64_REGISTER_MSR_MTRR_FIX4KF0000:
> +		rdmsrl(MSR_MTRRfix4K_F0000, *reg64);
> +		break;
> +	case HV_X64_REGISTER_MSR_MTRR_FIX4KF8000:
> +		rdmsrl(MSR_MTRRfix4K_F8000, *reg64);
> +		break;
> +
> +	default:
> +		goto hypercall;
> +	}
> +
> +	return 0;
> +
> +hypercall:
> +	return 1;
> +}
> +
> +static void mshv_vtl_return(struct mshv_vtl_cpu_context *vtl0)
> +{
> +	struct hv_vp_assist_page *hvp;
> +	u64 hypercall_addr;
> +
> +	register u64 r8 asm("r8");
> +	register u64 r9 asm("r9");
> +	register u64 r10 asm("r10");
> +	register u64 r11 asm("r11");
> +	register u64 r12 asm("r12");
> +	register u64 r13 asm("r13");
> +	register u64 r14 asm("r14");
> +	register u64 r15 asm("r15");
> +
> +	hvp = hv_vp_assist_page[smp_processor_id()];
> +
> +	/*
> +	 * Process signal event direct set in the run page, if any.
> +	 */
> +	if (mshv_vsm_capabilities.return_action_available) {
> +		u32 offset = READ_ONCE(mshv_vtl_this_run()-
> >vtl_ret_action_size);
> +
> +		WRITE_ONCE(mshv_vtl_this_run()->vtl_ret_action_size, 0);
> +
> +		/*
> +		 * Hypervisor will take care of clearing out the actions
> +		 * set in the assist page.
> +		 */
> +		memcpy(hvp->vtl_ret_actions,
> +		       mshv_vtl_this_run()->vtl_ret_actions,
> +		       min_t(u32, offset, sizeof(hvp->vtl_ret_actions)));
> +	}
> +
> +	hvp->vtl_ret_x64rax = vtl0->rax;
> +	hvp->vtl_ret_x64rcx = vtl0->rcx;
> +
> +	hypercall_addr = (u64)((u8 *)hv_hypercall_pg +
> mshv_vsm_page_offsets.vtl_return_offset);
> +
> +	kernel_fpu_begin_mask(0);
> +	fxrstor(&vtl0->fx_state);
> +	native_write_cr2(vtl0->cr2);
> +	r8 = vtl0->r8;
> +	r9 = vtl0->r9;
> +	r10 = vtl0->r10;
> +	r11 = vtl0->r11;
> +	r12 = vtl0->r12;
> +	r13 = vtl0->r13;
> +	r14 = vtl0->r14;
> +	r15 = vtl0->r15;
> +
> +	asm __volatile__ (	\
> +	/* Save rbp pointer to the lower VTL, keep the stack 16-byte aligned */
> +		"pushq	%%rbp\n"
> +		"pushq	%%rcx\n"
> +	/* Restore the lower VTL's rbp */
> +		"movq	(%%rcx), %%rbp\n"
> +	/* Load return kind into rcx
> (HV_VTL_RETURN_INPUT_NORMAL_RETURN == 0) */
> +		"xorl	%%ecx, %%ecx\n"
> +	/* Transition to the lower VTL */
> +		CALL_NOSPEC
> +	/* Save VTL0's rax and rcx temporarily on 16-byte aligned stack */
> +		"pushq	%%rax\n"
> +		"pushq	%%rcx\n"
> +	/* Restore pointer to lower VTL rbp */
> +		"movq	16(%%rsp), %%rax\n"
> +	/* Save the lower VTL's rbp */
> +		"movq	%%rbp, (%%rax)\n"
> +	/* Restore saved registers */
> +		"movq	8(%%rsp), %%rax\n"
> +		"movq	24(%%rsp), %%rbp\n"
> +		"addq	$32, %%rsp\n"
> +
> +		: "=a"(vtl0->rax), "=c"(vtl0->rcx),
> +		  "+d"(vtl0->rdx), "+b"(vtl0->rbx), "+S"(vtl0->rsi), "+D"(vtl0-
> >rdi),
> +		  "+r"(r8), "+r"(r9), "+r"(r10), "+r"(r11),
> +		  "+r"(r12), "+r"(r13), "+r"(r14), "+r"(r15)
> +		: THUNK_TARGET(hypercall_addr), "c"(&vtl0->rbp)
> +		: "cc", "memory");
> +
> +	vtl0->r8 = r8;
> +	vtl0->r9 = r9;
> +	vtl0->r10 = r10;
> +	vtl0->r11 = r11;
> +	vtl0->r12 = r12;
> +	vtl0->r13 = r13;
> +	vtl0->r14 = r14;
> +	vtl0->r15 = r15;
> +	vtl0->cr2 = native_read_cr2();
> +
> +	fxsave(&vtl0->fx_state);
> +	kernel_fpu_end();
> +}
> +
> +/*
> + * Returning to a lower VTL treats the base pointer register
> + * as a general purpose one. Without adding this, objtool produces
> + * a warning.
> + */
> +STACK_FRAME_NON_STANDARD(mshv_vtl_return);
> +
> +static bool mshv_vtl_process_intercept(void)
> +{
> +	struct hv_per_cpu_context *mshv_cpu;
> +	void *synic_message_page;
> +	struct hv_message *msg;
> +	u32 message_type;
> +
> +	mshv_cpu = this_cpu_ptr(hv_context.cpu_context);
> +	synic_message_page = mshv_cpu->synic_message_page;
> +	if (unlikely(!synic_message_page))
> +		return true;
> +
> +	msg = (struct hv_message *)synic_message_page +
> HV_SYNIC_INTERCEPTION_SINT_INDEX;
> +	message_type = READ_ONCE(msg->header.message_type);
> +	if (message_type == HVMSG_NONE)
> +		return true;
> +
> +	memcpy(mshv_vtl_this_run()->exit_message, msg, sizeof(*msg));
> +	vmbus_signal_eom(msg, message_type);
> +	return false;
> +}
> +
> +static int mshv_vtl_ioctl_return_to_lower_vtl(void)
> +{
> +	preempt_disable();
> +	for (;;) {
> +		const unsigned long VTL0_WORK = _TIF_SIGPENDING |
> _TIF_NEED_RESCHED |
> +						_TIF_NOTIFY_RESUME |
> _TIF_NOTIFY_SIGNAL;
> +		unsigned long ti_work;
> +		u32 cancel;
> +		unsigned long irq_flags;
> +		struct hv_vp_assist_page *hvp;
> +		int ret;
> +
> +		local_irq_save(irq_flags);
> +		ti_work = READ_ONCE(current_thread_info()->flags);
> +		cancel = READ_ONCE(mshv_vtl_this_run()->cancel);
> +		if (unlikely((ti_work & VTL0_WORK) || cancel)) {
> +			local_irq_restore(irq_flags);
> +			preempt_enable();
> +			if (cancel)
> +				ti_work |= _TIF_SIGPENDING;
> +			ret = mshv_do_pre_guest_mode_work(ti_work);
> +			if (ret)
> +				return ret;
> +			preempt_disable();
> +			continue;
> +		}
> +
> +		mshv_vtl_return(&mshv_vtl_this_run()->cpu_context);
> +		local_irq_restore(irq_flags);
> +
> +		hvp = hv_vp_assist_page[smp_processor_id()];
> +		this_cpu_inc(num_vtl0_transitions);
> +		switch (hvp->vtl_entry_reason) {
> +		case MSHV_ENTRY_REASON_INTERRUPT:
> +			if (!mshv_vsm_capabilities.intercept_page_available
> &&
> +			    likely(!mshv_vtl_process_intercept()))
> +				goto done;
> +			break;
> +
> +		case MSHV_ENTRY_REASON_INTERCEPT:
> +
> 	WARN_ON(!mshv_vsm_capabilities.intercept_page_available);
> +			memcpy(mshv_vtl_this_run()->exit_message, hvp-
> >intercept_message,
> +			       sizeof(hvp->intercept_message));
> +			goto done;
> +
> +		default:
> +			panic("unknown entry reason: %d", hvp-
> >vtl_entry_reason);
> +		}
> +	}
> +
> +done:
> +	preempt_enable();
> +	return 0;
> +}
> +
> +static long
> +mshv_vtl_ioctl_get_set_regs(void __user *user_args, bool set)
> +{
> +	struct mshv_vp_registers args;
> +	struct hv_register_assoc *registers;
> +	long ret;
> +
> +	if (copy_from_user(&args, user_args, sizeof(args)))
> +		return -EFAULT;
> +
> +	if (args.count == 0 || args.count > MSHV_VP_MAX_REGISTERS)
> +		return -EINVAL;
> +
> +	registers = kmalloc_array(args.count,
> +				  sizeof(*registers),
> +				  GFP_KERNEL);
> +	if (!registers)
> +		return -ENOMEM;
> +
> +	if (copy_from_user(registers, (void __user *)args.regs_ptr,
> +			   sizeof(*registers) * args.count)) {
> +		ret = -EFAULT;
> +		goto free_return;
> +	}
> +
> +	if (set) {
> +		ret = mshv_vtl_set_reg(registers);
> +		if (!ret)
> +			goto free_return; /* No need of hypercall */
> +		ret = vtl_set_vp_registers(args.count, registers);
> +
> +	} else {
> +		ret = mshv_vtl_get_reg(registers);
> +		if (!ret)
> +			goto copy_args; /* No need of hypercall */
> +		ret = vtl_get_vp_registers(args.count, registers);
> +		if (ret)
> +			goto free_return;
> +
> +copy_args:
> +		if (copy_to_user((void __user *)args.regs_ptr, registers,
> +				 sizeof(*registers) * args.count))
> +			ret = -EFAULT;
> +	}
> +
> +free_return:
> +	kfree(registers);
> +	return ret;
> +}
> +
> +static inline long
> +mshv_vtl_ioctl_set_regs(void __user *user_args)
> +{
> +	return mshv_vtl_ioctl_get_set_regs(user_args, true);
> +}
> +
> +static inline long
> +mshv_vtl_ioctl_get_regs(void __user *user_args)
> +{
> +	return mshv_vtl_ioctl_get_set_regs(user_args, false);
> +}
> +
> +static long
> +mshv_vtl_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg)
> +{
> +	long ret;
> +	struct mshv_vtl *vtl = filp->private_data;
> +
> +	switch (ioctl) {
> +	case MSHV_VTL_SET_POLL_FILE:
> +		ret = mshv_vtl_ioctl_set_poll_file((struct
> mshv_vtl_set_poll_file *)arg);
> +		break;
> +	case MSHV_GET_VP_REGISTERS:
> +		ret = mshv_vtl_ioctl_get_regs((void __user *)arg);
> +		break;
> +	case MSHV_SET_VP_REGISTERS:
> +		ret = mshv_vtl_ioctl_set_regs((void __user *)arg);
> +		break;
> +	case MSHV_VTL_RETURN_TO_LOWER_VTL:
> +		ret = mshv_vtl_ioctl_return_to_lower_vtl();
> +		break;
> +	case MSHV_VTL_ADD_VTL0_MEMORY:
> +		ret = mshv_vtl_ioctl_add_vtl0_mem(vtl, (void __user *)arg);
> +		break;
> +	default:
> +		dev_err(vtl->module_dev, "invalid vtl ioctl: %#x\n", ioctl);
> +		ret = -ENOTTY;
> +	}
> +
> +	return ret;
> +}
> +
> +static vm_fault_t mshv_vtl_fault(struct vm_fault *vmf)
> +{
> +	struct page *page;
> +	int cpu = vmf->pgoff & MSHV_PG_OFF_CPU_MASK;
> +	int real_off = vmf->pgoff >> MSHV_REAL_OFF_SHIFT;
> +
> +	if (!cpu_online(cpu))
> +		return VM_FAULT_SIGBUS;
> +
> +	if (real_off == MSHV_RUN_PAGE_OFFSET) {
> +		page = virt_to_page(mshv_vtl_cpu_run(cpu));
> +	} else if (real_off == MSHV_REG_PAGE_OFFSET) {
> +		if (!mshv_has_reg_page)
> +			return VM_FAULT_SIGBUS;
> +		page = mshv_vtl_cpu_reg_page(cpu);
> +	} else {
> +		return VM_FAULT_NOPAGE;
> +	}
> +
> +	get_page(page);
> +	vmf->page = page;
> +
> +	return 0;
> +}
> +
> +static const struct vm_operations_struct mshv_vtl_vm_ops = {
> +	.fault = mshv_vtl_fault,
> +};
> +
> +static int mshv_vtl_mmap(struct file *filp, struct vm_area_struct *vma)
> +{
> +	vma->vm_ops = &mshv_vtl_vm_ops;
> +	return 0;
> +}
> +
> +static int mshv_vtl_release(struct inode *inode, struct file *filp)
> +{
> +	struct mshv_vtl *vtl = filp->private_data;
> +
> +	kfree(vtl);
> +
> +	return 0;
> +}
> +
> +static const struct file_operations mshv_vtl_fops = {
> +	.owner = THIS_MODULE,
> +	.unlocked_ioctl = mshv_vtl_ioctl,
> +	.release = mshv_vtl_release,
> +	.mmap = mshv_vtl_mmap,
> +};
> +
> +static void mshv_vtl_synic_mask_vmbus_sint(const u8 *mask)
> +{
> +	union hv_synic_sint sint;
> +
> +	sint.as_uint64 = 0;
> +	sint.vector = HYPERVISOR_CALLBACK_VECTOR;
> +	sint.masked = (*mask != 0);
> +	sint.auto_eoi = hv_recommend_using_aeoi();
> +
> +	hv_set_msr(HV_MSR_SINT0 + VTL2_VMBUS_SINT_INDEX,
> +		   sint.as_uint64);
> +
> +	if (!sint.masked)
> +		pr_debug("%s: Unmasking VTL2 VMBUS SINT on VP %d\n",
> __func__, smp_processor_id());
> +	else
> +		pr_debug("%s: Masking VTL2 VMBUS SINT on VP %d\n",
> __func__, smp_processor_id());
> +}
> +
> +static void mshv_vtl_read_remote(void *buffer)
> +{
> +	struct hv_per_cpu_context *mshv_cpu =
> this_cpu_ptr(hv_context.cpu_context);
> +	struct hv_message *msg = (struct hv_message *)mshv_cpu-
> >synic_message_page +
> +					VTL2_VMBUS_SINT_INDEX;
> +	u32 message_type = READ_ONCE(msg->header.message_type);
> +
> +	WRITE_ONCE(has_message, false);
> +	if (message_type == HVMSG_NONE)
> +		return;
> +
> +	memcpy(buffer, msg, sizeof(*msg));
> +	vmbus_signal_eom(msg, message_type);
> +}
> +
> +static bool vtl_synic_mask_vmbus_sint_masked = true;
> +
> +static ssize_t mshv_vtl_sint_read(struct file *filp, char __user *arg, size_t size,
> loff_t *offset)
> +{
> +	struct hv_message msg = {};
> +	int ret;
> +
> +	if (size < sizeof(msg))
> +		return -EINVAL;
> +
> +	for (;;) {
> +		smp_call_function_single(VMBUS_CONNECT_CPU,
> mshv_vtl_read_remote, &msg, true);
> +		if (msg.header.message_type != HVMSG_NONE)
> +			break;
> +
> +		if (READ_ONCE(vtl_synic_mask_vmbus_sint_masked))
> +			return 0; /* EOF */
> +
> +		if (filp->f_flags & O_NONBLOCK)
> +			return -EAGAIN;
> +
> +		ret = wait_event_interruptible(fd_wait_queue,
> +					       READ_ONCE(has_message) ||
> +
> 	READ_ONCE(vtl_synic_mask_vmbus_sint_masked));
> +		if (ret)
> +			return ret;
> +	}
> +
> +	if (copy_to_user(arg, &msg, sizeof(msg)))
> +		return -EFAULT;
> +
> +	return sizeof(msg);
> +}
> +
> +static __poll_t mshv_vtl_sint_poll(struct file *filp, poll_table *wait)
> +{
> +	__poll_t mask = 0;
> +
> +	poll_wait(filp, &fd_wait_queue, wait);
> +	if (READ_ONCE(has_message) ||
> READ_ONCE(vtl_synic_mask_vmbus_sint_masked))
> +		mask |= EPOLLIN | EPOLLRDNORM;
> +
> +	return mask;
> +}
> +
> +static void mshv_vtl_sint_on_msg_dpc(unsigned long data)
> +{
> +	WRITE_ONCE(has_message, true);
> +	wake_up_interruptible_poll(&fd_wait_queue, EPOLLIN);
> +}
> +
> +static int mshv_vtl_sint_ioctl_post_message(struct mshv_vtl_sint_post_msg
> __user *arg)
> +{
> +	struct mshv_vtl_sint_post_msg message;
> +	u8 payload[HV_MESSAGE_PAYLOAD_BYTE_COUNT];
> +
> +	if (copy_from_user(&message, arg, sizeof(message)))
> +		return -EFAULT;
> +	if (message.payload_size > HV_MESSAGE_PAYLOAD_BYTE_COUNT)
> +		return -EINVAL;
> +	if (copy_from_user(payload, (void __user *)message.payload_ptr,
> +			   message.payload_size))
> +		return -EFAULT;
> +
> +	return hv_post_message((union

This function definition is in separate file which can be build as independent module, this will cause
problem while linking . Try building with CONFIG_HYPERV=m and check.

- Saurabh

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ