lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <MW2PR2101MB105279C034494D505E47EC15D7F61@MW2PR2101MB1052.namprd21.prod.outlook.com>
Date:   Sun, 29 Nov 2020 18:29:55 +0000
From:   Michael Kelley <mikelley@...rosoft.com>
To:     "Andrea Parri (Microsoft)" <parri.andrea@...il.com>,
        "linux-kernel@...r.kernel.org" <linux-kernel@...r.kernel.org>,
        "linux-hyperv@...r.kernel.org" <linux-hyperv@...r.kernel.org>
CC:     KY Srinivasan <kys@...rosoft.com>,
        Haiyang Zhang <haiyangz@...rosoft.com>,
        Stephen Hemminger <sthemmin@...rosoft.com>,
        Wei Liu <wei.liu@...nel.org>
Subject: RE: [PATCH] Drivers: hv: vmbus: Introduce the
 CHANNELMSG_MODIFYCHANNEL_RESPONSE message type

From: Andrea Parri (Microsoft) <parri.andrea@...il.com> Sent: Thursday, November 26, 2020 11:12 AM
> 
> Quoting from commit 7527810573436f ("Drivers: hv: vmbus: Introduce
> the CHANNELMSG_MODIFYCHANNEL message type"),
> 
>   "[...] Hyper-V can *not* currently ACK CHANNELMSG_MODIFYCHANNEL
>    messages with the promise that (after the ACK is sent) the
>    channel won't send any more interrupts to the "old" CPU.
> 
>    The peculiarity of the CHANNELMSG_MODIFYCHANNEL messages is
>    problematic if the user want to take a CPU offline, since we
>    don't want to take a CPU offline (and, potentially, "lose"
>    channel interrupts on such CPU) if the host is still processing
>    a CHANNELMSG_MODIFYCHANNEL message associated to that CPU."
> 
> Introduce the CHANNELMSG_MODIFYCHANNEL_RESPONSE(24) message type,
> which embodies the type of the CHANNELMSG_MODIFYCHANNEL ACK.

I feel like the commit message needs a bit more detail about the new
functionality that is added, including introducing the new VMbus protocol
version 5.3, and then waiting for the response message when running
with that protocol version of later.   Also, when taking a CPU offline, new
functionality ensures that there are no pending channel interrupts for
that CPU.

Could/should this patch be broken into multiple patches?

1)  Introduce and negotiate VMbus protocol version 5.3.   Note that just
negotiating version 5.3 should be safe because any ACK messages will
be cleanly ignored by the old code.
2)  Check for pending channel interrupts before taking a CPU offline.
Wouldn't this check be valid even with the existing code where there is no
ACK message?  The old code is, in effect, assuming that enough time has
passed such that the modify channel message has been processed.
3)  Code to wait for an ACK message, and code to receive and process an
ACK message.


> 
> Signed-off-by: Andrea Parri (Microsoft) <parri.andrea@...il.com>
> ---
>  drivers/hv/channel.c      | 108 ++++++++++++++++++++++++++++++++------
>  drivers/hv/channel_mgmt.c |  42 +++++++++++++++
>  drivers/hv/connection.c   |   3 +-
>  drivers/hv/hv.c           |  52 ++++++++++++++++++
>  drivers/hv/hv_trace.h     |  15 ++++++
>  drivers/hv/vmbus_drv.c    |   4 +-
>  include/linux/hyperv.h    |  13 ++++-
>  7 files changed, 218 insertions(+), 19 deletions(-)
> 
> diff --git a/drivers/hv/channel.c b/drivers/hv/channel.c
> index fbdda9938039a..6801d89a20051 100644
> --- a/drivers/hv/channel.c
> +++ b/drivers/hv/channel.c
> @@ -209,31 +209,107 @@ int vmbus_send_tl_connect_request(const guid_t
> *shv_guest_servie_id,
>  }
>  EXPORT_SYMBOL_GPL(vmbus_send_tl_connect_request);
> 
> +static int send_modifychannel_without_ack(struct vmbus_channel *channel, u32 target_vp)
> +{
> +	struct vmbus_channel_modifychannel msg;
> +	int ret;
> +
> +	memset(&msg, 0, sizeof(msg));
> +	msg.header.msgtype = CHANNELMSG_MODIFYCHANNEL;
> +	msg.child_relid = channel->offermsg.child_relid;
> +	msg.target_vp = target_vp;
> +
> +	ret = vmbus_post_msg(&msg, sizeof(msg), true);
> +	trace_vmbus_send_modifychannel(&msg, ret);
> +
> +	return ret;
> +}
> +
> +static int send_modifychannel_with_ack(struct vmbus_channel *channel, u32 target_vp)
> +{
> +	struct vmbus_channel_modifychannel *msg;
> +	struct vmbus_channel_msginfo *info;
> +	unsigned long flags;
> +	int ret;
> +
> +	info = kzalloc(sizeof(struct vmbus_channel_msginfo) +
> +				sizeof(struct vmbus_channel_modifychannel),
> +		       GFP_KERNEL);
> +	if (!info)
> +		return -ENOMEM;
> +
> +	init_completion(&info->waitevent);
> +	info->waiting_channel = channel;
> +
> +	msg = (struct vmbus_channel_modifychannel *)info->msg;
> +	msg->header.msgtype = CHANNELMSG_MODIFYCHANNEL;
> +	msg->child_relid = channel->offermsg.child_relid;
> +	msg->target_vp = target_vp;
> +
> +	spin_lock_irqsave(&vmbus_connection.channelmsg_lock, flags);
> +	list_add_tail(&info->msglistentry, &vmbus_connection.chn_msg_list);
> +	spin_unlock_irqrestore(&vmbus_connection.channelmsg_lock, flags);
> +
> +	if (channel->rescind) {
> +		ret = -ENODEV;
> +		goto free_info;
> +	}

Does the above check really do anything?  Such checks are sprinkled throughout
the VMbus code, and I've always questioned their efficacy.  The rescind flag can
be set without holding the channel_mutex, so as soon as the above code tests
the flag, it seems like it could change.

> +
> +	ret = vmbus_post_msg(msg, sizeof(struct vmbus_channel_modifychannel), true);

Use "sizeof(*msg)" instead?

> +	trace_vmbus_send_modifychannel(msg, ret);
> +	if (ret != 0)
> +		goto clean_msglist;
> +
> +	/*
> +	 * Release channel_mutex; otherwise, vmbus_onoffer_rescind() could block on
> +	 * the mutex and be unable to signal the completion.
> +	 */
> +	mutex_unlock(&vmbus_connection.channel_mutex);
> +	wait_for_completion(&info->waitevent);
> +	mutex_lock(&vmbus_connection.channel_mutex);

The calling function, target_cpu_store(), obtains the mutex to synchronize against
channel closure.  Does releasing the mutex here still ensure the needed synchronization?
If so, some comments explaining why could be helpful.  I didn't try to reason through it
all, so I'm not sure.

> +
> +	spin_lock_irqsave(&vmbus_connection.channelmsg_lock, flags);
> +	list_del(&info->msglistentry);
> +	spin_unlock_irqrestore(&vmbus_connection.channelmsg_lock, flags);
> +
> +	if (channel->rescind) {
> +		ret = -ENODEV;
> +		goto free_info;
> +	}

Again, I'm wondering if the above is actually useful.

> +
> +	if (info->response.modify_response.status) {

I'm thinking that current Hyper-V never sends a non-zero status.  But it's good
to check in case of future changes.  The implication is that a non-zero status means
that the request to change the target CPU was not performed, correct?

> +		kfree(info);
> +		return -EAGAIN;
> +	}
> +
> +	kfree(info);
> +	return 0;
> +
> +clean_msglist:
> +	spin_lock_irqsave(&vmbus_connection.channelmsg_lock, flags);
> +	list_del(&info->msglistentry);
> +	spin_unlock_irqrestore(&vmbus_connection.channelmsg_lock, flags);

The error handling seems more complex than needed.  The "clean_msglist" label
is only called from one place, so the above code could go inline at that place.

> +free_info:
> +	kfree(info);
> +	return ret;

More error handling:  The kfree(info) call is done in three different places.  With
consistent use of the 'ret' local variable, only one place would be needed.

> +}
> +
>  /*
>   * Set/change the vCPU (@target_vp) the channel (@child_relid) will interrupt.
>   *
>   * CHANNELMSG_MODIFYCHANNEL messages are aynchronous.  Also, Hyper-V does not
> - * ACK such messages.  IOW we can't know when the host will stop interrupting
> - * the "old" vCPU and start interrupting the "new" vCPU for the given channel.
> + * ACK such messages before VERSION_WIN10_V5_3.  Without ACK, we can not know
> + * when the host will stop interrupting the "old" vCPU and start interrupting
> + * the "new" vCPU for the given channel.

In the interest of clarity, make the above comment slightly more explicit:  When VMbus
version 5.3 or later is negotiated, Hyper-V always sends an ACK in response to
CHANNELMSG_MODIFYCHANNEL.  For VMbus version 5.2 and earlier, it never sends an ACK.

>   *
>   * The CHANNELMSG_MODIFYCHANNEL message type is supported since VMBus version
>   * VERSION_WIN10_V4_1.
>   */
> -int vmbus_send_modifychannel(u32 child_relid, u32 target_vp)
> +int vmbus_send_modifychannel(struct vmbus_channel *channel, u32 target_vp)
>  {
> -	struct vmbus_channel_modifychannel conn_msg;
> -	int ret;
> -
> -	memset(&conn_msg, 0, sizeof(conn_msg));
> -	conn_msg.header.msgtype = CHANNELMSG_MODIFYCHANNEL;
> -	conn_msg.child_relid = child_relid;
> -	conn_msg.target_vp = target_vp;
> -
> -	ret = vmbus_post_msg(&conn_msg, sizeof(conn_msg), true);
> -
> -	trace_vmbus_send_modifychannel(&conn_msg, ret);
> -
> -	return ret;
> +	if (vmbus_proto_version >= VERSION_WIN10_V5_3)
> +		return send_modifychannel_with_ack(channel, target_vp);
> +	return send_modifychannel_without_ack(channel, target_vp);
>  }
>  EXPORT_SYMBOL_GPL(vmbus_send_modifychannel);
> 
> diff --git a/drivers/hv/channel_mgmt.c b/drivers/hv/channel_mgmt.c
> index 1d44bb635bb84..8fcb66d623ba4 100644
> --- a/drivers/hv/channel_mgmt.c
> +++ b/drivers/hv/channel_mgmt.c
> @@ -1200,6 +1200,46 @@ static void vmbus_onopen_result(struct
> vmbus_channel_message_header *hdr)
>  	spin_unlock_irqrestore(&vmbus_connection.channelmsg_lock, flags);
>  }
> 
> +/*
> + * vmbus_onmodifychannel_response - Modify Channel response handler.
> + *
> + * This is invoked when we received a response to our channel modify request.
> + * Find the matching request, copy the response and signal the requesting thread.
> + */
> +static void vmbus_onmodifychannel_response(struct vmbus_channel_message_header *hdr)
> +{
> +	struct vmbus_channel_modifychannel_response *response;
> +	struct vmbus_channel_msginfo *msginfo;
> +	unsigned long flags;
> +
> +	response = (struct vmbus_channel_modifychannel_response *)hdr;
> +
> +	trace_vmbus_onmodifychannel_response(response);
> +
> +	/*
> +	 * Find the modify msg, copy the response and signal/unblock the wait event.
> +	 */
> +	spin_lock_irqsave(&vmbus_connection.channelmsg_lock, flags);
> +
> +	list_for_each_entry(msginfo, &vmbus_connection.chn_msg_list, msglistentry) {
> +		struct vmbus_channel_message_header *responseheader =
> +				(struct vmbus_channel_message_header *)msginfo->msg;
> +
> +		if (responseheader->msgtype == CHANNELMSG_MODIFYCHANNEL) {
> +			struct vmbus_channel_modifychannel *modifymsg;
> +
> +			modifymsg = (struct vmbus_channel_modifychannel *)msginfo->msg;
> +			if (modifymsg->child_relid == response->child_relid) {
> +				memcpy(&msginfo->response.modify_response, response,
> +				       sizeof(struct vmbus_channel_modifychannel_response));

Use "sizeof(*response)" ?

> +				complete(&msginfo->waitevent);
> +				break;
> +			}
> +		}
> +	}
> +	spin_unlock_irqrestore(&vmbus_connection.channelmsg_lock, flags);
> +}
> +
>  /*
>   * vmbus_ongpadl_created - GPADL created handler.
>   *
> @@ -1366,6 +1406,8 @@ channel_message_table[CHANNELMSG_COUNT] = {
>  	{ CHANNELMSG_TL_CONNECT_REQUEST,	0, NULL, 0},
>  	{ CHANNELMSG_MODIFYCHANNEL,		0, NULL, 0},
>  	{ CHANNELMSG_TL_CONNECT_RESULT,		0, NULL, 0},
> +	{ CHANNELMSG_MODIFYCHANNEL_RESPONSE,	1,
> vmbus_onmodifychannel_response,
> +		sizeof(struct vmbus_channel_modifychannel_response)},
>  };
> 
>  /*
> diff --git a/drivers/hv/connection.c b/drivers/hv/connection.c
> index 11170d9a2e1a5..cdf41c504d914 100644
> --- a/drivers/hv/connection.c
> +++ b/drivers/hv/connection.c
> @@ -45,6 +45,7 @@ EXPORT_SYMBOL_GPL(vmbus_proto_version);
>   * Table of VMBus versions listed from newest to oldest.
>   */
>  static __u32 vmbus_versions[] = {
> +	VERSION_WIN10_V5_3,
>  	VERSION_WIN10_V5_2,
>  	VERSION_WIN10_V5_1,
>  	VERSION_WIN10_V5,
> @@ -60,7 +61,7 @@ static __u32 vmbus_versions[] = {
>   * Maximal VMBus protocol version guests can negotiate.  Useful to cap the
>   * VMBus version for testing and debugging purpose.
>   */
> -static uint max_version = VERSION_WIN10_V5_2;
> +static uint max_version = VERSION_WIN10_V5_3;
> 
>  module_param(max_version, uint, S_IRUGO);
>  MODULE_PARM_DESC(max_version,
> diff --git a/drivers/hv/hv.c b/drivers/hv/hv.c
> index 0cde10fe0e71f..35f240f4c833e 100644
> --- a/drivers/hv/hv.c
> +++ b/drivers/hv/hv.c
> @@ -16,6 +16,7 @@
>  #include <linux/version.h>
>  #include <linux/random.h>
>  #include <linux/clockchips.h>
> +#include <linux/delay.h>
>  #include <clocksource/hyperv_timer.h>
>  #include <asm/mshyperv.h>
>  #include "hyperv_vmbus.h"
> @@ -237,6 +238,40 @@ void hv_synic_disable_regs(unsigned int cpu)
>  	hv_set_synic_state(sctrl.as_uint64);
>  }
> 
> +#define HV_MAX_TRIES 3
> +/*
> + * Scan the event flags page of 'this' CPU looking for any bit that is set.  If we find one
> + * bit set, then wait for a few milliseconds.  Repeat these steps for a maximum of 3 times.

A bit more comment here might be warranted.  If a bit is set, that means there is a
pending channel interrupt.  The expectation is that the normal interrupt handling
mechanism will find and process the channel interrupt "very soon", and in the process
clear the bit.   Since Hyper-V won't be setting any additional channel interrupt bits, we
should very soon reach a state where no bits are set.

> + * Return 'true', if there is still any set bit after this operation; 'false', otherwise.
> + */
> +static bool hv_synic_event_pending(void)
> +{
> +	struct hv_per_cpu_context *hv_cpu = this_cpu_ptr(hv_context.cpu_context);
> +	union hv_synic_event_flags *event =
> +		(union hv_synic_event_flags *)hv_cpu->synic_event_page + VMBUS_MESSAGE_SINT;
> +	unsigned long *recv_int_page = event->flags;

I think a comment is warranted here.  The similar vmbus_chan_sched() code has two ways
to get the recv_int_page, depending on the negotiated VMbus protocol version.  This code
assumes the "new" way to get the recv_int_page, which makes sense given that it is invoked
only for newer VMbus protocol versions.   But a note about that assumption would be a good
warning for future readers/coders.

> +	bool pending;
> +	u32 relid;
> +	int tries = 0;
> +
> +retry:
> +	pending = false;
> +	for_each_set_bit(relid, recv_int_page, HV_EVENT_FLAGS_COUNT) {
> +		/* Special case - VMBus channel protocol messages */
> +		if (relid == 0)
> +			continue;
> +		if (sync_test_bit(relid, recv_int_page)) {
> +			pending = true;
> +			break;

I'm not clear on why the above test is needed.  If the bit was found to be set
by for_each_set_bit(), that seems good enough to set pending=true.

> +		}
> +	}
> +	if (pending && tries++ < HV_MAX_TRIES) {
> +		usleep_range(10000, 20000);
> +		goto retry;
> +	}
> +	return pending;
> +}
> +
>  int hv_synic_cleanup(unsigned int cpu)
>  {
>  	struct vmbus_channel *channel, *sc;
> @@ -276,6 +311,23 @@ int hv_synic_cleanup(unsigned int cpu)
>  	if (channel_found && vmbus_connection.conn_state == CONNECTED)
>  		return -EBUSY;
> 
> +	if (vmbus_proto_version >= VERSION_WIN10_V5_3) {
> +		/*
> +		 * channel_found == false means that any channels that were previously
> +		 * assigned to the CPU have been reassigned elsewhere.  Since we have
> +		 * received a ModifyChannel ACK from Hyper-V for all such reassignments,
> +		 * we know that Hyper-V won't set any new bits in the event flags page.
> +		 * However, there may be existing bits set in this page that have not
> +		 * been processed by vmbus_chan_sched().  We scan the event flags page
> +		 * looking for any bits that are set and waiting (with a timeout) for
> +		 * vmbus_chan_sched() to process such bits.  If bits are still set after
> +		 * this operation (and VMBus is connected), we fail the CPU offlining
> +		 * operation.
> +		 */
> +		if (hv_synic_event_pending() && vmbus_connection.conn_state ==
> CONNECTED)
> +			return -EBUSY;
> +	}
> +
>  	hv_stimer_legacy_cleanup(cpu);
> 
>  	hv_synic_disable_regs(cpu);
> diff --git a/drivers/hv/hv_trace.h b/drivers/hv/hv_trace.h
> index 6063bb21bb137..3e83c24856dbe 100644
> --- a/drivers/hv/hv_trace.h
> +++ b/drivers/hv/hv_trace.h
> @@ -86,6 +86,21 @@ TRACE_EVENT(vmbus_onopen_result,
>  		    )
>  	);
> 
> +TRACE_EVENT(vmbus_onmodifychannel_response,
> +	    TP_PROTO(const struct vmbus_channel_modifychannel_response *response),
> +	    TP_ARGS(response),
> +	    TP_STRUCT__entry(
> +		    __field(u32, child_relid)
> +		    __field(u32, status)
> +		    ),
> +	    TP_fast_assign(__entry->child_relid = response->child_relid;
> +			   __entry->status = response->status;
> +		    ),
> +	    TP_printk("child_relid 0x%x, status %d",
> +		      __entry->child_relid,  __entry->status
> +		    )
> +	);
> +
>  TRACE_EVENT(vmbus_ongpadl_created,
>  	    TP_PROTO(const struct vmbus_channel_gpadl_created *gpadlcreated),
>  	    TP_ARGS(gpadlcreated),
> diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c
> index 4fad3e6745e53..3e1cd5e8f769e 100644
> --- a/drivers/hv/vmbus_drv.c
> +++ b/drivers/hv/vmbus_drv.c
> @@ -1767,13 +1767,15 @@ static ssize_t target_cpu_store(struct vmbus_channel *channel,
>  	if (target_cpu == origin_cpu)
>  		goto cpu_store_unlock;
> 
> -	if (vmbus_send_modifychannel(channel->offermsg.child_relid,
> +	if (vmbus_send_modifychannel(channel,
>  				     hv_cpu_number_to_vp_number(target_cpu))) {
>  		ret = -EIO;
>  		goto cpu_store_unlock;
>  	}
> 
>  	/*
> +	 * For version before VERSION_WIN10_V5_3, the following warning holds:
> +	 *
>  	 * Warning.  At this point, there is *no* guarantee that the host will
>  	 * have successfully processed the vmbus_send_modifychannel() request.
>  	 * See the header comment of vmbus_send_modifychannel() for more info.
> diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h
> index 1ce131f29f3b4..808acf4c3fe61 100644
> --- a/include/linux/hyperv.h
> +++ b/include/linux/hyperv.h
> @@ -234,6 +234,7 @@ static inline u32 hv_get_avail_to_write_percent(
>   * 5 . 0  (Newer Windows 10)
>   * 5 . 1  (Windows 10 RS4)
>   * 5 . 2  (Windows Server 2019, RS5)
> + * 5 . 3  (Windows Server 2021) // FIXME: use proper version number/name
>   */
> 
>  #define VERSION_WS2008  ((0 << 16) | (13))
> @@ -245,6 +246,7 @@ static inline u32 hv_get_avail_to_write_percent(
>  #define VERSION_WIN10_V5 ((5 << 16) | (0))
>  #define VERSION_WIN10_V5_1 ((5 << 16) | (1))
>  #define VERSION_WIN10_V5_2 ((5 << 16) | (2))
> +#define VERSION_WIN10_V5_3 ((5 << 16) | (3))
> 
>  /* Make maximum size of pipe payload of 16K */
>  #define MAX_PIPE_DATA_PAYLOAD		(sizeof(u8) * 16384)
> @@ -475,6 +477,7 @@ enum vmbus_channel_message_type {
>  	CHANNELMSG_TL_CONNECT_REQUEST		= 21,
>  	CHANNELMSG_MODIFYCHANNEL		= 22,
>  	CHANNELMSG_TL_CONNECT_RESULT		= 23,
> +	CHANNELMSG_MODIFYCHANNEL_RESPONSE	= 24,
>  	CHANNELMSG_COUNT
>  };
> 
> @@ -588,6 +591,13 @@ struct vmbus_channel_open_result {
>  	u32 status;
>  } __packed;
> 
> +/* Modify Channel Result parameters */
> +struct vmbus_channel_modifychannel_response {
> +	struct vmbus_channel_message_header header;
> +	u32 child_relid;
> +	u32 status;
> +} __packed;
> +
>  /* Close channel parameters; */
>  struct vmbus_channel_close_channel {
>  	struct vmbus_channel_message_header header;
> @@ -720,6 +730,7 @@ struct vmbus_channel_msginfo {
>  		struct vmbus_channel_gpadl_torndown gpadl_torndown;
>  		struct vmbus_channel_gpadl_created gpadl_created;
>  		struct vmbus_channel_version_response version_response;
> +		struct vmbus_channel_modifychannel_response modify_response;
>  	} response;
> 
>  	u32 msgsize;
> @@ -1562,7 +1573,7 @@ extern __u32 vmbus_proto_version;
> 
>  int vmbus_send_tl_connect_request(const guid_t *shv_guest_servie_id,
>  				  const guid_t *shv_host_servie_id);
> -int vmbus_send_modifychannel(u32 child_relid, u32 target_vp);
> +int vmbus_send_modifychannel(struct vmbus_channel *channel, u32 target_vp);
>  void vmbus_set_event(struct vmbus_channel *channel);
> 
>  /* Get the start of the ring buffer. */
> --
> 2.25.1

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ