netdev - RE: [PATCH V3 12/13] HV/Netvsc: Add Isolation VM support for netvsc driver

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <MWHPR21MB15936FE72E65A62FBA3EF4F2D7C09@MWHPR21MB1593.namprd21.prod.outlook.com>
Date:   Thu, 19 Aug 2021 18:14:51 +0000
From:   Michael Kelley <mikelley@...rosoft.com>
To:     Tianyu Lan <ltykernel@...il.com>,
        KY Srinivasan <kys@...rosoft.com>,
        Haiyang Zhang <haiyangz@...rosoft.com>,
        Stephen Hemminger <sthemmin@...rosoft.com>,
        "wei.liu@...nel.org" <wei.liu@...nel.org>,
        Dexuan Cui <decui@...rosoft.com>,
        "tglx@...utronix.de" <tglx@...utronix.de>,
        "mingo@...hat.com" <mingo@...hat.com>,
        "bp@...en8.de" <bp@...en8.de>, "x86@...nel.org" <x86@...nel.org>,
        "hpa@...or.com" <hpa@...or.com>,
        "dave.hansen@...ux.intel.com" <dave.hansen@...ux.intel.com>,
        "luto@...nel.org" <luto@...nel.org>,
        "peterz@...radead.org" <peterz@...radead.org>,
        "konrad.wilk@...cle.com" <konrad.wilk@...cle.com>,
        "boris.ostrovsky@...cle.com" <boris.ostrovsky@...cle.com>,
        "jgross@...e.com" <jgross@...e.com>,
        "sstabellini@...nel.org" <sstabellini@...nel.org>,
        "joro@...tes.org" <joro@...tes.org>,
        "will@...nel.org" <will@...nel.org>,
        "davem@...emloft.net" <davem@...emloft.net>,
        "kuba@...nel.org" <kuba@...nel.org>,
        "jejb@...ux.ibm.com" <jejb@...ux.ibm.com>,
        "martin.petersen@...cle.com" <martin.petersen@...cle.com>,
        "arnd@...db.de" <arnd@...db.de>, "hch@....de" <hch@....de>,
        "m.szyprowski@...sung.com" <m.szyprowski@...sung.com>,
        "robin.murphy@....com" <robin.murphy@....com>,
        "thomas.lendacky@....com" <thomas.lendacky@....com>,
        "brijesh.singh@....com" <brijesh.singh@....com>,
        "ardb@...nel.org" <ardb@...nel.org>,
        Tianyu Lan <Tianyu.Lan@...rosoft.com>,
        "pgonda@...gle.com" <pgonda@...gle.com>,
        "martin.b.radev@...il.com" <martin.b.radev@...il.com>,
        "akpm@...ux-foundation.org" <akpm@...ux-foundation.org>,
        "kirill.shutemov@...ux.intel.com" <kirill.shutemov@...ux.intel.com>,
        "rppt@...nel.org" <rppt@...nel.org>,
        "sfr@...b.auug.org.au" <sfr@...b.auug.org.au>,
        "saravanand@...com" <saravanand@...com>,
        "krish.sadhukhan@...cle.com" <krish.sadhukhan@...cle.com>,
        "aneesh.kumar@...ux.ibm.com" <aneesh.kumar@...ux.ibm.com>,
        "xen-devel@...ts.xenproject.org" <xen-devel@...ts.xenproject.org>,
        "rientjes@...gle.com" <rientjes@...gle.com>,
        "hannes@...xchg.org" <hannes@...xchg.org>,
        "tj@...nel.org" <tj@...nel.org>
CC:     "iommu@...ts.linux-foundation.org" <iommu@...ts.linux-foundation.org>,
        "linux-arch@...r.kernel.org" <linux-arch@...r.kernel.org>,
        "linux-hyperv@...r.kernel.org" <linux-hyperv@...r.kernel.org>,
        "linux-kernel@...r.kernel.org" <linux-kernel@...r.kernel.org>,
        "linux-scsi@...r.kernel.org" <linux-scsi@...r.kernel.org>,
        "netdev@...r.kernel.org" <netdev@...r.kernel.org>,
        vkuznets <vkuznets@...hat.com>,
        "parri.andrea@...il.com" <parri.andrea@...il.com>,
        "dave.hansen@...el.com" <dave.hansen@...el.com>
Subject: RE: [PATCH V3 12/13] HV/Netvsc: Add Isolation VM support for netvsc
 driver

From: Tianyu Lan <ltykernel@...il.com> Sent: Monday, August 9, 2021 10:56 AM
> 

The Subject line tag should be "hv_netvsc:".

> In Isolation VM, all shared memory with host needs to mark visible
> to host via hvcall. vmbus_establish_gpadl() has already done it for
> netvsc rx/tx ring buffer. The page buffer used by vmbus_sendpacket_
> pagebuffer() still need to handle. Use DMA API to map/umap these
> memory during sending/receiving packet and Hyper-V DMA ops callback
> will use swiotlb function to allocate bounce buffer and copy data
> from/to bounce buffer.
> 
> Signed-off-by: Tianyu Lan <Tianyu.Lan@...rosoft.com>
> ---
>  drivers/net/hyperv/hyperv_net.h   |   6 ++
>  drivers/net/hyperv/netvsc.c       | 144 +++++++++++++++++++++++++++++-
>  drivers/net/hyperv/rndis_filter.c |   2 +
>  include/linux/hyperv.h            |   5 ++
>  4 files changed, 154 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/net/hyperv/hyperv_net.h b/drivers/net/hyperv/hyperv_net.h
> index bc48855dff10..862419912bfb 100644
> --- a/drivers/net/hyperv/hyperv_net.h
> +++ b/drivers/net/hyperv/hyperv_net.h
> @@ -164,6 +164,7 @@ struct hv_netvsc_packet {
>  	u32 total_bytes;
>  	u32 send_buf_index;
>  	u32 total_data_buflen;
> +	struct hv_dma_range *dma_range;
>  };
> 
>  #define NETVSC_HASH_KEYLEN 40
> @@ -1074,6 +1075,7 @@ struct netvsc_device {
> 
>  	/* Receive buffer allocated by us but manages by NetVSP */
>  	void *recv_buf;
> +	void *recv_original_buf;
>  	u32 recv_buf_size; /* allocated bytes */
>  	u32 recv_buf_gpadl_handle;
>  	u32 recv_section_cnt;
> @@ -1082,6 +1084,8 @@ struct netvsc_device {
> 
>  	/* Send buffer allocated by us */
>  	void *send_buf;
> +	void *send_original_buf;
> +	u32 send_buf_size;
>  	u32 send_buf_gpadl_handle;
>  	u32 send_section_cnt;
>  	u32 send_section_size;
> @@ -1730,4 +1734,6 @@ struct rndis_message {
>  #define RETRY_US_HI	10000
>  #define RETRY_MAX	2000	/* >10 sec */
> 
> +void netvsc_dma_unmap(struct hv_device *hv_dev,
> +		      struct hv_netvsc_packet *packet);
>  #endif /* _HYPERV_NET_H */
> diff --git a/drivers/net/hyperv/netvsc.c b/drivers/net/hyperv/netvsc.c
> index 7bd935412853..fc312e5db4d5 100644
> --- a/drivers/net/hyperv/netvsc.c
> +++ b/drivers/net/hyperv/netvsc.c
> @@ -153,8 +153,21 @@ static void free_netvsc_device(struct rcu_head *head)
>  	int i;
> 
>  	kfree(nvdev->extension);
> -	vfree(nvdev->recv_buf);
> -	vfree(nvdev->send_buf);
> +
> +	if (nvdev->recv_original_buf) {
> +		vunmap(nvdev->recv_buf);
> +		vfree(nvdev->recv_original_buf);
> +	} else {
> +		vfree(nvdev->recv_buf);
> +	}
> +
> +	if (nvdev->send_original_buf) {
> +		vunmap(nvdev->send_buf);
> +		vfree(nvdev->send_original_buf);
> +	} else {
> +		vfree(nvdev->send_buf);
> +	}
> +
>  	kfree(nvdev->send_section_map);
> 
>  	for (i = 0; i < VRSS_CHANNEL_MAX; i++) {
> @@ -330,6 +343,27 @@ int netvsc_alloc_recv_comp_ring(struct netvsc_device *net_device, u32 q_idx)
>  	return nvchan->mrc.slots ? 0 : -ENOMEM;
>  }
> 
> +static void *netvsc_remap_buf(void *buf, unsigned long size)
> +{
> +	unsigned long *pfns;
> +	void *vaddr;
> +	int i;
> +
> +	pfns = kcalloc(size / HV_HYP_PAGE_SIZE, sizeof(unsigned long),
> +		       GFP_KERNEL);

This assumes that the "size" argument is a multiple of PAGE_SIZE.  I think
that's true in all the use cases, but it would be safer to check.

> +	if (!pfns)
> +		return NULL;
> +
> +	for (i = 0; i < size / HV_HYP_PAGE_SIZE; i++)
> +		pfns[i] = virt_to_hvpfn(buf + i * HV_HYP_PAGE_SIZE)
> +			+ (ms_hyperv.shared_gpa_boundary >> HV_HYP_PAGE_SHIFT);
> +
> +	vaddr = vmap_pfn(pfns, size / HV_HYP_PAGE_SIZE, PAGE_KERNEL_IO);
> +	kfree(pfns);
> +
> +	return vaddr;
> +}

This function appears to be a duplicate of hv_map_memory() in Patch 11 of this
series.  Is it possible to structure things so there is only one implementation?  In
any case, see the comment in hv_map_memory() about PAGE_SIZE vs
HV_HYP_PAGE_SIZE and similar.

> +
>  static int netvsc_init_buf(struct hv_device *device,
>  			   struct netvsc_device *net_device,
>  			   const struct netvsc_device_info *device_info)
> @@ -340,6 +374,7 @@ static int netvsc_init_buf(struct hv_device *device,
>  	unsigned int buf_size;
>  	size_t map_words;
>  	int i, ret = 0;
> +	void *vaddr;
> 
>  	/* Get receive buffer area. */
>  	buf_size = device_info->recv_sections * device_info->recv_section_size;
> @@ -375,6 +410,15 @@ static int netvsc_init_buf(struct hv_device *device,
>  		goto cleanup;
>  	}
> 
> +	if (hv_isolation_type_snp()) {
> +		vaddr = netvsc_remap_buf(net_device->recv_buf, buf_size);
> +		if (!vaddr)
> +			goto cleanup;
> +
> +		net_device->recv_original_buf = net_device->recv_buf;
> +		net_device->recv_buf = vaddr;
> +	}
> +
>  	/* Notify the NetVsp of the gpadl handle */
>  	init_packet = &net_device->channel_init_pkt;
>  	memset(init_packet, 0, sizeof(struct nvsp_message));
> @@ -477,6 +521,15 @@ static int netvsc_init_buf(struct hv_device *device,
>  		goto cleanup;
>  	}
> 
> +	if (hv_isolation_type_snp()) {
> +		vaddr = netvsc_remap_buf(net_device->send_buf, buf_size);
> +		if (!vaddr)
> +			goto cleanup;

I don't think this error case is handled correctly.  Doesn't the remapping
of the recv buf need to be undone?

> +
> +		net_device->send_original_buf = net_device->send_buf;
> +		net_device->send_buf = vaddr;
> +	}
> +
>  	/* Notify the NetVsp of the gpadl handle */
>  	init_packet = &net_device->channel_init_pkt;
>  	memset(init_packet, 0, sizeof(struct nvsp_message));
> @@ -767,7 +820,7 @@ static void netvsc_send_tx_complete(struct net_device *ndev,
> 
>  	/* Notify the layer above us */
>  	if (likely(skb)) {
> -		const struct hv_netvsc_packet *packet
> +		struct hv_netvsc_packet *packet
>  			= (struct hv_netvsc_packet *)skb->cb;
>  		u32 send_index = packet->send_buf_index;
>  		struct netvsc_stats *tx_stats;
> @@ -783,6 +836,7 @@ static void netvsc_send_tx_complete(struct net_device *ndev,
>  		tx_stats->bytes += packet->total_bytes;
>  		u64_stats_update_end(&tx_stats->syncp);
> 
> +		netvsc_dma_unmap(ndev_ctx->device_ctx, packet);
>  		napi_consume_skb(skb, budget);
>  	}
> 
> @@ -947,6 +1001,82 @@ static void netvsc_copy_to_send_buf(struct netvsc_device *net_device,
>  		memset(dest, 0, padding);
>  }
> 
> +void netvsc_dma_unmap(struct hv_device *hv_dev,
> +		      struct hv_netvsc_packet *packet)
> +{
> +	u32 page_count = packet->cp_partial ?
> +		packet->page_buf_cnt - packet->rmsg_pgcnt :
> +		packet->page_buf_cnt;
> +	int i;
> +
> +	if (!hv_is_isolation_supported())
> +		return;
> +
> +	if (!packet->dma_range)
> +		return;
> +
> +	for (i = 0; i < page_count; i++)
> +		dma_unmap_single(&hv_dev->device, packet->dma_range[i].dma,
> +				 packet->dma_range[i].mapping_size,
> +				 DMA_TO_DEVICE);
> +
> +	kfree(packet->dma_range);
> +}
> +
> +/* netvsc_dma_map - Map swiotlb bounce buffer with data page of
> + * packet sent by vmbus_sendpacket_pagebuffer() in the Isolation
> + * VM.
> + *
> + * In isolation VM, netvsc send buffer has been marked visible to
> + * host and so the data copied to send buffer doesn't need to use
> + * bounce buffer. The data pages handled by vmbus_sendpacket_pagebuffer()
> + * may not be copied to send buffer and so these pages need to be
> + * mapped with swiotlb bounce buffer. netvsc_dma_map() is to do
> + * that. The pfns in the struct hv_page_buffer need to be converted
> + * to bounce buffer's pfn. The loop here is necessary and so not
> + * use dma_map_sg() here.

I think I understand why the loop is necessary, but it would be
nice to add a bit more comment text to explain.  The reason is
that the entries in the page buffer array are not necessarily full
pages of data.  Each entry in the array has a separate offset and
len that may be non-zero, even for entries in the middle of the
array.   And the entries are not physically contiguous.  So each
entry must be individually mapped rather than as a contiguous unit.

> + */
> +int netvsc_dma_map(struct hv_device *hv_dev,
> +		   struct hv_netvsc_packet *packet,
> +		   struct hv_page_buffer *pb)
> +{
> +	u32 page_count =  packet->cp_partial ?
> +		packet->page_buf_cnt - packet->rmsg_pgcnt :
> +		packet->page_buf_cnt;
> +	dma_addr_t dma;
> +	int i;
> +
> +	if (!hv_is_isolation_supported())
> +		return 0;
> +
> +	packet->dma_range = kcalloc(page_count,
> +				    sizeof(*packet->dma_range),
> +				    GFP_KERNEL);
> +	if (!packet->dma_range)
> +		return -ENOMEM;
> +
> +	for (i = 0; i < page_count; i++) {
> +		char *src = phys_to_virt((pb[i].pfn << HV_HYP_PAGE_SHIFT)
> +					 + pb[i].offset);
> +		u32 len = pb[i].len;
> +
> +		dma = dma_map_single(&hv_dev->device, src, len,
> +				     DMA_TO_DEVICE);
> +		if (dma_mapping_error(&hv_dev->device, dma)) {
> +			kfree(packet->dma_range);
> +			return -ENOMEM;
> +		}
> +
> +		packet->dma_range[i].dma = dma;
> +		packet->dma_range[i].mapping_size = len;
> +		pb[i].pfn = dma >> HV_HYP_PAGE_SHIFT;
> +		pb[i].offset = offset_in_hvpage(dma);
> +		pb[i].len = len;
> +	}
> +
> +	return 0;
> +}
> +
>  static inline int netvsc_send_pkt(
>  	struct hv_device *device,
>  	struct hv_netvsc_packet *packet,
> @@ -987,14 +1117,22 @@ static inline int netvsc_send_pkt(
> 
>  	trace_nvsp_send_pkt(ndev, out_channel, rpkt);
> 
> +	packet->dma_range = NULL;
>  	if (packet->page_buf_cnt) {
>  		if (packet->cp_partial)
>  			pb += packet->rmsg_pgcnt;
> 
> +		ret = netvsc_dma_map(ndev_ctx->device_ctx, packet, pb);
> +		if (ret)
> +			return ret;

I think this error case needs to set things up so sending the packet
can be retried at the higher levels.  The typical error is that
swiotlb is out of bounce buffer memory.  That's a transient
condition.  There's already code in this function to retry when
the vmbus_sendpacket functions fails because the ring buffer
is full, and running out of bounce buffer memory should probably
take the same path.

> +
>  		ret = vmbus_sendpacket_pagebuffer(out_channel,
>  						  pb, packet->page_buf_cnt,
>  						  &nvmsg, sizeof(nvmsg),
>  						  req_id);
> +
> +		if (ret)
> +			netvsc_dma_unmap(ndev_ctx->device_ctx, packet);
>  	} else {
>  		ret = vmbus_sendpacket(out_channel,
>  				       &nvmsg, sizeof(nvmsg),
> diff --git a/drivers/net/hyperv/rndis_filter.c b/drivers/net/hyperv/rndis_filter.c
> index f6c9c2a670f9..448fcc325ed7 100644
> --- a/drivers/net/hyperv/rndis_filter.c
> +++ b/drivers/net/hyperv/rndis_filter.c
> @@ -361,6 +361,8 @@ static void rndis_filter_receive_response(struct net_device *ndev,
>  			}
>  		}
> 
> +		netvsc_dma_unmap(((struct net_device_context *)
> +			netdev_priv(ndev))->device_ctx, &request->pkt);
>  		complete(&request->wait_event);
>  	} else {
>  		netdev_err(ndev,
> diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h
> index 83fa567ad594..2ea638101645 100644
> --- a/include/linux/hyperv.h
> +++ b/include/linux/hyperv.h
> @@ -1601,6 +1601,11 @@ struct hyperv_service_callback {
>  	void (*callback)(void *context);
>  };
> 
> +struct hv_dma_range {
> +	dma_addr_t dma;
> +	u32 mapping_size;
> +};
> +
>  #define MAX_SRV_VER	0x7ffffff
>  extern bool vmbus_prep_negotiate_resp(struct icmsg_hdr *icmsghdrp, u8 *buf, u32 buflen,
>  				const int *fw_version, int fw_vercnt,
> --
> 2.25.1