linux-kernel - Re: [PATCH net-next 1/2] vhost: basic in order support

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <f9cc1fe1-79cd-40ec-8fa9-88a14f1b2ca1@oracle.com>
Date: Wed, 9 Jul 2025 12:38:23 -0400
From: Jonah Palmer <jonah.palmer@...cle.com>
To: Jason Wang <jasowang@...hat.com>, mst@...hat.com, eperezma@...hat.com
Cc: kvm@...r.kernel.org, virtualization@...ts.linux.dev,
        netdev@...r.kernel.org, linux-kernel@...r.kernel.org
Subject: Re: [PATCH net-next 1/2] vhost: basic in order support



On 7/8/25 2:48 AM, Jason Wang wrote:
> This patch adds basic in order support for vhost. Two optimizations
> are implemented in this patch:
> 
> 1) Since driver uses descriptor in order, vhost can deduce the next
>     avail ring head by counting the number of descriptors that has been
>     used in next_avail_head. This eliminate the need to access the
>     available ring in vhost.
> 
> 2) vhost_add_used_and_singal_n() is extended to accept the number of
>     batched buffers per used elem. While this increases the times of
>     usersapce memory access but it helps to reduce the chance of
>     used ring access of both the driver and vhost.
> 
> Vhost-net will be the first user for this.

Acked-by: Jonah Palmer <jonah.palmer@...cle.com>

> 
> Signed-off-by: Jason Wang <jasowang@...hat.com>
> ---
>   drivers/vhost/net.c   |   6 ++-
>   drivers/vhost/vhost.c | 121 +++++++++++++++++++++++++++++++++++-------
>   drivers/vhost/vhost.h |   8 ++-
>   3 files changed, 111 insertions(+), 24 deletions(-)
> 
> diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
> index 7cbfc7d718b3..4f9c67f17b49 100644
> --- a/drivers/vhost/net.c
> +++ b/drivers/vhost/net.c
> @@ -374,7 +374,8 @@ static void vhost_zerocopy_signal_used(struct vhost_net *net,
>   	while (j) {
>   		add = min(UIO_MAXIOV - nvq->done_idx, j);
>   		vhost_add_used_and_signal_n(vq->dev, vq,
> -					    &vq->heads[nvq->done_idx], add);
> +					    &vq->heads[nvq->done_idx],
> +					    NULL, add);
>   		nvq->done_idx = (nvq->done_idx + add) % UIO_MAXIOV;
>   		j -= add;
>   	}
> @@ -457,7 +458,8 @@ static void vhost_net_signal_used(struct vhost_net_virtqueue *nvq)
>   	if (!nvq->done_idx)
>   		return;
>   
> -	vhost_add_used_and_signal_n(dev, vq, vq->heads, nvq->done_idx);
> +	vhost_add_used_and_signal_n(dev, vq, vq->heads, NULL,
> +				    nvq->done_idx);
>   	nvq->done_idx = 0;
>   }
>   
> diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
> index 3a5ebb973dba..c7ed069fc49e 100644
> --- a/drivers/vhost/vhost.c
> +++ b/drivers/vhost/vhost.c
> @@ -364,6 +364,7 @@ static void vhost_vq_reset(struct vhost_dev *dev,
>   	vq->avail = NULL;
>   	vq->used = NULL;
>   	vq->last_avail_idx = 0;
> +	vq->next_avail_head = 0;
>   	vq->avail_idx = 0;
>   	vq->last_used_idx = 0;
>   	vq->signalled_used = 0;
> @@ -455,6 +456,8 @@ static void vhost_vq_free_iovecs(struct vhost_virtqueue *vq)
>   	vq->log = NULL;
>   	kfree(vq->heads);
>   	vq->heads = NULL;
> +	kfree(vq->nheads);
> +	vq->nheads = NULL;
>   }
>   
>   /* Helper to allocate iovec buffers for all vqs. */
> @@ -472,7 +475,9 @@ static long vhost_dev_alloc_iovecs(struct vhost_dev *dev)
>   					GFP_KERNEL);
>   		vq->heads = kmalloc_array(dev->iov_limit, sizeof(*vq->heads),
>   					  GFP_KERNEL);
> -		if (!vq->indirect || !vq->log || !vq->heads)
> +		vq->nheads = kmalloc_array(dev->iov_limit, sizeof(*vq->nheads),
> +					   GFP_KERNEL);
> +		if (!vq->indirect || !vq->log || !vq->heads || !vq->nheads)
>   			goto err_nomem;
>   	}
>   	return 0;
> @@ -1990,14 +1995,15 @@ long vhost_vring_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user *arg
>   			break;
>   		}
>   		if (vhost_has_feature(vq, VIRTIO_F_RING_PACKED)) {
> -			vq->last_avail_idx = s.num & 0xffff;
> +			vq->next_avail_head = vq->last_avail_idx =
> +					      s.num & 0xffff;
>   			vq->last_used_idx = (s.num >> 16) & 0xffff;
>   		} else {
>   			if (s.num > 0xffff) {
>   				r = -EINVAL;
>   				break;
>   			}
> -			vq->last_avail_idx = s.num;
> +			vq->next_avail_head = vq->last_avail_idx = s.num;
>   		}
>   		/* Forget the cached index value. */
>   		vq->avail_idx = vq->last_avail_idx;
> @@ -2590,11 +2596,12 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq,
>   		      unsigned int *out_num, unsigned int *in_num,
>   		      struct vhost_log *log, unsigned int *log_num)
>   {
> +	bool in_order = vhost_has_feature(vq, VIRTIO_F_IN_ORDER);
>   	struct vring_desc desc;
>   	unsigned int i, head, found = 0;
>   	u16 last_avail_idx = vq->last_avail_idx;
>   	__virtio16 ring_head;
> -	int ret, access;
> +	int ret, access, c = 0;
>   
>   	if (vq->avail_idx == vq->last_avail_idx) {
>   		ret = vhost_get_avail_idx(vq);
> @@ -2605,17 +2612,21 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq,
>   			return vq->num;
>   	}
>   
> -	/* Grab the next descriptor number they're advertising, and increment
> -	 * the index we've seen. */
> -	if (unlikely(vhost_get_avail_head(vq, &ring_head, last_avail_idx))) {
> -		vq_err(vq, "Failed to read head: idx %d address %p\n",
> -		       last_avail_idx,
> -		       &vq->avail->ring[last_avail_idx % vq->num]);
> -		return -EFAULT;
> +	if (in_order)
> +		head = vq->next_avail_head & (vq->num - 1);
> +	else {
> +		/* Grab the next descriptor number they're
> +		 * advertising, and increment the index we've seen. */
> +		if (unlikely(vhost_get_avail_head(vq, &ring_head,
> +						  last_avail_idx))) {
> +			vq_err(vq, "Failed to read head: idx %d address %p\n",
> +				last_avail_idx,
> +				&vq->avail->ring[last_avail_idx % vq->num]);
> +			return -EFAULT;
> +		}
> +		head = vhost16_to_cpu(vq, ring_head);
>   	}
>   
> -	head = vhost16_to_cpu(vq, ring_head);
> -
>   	/* If their number is silly, that's an error. */
>   	if (unlikely(head >= vq->num)) {
>   		vq_err(vq, "Guest says index %u > %u is available",
> @@ -2658,6 +2669,7 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq,
>   						"in indirect descriptor at idx %d\n", i);
>   				return ret;
>   			}
> +			++c;
>   			continue;
>   		}
>   
> @@ -2693,10 +2705,12 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq,
>   			}
>   			*out_num += ret;
>   		}
> +		++c;
>   	} while ((i = next_desc(vq, &desc)) != -1);
>   
>   	/* On success, increment avail index. */
>   	vq->last_avail_idx++;
> +	vq->next_avail_head += c;
>   
>   	/* Assume notifications from guest are disabled at this point,
>   	 * if they aren't we would need to update avail_event index. */
> @@ -2720,8 +2734,9 @@ int vhost_add_used(struct vhost_virtqueue *vq, unsigned int head, int len)
>   		cpu_to_vhost32(vq, head),
>   		cpu_to_vhost32(vq, len)
>   	};
> +	u16 nheads = 1;
>   
> -	return vhost_add_used_n(vq, &heads, 1);
> +	return vhost_add_used_n(vq, &heads, &nheads, 1);
>   }
>   EXPORT_SYMBOL_GPL(vhost_add_used);
>   
> @@ -2757,10 +2772,10 @@ static int __vhost_add_used_n(struct vhost_virtqueue *vq,
>   	return 0;
>   }
>   
> -/* After we've used one of their buffers, we tell them about it.  We'll then
> - * want to notify the guest, using eventfd. */
> -int vhost_add_used_n(struct vhost_virtqueue *vq, struct vring_used_elem *heads,
> -		     unsigned count)
> +static int vhost_add_used_n_ooo(struct vhost_virtqueue *vq,
> +				struct vring_used_elem *heads,
> +				u16 *nheads,
> +				unsigned count)
>   {
>   	int start, n, r;
>   
> @@ -2775,6 +2790,70 @@ int vhost_add_used_n(struct vhost_virtqueue *vq, struct vring_used_elem *heads,
>   	}
>   	r = __vhost_add_used_n(vq, heads, count);
>   
> +	return r;
> +}
> +
> +static int vhost_add_used_n_in_order(struct vhost_virtqueue *vq,
> +				     struct vring_used_elem *heads,
> +				     u16 *nheads,
> +				     unsigned count)
> +{
> +	vring_used_elem_t __user *used;
> +	u16 old, new = vq->last_used_idx;
> +	int start, i;
> +
> +	if (!nheads)
> +		return -EINVAL;
> +
> +	start = vq->last_used_idx & (vq->num - 1);
> +	used = vq->used->ring + start;
> +
> +	for (i = 0; i < count; i++) {
> +		if (vhost_put_used(vq, &heads[i], start, 1)) {
> +			vq_err(vq, "Failed to write used");
> +			return -EFAULT;
> +		}
> +		start += nheads[i];
> +		new += nheads[i];
> +		if (start >= vq->num)
> +			start -= vq->num;
> +	}
> +
> +	if (unlikely(vq->log_used)) {
> +		/* Make sure data is seen before log. */
> +		smp_wmb();
> +		/* Log used ring entry write. */
> +		log_used(vq, ((void __user *)used - (void __user *)vq->used),
> +			 (vq->num - start) * sizeof *used);
> +		if (start + count > vq->num)
> +			log_used(vq, 0,
> +				 (start + count - vq->num) * sizeof *used);
> +	}
> +
> +	old = vq->last_used_idx;
> +	vq->last_used_idx = new;
> +	/* If the driver never bothers to signal in a very long while,
> +	 * used index might wrap around. If that happens, invalidate
> +	 * signalled_used index we stored. TODO: make sure driver
> +	 * signals at least once in 2^16 and remove this. */
> +	if (unlikely((u16)(new - vq->signalled_used) < (u16)(new - old)))
> +		vq->signalled_used_valid = false;
> +	return 0;
> +}
> +
> +/* After we've used one of their buffers, we tell them about it.  We'll then
> + * want to notify the guest, using eventfd. */
> +int vhost_add_used_n(struct vhost_virtqueue *vq, struct vring_used_elem *heads,
> +		     u16 *nheads, unsigned count)
> +{
> +	bool in_order = vhost_has_feature(vq, VIRTIO_F_IN_ORDER);
> +	int r;
> +
> +	if (!in_order || !nheads)
> +		r = vhost_add_used_n_ooo(vq, heads, nheads, count);
> +	else
> +		r = vhost_add_used_n_in_order(vq, heads, nheads, count);
> +
>   	/* Make sure buffer is written before we update index. */
>   	smp_wmb();
>   	if (vhost_put_used_idx(vq)) {
> @@ -2853,9 +2932,11 @@ EXPORT_SYMBOL_GPL(vhost_add_used_and_signal);
>   /* multi-buffer version of vhost_add_used_and_signal */
>   void vhost_add_used_and_signal_n(struct vhost_dev *dev,
>   				 struct vhost_virtqueue *vq,
> -				 struct vring_used_elem *heads, unsigned count)
> +				 struct vring_used_elem *heads,
> +				 u16 *nheads,
> +				 unsigned count)
>   {
> -	vhost_add_used_n(vq, heads, count);
> +	vhost_add_used_n(vq, heads, nheads, count);
>   	vhost_signal(dev, vq);
>   }
>   EXPORT_SYMBOL_GPL(vhost_add_used_and_signal_n);
> diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
> index bb75a292d50c..dca9f309d396 100644
> --- a/drivers/vhost/vhost.h
> +++ b/drivers/vhost/vhost.h
> @@ -103,6 +103,8 @@ struct vhost_virtqueue {
>   	 * Values are limited to 0x7fff, and the high bit is used as
>   	 * a wrap counter when using VIRTIO_F_RING_PACKED. */
>   	u16 last_avail_idx;
> +	/* Next avail ring head when VIRTIO_F_IN_ORDER is neogitated */
> +	u16 next_avail_head;
>   
>   	/* Caches available index value from user. */
>   	u16 avail_idx;
> @@ -129,6 +131,7 @@ struct vhost_virtqueue {
>   	struct iovec iotlb_iov[64];
>   	struct iovec *indirect;
>   	struct vring_used_elem *heads;
> +	u16 *nheads;
>   	/* Protected by virtqueue mutex. */
>   	struct vhost_iotlb *umem;
>   	struct vhost_iotlb *iotlb;
> @@ -213,11 +216,12 @@ bool vhost_vq_is_setup(struct vhost_virtqueue *vq);
>   int vhost_vq_init_access(struct vhost_virtqueue *);
>   int vhost_add_used(struct vhost_virtqueue *, unsigned int head, int len);
>   int vhost_add_used_n(struct vhost_virtqueue *, struct vring_used_elem *heads,
> -		     unsigned count);
> +		     u16 *nheads, unsigned count);
>   void vhost_add_used_and_signal(struct vhost_dev *, struct vhost_virtqueue *,
>   			       unsigned int id, int len);
>   void vhost_add_used_and_signal_n(struct vhost_dev *, struct vhost_virtqueue *,
> -			       struct vring_used_elem *heads, unsigned count);
> +				 struct vring_used_elem *heads, u16 *nheads,
> +				 unsigned count);
>   void vhost_signal(struct vhost_dev *, struct vhost_virtqueue *);
>   void vhost_disable_notify(struct vhost_dev *, struct vhost_virtqueue *);
>   bool vhost_vq_avail_empty(struct vhost_dev *, struct vhost_virtqueue *);