netdev - Re: [PATCH v3] Add Mergeable receive buffer support to vhost

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20100407113550.GA12408@redhat.com>
Date:	Wed, 7 Apr 2010 14:35:50 +0300
From:	"Michael S. Tsirkin" <mst@...hat.com>
To:	David L Stevens <dlstevens@...ibm.com>
Cc:	kvm@...r.kernel.org, netdev@...r.kernel.org, rusty@...tcorp.com.au,
	virtualization@...ts.osdl.org
Subject: Re: [PATCH v3] Add Mergeable receive buffer support to vhost_net

Some corrections:

On Wed, Apr 07, 2010 at 01:59:10PM +0300, Michael S. Tsirkin wrote:
> On Tue, Apr 06, 2010 at 01:32:53PM -0700, David L Stevens wrote:
> > 
> > This patch adds support for the Mergeable Receive Buffers feature to
> > vhost_net.
> > 
> > 						+-DLS
> > 
> > Changes from previous revision:
> > 1) renamed:
> > 	vhost_discard_vq_desc -> vhost_discard_desc
> > 	vhost_get_heads -> vhost_get_desc_n
> > 	vhost_get_vq_desc -> vhost_get_desc
> > 2) added heads as argument to ghost_get_desc_n
> > 3) changed "vq->heads" from iovec to vring_used_elem, removed casts
> > 4) changed vhost_add_used to do multiple elements in a single
> > copy_to_user,
> > 	or two when we wrap the ring.
> > 5) removed rxmaxheadcount and available buffer checks in favor of
> > running until
> > 	an allocation failure, but making sure we break the loop if we get
> > 	two in a row, indicating we have at least 1 buffer, but not enough
> > 	for the current receive packet
> > 6) restore non-vnet header handling
> > 
> > Signed-Off-By: David L Stevens <dlstevens@...ibm.com>
> 
> Thanks!
> There's some whitespace damage, are you sending with your new
> sendmail setup? It seems to have worked for qemu patches ...
> 
> > diff -ruNp net-next-p0/drivers/vhost/net.c
> > net-next-v3/drivers/vhost/net.c
> > --- net-next-p0/drivers/vhost/net.c	2010-03-22 12:04:38.000000000 -0700
> > +++ net-next-v3/drivers/vhost/net.c	2010-04-06 12:54:56.000000000 -0700
> > @@ -130,9 +130,8 @@ static void handle_tx(struct vhost_net *
> >  	hdr_size = vq->hdr_size;
> >  
> >  	for (;;) {
> > -		head = vhost_get_vq_desc(&net->dev, vq, vq->iov,
> > -					 ARRAY_SIZE(vq->iov),
> > -					 &out, &in,
> > +		head = vhost_get_desc(&net->dev, vq, vq->iov,
> > +					 ARRAY_SIZE(vq->iov), &out, &in,
> >  					 NULL, NULL);
> >  		/* Nothing new?  Wait for eventfd to tell us they refilled. */
> >  		if (head == vq->num) {
> > @@ -167,8 +166,15 @@ static void handle_tx(struct vhost_net *
> >  		/* TODO: Check specific error and bomb out unless ENOBUFS? */
> >  		err = sock->ops->sendmsg(NULL, sock, &msg, len);
> >  		if (unlikely(err < 0)) {
> > -			vhost_discard_vq_desc(vq);
> > -			tx_poll_start(net, sock);
> > +			if (err == -EAGAIN) {
> > +				vhost_discard_desc(vq, 1);
> > +				tx_poll_start(net, sock);
> > +			} else {
> > +				vq_err(vq, "sendmsg: errno %d\n", -err);
> > +				/* drop packet; do not discard/resend */
> > +				vhost_add_used_and_signal(&net->dev, vq, head,
> > +							  0);
> 
> vhost does not currently has a consistent error handling strategy:
> if we drop packets, need to think which other errors should cause
> packet drops.  I prefer to just call vq_err for now,
> and have us look at handling segfaults etc in a consistent way
> separately.
> 
> > +			}
> >  			break;
> >  		}
> >  		if (err != len)
> > @@ -186,12 +192,25 @@ static void handle_tx(struct vhost_net *
> >  	unuse_mm(net->dev.mm);
> >  }
> >  
> > +static int vhost_head_len(struct sock *sk)
> > +{
> > +	struct sk_buff *head;
> > +	int len = 0;
> > +
> > +	lock_sock(sk);
> > +	head = skb_peek(&sk->sk_receive_queue);
> > +	if (head)
> > +		len = head->len;
> > +	release_sock(sk);
> > +	return len;
> > +}
> > +
> 
> I wonder whether it makes sense to check
> skb_queue_empty(&sk->sk_receive_queue)
> outside the lock, to reduce the cost of this call
> on an empty queue (we know that it happens at least once
> each time we exit the loop on rx)?
> 
> >  /* Expects to be always run from workqueue - which acts as
> >   * read-size critical section for our kind of RCU. */
> >  static void handle_rx(struct vhost_net *net)
> >  {
> >  	struct vhost_virtqueue *vq = &net->dev.vqs[VHOST_NET_VQ_RX];
> > -	unsigned head, out, in, log, s;
> > +	unsigned in, log, s;
> >  	struct vhost_log *vq_log;
> >  	struct msghdr msg = {
> >  		.msg_name = NULL,
> > @@ -202,13 +221,14 @@ static void handle_rx(struct vhost_net *
> >  		.msg_flags = MSG_DONTWAIT,
> >  	};
> >  
> > -	struct virtio_net_hdr hdr = {
> > -		.flags = 0,
> > -		.gso_type = VIRTIO_NET_HDR_GSO_NONE
> > +	struct virtio_net_hdr_mrg_rxbuf hdr = {
> > +		.hdr.flags = 0,
> > +		.hdr.gso_type = VIRTIO_NET_HDR_GSO_NONE
> >  	};
> >  
> > +	int retries = 0;
> >  	size_t len, total_len = 0;
> > -	int err;
> > +	int err, headcount, datalen;
> >  	size_t hdr_size;
> >  	struct socket *sock = rcu_dereference(vq->private_data);
> >  	if (!sock || skb_queue_empty(&sock->sk->sk_receive_queue))
> > @@ -222,31 +242,25 @@ static void handle_rx(struct vhost_net *
> >  	vq_log = unlikely(vhost_has_feature(&net->dev, VHOST_F_LOG_ALL)) ?
> >  		vq->log : NULL;
> >  
> > -	for (;;) {
> > -		head = vhost_get_vq_desc(&net->dev, vq, vq->iov,
> > -					 ARRAY_SIZE(vq->iov),
> > -					 &out, &in,
> > -					 vq_log, &log);
> > +	while ((datalen = vhost_head_len(sock->sk))) {
> > +		headcount = vhost_get_desc_n(vq, vq->heads, datalen, &in,
> > +					     vq_log, &log);
> 
> This looks like a bug, I think we need to pass
> datalen + header size to vhost_get_desc_n.
> Not sure how we know the header size that backend will use though.
> Maybe just look at our features.
> 
> >  		/* OK, now we need to know about added descriptors. */
> > -		if (head == vq->num) {
> > -			if (unlikely(vhost_enable_notify(vq))) {
> > +		if (!headcount) {
> > +			if (retries == 0 && unlikely(vhost_enable_notify(vq))) {
> >  				/* They have slipped one in as we were
> >  				 * doing that: check again. */
> >  				vhost_disable_notify(vq);
> > +				retries++;
> >  				continue;
> >  			}
> 
> Hmm. The reason we have the code at all, as the comment says, is because
> guest could have added more buffers between the time we read last index
> and the time we enabled notification. So if we just break like this
> the race still exists. We could remember the
> last head value we observed, and have vhost_enable_notify check
> against this value?
> 
> Need to think about it.
> 
> Another concern here is that on retries vhost_get_desc_n
> is doing extra work, rescanning the same descriptor
> again and again. Not sure how common this is, might be
> worthwhile to add a TODO to consider this at least.
> 
> > +			retries = 0;
> >  			/* Nothing new?  Wait for eventfd to tell us
> >  			 * they refilled. */
> >  			break;
> >  		}
> >  		/* We don't need to be notified again. */
> > -		if (out) {
> > -			vq_err(vq, "Unexpected descriptor format for RX: "
> > -			       "out %d, int %d\n",
> > -			       out, in);
> > -			break;
> > -		}
> > -		/* Skip header. TODO: support TSO/mergeable rx buffers. */
> > +		/* Skip header. TODO: support TSO. */
> >  		s = move_iovec_hdr(vq->iov, vq->hdr, hdr_size, in);
> >  		msg.msg_iovlen = in;
> >  		len = iov_length(vq->iov, in);
> > @@ -261,14 +275,33 @@ static void handle_rx(struct vhost_net *
> >  					 len, MSG_DONTWAIT | MSG_TRUNC);
> >  		/* TODO: Check specific error and bomb out unless EAGAIN? */
> >  		if (err < 0) {
> 
> I think we need to compare err and datalen and drop packet on mismatch as well.
> The check err > len won't be needed then.
> 
> > -			vhost_discard_vq_desc(vq);
> > +			vhost_discard_desc(vq, headcount);
> >  			break;
> >  		}
> >  		/* TODO: Should check and handle checksum. */
> > +		if (vhost_has_feature(&net->dev, VIRTIO_NET_F_MRG_RXBUF)) {
> > +			struct virtio_net_hdr_mrg_rxbuf *vhdr =
> > +				(struct virtio_net_hdr_mrg_rxbuf *)
> > +				vq->iov[0].iov_base;
> > +			/* add num_buffers */
> > +			if (vhost_has_feature(&net->dev,
> > +					      VHOST_NET_F_VIRTIO_NET_HDR))
> > +				hdr.num_buffers = headcount;
> 
> Why is the above necessary?
> 
> > +			else if (vq->iov[0].iov_len < sizeof(*vhdr)) {
> 
> I think length check is already done when we copy the header. No?
> 
> > +				vq_err(vq, "tiny buffers < %d unsupported",
> > +					vq->iov[0].iov_len);
> > +				vhost_discard_desc(vq, headcount);
> > +				break;
> 
> Problem here is that recvmsg might modify iov.
> This is why I think we need to use vq->hdr here.
> 
> > +			} else if (put_user(headcount, &vhdr->num_buffers)) {
> 
> The above put_user writes out a 32 bit value, right?
> This seems wrong.

Sorry, put_user looks at the pointer type, so that's ok.
I still suggest memcpy_toiovecend to remove layout assumptions.

> How about using
> 	memcpy_toiovecend(vq->hdr, &headcount,
> 			  offsetof(struct virtio_net_hdr_mrg_rxbuf, num_buffers),
> 			  sizeof headcount);
> 
> this way we also do not make any assumptions about layout.
> 
> > +				vq_err(vq, "Failed num_buffers write");
> > +				vhost_discard_desc(vq, headcount);
> > +				break;
> > +			}
> > +		}
> >  		if (err > len) {
> >  			pr_err("Discarded truncated rx packet: "
> >  			       " len %d > %zd\n", err, len);
> > -			vhost_discard_vq_desc(vq);
> > +			vhost_discard_desc(vq, headcount);
> >  			continue;
> >  		}
> >  		len = err;
> > @@ -279,7 +312,7 @@ static void handle_rx(struct vhost_net *
> >  			break;
> >  		}
> >  		len += hdr_size;
> > -		vhost_add_used_and_signal(&net->dev, vq, head, len);
> > +		vhost_add_used_and_signal_n(&net->dev, vq, vq->heads, headcount);
> >  		if (unlikely(vq_log))
> >  			vhost_log_write(vq, vq_log, log, len);
> >  		total_len += len;
> > @@ -560,9 +593,14 @@ done:
> >  
> >  static int vhost_net_set_features(struct vhost_net *n, u64 features)
> >  {
> > -	size_t hdr_size = features & (1 << VHOST_NET_F_VIRTIO_NET_HDR) ?
> > -		sizeof(struct virtio_net_hdr) : 0;
> > +	size_t hdr_size = 0;
> >  	int i;
> > +
> > +	if (features & (1 << VHOST_NET_F_VIRTIO_NET_HDR)) {
> > +		hdr_size = sizeof(struct virtio_net_hdr);
> > +		if (features & (1 << VIRTIO_NET_F_MRG_RXBUF))
> > +			hdr_size = sizeof(struct virtio_net_hdr_mrg_rxbuf);
> 
> My personal style for this would be:
>   	if (!(features & (1 << VHOST_NET_F_VIRTIO_NET_HDR)))
> 		hdr_size = 0
> 	else if (!(features & (1 << VIRTIO_NET_F_MRG_RXBUF)))
> 		hdr_size = sizeof(virtio_net_hdr);
> 	else
> 		hdr_size = sizeof(struct virtio_net_hdr_mrg_rxbuf);
> 
> which results in more symmetry and less nesting.
> 
> >  	mutex_lock(&n->dev.mutex);
> >  	if ((features & (1 << VHOST_F_LOG_ALL)) &&
> >  	    !vhost_log_access_ok(&n->dev)) {
> > diff -ruNp net-next-p0/drivers/vhost/vhost.c
> > net-next-v3/drivers/vhost/vhost.c
> > --- net-next-p0/drivers/vhost/vhost.c	2010-03-22 12:04:38.000000000
> > -0700
> > +++ net-next-v3/drivers/vhost/vhost.c	2010-04-06 12:57:51.000000000
> > -0700
> > @@ -856,6 +856,47 @@ static unsigned get_indirect(struct vhos
> >  	return 0;
> >  }
> >  
> > +/* This is a multi-buffer version of vhost_get_vq_desc
> > + * @vq		- the relevant virtqueue
> > + * datalen	- data length we'll be reading
> > + * @iovcount	- returned count of io vectors we fill
> > + * @log		- vhost log
> > + * @log_num	- log offset
> > + *	returns number of buffer heads allocated, 0 on error
> 
> This is unusual. Let's return a negative error code on error.
> 
> > + */
> > +int vhost_get_desc_n(struct vhost_virtqueue *vq, struct vring_used_elem
> > *heads,
> > +		     int datalen, int *iovcount, struct vhost_log *log,
> > +		     unsigned int *log_num)
> > +{
> > +	int out, in;
> > +	int seg = 0;		/* iov index */
> > +	int hc = 0;		/* head count */
> > +
> > +	while (datalen > 0) {
> > +		if (hc >= VHOST_NET_MAX_SG)
> > +			goto err;
> > +		heads[hc].id = vhost_get_desc(vq->dev, vq, vq->iov+seg,
> > +					      ARRAY_SIZE(vq->iov)-seg, &out,
> > +					      &in, log, log_num);
> > +		if (heads[hc].id == vq->num)
> > +			goto err;
> > +		if (out || in <= 0) {
> > +			vq_err(vq, "unexpected descriptor format for RX: "
> > +				"out %d, in %d\n", out, in);
> > +			goto err;
> > +		}
> > +		heads[hc].len = iov_length(vq->iov+seg, in);
> > +		datalen -= heads[hc].len;
> 
> This signed/unsigned mix makes me nervuous.
> Let's make datalen unsigned, add unsigned total_len, and
> while (datalen < total_len).
> 
> > +		hc++;
> > +		seg += in;
> > +	}
> > +	*iovcount = seg;
> > +	return hc;
> > +err:
> > +	vhost_discard_desc(vq, hc);
> > +	return 0;
> > +}
> > +
> >  /* This looks in the virtqueue and for the first available buffer, and
> > converts
> >   * it to an iovec for convenient access.  Since descriptors consist of
> > some
> >   * number of output then some number of input descriptors, it's
> > actually two
> > @@ -863,7 +904,7 @@ static unsigned get_indirect(struct vhos
> >   *
> >   * This function returns the descriptor number found, or vq->num (which
> >   * is never a valid descriptor number) if none was found. */
> > -unsigned vhost_get_vq_desc(struct vhost_dev *dev, struct
> > vhost_virtqueue *vq,
> > +unsigned vhost_get_desc(struct vhost_dev *dev, struct vhost_virtqueue
> > *vq,
> >  			   struct iovec iov[], unsigned int iov_size,
> >  			   unsigned int *out_num, unsigned int *in_num,
> >  			   struct vhost_log *log, unsigned int *log_num)
> > @@ -981,31 +1022,42 @@ unsigned vhost_get_vq_desc(struct vhost_
> >  }
> >  
> >  /* Reverse the effect of vhost_get_vq_desc. Useful for error handling.
> > */
> > -void vhost_discard_vq_desc(struct vhost_virtqueue *vq)
> > +void vhost_discard_desc(struct vhost_virtqueue *vq, int n)
> >  {
> > -	vq->last_avail_idx--;
> > +	vq->last_avail_idx -= n;
> >  }
> >  
> >  /* After we've used one of their buffers, we tell them about it.  We'll
> > then
> >   * want to notify the guest, using eventfd. */
> > -int vhost_add_used(struct vhost_virtqueue *vq, unsigned int head, int
> > len)
> > +int vhost_add_used(struct vhost_virtqueue *vq, struct vring_used_elem
> > *heads,
> > +		   int count)
> 
> I think we are better off with vhost_add_used and vhost_add_used_n:
> the version with _n has a lot of extra complexity, and tx always
> adds them 1 by one.
> 
> >  {
> >  	struct vring_used_elem *used;
> > +	int start, n;
> > +
> > +	if (count <= 0)
> > +		return -EINVAL;
> >  
> > -	/* The virtqueue contains a ring of used buffers.  Get a pointer to
> > the
> > -	 * next entry in that used ring. */
> > -	used = &vq->used->ring[vq->last_used_idx % vq->num];
> > -	if (put_user(head, &used->id)) {
> > -		vq_err(vq, "Failed to write used id");
> > +	start = vq->last_used_idx % vq->num;
> > +	if (vq->num - start < count)
> > +		n = vq->num - start;
> > +	else
> > +		n = count;
> 
> use min?
> 
> > +	used = vq->used->ring + start;
> > +	if (copy_to_user(used, heads, sizeof(heads[0])*n)) {
> > +		vq_err(vq, "Failed to write used");
> >  		return -EFAULT;
> >  	}
> > -	if (put_user(len, &used->len)) {
> > -		vq_err(vq, "Failed to write used len");
> > -		return -EFAULT;
> > +	if (n < count) {	/* wrapped the ring */
> > +		used = vq->used->ring;
> > +		if (copy_to_user(used, heads+n, sizeof(heads[0])*(count-n))) {
> > +			vq_err(vq, "Failed to write used");
> > +			return -EFAULT;
> > +		}
> >  	}
> >  	/* Make sure buffer is written before we update index. */
> >  	smp_wmb();
> > -	if (put_user(vq->last_used_idx + 1, &vq->used->idx)) {
> > +	if (put_user(vq->last_used_idx+count, &vq->used->idx)) {
> 
> I am a bit confused ... will this write a 32 or 16 bit value?
> count is 32 bit ... Maybe we are better off with
>   u16 idx = vq->last_used_idx + count
>   put_user(idx, &vq->used->idx)
>   vq->last_used_idx = idx

The above's not necessary, put_user gets type from the pointer.

> >  		vq_err(vq, "Failed to increment used idx");
> >  		return -EFAULT;
> >  	}
> > @@ -1023,7 +1075,7 @@ int vhost_add_used(struct vhost_virtqueu
> >  		if (vq->log_ctx)
> >  			eventfd_signal(vq->log_ctx, 1);
> >  	}
> > -	vq->last_used_idx++;
> > +	vq->last_used_idx += count;
> >  	return 0;
> >  }
> >  
> > @@ -1049,10 +1101,23 @@ void vhost_signal(struct vhost_dev *dev,
> >  
> >  /* And here's the combo meal deal.  Supersize me! */
> >  void vhost_add_used_and_signal(struct vhost_dev *dev,
> > -			       struct vhost_virtqueue *vq,
> > -			       unsigned int head, int len)
> > +			       struct vhost_virtqueue *vq, unsigned int id,
> > +			       int len)
> > +{
> > +	struct vring_used_elem head;
> > +
> > +	head.id = id;
> > +	head.len = len;
> > +	vhost_add_used(vq, &head, 1);
> > +	vhost_signal(dev, vq);
> > +}
> > +
> > +/* multi-buffer version of vhost_add_used_and_signal */
> > +void vhost_add_used_and_signal_n(struct vhost_dev *dev,
> > +				 struct vhost_virtqueue *vq,
> > +				 struct vring_used_elem *heads, int count)
> >  {
> > -	vhost_add_used(vq, head, len);
> > +	vhost_add_used(vq, heads, count);
> >  	vhost_signal(dev, vq);
> >  }
> >  
> > diff -ruNp net-next-p0/drivers/vhost/vhost.h
> > net-next-v3/drivers/vhost/vhost.h
> > --- net-next-p0/drivers/vhost/vhost.h	2010-03-22 12:04:38.000000000
> > -0700
> > +++ net-next-v3/drivers/vhost/vhost.h	2010-04-05 20:33:57.000000000
> > -0700
> > @@ -85,6 +85,7 @@ struct vhost_virtqueue {
> >  	struct iovec iov[VHOST_NET_MAX_SG];
> >  	struct iovec hdr[VHOST_NET_MAX_SG];
> >  	size_t hdr_size;
> > +	struct vring_used_elem heads[VHOST_NET_MAX_SG];
> >  	/* We use a kind of RCU to access private pointer.
> >  	 * All readers access it from workqueue, which makes it possible to
> >  	 * flush the workqueue instead of synchronize_rcu. Therefore readers
> > do
> > @@ -120,16 +121,22 @@ long vhost_dev_ioctl(struct vhost_dev *,
> >  int vhost_vq_access_ok(struct vhost_virtqueue *vq);
> >  int vhost_log_access_ok(struct vhost_dev *);
> >  
> > -unsigned vhost_get_vq_desc(struct vhost_dev *, struct vhost_virtqueue
> > *,
> > +int vhost_get_desc_n(struct vhost_virtqueue *, struct vring_used_elem
> > *heads,
> > +		     int datalen, int *iovcount, struct vhost_log *log,
> > +		     unsigned int *log_num);
> > +unsigned vhost_get_desc(struct vhost_dev *, struct vhost_virtqueue *,
> >  			   struct iovec iov[], unsigned int iov_count,
> >  			   unsigned int *out_num, unsigned int *in_num,
> >  			   struct vhost_log *log, unsigned int *log_num);
> > -void vhost_discard_vq_desc(struct vhost_virtqueue *);
> > +void vhost_discard_desc(struct vhost_virtqueue *, int);
> >  
> > -int vhost_add_used(struct vhost_virtqueue *, unsigned int head, int
> > len);
> > -void vhost_signal(struct vhost_dev *, struct vhost_virtqueue *);
> > +int vhost_add_used(struct vhost_virtqueue *, struct vring_used_elem
> > *heads,
> > +		    int count);
> >  void vhost_add_used_and_signal(struct vhost_dev *, struct
> > vhost_virtqueue *,
> > -			       unsigned int head, int len);
> > +			       unsigned int id, int len);
> > +void vhost_add_used_and_signal_n(struct vhost_dev *, struct
> > vhost_virtqueue *,
> > +			       struct vring_used_elem *heads, int count);
> > +void vhost_signal(struct vhost_dev *, struct vhost_virtqueue *);
> >  void vhost_disable_notify(struct vhost_virtqueue *);
> >  bool vhost_enable_notify(struct vhost_virtqueue *);
> >  
> > @@ -149,7 +156,8 @@ enum {
> >  	VHOST_FEATURES = (1 << VIRTIO_F_NOTIFY_ON_EMPTY) |
> >  			 (1 << VIRTIO_RING_F_INDIRECT_DESC) |
> >  			 (1 << VHOST_F_LOG_ALL) |
> > -			 (1 << VHOST_NET_F_VIRTIO_NET_HDR),
> > +			 (1 << VHOST_NET_F_VIRTIO_NET_HDR) |
> > +			 (1 << VIRTIO_NET_F_MRG_RXBUF),
> >  };
> >  
> >  static inline int vhost_has_feature(struct vhost_dev *dev, int bit)
> > 
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html