linux-kernel - Re: [PATCH RFC] vhost: address fixme in vhost TX zero-copy support

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <1310431067.28930.7.camel@localhost.localdomain>
Date:	Mon, 11 Jul 2011 17:37:47 -0700
From:	Shirley Ma <mashirle@...ibm.com>
To:	"Michael S. Tsirkin" <mst@...hat.com>
Cc:	David Miller <davem@...emloft.net>, netdev@...r.kernel.org,
	kvm@...r.kernel.org, linux-kernel@...r.kernel.org
Subject: Re: [PATCH RFC] vhost: address fixme in vhost TX zero-copy support

On Tue, 2011-07-12 at 01:04 +0300, Michael S. Tsirkin wrote:
> So the following should do it, on top of Shirleys's patch, I think.
> I'm
> a bit not sure about using vq->upend_idx - vq->done_idx to check the
> number of outstanding DMA, Shirley, what do you think?

Yes, you can use this to track # outstanding DMAs.

> Untested.
> 
> I'm also thinking about making the use of this conditinal
> on a module parameter, off by default to reduce
> stability risk while still enabling more people to
> test the feature.
> Thoughts?

Agreed.

> Signed-off-by: Michael S. Tsirkin <mst@...hat.com>
> 
> diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
> index 7de0c6e..cf8deb3 100644
> --- a/drivers/vhost/net.c
> +++ b/drivers/vhost/net.c
> @@ -156,8 +156,7 @@ static void handle_tx(struct vhost_net *net)
> 
>         for (;;) {
>                 /* Release DMAs done buffers first */
> -               if (atomic_read(&vq->refcnt) > VHOST_MAX_PEND)
> -                       vhost_zerocopy_signal_used(vq);
> +               vhost_zerocopy_signal_used(vq);
> 
>                 head = vhost_get_vq_desc(&net->dev, vq, vq->iov,
>                                          ARRAY_SIZE(vq->iov),
> @@ -175,7 +174,7 @@ static void handle_tx(struct vhost_net *net)
>                                 break;
>                         }
>                         /* If more outstanding DMAs, queue the work */
> -                       if (atomic_read(&vq->refcnt) > VHOST_MAX_PEND)
> {
> +                       if (vq->upend_idx - vq->done_idx >
> VHOST_MAX_PEND) {
>                                 tx_poll_start(net, sock);
>                                 set_bit(SOCK_ASYNC_NOSPACE,
> &sock->flags);
>                                 break;
> @@ -214,12 +213,12 @@ static void handle_tx(struct vhost_net *net)
> 
>                                 vq->heads[vq->upend_idx].len = len;
>                                 ubuf->callback =
> vhost_zerocopy_callback;
> -                               ubuf->arg = vq;
> +                               ubuf->arg = vq->ubufs;
>                                 ubuf->desc = vq->upend_idx;
>                                 msg.msg_control = ubuf;
>                                 msg.msg_controllen = sizeof(ubuf);
> +                               kref_get(&vq->ubufs->kref);
>                         }
> -                       atomic_inc(&vq->refcnt);
>                         vq->upend_idx = (vq->upend_idx + 1) %
> UIO_MAXIOV;
>                 }
>                 /* TODO: Check specific error and bomb out unless
> ENOBUFS? */
> @@ -646,6 +645,7 @@ static long vhost_net_set_backend(struct vhost_net
> *n, unsigned index, int fd)
>  {
>         struct socket *sock, *oldsock;
>         struct vhost_virtqueue *vq;
> +       struct vhost_ubuf_ref *ubufs, *oldubufs = NULL;
>         int r;
> 
>         mutex_lock(&n->dev.mutex);
> @@ -675,6 +675,13 @@ static long vhost_net_set_backend(struct
> vhost_net *n, unsigned index, int fd)
>         oldsock = rcu_dereference_protected(vq->private_data,
> 
> lockdep_is_held(&vq->mutex));
>         if (sock != oldsock) {
> +               ubufs = vhost_ubuf_alloc(vq, sock);
> +               if (IS_ERR(ubufs)) {
> +                       r = PTR_ERR(ubufs);
> +                       goto err_ubufs;
> +               }
> +               oldubufs = vq->ubufs;
> +               vq->ubufs = ubufs;
>                 vhost_net_disable_vq(n, vq);
>                 rcu_assign_pointer(vq->private_data, sock);
>                 vhost_net_enable_vq(n, vq);
> @@ -682,6 +689,9 @@ static long vhost_net_set_backend(struct vhost_net
> *n, unsigned index, int fd)
> 
>         mutex_unlock(&vq->mutex);
> 
> +       if (oldbufs)
> +               vhost_ubuf_put_and_wait(oldbufs);
> +
>         if (oldsock) {
>                 vhost_net_flush_vq(n, index);
>                 fput(oldsock->file);
> @@ -690,6 +700,8 @@ static long vhost_net_set_backend(struct vhost_net
> *n, unsigned index, int fd)
>         mutex_unlock(&n->dev.mutex);
>         return 0;
> 
> +err_ubufs:
> +       fput(sock);
>  err_vq:
>         mutex_unlock(&vq->mutex);
>  err:
> diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
> index db242b1..81b1dd7 100644
> --- a/drivers/vhost/vhost.c
> +++ b/drivers/vhost/vhost.c
> @@ -181,7 +181,7 @@ static void vhost_vq_reset(struct vhost_dev *dev,
>         vq->log_ctx = NULL;
>         vq->upend_idx = 0;
>         vq->done_idx = 0;
> -       atomic_set(&vq->refcnt, 0);
> +       vq->ubufs = NULL;
>  }
> 
>  static int vhost_worker(void *data)
> @@ -401,7 +401,7 @@ long vhost_dev_reset_owner(struct vhost_dev *dev)
>   * of used idx. Once lower device DMA done contiguously, we will
> signal KVM
>   * guest used idx.
>   */
> -void vhost_zerocopy_signal_used(struct vhost_virtqueue *vq)
> +int vhost_zerocopy_signal_used(struct vhost_virtqueue *vq)
>  {
>         int i, j = 0;
> 
> @@ -414,10 +414,9 @@ void vhost_zerocopy_signal_used(struct
> vhost_virtqueue *vq)
>                 } else
>                         break;
>         }
> -       if (j) {
> +       if (j)
>                 vq->done_idx = i;
> -               atomic_sub(j, &vq->refcnt);
> -       }
> +       return j;
>  }
> 
>  /* Caller should have device mutex */
> @@ -430,9 +429,13 @@ void vhost_dev_cleanup(struct vhost_dev *dev)
>                         vhost_poll_stop(&dev->vqs[i].poll);
>                         vhost_poll_flush(&dev->vqs[i].poll);
>                 }
> -               /* Wait for all lower device DMAs done (busywait
> FIXME) */
> -               while (atomic_read(&dev->vqs[i].refcnt))
> -                       vhost_zerocopy_signal_used(&dev->vqs[i]);
> +               /* Wait for all lower device DMAs done. */
> +               if (dev->vqs[i].ubufs)
> +                       vhost_ubuf_put_and_wait(dev->vqs[i].ubufs);
> +
> +               /* Signal guest as appropriate. */
> +               vhost_zerocopy_signal_used(&dev->vqs[i]);
> +
>                 if (dev->vqs[i].error_ctx)
>                         eventfd_ctx_put(dev->vqs[i].error_ctx);
>                 if (dev->vqs[i].error)
> @@ -645,11 +648,6 @@ static long vhost_set_vring(struct vhost_dev *d,
> int ioctl, void __user *argp)
> 
>         mutex_lock(&vq->mutex);
> 
> -       /* clean up lower device outstanding DMAs, before setting ring
> -          busywait FIXME */
> -       while (atomic_read(&vq->refcnt))
> -               vhost_zerocopy_signal_used(vq);
> -

We need to clear up outstanding DMAs here too when we set vring.
Otherwise, KVM guest remove/reload virtio_net module vring would be out
of sync with vhost.

>         switch (ioctl) {
>         case VHOST_SET_VRING_NUM:
>                 /* Resizing ring with an active backend?
> @@ -1525,12 +1523,46 @@ void vhost_disable_notify(struct vhost_dev
> *dev, struct vhost_virtqueue *vq)
>         }
>  }
> 
> +static void vhost_zerocopy_done_signal(struct kref *kref)
> +{
> +       struct vhost_ubuf_ref *ubufs = container_of(kref, struct
> vhost_ubuf_ref,
> +                                                   kref);
> +       wake_up(&ubufs->wait);
> +}
> +
> +struct vhost_ubuf_ref *vhost_ubuf_alloc(struct vhost_virtqueue *vq,
> +                                       void * private_data)
> +{
> +       struct vhost_ubuf_ref *ubufs;
> +       /* No backend? Nothing to count. */
> +       if (!private_data)
> +               return NULL;
> +       ubufs = kmalloc(sizeof *ubufs, GFP_KERNEL);
> +       if (!ubufs)
> +               return ERR_PTR(-ENOMEM);
> +       kref_init(&ubufs->kref);
> +       kref_get(&ubufs->kref);
> +       init_waitqueue_head(&ubufs->wait);
> +       ubufs->vq = vq;
> +       return ubufs;
> +}
> +
> +void vhost_ubuf_put_and_wait(struct vhost_ubuf_ref *ubufs)
> +{
> +       kref_put(&ubufs->kref, vhost_zerocopy_done_signal); 
> +       wait_event(ubufs->wait, !atomic_read(&ubufs->kref.refcount));
> +       kfree(ubufs);
> +}
> +
>  void vhost_zerocopy_callback(void *arg)
>  {
>         struct ubuf_info *ubuf = (struct ubuf_info *)arg;
> +       struct vhost_ubuf_ref *ubufs;
>         struct vhost_virtqueue *vq;
> 
> -       vq = (struct vhost_virtqueue *)ubuf->arg;
> +       ubufs = ubuf->arg;
> +       vq = ubufs->vq;
>         /* set len = 1 to mark this desc buffers done DMA */
>         vq->heads[ubuf->desc].len = VHOST_DMA_DONE_LEN;
> +       kref_put(&ubufs->kref, vhost_zerocopy_done_signal); 
>  }
> diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
> index 883688c..b42b126 100644
> --- a/drivers/vhost/vhost.h
> +++ b/drivers/vhost/vhost.h
> @@ -55,6 +55,17 @@ struct vhost_log {
>         u64 len;
>  };
> 
> +struct vhost_virtqueue;
> +
> +struct vhost_ubuf_ref {
> +       struct kref kref;
> +       wait_queue_t wait;
> +       struct vhost_virtqueue *vq;
> +};
> +
> +struct vhost_ubuf_ref *vhost_ubuf_alloc(struct vhost_virtqueue *,
> void *);
> +void vhost_ubuf_put_and_wait(struct vhost_ubuf_ref *);
> +
>  /* The virtqueue structure describes a queue attached to a device. */
>  struct vhost_virtqueue {
>         struct vhost_dev *dev;
> @@ -127,6 +138,9 @@ struct vhost_virtqueue {
>         int done_idx;
>         /* an array of userspace buffers info */
>         struct ubuf_info *ubuf_info;
> +       /* Reference counting for outstanding ubufs.
> +        * Protected by vq mutex. Writers must also take device mutex.
> */
> +       struct vhost_ubuf_ref *ubufs;
>  };
> 
>  struct vhost_dev {
> @@ -174,7 +188,7 @@ bool vhost_enable_notify(struct vhost_dev *,
> struct vhost_virtqueue *);
>  int vhost_log_write(struct vhost_virtqueue *vq, struct vhost_log
> *log,
>                     unsigned int log_num, u64 len);
>  void vhost_zerocopy_callback(void *arg);
> -void vhost_zerocopy_signal_used(struct vhost_virtqueue *vq);
> +int vhost_zerocopy_signal_used(struct vhost_virtqueue *vq);
> 
>  #define vq_err(vq, fmt, ...) do {                                  \
>                 pr_debug(pr_fmt(fmt), ##__VA_ARGS__);       \
> -- 

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/