[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <CACGkMEv5M7ud359wvOTCFJdHw_zrOBnwf5i6GqDh7ZR3QE==Dw@mail.gmail.com>
Date: Mon, 22 Sep 2025 08:43:20 +0800
From: Jason Wang <jasowang@...hat.com>
To: "Michael S. Tsirkin" <mst@...hat.com>
Cc: xuanzhuo@...ux.alibaba.com, eperezma@...hat.com,
virtualization@...ts.linux.dev, linux-kernel@...r.kernel.org
Subject: Re: [PATCH V6 19/19] virtio_ring: add in order support
On Mon, Sep 22, 2025 at 1:40 AM Michael S. Tsirkin <mst@...hat.com> wrote:
>
> On Fri, Sep 19, 2025 at 03:31:54PM +0800, Jason Wang wrote:
> > This patch implements in order support for both split virtqueue and
> > packed virtqueue. Perfomance could be gained for the device where the
> > memory access could be expensive (e.g vhost-net or a real PCI device):
> >
> > Benchmark with KVM guest:
> >
> > Vhost-net on the host: (pktgen + XDP_DROP):
> >
> > in_order=off | in_order=on | +%
> > TX: 5.20Mpps | 6.20Mpps | +19%
> > RX: 3.47Mpps | 3.61Mpps | + 4%
> >
> > Vhost-user(testpmd) on the host: (pktgen/XDP_DROP):
> >
> > For split virtqueue:
> >
> > in_order=off | in_order=on | +%
> > TX: 5.60Mpps | 5.60Mpps | +0.0%
> > RX: 9.16Mpps | 9.61Mpps | +4.9%
> >
> > For packed virtqueue:
> >
> > in_order=off | in_order=on | +%
> > TX: 5.60Mpps | 5.70Mpps | +1.7%
> > RX: 10.6Mpps | 10.8Mpps | +1.8%
> >
> > Benchmark also shows no performance impact for in_order=off for queue
> > size with 256 and 1024.
> >
> > Signed-off-by: Jason Wang <jasowang@...hat.com>
> > Signed-off-by: Michael S. Tsirkin <mst@...hat.com>
> > ---
> > drivers/virtio/virtio_ring.c | 421 +++++++++++++++++++++++++++++++++--
> > 1 file changed, 401 insertions(+), 20 deletions(-)
>
>
>
> Thanks for the patch! Yet something to improve:
>
>
> > diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> > index b700aa3e56c3..c00b5e57f2fc 100644
> > --- a/drivers/virtio/virtio_ring.c
> > +++ b/drivers/virtio/virtio_ring.c
> > @@ -70,6 +70,8 @@
> > enum vq_layout {
> > SPLIT = 0,
> > PACKED,
> > + SPLIT_IN_ORDER,
> > + PACKED_IN_ORDER,
> > VQ_TYPE_MAX,
> > };
> >
> > @@ -80,6 +82,7 @@ struct vring_desc_state_split {
> > * allocated together. So we won't stress more to the memory allocator.
> > */
> > struct vring_desc *indir_desc;
> > + u32 total_len; /* Buffer Length */
> > };
> >
> > struct vring_desc_state_packed {
> > @@ -91,6 +94,7 @@ struct vring_desc_state_packed {
> > struct vring_packed_desc *indir_desc;
> > u16 num; /* Descriptor list length. */
> > u16 last; /* The last desc state in a list. */
> > + u32 total_len; /* Buffer Length */
> > };
> >
> > struct vring_desc_extra {
> > @@ -206,6 +210,17 @@ struct vring_virtqueue {
> >
> > /* Head of free buffer list. */
> > unsigned int free_head;
> > +
> > + /*
> > + * With IN_ORDER, devices write a single used ring entry with
> > + * the id corresponding to the head entry of the descriptor chain
> > + * describing the last buffer in the batch
> > + */
> > + struct used_entry {
> > + u32 id;
> > + u32 len;
> > + } batch_last;
> > +
> > /* Number we've added since last sync. */
> > unsigned int num_added;
> >
> > @@ -258,7 +273,12 @@ static void vring_free(struct virtqueue *_vq);
> >
> > static inline bool virtqueue_is_packed(const struct vring_virtqueue *vq)
> > {
> > - return vq->layout == PACKED;
> > + return vq->layout == PACKED || vq->layout == PACKED_IN_ORDER;
> > +}
> > +
> > +static inline bool virtqueue_is_in_order(const struct vring_virtqueue *vq)
> > +{
> > + return vq->layout == SPLIT_IN_ORDER || vq->layout == PACKED_IN_ORDER;
> > }
> >
> > static bool virtqueue_use_indirect(const struct vring_virtqueue *vq,
> > @@ -575,6 +595,8 @@ static inline int virtqueue_add_split(struct vring_virtqueue *vq,
> > struct scatterlist *sg;
> > struct vring_desc *desc;
> > unsigned int i, n, avail, descs_used, err_idx, c = 0;
> > + /* Total length for in-order */
> > + unsigned int total_len = 0;
> > int head;
> > bool indirect;
> >
> > @@ -646,6 +668,7 @@ static inline int virtqueue_add_split(struct vring_virtqueue *vq,
> > ++c == total_sg ?
> > 0 : VRING_DESC_F_NEXT,
> > premapped);
> > + total_len += len;
> > }
> > }
> > for (; n < (out_sgs + in_sgs); n++) {
> > @@ -663,6 +686,7 @@ static inline int virtqueue_add_split(struct vring_virtqueue *vq,
> > i, addr, len,
> > (++c == total_sg ? 0 : VRING_DESC_F_NEXT) |
> > VRING_DESC_F_WRITE, premapped);
> > + total_len += len;
> > }
> > }
> >
> > @@ -685,7 +709,12 @@ static inline int virtqueue_add_split(struct vring_virtqueue *vq,
> > vq->vq.num_free -= descs_used;
> >
> > /* Update free pointer */
> > - if (indirect)
> > + if (virtqueue_is_in_order(vq)) {
> > + vq->free_head += descs_used;
> > + if (vq->free_head >= vq->split.vring.num)
> > + vq->free_head -= vq->split.vring.num;
> > + vq->split.desc_state[head].total_len = total_len;;
>
> what is going on here?
I'm not sure I get the question. But total_len is needed for the
driver to get the used buffer length when the device is doing batched
used idx updating.
Or you may wonder why we have a check for in_order? It is because the
function is used in both in_order and !in_order since the difference
is minimal. When we come more optimizations for in_order in the future
we can consider to use a separate function for in_order add.
>
> > + } else if (indirect)
> > vq->free_head = vq->split.desc_extra[head].next;
> > else
> > vq->free_head = i;
> > @@ -858,6 +887,14 @@ static bool more_used_split(const struct vring_virtqueue *vq)
> > return virtqueue_poll_split(vq, vq->last_used_idx);
> > }
> >
> > +static bool more_used_split_in_order(const struct vring_virtqueue *vq)
> > +{
> > + if (vq->batch_last.id != vq->packed.vring.num)
> > + return true;
> > +
> > + return virtqueue_poll_split(vq, vq->last_used_idx);
> > +}
> > +
> > static void *virtqueue_get_buf_ctx_split(struct vring_virtqueue *vq,
> > unsigned int *len,
> > void **ctx)
> > @@ -915,6 +952,73 @@ static void *virtqueue_get_buf_ctx_split(struct vring_virtqueue *vq,
> > return ret;
> > }
> >
> > +static void *virtqueue_get_buf_ctx_split_in_order(struct vring_virtqueue *vq,
> > + unsigned int *len,
> > + void **ctx)
> > +{
> > + void *ret;
> > + unsigned int num = vq->split.vring.num;
> > + u16 last_used;
> > +
> > + START_USE(vq);
> > +
> > + if (unlikely(vq->broken)) {
> > + END_USE(vq);
> > + return NULL;
> > + }
> > +
> > + last_used = (vq->last_used_idx & (vq->split.vring.num - 1));
> > +
> > + if (vq->batch_last.id == num) {
> > + if (!more_used_split(vq)) {
> > + pr_debug("No more buffers in queue\n");
> > + END_USE(vq);
> > + return NULL;
> > + }
> > +
> > + /* Only get used array entries after they have been
> > + * exposed by host. */
> > + virtio_rmb(vq->weak_barriers);
> > + vq->batch_last.id = virtio32_to_cpu(vq->vq.vdev,
> > + vq->split.vring.used->ring[last_used].id);
> > + vq->batch_last.len = virtio32_to_cpu(vq->vq.vdev,
> > + vq->split.vring.used->ring[last_used].len);
> > + }
> > +
> > + if (vq->batch_last.id == last_used) {
> > + vq->batch_last.id = num;
> > + *len = vq->batch_last.len;
> > + } else
> > + *len = vq->split.desc_state[last_used].total_len;
> > +
> > + if (unlikely(last_used >= num)) {
> > + BAD_RING(vq, "id %u out of range\n", last_used);
> > + return NULL;
> > + }
> > + if (unlikely(!vq->split.desc_state[last_used].data)) {
> > + BAD_RING(vq, "id %u is not a head!\n", last_used);
> > + return NULL;
> > + }
> > +
> > + /* detach_buf_split clears data, so grab it now. */
> > + ret = vq->split.desc_state[last_used].data;
> > + detach_buf_split_in_order(vq, last_used, ctx);
> > +
> > + vq->last_used_idx++;
> > + /* If we expect an interrupt for the next entry, tell host
> > + * by writing event index and flush out the write before
> > + * the read in the next get_buf call. */
> > + if (!(vq->split.avail_flags_shadow & VRING_AVAIL_F_NO_INTERRUPT))
> > + virtio_store_mb(vq->weak_barriers,
> > + &vring_used_event(&vq->split.vring),
> > + cpu_to_virtio16(vq->vq.vdev, vq->last_used_idx));
> > +
> > + LAST_ADD_TIME_INVALID(vq);
> > +
> > + END_USE(vq);
> > + return ret;
> > +}
> > +
> > static void virtqueue_disable_cb_split(struct vring_virtqueue *vq)
> > {
> > if (!(vq->split.avail_flags_shadow & VRING_AVAIL_F_NO_INTERRUPT)) {
> > @@ -1008,7 +1112,10 @@ static void *virtqueue_detach_unused_buf_split(struct vring_virtqueue *vq)
> > continue;
> > /* detach_buf_split clears data, so grab it now. */
> > buf = vq->split.desc_state[i].data;
> > - detach_buf_split(vq, i, NULL);
> > + if (virtqueue_is_in_order(vq))
> > + detach_buf_split_in_order(vq, i, NULL);
> > + else
> > + detach_buf_split(vq, i, NULL);
> > vq->split.avail_idx_shadow--;
> > vq->split.vring.avail->idx = cpu_to_virtio16(vq->vq.vdev,
> > vq->split.avail_idx_shadow);
> > @@ -1071,6 +1178,7 @@ static void virtqueue_vring_attach_split(struct vring_virtqueue *vq,
> >
> > /* Put everything in free lists. */
> > vq->free_head = 0;
> > + vq->batch_last.id = vq->split.vring.num;
> > }
> >
> > static int vring_alloc_state_extra_split(struct vring_virtqueue_split *vring_split)
> > @@ -1182,7 +1290,6 @@ static struct virtqueue *__vring_new_virtqueue_split(unsigned int index,
> > if (!vq)
> > return NULL;
> >
> > - vq->layout = SPLIT;
> > vq->vq.callback = callback;
> > vq->vq.vdev = vdev;
> > vq->vq.name = name;
> > @@ -1202,6 +1309,8 @@ static struct virtqueue *__vring_new_virtqueue_split(unsigned int index,
> > vq->indirect = virtio_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC) &&
> > !context;
> > vq->event = virtio_has_feature(vdev, VIRTIO_RING_F_EVENT_IDX);
> > + vq->layout = virtio_has_feature(vdev, VIRTIO_F_IN_ORDER) ?
> > + SPLIT_IN_ORDER : SPLIT;
> >
> > if (virtio_has_feature(vdev, VIRTIO_F_ORDER_PLATFORM))
> > vq->weak_barriers = false;
> > @@ -1359,13 +1468,14 @@ static int virtqueue_add_indirect_packed(struct vring_virtqueue *vq,
> > unsigned int in_sgs,
> > void *data,
> > bool premapped,
> > - gfp_t gfp)
> > + gfp_t gfp,
> > + u16 id)
> > {
> > struct vring_desc_extra *extra;
> > struct vring_packed_desc *desc;
> > struct scatterlist *sg;
> > - unsigned int i, n, err_idx, len;
> > - u16 head, id;
> > + unsigned int i, n, err_idx, len, total_len = 0;
> > + u16 head;
> > dma_addr_t addr;
> >
> > head = vq->packed.next_avail_idx;
> > @@ -1383,8 +1493,6 @@ static int virtqueue_add_indirect_packed(struct vring_virtqueue *vq,
> > }
> >
> > i = 0;
> > - id = vq->free_head;
> > - BUG_ON(id == vq->packed.vring.num);
> >
> > for (n = 0; n < out_sgs + in_sgs; n++) {
> > for (sg = sgs[n]; sg; sg = sg_next(sg)) {
> > @@ -1404,6 +1512,7 @@ static int virtqueue_add_indirect_packed(struct vring_virtqueue *vq,
> > extra[i].flags = n < out_sgs ? 0 : VRING_DESC_F_WRITE;
> > }
> >
> > + total_len += len;
> > i++;
> > }
> > }
> > @@ -1457,6 +1566,7 @@ static int virtqueue_add_indirect_packed(struct vring_virtqueue *vq,
> > vq->packed.desc_state[id].data = data;
> > vq->packed.desc_state[id].indir_desc = desc;
> > vq->packed.desc_state[id].last = id;
> > + vq->packed.desc_state[id].total_len = total_len;
> >
> > vq->num_added += 1;
> >
> > @@ -1509,8 +1619,11 @@ static inline int virtqueue_add_packed(struct vring_virtqueue *vq,
> > BUG_ON(total_sg == 0);
> >
> > if (virtqueue_use_indirect(vq, total_sg)) {
> > + id = vq->free_head;
> > + BUG_ON(id == vq->packed.vring.num);
> > err = virtqueue_add_indirect_packed(vq, sgs, total_sg, out_sgs,
> > - in_sgs, data, premapped, gfp);
> > + in_sgs, data, premapped,
> > + gfp, id);
> > if (err != -ENOMEM) {
> > END_USE(vq);
> > return err;
> > @@ -1631,6 +1744,152 @@ static inline int virtqueue_add_packed(struct vring_virtqueue *vq,
> > return -EIO;
> > }
> >
> > +static inline int virtqueue_add_packed_in_order(struct vring_virtqueue *vq,
> > + struct scatterlist *sgs[],
> > + unsigned int total_sg,
> > + unsigned int out_sgs,
> > + unsigned int in_sgs,
> > + void *data,
> > + void *ctx,
> > + bool premapped,
> > + gfp_t gfp)
> > +{
> > + struct vring_packed_desc *desc;
> > + struct scatterlist *sg;
> > + unsigned int i, n, c, err_idx, total_len = 0;
> > + __le16 head_flags, flags;
> > + u16 head, avail_used_flags;
> > + int err;
> > +
> > + START_USE(vq);
> > +
> > + BUG_ON(data == NULL);
> > + BUG_ON(ctx && vq->indirect);
> > +
> > + if (unlikely(vq->broken)) {
> > + END_USE(vq);
> > + return -EIO;
> > + }
> > +
> > + LAST_ADD_TIME_UPDATE(vq);
> > +
> > + BUG_ON(total_sg == 0);
> > +
> > + if (virtqueue_use_indirect(vq, total_sg)) {
> > + err = virtqueue_add_indirect_packed(vq, sgs, total_sg, out_sgs,
> > + in_sgs, data, premapped, gfp,
> > + vq->packed.next_avail_idx);
> > + if (err != -ENOMEM) {
> > + END_USE(vq);
> > + return err;
> > + }
> > +
> > + /* fall back on direct */
> > + }
> > +
> > + head = vq->packed.next_avail_idx;
> > + avail_used_flags = vq->packed.avail_used_flags;
> > +
> > + WARN_ON_ONCE(total_sg > vq->packed.vring.num && !vq->indirect);
> > +
> > + desc = vq->packed.vring.desc;
> > + i = head;
> > +
> > + if (unlikely(vq->vq.num_free < total_sg)) {
> > + pr_debug("Can't add buf len %i - avail = %i\n",
> > + total_sg, vq->vq.num_free);
> > + END_USE(vq);
> > + return -ENOSPC;
> > + }
> > +
> > + c = 0;
> > + for (n = 0; n < out_sgs + in_sgs; n++) {
> > + for (sg = sgs[n]; sg; sg = sg_next(sg)) {
> > + dma_addr_t addr;
> > + u32 len;
> > +
> > + if (vring_map_one_sg(vq, sg, n < out_sgs ?
> > + DMA_TO_DEVICE : DMA_FROM_DEVICE,
> > + &addr, &len, premapped))
> > + goto unmap_release;
> > +
> > + flags = cpu_to_le16(vq->packed.avail_used_flags |
> > + (++c == total_sg ? 0 : VRING_DESC_F_NEXT) |
> > + (n < out_sgs ? 0 : VRING_DESC_F_WRITE));
> > + if (i == head)
> > + head_flags = flags;
> > + else
> > + desc[i].flags = flags;
> > +
> > +
> > + desc[i].addr = cpu_to_le64(addr);
> > + desc[i].len = cpu_to_le32(len);
> > + desc[i].id = cpu_to_le16(head);
> > +
> > + if (unlikely(vq->use_map_api)) {
> > + vq->packed.desc_extra[i].addr = premapped ?
> > + DMA_MAPPING_ERROR: addr;
> > + vq->packed.desc_extra[i].len = len;
> > + vq->packed.desc_extra[i].flags =
> > + le16_to_cpu(flags);
> > + }
> > +
> > + if ((unlikely(++i >= vq->packed.vring.num))) {
> > + i = 0;
> > + vq->packed.avail_used_flags ^=
> > + 1 << VRING_PACKED_DESC_F_AVAIL |
> > + 1 << VRING_PACKED_DESC_F_USED;
> > + vq->packed.avail_wrap_counter ^= 1;
> > + }
> > +
> > + total_len += len;
> > + }
> > + }
> > +
> > + /* We're using some buffers from the free list. */
> > + vq->vq.num_free -= total_sg;
> > +
> > + /* Update free pointer */
> > + vq->packed.next_avail_idx = i;
> > +
> > + /* Store token. */
> > + vq->packed.desc_state[head].num = total_sg;
> > + vq->packed.desc_state[head].data = data;
> > + vq->packed.desc_state[head].indir_desc = ctx;
> > + vq->packed.desc_state[head].total_len = total_len;
> > +
> > + /*
> > + * A driver MUST NOT make the first descriptor in the list
> > + * available before all subsequent descriptors comprising
> > + * the list are made available.
> > + */
> > + virtio_wmb(vq->weak_barriers);
> > + vq->packed.vring.desc[head].flags = head_flags;
> > + vq->num_added += total_sg;
> > +
> > + pr_debug("Added buffer head %i to %p\n", head, vq);
> > + END_USE(vq);
> > +
> > + return 0;
> > +
> > +unmap_release:
> > + err_idx = i;
> > + i = head;
> > + vq->packed.avail_used_flags = avail_used_flags;
> > +
> > + for (n = 0; n < total_sg; n++) {
> > + if (i == err_idx)
> > + break;
> > + vring_unmap_extra_packed(vq, &vq->packed.desc_extra[i]);
> > + i++;
> > + if (i >= vq->packed.vring.num)
> > + i = 0;
> > + }
> > +
> > + END_USE(vq);
> > + return -EIO;
> > +}
> > +
> > static bool virtqueue_kick_prepare_packed(struct vring_virtqueue *vq)
> > {
> > u16 new, old, off_wrap, flags, wrap_counter, event_idx;
> > @@ -1791,10 +2050,81 @@ static void update_last_used_idx_packed(struct vring_virtqueue *vq,
> > cpu_to_le16(vq->last_used_idx));
> > }
> >
> > +static bool more_used_packed_in_order(const struct vring_virtqueue *vq)
> > +{
> > + if (vq->batch_last.id != vq->packed.vring.num)
> > + return true;
> > +
> > + return virtqueue_poll_packed(vq, READ_ONCE(vq->last_used_idx));
> > +}
> > +
> > +static void *virtqueue_get_buf_ctx_packed_in_order(struct vring_virtqueue *vq,
> > + unsigned int *len,
> > + void **ctx)
> > +{
> > + unsigned int num = vq->packed.vring.num;
> > + u16 last_used, last_used_idx;
> > + bool used_wrap_counter;
> > + void *ret;
> > +
> > + START_USE(vq);
> > +
> > + if (unlikely(vq->broken)) {
> > + END_USE(vq);
> > + return NULL;
> > + }
> > +
> > + last_used_idx = vq->last_used_idx;
> > + used_wrap_counter = packed_used_wrap_counter(last_used_idx);
> > + last_used = packed_last_used(last_used_idx);
> > +
> > + if (vq->batch_last.id == num) {
> > + if (!more_used_packed(vq)) {
> > + pr_debug("No more buffers in queue\n");
> > + END_USE(vq);
> > + return NULL;
> > + }
> > + /* Only get used elements after they have been exposed by host. */
> > + virtio_rmb(vq->weak_barriers);
> > + vq->batch_last.id =
> > + le16_to_cpu(vq->packed.vring.desc[last_used].id);
> > + vq->batch_last.len =
> > + le32_to_cpu(vq->packed.vring.desc[last_used].len);
> > + }
> > +
> > + if (vq->batch_last.id == last_used) {
> > + vq->batch_last.id = num;
> > + *len = vq->batch_last.len;
> > + } else
> > + *len = vq->packed.desc_state[last_used].total_len;
> > +
> > + if (unlikely(last_used >= num)) {
> > + BAD_RING(vq, "id %u out of range\n", last_used);
> > + return NULL;
> > + }
> > + if (unlikely(!vq->packed.desc_state[last_used].data)) {
> > + BAD_RING(vq, "id %u is not a head!\n", last_used);
> > + return NULL;
> > + }
> > +
> > + /* detach_buf_packed clears data, so grab it now. */
> > + ret = vq->packed.desc_state[last_used].data;
> > + detach_buf_packed_in_order(vq, last_used, ctx);
> > +
> > + update_last_used_idx_packed(vq, last_used, last_used,
> > + used_wrap_counter);
> > +
> > + LAST_ADD_TIME_INVALID(vq);
> > +
> > + END_USE(vq);
> > + return ret;
> > +}
> > +
> > static void *virtqueue_get_buf_ctx_packed(struct vring_virtqueue *vq,
> > unsigned int *len,
> > void **ctx)
> > {
> > + unsigned int num = vq->packed.vring.num;
> > u16 last_used, id, last_used_idx;
> > bool used_wrap_counter;
> > void *ret;
> > @@ -1821,7 +2151,7 @@ static void *virtqueue_get_buf_ctx_packed(struct vring_virtqueue *vq,
> > id = le16_to_cpu(vq->packed.vring.desc[last_used].id);
> > *len = le32_to_cpu(vq->packed.vring.desc[last_used].len);
> >
> > - if (unlikely(id >= vq->packed.vring.num)) {
> > + if (unlikely(id >= num)) {
> > BAD_RING(vq, "id %u out of range\n", id);
> > return NULL;
> > }
> > @@ -1962,7 +2292,7 @@ static void *virtqueue_detach_unused_buf_packed(struct vring_virtqueue *vq)
> > continue;
> > /* detach_buf clears data, so grab it now. */
> > buf = vq->packed.desc_state[i].data;
> > - detach_buf_packed(vq, i, NULL);
> > + detach_buf_packed_in_order(vq, i, NULL);
>
> Wait why is this in order unconditionally?
This is a bug, let me fix it.
Thanks
Powered by blists - more mailing lists