[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <CACGkMEvf+bSjX47s6ENC4RkNq5=RNzcMCgftt5D4P=kDi+9JwA@mail.gmail.com>
Date: Mon, 19 May 2025 15:33:42 +0800
From: Jason Wang <jasowang@...hat.com>
To: "Michael S. Tsirkin" <mst@...hat.com>
Cc: xuanzhuo@...ux.alibaba.com, eperezma@...hat.com,
virtualization@...ts.linux.dev, linux-kernel@...r.kernel.org
Subject: Re: [PATCH 13/19] virtio_ring: introduce virtqueue ops
On Fri, May 16, 2025 at 6:35 PM Michael S. Tsirkin <mst@...hat.com> wrote:
>
> On Fri, May 16, 2025 at 09:30:01AM +0800, Jason Wang wrote:
> > On Wed, May 14, 2025 at 10:24 PM Michael S. Tsirkin <mst@...hat.com> wrote:
> > >
> > > On Wed, May 14, 2025 at 10:19:05AM -0400, Michael S. Tsirkin wrote:
> > > > On Wed, Apr 09, 2025 at 12:06:03PM +0800, Jason Wang wrote:
> > > > > On Tue, Apr 8, 2025 at 7:37 PM Michael S. Tsirkin <mst@...hat.com> wrote:
> > > > > >
> > > > > > On Tue, Apr 08, 2025 at 03:02:35PM +0800, Jason Wang wrote:
> > > > > > > On Mon, Apr 7, 2025 at 4:20 PM Michael S. Tsirkin <mst@...hat.com> wrote:
> > > > > > > >
> > > > > > > > On Mon, Mar 24, 2025 at 02:01:21PM +0800, Jason Wang wrote:
> > > > > > > > > This patch introduces virtqueue ops which is a set of the callbacks
> > > > > > > > > that will be called for different queue layout or features. This would
> > > > > > > > > help to avoid branches for split/packed and will ease the future
> > > > > > > > > implementation like in order.
> > > > > > > > >
> > > > > > > > > Signed-off-by: Jason Wang <jasowang@...hat.com>
> > > > > > > >
> > > > > > > >
> > > > > > > >
> > > > > > > >
> > > > > > > > > ---
> > > > > > > > > drivers/virtio/virtio_ring.c | 96 +++++++++++++++++++++++++-----------
> > > > > > > > > 1 file changed, 67 insertions(+), 29 deletions(-)
> > > > > > > > >
> > > > > > > > > diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> > > > > > > > > index a2884eae14d9..ce1dc90ee89d 100644
> > > > > > > > > --- a/drivers/virtio/virtio_ring.c
> > > > > > > > > +++ b/drivers/virtio/virtio_ring.c
> > > > > > > > > @@ -159,9 +159,30 @@ struct vring_virtqueue_packed {
> > > > > > > > > size_t event_size_in_bytes;
> > > > > > > > > };
> > > > > > > > >
> > > > > > > > > +struct vring_virtqueue;
> > > > > > > > > +
> > > > > > > > > +struct virtqueue_ops {
> > > > > > > > > + int (*add)(struct vring_virtqueue *_vq, struct scatterlist *sgs[],
> > > > > > > > > + unsigned int total_sg, unsigned int out_sgs,
> > > > > > > > > + unsigned int in_sgs, void *data,
> > > > > > > > > + void *ctx, bool premapped, gfp_t gfp);
> > > > > > > > > + void *(*get)(struct vring_virtqueue *vq, unsigned int *len, void **ctx);
> > > > > > > > > + bool (*kick_prepare)(struct vring_virtqueue *vq);
> > > > > > > > > + void (*disable_cb)(struct vring_virtqueue *vq);
> > > > > > > > > + bool (*enable_cb_delayed)(struct vring_virtqueue *vq);
> > > > > > > > > + unsigned int (*enable_cb_prepare)(struct vring_virtqueue *vq);
> > > > > > > > > + bool (*poll)(const struct vring_virtqueue *vq, u16 last_used_idx);
> > > > > > > > > + void *(*detach_unused_buf)(struct vring_virtqueue *vq);
> > > > > > > > > + bool (*more_used)(const struct vring_virtqueue *vq);
> > > > > > > > > + int (*resize)(struct vring_virtqueue *vq, u32 num);
> > > > > > > > > + void (*reset)(struct vring_virtqueue *vq);
> > > > > > > > > +};
> > > > > > > >
> > > > > > > > I like it that it's organized but
> > > > > > > > I worry about the overhead of indirect calls here.
> > > > > > >
> > > > > > > We can switch to use INDIRECT_CALL_X() here
> > > > > >
> > > > > > If you think it's cleaner.. but INDIRECT_CALL is all chained
> > > > >
> > > > > Yes, and it would be problematic as the number of ops increased.
> > > > >
> > > > > > while a switch can do a binary search.
> > > > > >
> > > > >
> > > > > Do you mean a nested switch?
> > > >
> > > > Not sure what is nested. gcc does a decent job of optimizing
> > > > switches. You have 4 types of ops:
> > > > packed/packed in order/split/split in order
> > > >
> > > > So:
> > > >
> > > > enum {
> > > > VQ_SPLIT,
> > > > VQ_SPLIT_IN_ORDER,
> > > > VQ_PACKED,
> > > > VQ_PACKED_IN_ORDER,
> > > > }
> > > >
> > > >
> > > > I do not see how it is worse?
> > > >
> > > >
> > >
> > >
> > >
> > > Actually, here is an idea - create an array of ops:
> > >
> > >
> > >
> > > enum vqtype {
> > > SPLIT,
> > > SPLIT_IN_ORDER,
> > > PACKED,
> > > PACKED_IN_ORDER,
> > > MAX
> > > };
> > >
> > >
> > > struct ops {
> > > int (*add)(int bar);
> > > };
> > >
> > > extern int packed(int);
> > > extern int packedinorder(int);
> > > extern int split(int);
> > > extern int splitinorder(int);
> > >
> > > const struct ops allops[MAX] = { [SPLIT] = {split}, [SPLIT_IN_ORDER] = { splitinorder}, [PACKED] = { packed }, [PACKED_IN_ORDER] = {packedinorder}};
> > >
> > > int main(int argc, char **argv)
> > > {
> > > switch (argc) {
> > > case 0:
> > > return allops[PACKED].foo(argc);
> > > case 1:
> > > return allops[SPLIT].foo(argc);
> > > default:
> > > return allops[PACKED_IN_ORDER].foo(argc);
> >
> > This still looks like an indirection call as we don't call the symbol
> > directly but need to load the function address into a register.
>
> See below.
>
>
>
> > > }
> > > }
> > >
> > >
> > > I tested this and compiler is able to elide the indirect calls.
> >
> > I've tried the following:
> >
> > struct virtqueue_ops split_ops = {
> > .add = virtqueue_add_split,
> > .get = virtqueue_get_buf_ctx_split,
> > .kick_prepare = virtqueue_kick_prepare_split,
> > .disable_cb = virtqueue_disable_cb_split,
> > .enable_cb_delayed = virtqueue_enable_cb_delayed_split,
> > .enable_cb_prepare = virtqueue_enable_cb_prepare_split,
> > .poll = virtqueue_poll_split,
> > .detach_unused_buf = virtqueue_detach_unused_buf_split,
> > .more_used = more_used_split,
> > .resize = virtqueue_resize_split,
> > .reset = virtqueue_reset_split,
> > };
> >
> > struct virtqueue_ops packed_ops = {
> > .add = virtqueue_add_packed,
> > .get = virtqueue_get_buf_ctx_packed,
> > .kick_prepare = virtqueue_kick_prepare_packed,
> > .disable_cb = virtqueue_disable_cb_packed,
> > .enable_cb_delayed = virtqueue_enable_cb_delayed_packed,
> > .enable_cb_prepare = virtqueue_enable_cb_prepare_packed,
> > .poll = virtqueue_poll_packed,
> > .detach_unused_buf = virtqueue_detach_unused_buf_packed,
> > .more_used = more_used_packed,
> > .resize = virtqueue_resize_packed,
> > .reset = virtqueue_reset_packed,
> > };
> >
> > const struct virtqueue_ops *all_ops[VQ_TYPE_MAX] = { [SPLIT] = &split_ops,
> > [PACKED] = &packed_ops};
> >
> > unsigned int virtqueue_enable_cb_prepare(struct virtqueue *_vq)
> > {
> > struct vring_virtqueue *vq = to_vvq(_vq);
> >
> > if (vq->event_triggered)
> > vq->event_triggered = false;
> >
> > switch (vq->layout) {
> > case SPLIT:
> > return all_ops[SPLIT]->enable_cb_prepare(vq);
> > break;
> > case PACKED:
> > return all_ops[PACKED]->enable_cb_prepare(vq);
> > break;
> > default:
> > BUG();
> > break;
> > }
> >
> > return -EFAULT;
> > }
> > EXPORT_SYMBOL_GPL(virtqueue_enable_cb_prepare);
> >
> > Compilers gives me (when RETPOLINE is enabled):
> >
> > ffffffff8193a870 <virtqueue_enable_cb_prepare>:
> > ffffffff8193a870: f3 0f 1e fa endbr64
> > ffffffff8193a874: e8 47 68 93 ff callq
> > ffffffff812710c0 <__fentry__>
> > ffffffff8193a879: 80 bf 8e 00 00 00 00 cmpb $0x0,0x8e(%rdi)
> > ffffffff8193a880: 74 07 je
> > ffffffff8193a889 <virtqueue_enable_cb_prepare+0x19>
> > ffffffff8193a882: c6 87 8e 00 00 00 00 movb $0x0,0x8e(%rdi)
> > ffffffff8193a889: 8b 87 80 00 00 00 mov 0x80(%rdi),%eax
> > ffffffff8193a88f: 85 c0 test %eax,%eax
> > ffffffff8193a891: 74 15 je
> > ffffffff8193a8a8 <virtqueue_enable_cb_prepare+0x38>
> > ffffffff8193a893: 83 f8 01 cmp $0x1,%eax
> > ffffffff8193a896: 75 20 jne
> > ffffffff8193a8b8 <virtqueue_enable_cb_prepare+0x48>
> > ffffffff8193a898: 48 8b 05 49 03 4a 01 mov
> > 0x14a0349(%rip),%rax # ffffffff82ddabe8 <all_ops+0x8>
> > ffffffff8193a89f: 48 8b 40 28 mov 0x28(%rax),%rax
> > ffffffff8193a8a3: e9 b8 d8 9b 00 jmpq
> > ffffffff822f8160 <__x86_indirect_thunk_array>
> > ffffffff8193a8a8: 48 8b 05 31 03 4a 01 mov
> > 0x14a0331(%rip),%rax # ffffffff82ddabe0 <all_ops>
> > ffffffff8193a8af: 48 8b 40 28 mov 0x28(%rax),%rax
> > ffffffff8193a8b3: e9 a8 d8 9b 00 jmpq
> > ffffffff822f8160 <__x86_indirect_thunk_array>
> > ffffffff8193a8b8: 0f 0b ud2
> > ffffffff8193a8ba: 66 0f 1f 44 00 00 nopw 0x0(%rax,%rax,1)
> >
> > indirection call is still being mitigated via thunk.
>
>
>
> This is because you put the pointers there, and the pointers
> are not const themselves.
>
>
> const struct virtqueue_ops * const all_ops[VQ_TYPE_MAX]
>
> should do the trick.
>
>
>
> Here is the example I wrote:
>
> enum vqtype {
> SPLIT,
> SPLIT_IN_ORDER,
> PACKED,
> PACKED_IN_ORDER,
> MAX
> };
>
>
> struct ops {
> int (*foo)(int bar);
> };
>
> extern int packed(int);
> extern int packedinorder(int);
> extern int split(int);
> extern int splitinorder(int);
>
> const struct ops splitops = { split };
> const struct ops packedops = { packed };
> const struct ops packedinorderops = { packedinorder };
> const struct ops splitinorderops = { splitinorder };
>
> const struct ops *const allopsp[MAX] = { [SPLIT] = &splitops, [SPLIT_IN_ORDER] = &splitinorderops, [PACKED] = &packedops, [PACKED_IN_ORDER] = &packedinorderops};
> const struct ops allops[MAX] = { [SPLIT] = {split}, [SPLIT_IN_ORDER] = { splitinorder}, [PACKED] = { packed }, [PACKED_IN_ORDER] = {packedinorder}};
>
> int foo(int argc)
> {
> switch (argc) {
> case 0:
> return allops[PACKED].foo(argc);
> case 1:
> return allops[SPLIT].foo(argc);
> case 2:
> return allops[PACKED_IN_ORDER].foo(argc);
> case 3:
> return allops[SPLIT_IN_ORDER].foo(argc);
> default:
> return 0;
> }
> }
> extern int foo(int argc);
> int bar(int argc)
> {
> switch (argc) {
> case 0:
> return allopsp[PACKED]->foo(argc);
> case 1:
> return allopsp[SPLIT]->foo(argc);
> case 2:
> return allopsp[PACKED_IN_ORDER]->foo(argc);
> case 3:
> return allopsp[SPLIT_IN_ORDER]->foo(argc);
> default:
> return 0;
> }
> }
> extern int bar(int argc);
> int main(int argc, char **argv)
> {
> foo(argc);
> bar(argc);
> return 0;
> }
>
>
>
> then:
> $ gcc -c -O2 -ggdb main.c
> $ objdump -r -ld main.o
>
> main.o: file format elf64-x86-64
> mst@...k:~$ objdump -r -ld main.o
>
> main.o: file format elf64-x86-64
>
>
> Disassembly of section .text:
>
> 0000000000000000 <foo>:
> foo():
> /home/mst/main.c:29
> 0: 83 ff 02 cmp $0x2,%edi
> 3: 74 2a je 2f <foo+0x2f>
> 5: 7f 12 jg 19 <foo+0x19>
> 7: 85 ff test %edi,%edi
> 9: 74 1d je 28 <foo+0x28>
> b: ff cf dec %edi
> d: 75 2a jne 39 <foo+0x39>
> /home/mst/main.c:33
> f: bf 01 00 00 00 mov $0x1,%edi
> 14: e9 00 00 00 00 jmp 19 <foo+0x19>
> 15: R_X86_64_PLT32 split-0x4
> /home/mst/main.c:29
> 19: 83 ff 03 cmp $0x3,%edi
> 1c: 75 1b jne 39 <foo+0x39>
> /home/mst/main.c:37
> 1e: bf 03 00 00 00 mov $0x3,%edi
> 23: e9 00 00 00 00 jmp 28 <foo+0x28>
> 24: R_X86_64_PLT32 splitinorder-0x4
> /home/mst/main.c:31
> 28: 31 ff xor %edi,%edi
> 2a: e9 00 00 00 00 jmp 2f <foo+0x2f>
> 2b: R_X86_64_PLT32 packed-0x4
> /home/mst/main.c:35
> 2f: bf 02 00 00 00 mov $0x2,%edi
> 34: e9 00 00 00 00 jmp 39 <foo+0x39>
> 35: R_X86_64_PLT32 packedinorder-0x4
> /home/mst/main.c:41
> 39: 31 c0 xor %eax,%eax
> 3b: c3 ret
>
> 000000000000003c <bar>:
> bar():
> /home/mst/main.c:43
> 3c: eb c2 jmp 0 <foo>
>
> Disassembly of section .text.startup:
>
> 0000000000000000 <main>:
> main():
> /home/mst/main.c:60
> 0: 48 83 ec 18 sub $0x18,%rsp
> /home/mst/main.c:61
> 4: 89 7c 24 0c mov %edi,0xc(%rsp)
> 8: e8 00 00 00 00 call d <main+0xd>
> 9: R_X86_64_PLT32 foo-0x4
> /home/mst/main.c:62
> d: 8b 7c 24 0c mov 0xc(%rsp),%edi
> 11: e8 00 00 00 00 call 16 <main+0x16>
> 12: R_X86_64_PLT32 foo-0x4
> /home/mst/main.c:64
> 16: 31 c0 xor %eax,%eax
> 18: 48 83 c4 18 add $0x18,%rsp
>
>
> compiler was able to figure out they are the same.
>
>
Exactly, it works.
Let me redo the benchmark and post a new version.
Thanks
Powered by blists - more mailing lists