[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <m5q3fqqvur4pcvkcxx36ivoqu77tsrjd4xna6zszmzq34dbqq5@6wfrhllk6tsq>
Date: Tue, 27 Jun 2023 09:50:50 +0200
From: Stefano Garzarella <sgarzare@...hat.com>
To: Arseniy Krasnov <avkrasnov@...rdevices.ru>
Cc: Stefan Hajnoczi <stefanha@...hat.com>,
"David S. Miller" <davem@...emloft.net>, Eric Dumazet <edumazet@...gle.com>,
Jakub Kicinski <kuba@...nel.org>, Paolo Abeni <pabeni@...hat.com>,
"Michael S. Tsirkin" <mst@...hat.com>, Jason Wang <jasowang@...hat.com>,
Bobby Eshleman <bobby.eshleman@...edance.com>, kvm@...r.kernel.org, virtualization@...ts.linux-foundation.org,
netdev@...r.kernel.org, linux-kernel@...r.kernel.org, kernel@...rdevices.ru,
oxffffaa@...il.com
Subject: Re: [RFC PATCH v4 05/17] vsock/virtio: MSG_ZEROCOPY flag support
On Tue, Jun 27, 2023 at 07:41:51AM +0300, Arseniy Krasnov wrote:
>
>
>On 26.06.2023 19:03, Stefano Garzarella wrote:
>> On Sat, Jun 03, 2023 at 11:49:27PM +0300, Arseniy Krasnov wrote:
>>> This adds handling of MSG_ZEROCOPY flag on transmission path: if this
>>> flag is set and zerocopy transmission is possible, then non-linear skb
>>> will be created and filled with the pages of user's buffer. Pages of
>>> user's buffer are locked in memory by 'get_user_pages()'.
>>>
>>> Signed-off-by: Arseniy Krasnov <AVKrasnov@...rdevices.ru>
>>> ---
>>> net/vmw_vsock/virtio_transport_common.c | 270 ++++++++++++++++++------
>>> 1 file changed, 208 insertions(+), 62 deletions(-)
>>>
>>> diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c
>>> index 0de562c1dc4b..f1ec38c72db7 100644
>>> --- a/net/vmw_vsock/virtio_transport_common.c
>>> +++ b/net/vmw_vsock/virtio_transport_common.c
>>> @@ -37,27 +37,100 @@ virtio_transport_get_ops(struct vsock_sock *vsk)
>>> return container_of(t, struct virtio_transport, transport);
>>> }
>>>
>>> -/* Returns a new packet on success, otherwise returns NULL.
>>> - *
>>> - * If NULL is returned, errp is set to a negative errno.
>>> - */
>>> -static struct sk_buff *
>>> -virtio_transport_alloc_skb(struct virtio_vsock_pkt_info *info,
>>> - size_t len,
>>> - u32 src_cid,
>>> - u32 src_port,
>>> - u32 dst_cid,
>>> - u32 dst_port)
>>> -{
>>> - const size_t skb_len = VIRTIO_VSOCK_SKB_HEADROOM + len;
>>> - struct virtio_vsock_hdr *hdr;
>>> - struct sk_buff *skb;
>>> +static bool virtio_transport_can_zcopy(struct virtio_vsock_pkt_info *info,
>>> + size_t max_to_send)
>>> +{
>>> + struct iov_iter *iov_iter;
>>> +
>>> + if (!info->msg)
>>> + return false;
>>> +
>>> + iov_iter = &info->msg->msg_iter;
>>> +
>>> + /* Data is simple buffer. */
>>> + if (iter_is_ubuf(iov_iter))
>>> + return true;
>>> +
>>> + if (!iter_is_iovec(iov_iter))
>>> + return false;
>>> +
>>> + if (iov_iter->iov_offset)
>>> + return false;
>>> +
>>> + /* We can't send whole iov. */
>>> + if (iov_iter->count > max_to_send)
>>> + return false;
>>> +
>>> + return true;
>>> +}
>>> +
>>> +static int virtio_transport_init_zcopy_skb(struct vsock_sock *vsk,
>>> + struct sk_buff *skb,
>>> + struct msghdr *msg,
>>> + bool zerocopy)
>>> +{
>>> + struct ubuf_info *uarg;
>>> +
>>> + if (msg->msg_ubuf) {
>>> + uarg = msg->msg_ubuf;
>>> + net_zcopy_get(uarg);
>>> + } else {
>>> + struct iov_iter *iter = &msg->msg_iter;
>>> + struct ubuf_info_msgzc *uarg_zc;
>>> + int len;
>>> +
>>> + /* Only ITER_IOVEC or ITER_UBUF are allowed and
>>> + * checked before.
>>> + */
>>> + if (iter_is_iovec(iter))
>>> + len = iov_length(iter->__iov, iter->nr_segs);
>>> + else
>>> + len = iter->count;
>>> +
>>> + uarg = msg_zerocopy_realloc(sk_vsock(vsk),
>>> + len,
>>> + NULL);
>>> +
>>> + if (!uarg)
>>> + return -1;
>>> +
>>> + uarg_zc = uarg_to_msgzc(uarg);
>>> + uarg_zc->zerocopy = zerocopy ? 1 : 0;
>>> + }
>>> +
>>> + skb_zcopy_init(skb, uarg);
>>> +
>>> + return 0;
>>> +}
>>> +
>>> +static int virtio_transport_fill_linear_skb(struct sk_buff *skb,
>>> + struct vsock_sock *vsk,
>>
>> `vsk` seems unused
>>
>>> + struct virtio_vsock_pkt_info *info,
>>> + size_t len)
>>> +{
>>> void *payload;
>>> int err;
>>>
>>> - skb = virtio_vsock_alloc_skb(skb_len, GFP_KERNEL);
>>> - if (!skb)
>>> - return NULL;
>>> + payload = skb_put(skb, len);
>>> + err = memcpy_from_msg(payload, info->msg, len);
>>> + if (err)
>>> + return -1;
>>> +
>>> + if (msg_data_left(info->msg))
>>> + return 0;
>>> +
>>> + return 0;
>>> +}
>>> +
>>> +static void virtio_transport_init_hdr(struct sk_buff *skb,
>>> + struct virtio_vsock_pkt_info *info,
>>> + u32 src_cid,
>>> + u32 src_port,
>>> + u32 dst_cid,
>>> + u32 dst_port,
>>> + size_t len)
>>> +{
>>> + struct virtio_vsock_hdr *hdr;
>>>
>>> hdr = virtio_vsock_hdr(skb);
>>> hdr->type = cpu_to_le16(info->type);
>>> @@ -68,42 +141,6 @@ virtio_transport_alloc_skb(struct virtio_vsock_pkt_info *info,
>>> hdr->dst_port = cpu_to_le32(dst_port);
>>> hdr->flags = cpu_to_le32(info->flags);
>>> hdr->len = cpu_to_le32(len);
>>> -
>>> - if (info->msg && len > 0) {
>>> - payload = skb_put(skb, len);
>>> - err = memcpy_from_msg(payload, info->msg, len);
>>> - if (err)
>>> - goto out;
>>> -
>>> - if (msg_data_left(info->msg) == 0 &&
>>> - info->type == VIRTIO_VSOCK_TYPE_SEQPACKET) {
>>> - hdr->flags |= cpu_to_le32(VIRTIO_VSOCK_SEQ_EOM);
>>> -
>>> - if (info->msg->msg_flags & MSG_EOR)
>>> - hdr->flags |= cpu_to_le32(VIRTIO_VSOCK_SEQ_EOR);
>>> - }
>>> - }
>>> -
>>> - if (info->reply)
>>> - virtio_vsock_skb_set_reply(skb);
>>> -
>>> - trace_virtio_transport_alloc_pkt(src_cid, src_port,
>>> - dst_cid, dst_port,
>>> - len,
>>> - info->type,
>>> - info->op,
>>> - info->flags);
>>> -
>>> - if (info->vsk && !skb_set_owner_sk_safe(skb, sk_vsock(info->vsk))) {
>>> - WARN_ONCE(1, "failed to allocate skb on vsock socket with sk_refcnt == 0\n");
>>> - goto out;
>>> - }
>>> -
>>> - return skb;
>>> -
>>> -out:
>>> - kfree_skb(skb);
>>> - return NULL;
>>> }
>>>
>>> static void virtio_transport_copy_nonlinear_skb(struct sk_buff *skb,
>>> @@ -214,6 +251,85 @@ static u16 virtio_transport_get_type(struct sock *sk)
>>> return VIRTIO_VSOCK_TYPE_SEQPACKET;
>>> }
>>>
>>> +/* Returns a new packet on success, otherwise returns NULL.
>>> + *
>>> + * If NULL is returned, errp is set to a negative errno.
>>
>> I had noticed this in Bobby's patches, I think it's an old comment we
>> left around.
>>
>>> + */
>>> +static struct sk_buff *virtio_transport_alloc_skb(struct vsock_sock *vsk,
>>> + struct virtio_vsock_pkt_info *info,
>>> + size_t payload_len,
>>> + bool zcopy,
>>> + u32 dst_cid,
>>> + u32 dst_port,
>>> + u32 src_cid,
>>> + u32 src_port)
>>> +{
>>> + struct sk_buff *skb;
>>> + size_t skb_len;
>>> +
>>> + skb_len = VIRTIO_VSOCK_SKB_HEADROOM;
>>> +
>>> + if (!zcopy)
>>> + skb_len += payload_len;
>>> +
>>> + skb = virtio_vsock_alloc_skb(skb_len, GFP_KERNEL);
>>> + if (!skb)
>>> + return NULL;
>>> +
>>> + virtio_transport_init_hdr(skb, info, src_cid, src_port,
>>> + dst_cid, dst_port,
>>> + payload_len);
>>> +
>>> + /* Set owner here, because '__zerocopy_sg_from_iter()' uses
>>> + * owner of skb without check to update 'sk_wmem_alloc'.
>>> + */
>>> + if (vsk)
>>> + skb_set_owner_w(skb, sk_vsock(vsk));
>>
>> why we are moving from skb_set_owner_sk_safe() to skb_set_owner_w()?
>>
>> We should mention this in the commit description.
>>
>>> +
>>> + if (info->msg && payload_len > 0) {
>>> + int err;
>>> +
>>> + if (zcopy) {
>>> + err = __zerocopy_sg_from_iter(info->msg, NULL, skb,
>>> + &info->msg->msg_iter,
>>> + payload_len);
>>> + } else {
>>> + err = virtio_transport_fill_linear_skb(skb, vsk, info, payload_len);
>>> + }
>>> +
>>> + if (err)
>>> + goto out;
>>> +
>>> + VIRTIO_VSOCK_SKB_CB(skb)->frag_off = 0;
>>> +
>>> + if (info->type == VIRTIO_VSOCK_TYPE_SEQPACKET) {
>>> + struct virtio_vsock_hdr *hdr;
>>> +
>>> + hdr = virtio_vsock_hdr(skb);
>>
>> Just `struct virtio_vsock_hdr *hdr = virtio_vsock_hdr(skb);` should be
>> fine.
>>
>>> +
>>> + hdr->flags |= cpu_to_le32(VIRTIO_VSOCK_SEQ_EOM);
>>> +
>>> + if (info->msg->msg_flags & MSG_EOR)
>>> + hdr->flags |= cpu_to_le32(VIRTIO_VSOCK_SEQ_EOR);
>>> + }
>>> + }
>>> +
>>> + if (info->reply)
>>> + virtio_vsock_skb_set_reply(skb);
>>> +
>>> + trace_virtio_transport_alloc_pkt(src_cid, src_port,
>>> + dst_cid, dst_port,
>>> + payload_len,
>>> + info->type,
>>> + info->op,
>>> + info->flags);
>>> +
>>> + return skb;
>>> +out:
>>> + kfree_skb(skb);
>>> + return NULL;
>>> +}
>>> +
>>> /* This function can only be used on connecting/connected sockets,
>>> * since a socket assigned to a transport is required.
>>> *
>>> @@ -226,6 +342,8 @@ static int virtio_transport_send_pkt_info(struct vsock_sock *vsk,
>>> const struct virtio_transport *t_ops;
>>> struct virtio_vsock_sock *vvs;
>>> u32 pkt_len = info->pkt_len;
>>> + bool can_zcopy = false;
>>> + u32 max_skb_cap;
>>> u32 rest_len;
>>> int ret;
>>>
>>> @@ -254,22 +372,49 @@ static int virtio_transport_send_pkt_info(struct vsock_sock *vsk,
>>> if (pkt_len == 0 && info->op == VIRTIO_VSOCK_OP_RW)
>>> return pkt_len;
>>>
>>> + /* If zerocopy is not enabled by 'setsockopt()', we behave as
>>> + * there is no MSG_ZEROCOPY flag set.
>>> + */
>>> + if (!sock_flag(sk_vsock(vsk), SOCK_ZEROCOPY))
>>> + info->flags &= ~MSG_ZEROCOPY;
>>> +
>>> + if (info->flags & MSG_ZEROCOPY)
>>> + can_zcopy = virtio_transport_can_zcopy(info, pkt_len);
>>> +
>>> + if (can_zcopy)
>>> + max_skb_cap = min_t(u32, VIRTIO_VSOCK_MAX_PKT_BUF_SIZE,
>>> + (MAX_SKB_FRAGS * PAGE_SIZE));
>>> + else
>>> + max_skb_cap = VIRTIO_VSOCK_MAX_PKT_BUF_SIZE;
>>> +
>>
>> We use `len` very often, what about `max_skb_len`?
>>
>>> rest_len = pkt_len;
>>>
>>> do {
>>> struct sk_buff *skb;
>>> size_t skb_len;
>>>
>>> - skb_len = min_t(u32, VIRTIO_VSOCK_MAX_PKT_BUF_SIZE, rest_len);
>>> + skb_len = min(max_skb_cap, rest_len);
>>>
>>> - skb = virtio_transport_alloc_skb(info, skb_len,
>>> - src_cid, src_port,
>>> - dst_cid, dst_port);
>>> + skb = virtio_transport_alloc_skb(vsk, info, skb_len, can_zcopy,
>>> + dst_cid, dst_port,
>>> + src_cid, src_port);
>>> if (!skb) {
>>> ret = -ENOMEM;
>>> break;
>>> }
>>>
>>> + /* This is last skb to send this portion of data. */
>>> + if (skb_len == rest_len &&
>>> + info->flags & MSG_ZEROCOPY &&
>>> + info->op == VIRTIO_VSOCK_OP_RW) {
>>> + if (virtio_transport_init_zcopy_skb(vsk, skb,
>>> + info->msg,
>>> + can_zcopy)) {
>>> + ret = -ENOMEM;
>>> + break;
>>> + }
>>> + }
>>> +
>>> virtio_transport_inc_tx_pkt(vvs, skb);
>>>
>>> ret = t_ops->send_pkt(skb);
>>> @@ -884,6 +1029,7 @@ virtio_transport_stream_enqueue(struct vsock_sock *vsk,
>>> .msg = msg,
>>> .pkt_len = len,
>>> .vsk = vsk,
>>> + .flags = msg->msg_flags,
>>
>> These flags then get copied into the virtio_vsock_hdr, which I don't
>> think is a good idea.
>>
>> Why not using directly info->msg->msg_flags?
>
>Ops, yes, it's a bug, You're right, this is really wrong as there are two different
>sets of flags - MSG_XXX passed to syscall and flags in the header of packet.
Yep.
What about the moving from skb_set_owner_sk_safe() to skb_set_owner_w()?
Was it voluntary? If so, can you explain why?
Thanks,
Stefano
Powered by blists - more mailing lists