[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <b00b3064-4a08-b844-3d09-7124f5ab0143@antgroup.com>
Date: Wed, 15 Mar 2023 20:20:32 +0800
From: "沈安琪(凛玥)" <amy.saq@...group.com>
To: Willem de Bruijn <willemdebruijn.kernel@...il.com>,
netdev@...r.kernel.org, "Michael S. Tsirkin" <mst@...hat.com>
Cc: <davem@...emloft.net>, <jasowang@...hat.com>,
"谈鉴锋" <henry.tjf@...group.com>
Subject: Re: [PATCH v4] net/packet: support mergeable feature of virtio
在 2023/3/13 下午10:05, Willem de Bruijn 写道:
> 沈安琪(凛玥) wrote:
>> From: Jianfeng Tan <henry.tjf@...group.com>
>>
>> Packet sockets, like tap, can be used as the backend for kernel vhost.
>> In packet sockets, virtio net header size is currently hardcoded to be
>> the size of struct virtio_net_hdr, which is 10 bytes; however, it is not
>> always the case: some virtio features, such as mrg_rxbuf, need virtio
>> net headers to be 12-byte long.
>>
>> Mergeable buffers, as a virtio feature, is worthy of supporting: packets
>> that are larger than one-mbuf size will be dropped in vhost worker's
>> handle_rx if mrg_rxbuf feature is not used, but large packets
>> cannot be avoided and increasing mbuf's size is not economical.
>>
>> With this virtio feature enabled by virtio-user, packet sockets with
>> hardcoded 10-byte virtio net header will parse mac head incorrectly in
>> packet_snd by taking the last two bytes of virtio net header as part of
>> mac header.
>> This incorrect mac header parsing will cause packet to be dropped due to
>> invalid ether head checking in later under-layer device packet receiving.
>>
>> By adding extra field vnet_hdr_sz with utilizing holes in struct
>> packet_sock to record currently used virtio net header size and supporting
>> extra sockopt PACKET_VNET_HDR_SZ to set specified vnet_hdr_sz, packet
>> sockets can know the exact length of virtio net header that virtio user
>> gives.
>> In packet_snd, tpacket_snd and packet_recvmsg, instead of using
>> hardcoded virtio net header size, it can get the exact vnet_hdr_sz from
>> corresponding packet_sock, and parse mac header correctly based on this
>> information to avoid the packets being mistakenly dropped.
>>
>> Signed-off-by: Jianfeng Tan <henry.tjf@...group.com>
>> Co-developed-by: Anqi Shen <amy.saq@...group.com>
>> Signed-off-by: Anqi Shen <amy.saq@...group.com>
>> ---
>>
>> V3 -> V4:
>> * read po->vnet_hdr_sz once during vnet_hdr_sz and use vnet_hdr_sz locally
>> to avoid race condition;
>> * modify how to check non-zero po->vnet_hdr_sz;
>> * separate vnet_hdr_sz as a u8 field in struct packet_sock instead of 8-bit
>> in an int field.
>>
>> include/uapi/linux/if_packet.h | 1 +
>> net/packet/af_packet.c | 87 +++++++++++++++++++++++++++---------------
>> net/packet/diag.c | 2 +-
>> net/packet/internal.h | 2 +-
>> 4 files changed, 59 insertions(+), 33 deletions(-)
>>
>> diff --git a/include/uapi/linux/if_packet.h b/include/uapi/linux/if_packet.h
>> index 78c981d..9efc423 100644
>> --- a/include/uapi/linux/if_packet.h
>> +++ b/include/uapi/linux/if_packet.h
>> @@ -59,6 +59,7 @@ struct sockaddr_ll {
>> #define PACKET_ROLLOVER_STATS 21
>> #define PACKET_FANOUT_DATA 22
>> #define PACKET_IGNORE_OUTGOING 23
>> +#define PACKET_VNET_HDR_SZ 24
>>
>> #define PACKET_FANOUT_HASH 0
>> #define PACKET_FANOUT_LB 1
>> diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
>> index 8ffb19c..06b9893 100644
>> --- a/net/packet/af_packet.c
>> +++ b/net/packet/af_packet.c
>> @@ -2092,18 +2092,18 @@ static unsigned int run_filter(struct sk_buff *skb,
>> }
>>
>> static int packet_rcv_vnet(struct msghdr *msg, const struct sk_buff *skb,
>> - size_t *len)
>> + size_t *len, int vnet_hdr_sz)
>> {
>> - struct virtio_net_hdr vnet_hdr;
>> + struct virtio_net_hdr_mrg_rxbuf vnet_hdr = { .num_buffers = 0 };
>>
>> - if (*len < sizeof(vnet_hdr))
>> + if (*len < vnet_hdr_sz)
>> return -EINVAL;
>> - *len -= sizeof(vnet_hdr);
>> + *len -= vnet_hdr_sz;
>>
>> - if (virtio_net_hdr_from_skb(skb, &vnet_hdr, vio_le(), true, 0))
>> + if (virtio_net_hdr_from_skb(skb, (struct virtio_net_hdr *)&vnet_hdr, vio_le(), true, 0))
>> return -EINVAL;
>>
>> - return memcpy_to_msg(msg, (void *)&vnet_hdr, sizeof(vnet_hdr));
>> + return memcpy_to_msg(msg, (void *)&vnet_hdr, vnet_hdr_sz);
>> }
>>
>> /*
>> @@ -2253,6 +2253,7 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
>> bool is_drop_n_account = false;
>> unsigned int slot_id = 0;
>> bool do_vnet = false;
>> + int vnet_hdr_sz;
>>
>> /* struct tpacket{2,3}_hdr is aligned to a multiple of TPACKET_ALIGNMENT.
>> * We may add members to them until current aligned size without forcing
>> @@ -2310,8 +2311,9 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
>> netoff = TPACKET_ALIGN(po->tp_hdrlen +
>> (maclen < 16 ? 16 : maclen)) +
>> po->tp_reserve;
>> - if (po->has_vnet_hdr) {
>> - netoff += sizeof(struct virtio_net_hdr);
>> + vnet_hdr_sz = po->vnet_hdr_sz;
>> + if (vnet_hdr_sz) {
>> + netoff += vnet_hdr_sz;
>> do_vnet = true;
>> }
>> macoff = netoff - maclen;
>> @@ -2552,16 +2554,27 @@ static int __packet_snd_vnet_parse(struct virtio_net_hdr *vnet_hdr, size_t len)
>> }
>>
>> static int packet_snd_vnet_parse(struct msghdr *msg, size_t *len,
>> - struct virtio_net_hdr *vnet_hdr)
>> + struct virtio_net_hdr *vnet_hdr, int vnet_hdr_sz)
>> {
>> - if (*len < sizeof(*vnet_hdr))
>> + int ret;
>> +
>> + if (*len < vnet_hdr_sz)
>> return -EINVAL;
>> - *len -= sizeof(*vnet_hdr);
>> + *len -= vnet_hdr_sz;
>>
>> if (!copy_from_iter_full(vnet_hdr, sizeof(*vnet_hdr), &msg->msg_iter))
>> return -EFAULT;
>>
>> - return __packet_snd_vnet_parse(vnet_hdr, *len);
>> + ret = __packet_snd_vnet_parse(vnet_hdr, *len);
>> +
> nit: please drop the whitespace line
>
>> + if (ret)
>> + return ret;
>> +
>> + /* move iter to point to the start of mac header */
>> + if (vnet_hdr_sz != sizeof(struct virtio_net_hdr))
>> + iov_iter_advance(&msg->msg_iter, vnet_hdr_sz - sizeof(struct virtio_net_hdr));
>> +
>> + return 0;
>> }
>>
>> static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
>> @@ -2730,6 +2743,7 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
>> int status = TP_STATUS_AVAILABLE;
>> int hlen, tlen, copylen = 0;
>> long timeo = 0;
>> + int vnet_hdr_sz = po->vnet_hdr_sz;
> nit: order variables longest to shortest ("reverse christmas tree")
>>
>> mutex_lock(&po->pg_vec_lock);
>>
>> @@ -2780,7 +2794,7 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
>> size_max = po->tx_ring.frame_size
>> - (po->tp_hdrlen - sizeof(struct sockaddr_ll));
>>
>> - if ((size_max > dev->mtu + reserve + VLAN_HLEN) && !po->has_vnet_hdr)
>> + if ((size_max > dev->mtu + reserve + VLAN_HLEN) && !vnet_hdr_sz)
>> size_max = dev->mtu + reserve + VLAN_HLEN;
>>
>> reinit_completion(&po->skb_completion);
>> @@ -2809,10 +2823,10 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
>> status = TP_STATUS_SEND_REQUEST;
>> hlen = LL_RESERVED_SPACE(dev);
>> tlen = dev->needed_tailroom;
>> - if (po->has_vnet_hdr) {
>> + if (vnet_hdr_sz) {
>> vnet_hdr = data;
>> - data += sizeof(*vnet_hdr);
>> - tp_len -= sizeof(*vnet_hdr);
>> + data += vnet_hdr_sz;
>> + tp_len -= vnet_hdr_sz;
>> if (tp_len < 0 ||
>> __packet_snd_vnet_parse(vnet_hdr, tp_len)) {
>> tp_len = -EINVAL;
>> @@ -2837,7 +2851,7 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
>> addr, hlen, copylen, &sockc);
>> if (likely(tp_len >= 0) &&
>> tp_len > dev->mtu + reserve &&
>> - !po->has_vnet_hdr &&
>> + !vnet_hdr_sz &&
>> !packet_extra_vlan_len_allowed(dev, skb))
>> tp_len = -EMSGSIZE;
>>
>> @@ -2856,7 +2870,7 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
>> }
>> }
>>
>> - if (po->has_vnet_hdr) {
>> + if (vnet_hdr_sz) {
>> if (virtio_net_hdr_to_skb(skb, vnet_hdr, vio_le())) {
>> tp_len = -EINVAL;
>> goto tpacket_error;
>> @@ -2946,7 +2960,7 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len)
>> struct virtio_net_hdr vnet_hdr = { 0 };
>> int offset = 0;
>> struct packet_sock *po = pkt_sk(sk);
>> - bool has_vnet_hdr = false;
>> + int vnet_hdr_sz = po->vnet_hdr_sz;
>> int hlen, tlen, linear;
>> int extra_len = 0;
>>
>> @@ -2990,11 +3004,11 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len)
>>
>> if (sock->type == SOCK_RAW)
>> reserve = dev->hard_header_len;
>> - if (po->has_vnet_hdr) {
>> - err = packet_snd_vnet_parse(msg, &len, &vnet_hdr);
>> +
> nit: drop extra whitespace line
>
>> + if (vnet_hdr_sz) {
>> + err = packet_snd_vnet_parse(msg, &len, &vnet_hdr, vnet_hdr_sz);
>> if (err)
>> goto out_unlock;
>> - has_vnet_hdr = true;
>> }
>>
>> if (unlikely(sock_flag(sk, SOCK_NOFCS))) {
>> @@ -3064,11 +3078,11 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len)
>>
>> packet_parse_headers(skb, sock);
>>
>> - if (has_vnet_hdr) {
>> + if (vnet_hdr_sz) {
>> err = virtio_net_hdr_to_skb(skb, &vnet_hdr, vio_le());
>> if (err)
>> goto out_free;
>> - len += sizeof(vnet_hdr);
>> + len += vnet_hdr_sz;
>> virtio_net_hdr_set_proto(skb, &vnet_hdr);
>> }
>>
>> @@ -3410,7 +3424,7 @@ static int packet_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
>> struct sock *sk = sock->sk;
>> struct sk_buff *skb;
>> int copied, err;
>> - int vnet_hdr_len = 0;
>> + int vnet_hdr_len = pkt_sk(sk)->vnet_hdr_sz;
>> unsigned int origlen = 0;
>>
>> err = -EINVAL;
>> @@ -3451,11 +3465,10 @@ static int packet_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
>>
>> packet_rcv_try_clear_pressure(pkt_sk(sk));
>>
>> - if (pkt_sk(sk)->has_vnet_hdr) {
>> - err = packet_rcv_vnet(msg, skb, &len);
>> + if (vnet_hdr_len) {
>> + err = packet_rcv_vnet(msg, skb, &len, vnet_hdr_len);
>> if (err)
>> goto out_free;
>> - vnet_hdr_len = sizeof(struct virtio_net_hdr);
>> }
>>
>> /* You lose any data beyond the buffer you gave. If it worries
>> @@ -3921,8 +3934,9 @@ static void packet_flush_mclist(struct sock *sk)
>> return 0;
>> }
>> case PACKET_VNET_HDR:
>> + case PACKET_VNET_HDR_SZ:
>> {
>> - int val;
>> + int val, hdr_len;
>>
>> if (sock->type != SOCK_RAW)
>> return -EINVAL;
>> @@ -3931,11 +3945,19 @@ static void packet_flush_mclist(struct sock *sk)
>> if (copy_from_sockptr(&val, optval, sizeof(val)))
>> return -EFAULT;
>>
>> + hdr_len = val ? sizeof(struct virtio_net_hdr) : 0;
>> + if (optname == PACKET_VNET_HDR_SZ) {
>> + if (val && val != sizeof(struct virtio_net_hdr) &&
>> + val != sizeof(struct virtio_net_hdr_mrg_rxbuf))
>> + return -EINVAL;
>> + hdr_len = val;
>> + }
>> +
>> lock_sock(sk);
>> if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) {
>> ret = -EBUSY;
>> } else {
>> - po->has_vnet_hdr = !!val;
>> + po->vnet_hdr_sz = hdr_len;
>> ret = 0;
>> }
>> release_sock(sk);
>> @@ -4068,7 +4090,10 @@ static int packet_getsockopt(struct socket *sock, int level, int optname,
>> val = po->origdev;
>> break;
>> case PACKET_VNET_HDR:
>> - val = po->has_vnet_hdr;
>> + val = !!po->vnet_hdr_sz;
>> + break;
>> + case PACKET_VNET_HDR_SZ:
>> + val = po->vnet_hdr_sz;
>> break;
>> case PACKET_VERSION:
>> val = po->tp_version;
>> diff --git a/net/packet/diag.c b/net/packet/diag.c
>> index 07812ae..dfec603 100644
>> --- a/net/packet/diag.c
>> +++ b/net/packet/diag.c
>> @@ -27,7 +27,7 @@ static int pdiag_put_info(const struct packet_sock *po, struct sk_buff *nlskb)
>> pinfo.pdi_flags |= PDI_AUXDATA;
>> if (po->origdev)
>> pinfo.pdi_flags |= PDI_ORIGDEV;
>> - if (po->has_vnet_hdr)
>> + if (po->vnet_hdr_sz)
>> pinfo.pdi_flags |= PDI_VNETHDR;
>> if (po->tp_loss)
>> pinfo.pdi_flags |= PDI_LOSS;
>> diff --git a/net/packet/internal.h b/net/packet/internal.h
>> index 48af35b..154c6bb 100644
>> --- a/net/packet/internal.h
>> +++ b/net/packet/internal.h
>> @@ -119,9 +119,9 @@ struct packet_sock {
>> unsigned int running; /* bind_lock must be held */
>> unsigned int auxdata:1, /* writer must hold sock lock */
>> origdev:1,
>> - has_vnet_hdr:1,
>> tp_loss:1,
>> tp_tx_has_off:1;
>> + u8 vnet_hdr_sz;
>> int pressure;
>> int ifindex; /* bound device */
>> __be16 num;
>> --
>> 1.8.3.1
>>
Thanks for the comments! We will address them in the next version of patch.
Besides 1) adding READ_ONCE macro to local vnet_hdr_sz; 2) fixing some
coding format; 3) nice to have some performance test, is there any other
suggestion on this version? Again, thanks for all the advice : )
Powered by blists - more mailing lists