[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <7e124a5b-0e95-a3a5-4d19-4b356b7b4942@linux.alibaba.com>
Date: Mon, 29 Oct 2018 12:19:11 +0800
From: Jianfeng Tan <jianfeng.tan@...ux.alibaba.com>
To: Jason Wang <jasowang@...hat.com>, netdev@...r.kernel.org
Cc: davem@...emloft.net, mst@...hat.com
Subject: Re: [PATCH] net/packet: support vhost mrg_rxbuf
On 10/29/2018 10:54 AM, Jason Wang wrote:
>
> On 2018/10/27 下午8:04, Jianfeng Tan wrote:
>> Previouly, virtio net header size is hardcoded to be 10, which makes
>> the feature mrg_rxbuf not available.
>>
>> We redefine PACKET_VNET_HDR ioctl which treats user input as boolean,
>> but now as int, 0, 10, 12, or everything else be treated as 10.
>>
>> There will be one case which is treated differently: if user input is
>> 12, previously, the header size will be 10; but now it's 12.
>>
>> Signed-off-by: Jianfeng Tan <jianfeng.tan@...ux.alibaba.com>
>
>
> This should go for net-next which is closed. You may consider to
> re-submit when it was open.
Thank you for the reminder. We'll re-evaluate the necessity of this patch.
>
>
>> ---
>> net/packet/af_packet.c | 97 ++++++++++++++++++++++++++----------------
>> net/packet/diag.c | 2 +-
>> net/packet/internal.h | 2 +-
>> 3 files changed, 63 insertions(+), 38 deletions(-)
>>
>> diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
>> index ec3095f13aae..1bd7f4cdcc80 100644
>> --- a/net/packet/af_packet.c
>> +++ b/net/packet/af_packet.c
>> @@ -1999,18 +1999,24 @@ static unsigned int run_filter(struct sk_buff
>> *skb,
>> }
>> static int packet_rcv_vnet(struct msghdr *msg, const struct
>> sk_buff *skb,
>> - size_t *len)
>> + size_t *len, int vnet_hdr_len)
>> {
>> + int res;
>> struct virtio_net_hdr vnet_hdr;
>> - if (*len < sizeof(vnet_hdr))
>> + if (*len < vnet_hdr_len)
>> return -EINVAL;
>> - *len -= sizeof(vnet_hdr);
>> + *len -= vnet_hdr_len;
>> if (virtio_net_hdr_from_skb(skb, &vnet_hdr, vio_le(), true, 0))
>> return -EINVAL;
>> - return memcpy_to_msg(msg, (void *)&vnet_hdr, sizeof(vnet_hdr));
>> + res = memcpy_to_msg(msg, (void *)&vnet_hdr, sizeof(vnet_hdr));
>> + if (res == 0)
>> + iov_iter_advance(&msg->msg_iter,
>> + vnet_hdr_len - sizeof(vnet_hdr));
>> +
>> + return res;
>> }
>> /*
>> @@ -2206,11 +2212,13 @@ static int tpacket_rcv(struct sk_buff *skb,
>> struct net_device *dev,
>> po->tp_reserve;
>> } else {
>> unsigned int maclen = skb_network_offset(skb);
>> + int vnet_hdr_sz = READ_ONCE(po->vnet_hdr_sz);
>> +
>> netoff = TPACKET_ALIGN(po->tp_hdrlen +
>> (maclen < 16 ? 16 : maclen)) +
>> po->tp_reserve;
>> - if (po->has_vnet_hdr) {
>> - netoff += sizeof(struct virtio_net_hdr);
>> + if (vnet_hdr_sz) {
>> + netoff += vnet_hdr_sz;
>> do_vnet = true;
>> }
>> macoff = netoff - maclen;
>> @@ -2429,19 +2437,6 @@ static int __packet_snd_vnet_parse(struct
>> virtio_net_hdr *vnet_hdr, size_t len)
>> return 0;
>> }
>> -static int packet_snd_vnet_parse(struct msghdr *msg, size_t *len,
>> - struct virtio_net_hdr *vnet_hdr)
>> -{
>> - if (*len < sizeof(*vnet_hdr))
>> - return -EINVAL;
>> - *len -= sizeof(*vnet_hdr);
>> -
>> - if (!copy_from_iter_full(vnet_hdr, sizeof(*vnet_hdr),
>> &msg->msg_iter))
>> - return -EFAULT;
>> -
>> - return __packet_snd_vnet_parse(vnet_hdr, *len);
>> -}
>> -
>> static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff
>> *skb,
>> void *frame, struct net_device *dev, void *data, int tp_len,
>> __be16 proto, unsigned char *addr, int hlen, int copylen,
>> @@ -2609,6 +2604,7 @@ static int tpacket_snd(struct packet_sock *po,
>> struct msghdr *msg)
>> int len_sum = 0;
>> int status = TP_STATUS_AVAILABLE;
>> int hlen, tlen, copylen = 0;
>> + int vnet_hdr_sz;
>> mutex_lock(&po->pg_vec_lock);
>> @@ -2648,7 +2644,8 @@ static int tpacket_snd(struct packet_sock
>> *po, struct msghdr *msg)
>> size_max = po->tx_ring.frame_size
>> - (po->tp_hdrlen - sizeof(struct sockaddr_ll));
>> - if ((size_max > dev->mtu + reserve + VLAN_HLEN) &&
>> !po->has_vnet_hdr)
>> + vnet_hdr_sz = READ_ONCE(po->vnet_hdr_sz);
>> + if ((size_max > dev->mtu + reserve + VLAN_HLEN) && !vnet_hdr_sz)
>> size_max = dev->mtu + reserve + VLAN_HLEN;
>> do {
>> @@ -2668,10 +2665,10 @@ static int tpacket_snd(struct packet_sock
>> *po, struct msghdr *msg)
>> status = TP_STATUS_SEND_REQUEST;
>> hlen = LL_RESERVED_SPACE(dev);
>> tlen = dev->needed_tailroom;
>> - if (po->has_vnet_hdr) {
>> + if (vnet_hdr_sz) {
>> vnet_hdr = data;
>> - data += sizeof(*vnet_hdr);
>> - tp_len -= sizeof(*vnet_hdr);
>> + data += vnet_hdr_sz;
>> + tp_len -= vnet_hdr_sz;
>> if (tp_len < 0 ||
>> __packet_snd_vnet_parse(vnet_hdr, tp_len)) {
>> tp_len = -EINVAL;
>> @@ -2696,7 +2693,7 @@ static int tpacket_snd(struct packet_sock *po,
>> struct msghdr *msg)
>> addr, hlen, copylen, &sockc);
>> if (likely(tp_len >= 0) &&
>> tp_len > dev->mtu + reserve &&
>> - !po->has_vnet_hdr &&
>> + !vnet_hdr_sz &&
>> !packet_extra_vlan_len_allowed(dev, skb))
>> tp_len = -EMSGSIZE;
>> @@ -2715,7 +2712,7 @@ static int tpacket_snd(struct packet_sock
>> *po, struct msghdr *msg)
>> }
>> }
>> - if (po->has_vnet_hdr) {
>> + if (vnet_hdr_sz) {
>> if (virtio_net_hdr_to_skb(skb, vnet_hdr, vio_le())) {
>> tp_len = -EINVAL;
>> goto tpacket_error;
>> @@ -2802,9 +2799,9 @@ static int packet_snd(struct socket *sock,
>> struct msghdr *msg, size_t len)
>> int err, reserve = 0;
>> struct sockcm_cookie sockc;
>> struct virtio_net_hdr vnet_hdr = { 0 };
>> + int vnet_hdr_sz;
>> int offset = 0;
>> struct packet_sock *po = pkt_sk(sk);
>> - bool has_vnet_hdr = false;
>> int hlen, tlen, linear;
>> int extra_len = 0;
>> @@ -2844,11 +2841,29 @@ static int packet_snd(struct socket *sock,
>> struct msghdr *msg, size_t len)
>> if (sock->type == SOCK_RAW)
>> reserve = dev->hard_header_len;
>> - if (po->has_vnet_hdr) {
>> - err = packet_snd_vnet_parse(msg, &len, &vnet_hdr);
>> - if (err)
>> +
>> + vnet_hdr_sz = READ_ONCE(po->vnet_hdr_sz);
>> + if (vnet_hdr_sz) {
>> + if (len < vnet_hdr_sz) {
>> + err = -EINVAL;
>> goto out_unlock;
>> - has_vnet_hdr = true;
>> + }
>> + len -= vnet_hdr_sz;
>> +
>> + if (!copy_from_iter_full(&vnet_hdr, sizeof(vnet_hdr),
>> + &msg->msg_iter)) {
>> + err = -EFAULT;
>> + goto out_unlock;
>> + }
>> +
>> + if (__packet_snd_vnet_parse(&vnet_hdr, len)) {
>> + err = -EINVAL;
>> + goto out_unlock;
>> + }
>
>
> Any reason to open code packet_snd_vnet_parse() here?
No particular reason. Will try to add an parameter, and keep the vnet
related code inside that function if there will be resubmit.
>
>
>> +
>> + /* TODO: check hdr_len with len? */
>> +
>> + iov_iter_advance(&msg->msg_iter, vnet_hdr_sz -
>> sizeof(vnet_hdr));
>> }
>> if (unlikely(sock_flag(sk, SOCK_NOFCS))) {
>> @@ -2912,7 +2927,7 @@ static int packet_snd(struct socket *sock,
>> struct msghdr *msg, size_t len)
>> skb->mark = sockc.mark;
>> skb->tstamp = sockc.transmit_time;
>> - if (has_vnet_hdr) {
>> + if (vnet_hdr_sz) {
>> err = virtio_net_hdr_to_skb(skb, &vnet_hdr, vio_le());
>> if (err)
>> goto out_free;
>> @@ -3307,11 +3322,11 @@ static int packet_recvmsg(struct socket
>> *sock, struct msghdr *msg, size_t len,
>> if (pkt_sk(sk)->pressure)
>> packet_rcv_has_room(pkt_sk(sk), NULL);
>> - if (pkt_sk(sk)->has_vnet_hdr) {
>> - err = packet_rcv_vnet(msg, skb, &len);
>> + vnet_hdr_len = READ_ONCE(pkt_sk(sk)->vnet_hdr_sz);
>> + if (vnet_hdr_len) {
>> + err = packet_rcv_vnet(msg, skb, &len, vnet_hdr_len);
>> if (err)
>> goto out_free;
>> - vnet_hdr_len = sizeof(struct virtio_net_hdr);
>> }
>> /* You lose any data beyond the buffer you gave. If it worries
>> @@ -3772,7 +3787,17 @@ packet_setsockopt(struct socket *sock, int
>> level, int optname, char __user *optv
>> if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) {
>> ret = -EBUSY;
>> } else {
>> - po->has_vnet_hdr = !!val;
>> + /* Previouly we treat user input as boolean (!!val),
>> + * now we treat it as int. After the below correction,
>> + * the only violation case is 12, which results in
>> + * vnet header size of 12 instead of 10.
>> + */
>> + if (val &&
>> + val != sizeof(struct virtio_net_hdr) &&
>> + val != sizeof(struct virtio_net_hdr_mrg_rxbuf))
>> + val = sizeof(struct virtio_net_hdr);
>> +
>> + po->vnet_hdr_sz = val;
>> ret = 0;
>> }
>> release_sock(sk);
>> @@ -3903,7 +3928,7 @@ static int packet_getsockopt(struct socket
>> *sock, int level, int optname,
>> val = po->origdev;
>> break;
>> case PACKET_VNET_HDR:
>> - val = po->has_vnet_hdr;
>> + val = po->vnet_hdr_sz;
>
>
> So the change here is noticeable by userspace. Maybe we need a new opt
> for this?
Nice catch, users may assume that only 0 or 1 is returned.
Thanks,
Jianfeng
>
> Thanks
>
>
>> break;
>> case PACKET_VERSION:
>> val = po->tp_version;
>> diff --git a/net/packet/diag.c b/net/packet/diag.c
>> index 7ef1c881ae74..950015b6704f 100644
>> --- a/net/packet/diag.c
>> +++ b/net/packet/diag.c
>> @@ -26,7 +26,7 @@ static int pdiag_put_info(const struct packet_sock
>> *po, struct sk_buff *nlskb)
>> pinfo.pdi_flags |= PDI_AUXDATA;
>> if (po->origdev)
>> pinfo.pdi_flags |= PDI_ORIGDEV;
>> - if (po->has_vnet_hdr)
>> + if (po->vnet_hdr_sz)
>> pinfo.pdi_flags |= PDI_VNETHDR;
>> if (po->tp_loss)
>> pinfo.pdi_flags |= PDI_LOSS;
>> diff --git a/net/packet/internal.h b/net/packet/internal.h
>> index 3bb7c5fb3bff..11bc75950f28 100644
>> --- a/net/packet/internal.h
>> +++ b/net/packet/internal.h
>> @@ -115,9 +115,9 @@ struct packet_sock {
>> unsigned int running; /* bind_lock must be held */
>> unsigned int auxdata:1, /* writer must hold sock lock */
>> origdev:1,
>> - has_vnet_hdr:1,
>> tp_loss:1,
>> tp_tx_has_off:1;
>> + int vnet_hdr_sz;
>> int pressure;
>> int ifindex; /* bound device */
>> __be16 num;
Powered by blists - more mailing lists