netdev - Re: [net PATCH v3 5/5] virtio_net: XDP support for adjust

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <587933C2.8090003@gmail.com>
Date:   Fri, 13 Jan 2017 12:08:34 -0800
From:   John Fastabend <john.fastabend@...il.com>
To:     Jason Wang <jasowang@...hat.com>, mst@...hat.com
Cc:     john.r.fastabend@...el.com, netdev@...r.kernel.org,
        alexei.starovoitov@...il.com, daniel@...earbox.net
Subject: Re: [net PATCH v3 5/5] virtio_net: XDP support for adjust_head

On 17-01-12 11:41 PM, Jason Wang wrote:
> 
> 
> On 2017年01月13日 10:52, John Fastabend wrote:
>> Add support for XDP adjust head by allocating a 256B header region
>> that XDP programs can grow into. This is only enabled when a XDP
>> program is loaded.
>>
>> In order to ensure that we do not have to unwind queue headroom push
>> queue setup below bpf_prog_add. It reads better to do a prog ref
>> unwind vs another queue setup call.
>>
>> At the moment this code must do a full reset to ensure old buffers
>> without headroom on program add or with headroom on program removal
>> are not used incorrectly in the datapath. Ideally we would only
>> have to disable/enable the RX queues being updated but there is no
>> API to do this at the moment in virtio so use the big hammer. In
>> practice it is likely not that big of a problem as this will only
>> happen when XDP is enabled/disabled changing programs does not
>> require the reset. There is some risk that the driver may either
>> have an allocation failure or for some reason fail to correctly
>> negotiate with the underlying backend in this case the driver will
>> be left uninitialized. I have not seen this ever happen on my test
>> systems and for what its worth this same failure case can occur
>> from probe and other contexts in virtio framework.
>>
>> Signed-off-by: John Fastabend <john.r.fastabend@...el.com>
>> ---
>>   drivers/net/virtio_net.c |  155 ++++++++++++++++++++++++++++++++++++++++------
>>   drivers/virtio/virtio.c  |    9 ++-
>>   include/linux/virtio.h   |    3 +
>>   3 files changed, 144 insertions(+), 23 deletions(-)
>>
>> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
>> index 6041828..8b897e7 100644
>> --- a/drivers/net/virtio_net.c
>> +++ b/drivers/net/virtio_net.c
>> @@ -28,6 +28,7 @@
>>   #include <linux/slab.h>
>>   #include <linux/cpu.h>
>>   #include <linux/average.h>
>> +#include <linux/pci.h>
>>   #include <net/busy_poll.h>
>>     static int napi_weight = NAPI_POLL_WEIGHT;
>> @@ -159,6 +160,9 @@ struct virtnet_info {
>>       /* Ethtool settings */
>>       u8 duplex;
>>       u32 speed;
>> +
>> +    /* Headroom allocated in RX Queue */
>> +    unsigned int headroom;
> 
> If this could not be changed in anyway, better use a macro instead of a filed
> here. And there's even no need to add an extra parameter to
> add_recvbuf_mergeable().

OK originally I thought this might be dynamic but I agree no need
for it here.

> 
>>   };
>>     struct padded_vnet_hdr {
>> @@ -359,6 +363,7 @@ static void virtnet_xdp_xmit(struct virtnet_info *vi,
>>       }
>>         if (vi->mergeable_rx_bufs) {
>> +        xdp->data -= sizeof(struct virtio_net_hdr_mrg_rxbuf);
> 
> Fail to understand why this is needed. We should have excluded vnet header from
> xdp->data even before bpf_prog_run_xdp().
> 
>>           /* Zero header and leave csum up to XDP layers */
>>           hdr = xdp->data;
>>           memset(hdr, 0, vi->hdr_len);
>> @@ -375,7 +380,9 @@ static void virtnet_xdp_xmit(struct virtnet_info *vi,
>>           num_sg = 2;
>>           sg_init_table(sq->sg, 2);
>>           sg_set_buf(sq->sg, hdr, vi->hdr_len);
>> -        skb_to_sgvec(skb, sq->sg + 1, 0, skb->len);
>> +        skb_to_sgvec(skb, sq->sg + 1,
>> +                 xdp->data - xdp->data_hard_start,
>> +                 xdp->data_end - xdp->data);
>>       }
>>       err = virtqueue_add_outbuf(sq->vq, sq->sg, num_sg,
>>                      data, GFP_ATOMIC);
>> @@ -401,7 +408,6 @@ static struct sk_buff *receive_small(struct net_device *dev,
>>       struct bpf_prog *xdp_prog;
>>         len -= vi->hdr_len;
>> -    skb_trim(skb, len);
>>         rcu_read_lock();
>>       xdp_prog = rcu_dereference(rq->xdp_prog);
>> @@ -413,11 +419,15 @@ static struct sk_buff *receive_small(struct net_device
>> *dev,
>>           if (unlikely(hdr->hdr.gso_type || hdr->hdr.flags))
>>               goto err_xdp;
>>   -        xdp.data = skb->data;
>> +        xdp.data_hard_start = skb->data;
>> +        xdp.data = skb->data + vi->headroom;
>>           xdp.data_end = xdp.data + len;
>>           act = bpf_prog_run_xdp(xdp_prog, &xdp);
>>           switch (act) {
>>           case XDP_PASS:
>> +            /* Recalculate length in case bpf program changed it */
>> +            len = xdp.data_end - xdp.data;
>> +            __skb_pull(skb, xdp.data - xdp.data_hard_start);
> 
> How about do this just after bpf_pro_run_xdp() for XDP_TX too? This is more
> readable and there's no need to change xmit path.

Agreed will do.

> 
>>               break;
>>           case XDP_TX:
>>               virtnet_xdp_xmit(vi, rq, &xdp, skb);
>> @@ -432,6 +442,7 @@ static struct sk_buff *receive_small(struct net_device *dev,
>>       }
>>       rcu_read_unlock();
>>   +    skb_trim(skb, len);
>>       return skb;
>>     err_xdp:
>> @@ -569,7 +580,11 @@ static struct sk_buff *receive_mergeable(struct
>> net_device *dev,
>>           if (unlikely(hdr->hdr.gso_type))
>>               goto err_xdp;
>>   +        /* Allow consuming headroom but reserve enough space to push
>> +         * the descriptor on if we get an XDP_TX return code.
>> +         */
>>           data = page_address(xdp_page) + offset;
>> +        xdp.data_hard_start = data - vi->headroom + desc_room;
> 
> Two possible issues here:
> 
> 1) If we want to adjust header after linearizing, we should reserve a room for
> that page, but I don't see any codes for this.
> 2) If the header has been adjusted, looks like we need change offset value
> otherwise, page_to_skb() won't build a correct skb for us for XDP_PASS.
> 

Both correct thanks. I'll add a couple sample programs to catch this as well.

>>           xdp.data = data + desc_room;
>>           xdp.data_end = xdp.data + (len - vi->hdr_len);
>>           act = bpf_prog_run_xdp(xdp_prog, &xdp);
>> @@ -748,20 +763,21 @@ static void receive_buf(struct virtnet_info *vi, struct
>> receive_queue *rq,
>>   static int add_recvbuf_small(struct virtnet_info *vi, struct receive_queue *rq,
>>                    gfp_t gfp)
>>   {
>> +    int headroom = GOOD_PACKET_LEN + vi->headroom;
>>       struct sk_buff *skb;
>>       struct virtio_net_hdr_mrg_rxbuf *hdr;
>>       int err;
>>   -    skb = __netdev_alloc_skb_ip_align(vi->dev, GOOD_PACKET_LEN, gfp);
>> +    skb = __netdev_alloc_skb_ip_align(vi->dev, headroom, gfp);
>>       if (unlikely(!skb))
>>           return -ENOMEM;
>>   -    skb_put(skb, GOOD_PACKET_LEN);
>> +    skb_put(skb, headroom);
>>         hdr = skb_vnet_hdr(skb);
>>       sg_init_table(rq->sg, 2);
>>       sg_set_buf(rq->sg, hdr, vi->hdr_len);
>> -    skb_to_sgvec(skb, rq->sg + 1, 0, skb->len);
>> +    skb_to_sgvec(skb, rq->sg + 1, vi->headroom, skb->len - vi->headroom);
>>         err = virtqueue_add_inbuf(rq->vq, rq->sg, 2, skb, gfp);
>>       if (err < 0)
>> @@ -829,24 +845,27 @@ static unsigned int get_mergeable_buf_len(struct
>> ewma_pkt_len *avg_pkt_len)
>>       return ALIGN(len, MERGEABLE_BUFFER_ALIGN);
>>   }
>>   -static int add_recvbuf_mergeable(struct receive_queue *rq, gfp_t gfp)
>> +static int add_recvbuf_mergeable(struct virtnet_info *vi,
>> +                 struct receive_queue *rq, gfp_t gfp)
>>   {
>>       struct page_frag *alloc_frag = &rq->alloc_frag;
>> +    unsigned int headroom = vi->headroom;
>>       char *buf;
>>       unsigned long ctx;
>>       int err;
>>       unsigned int len, hole;
>>         len = get_mergeable_buf_len(&rq->mrg_avg_pkt_len);
>> -    if (unlikely(!skb_page_frag_refill(len, alloc_frag, gfp)))
>> +    if (unlikely(!skb_page_frag_refill(len + headroom, alloc_frag, gfp)))
>>           return -ENOMEM;
>>         buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
>> +    buf += headroom; /* advance address leaving hole at front of pkt */
>>       ctx = mergeable_buf_to_ctx(buf, len);
>>       get_page(alloc_frag->page);
>> -    alloc_frag->offset += len;
>> +    alloc_frag->offset += len + headroom;
>>       hole = alloc_frag->size - alloc_frag->offset;
>> -    if (hole < len) {
>> +    if (hole < len + headroom) {
>>           /* To avoid internal fragmentation, if there is very likely not
>>            * enough space for another buffer, add the remaining space to
>>            * the current buffer. This extra space is not included in
>> @@ -880,7 +899,7 @@ static bool try_fill_recv(struct virtnet_info *vi, struct
>> receive_queue *rq,
>>       gfp |= __GFP_COLD;
>>       do {
>>           if (vi->mergeable_rx_bufs)
>> -            err = add_recvbuf_mergeable(rq, gfp);
>> +            err = add_recvbuf_mergeable(vi, rq, gfp);
>>           else if (vi->big_packets)
>>               err = add_recvbuf_big(vi, rq, gfp);
>>           else
>> @@ -1675,12 +1694,90 @@ static void virtnet_init_settings(struct net_device *dev)
>>       .set_settings = virtnet_set_settings,
>>   };
>>   +#define VIRTIO_XDP_HEADROOM 256
>> +
>> +static int init_vqs(struct virtnet_info *vi);
>> +static void remove_vq_common(struct virtnet_info *vi, bool lock);
>> +
>> +/* Reset virtio device with RTNL held this is very similar to the
>> + * freeze()/restore() logic except we need to ensure locking. It is
>> + * possible that this routine may fail and leave the driver in a
>> + * failed state. However assuming the driver negotiated correctly
>> + * at probe time we _should_ be able to (re)negotiate driver again.
>> + */
> 
> Instead of duplicate codes and export helpers, why not use
> virtio_device_freeze()/virtio_device_restore()? For rtnl_lock in restore, you
> can probably avoid it be checking rtnl_is_locked() before?

freeze/restore does virtnet_cpu_notif_* work that is not needed
in this case. But the overhead here is maybe minimal.

Michael wanted to create a generic virtio_reset() so let me
give that a try and that should clean this up as well.

.John

> 
>> +static int virtnet_xdp_reset(struct virtnet_info *vi)
>> +{
>> +    struct virtio_device *vdev = vi->vdev;
>> +    unsigned int status;
>> +    int i, ret;
>> +
>> +    /* Disable and unwind rings */
>> +    virtio_config_disable(vdev);
>> +    vdev->failed = vdev->config->get_status(vdev) & VIRTIO_CONFIG_S_FAILED;
>> +
>> +    netif_device_detach(vi->dev);
>> +    cancel_delayed_work_sync(&vi->refill);
>> +    if (netif_running(vi->dev)) {
>> +        for (i = 0; i < vi->max_queue_pairs; i++)
>> +            napi_disable(&vi->rq[i].napi);
>> +    }
>> +
>> +    remove_vq_common(vi, false);
>> +
>> +    /* Do a reset per virtio spec recommendation */
>> +    vdev->config->reset(vdev);
>> +
>> +    /* Acknowledge that we've seen the device. */
>> +    status = vdev->config->get_status(vdev);
>> +    vdev->config->set_status(vdev, status | VIRTIO_CONFIG_S_ACKNOWLEDGE);
>> +
>> +    /* Notify driver is up and finalize features per specification. The
>> +     * error code from finalize features is checked here but should not
>> +     * fail because we assume features were previously synced successfully.
>> +     */
>> +    status = vdev->config->get_status(vdev);
>> +    vdev->config->set_status(vdev, status | VIRTIO_CONFIG_S_DRIVER);
>> +    ret = virtio_finalize_features(vdev);
>> +    if (ret) {
>> +        netdev_warn(vi->dev, "virtio_finalize_features failed during reset
>> aborting\n");
>> +        goto err;
>> +    }
>> +
>> +    ret = init_vqs(vi);
>> +    if (ret) {
>> +        netdev_warn(vi->dev, "init_vqs failed during reset aborting\n");
>> +        goto err;
>> +    }
>> +    virtio_device_ready(vi->vdev);
>> +
>> +    if (netif_running(vi->dev)) {
>> +        for (i = 0; i < vi->curr_queue_pairs; i++)
>> +            if (!try_fill_recv(vi, &vi->rq[i], GFP_KERNEL))
>> +                schedule_delayed_work(&vi->refill, 0);
>> +
>> +        for (i = 0; i < vi->max_queue_pairs; i++)
>> +            virtnet_napi_enable(&vi->rq[i]);
>> +    }
>> +    netif_device_attach(vi->dev);
>> +    /* Finally, tell the device we're all set */
>> +    status = vdev->config->get_status(vdev);
>> +    vdev->config->set_status(vdev, status | VIRTIO_CONFIG_S_DRIVER_OK);
>> +    virtio_config_enable(vdev);
>> +
>> +    return 0;
>> +err:
>> +    status = vdev->config->get_status(vdev);
>> +    vdev->config->set_status(vdev, status | VIRTIO_CONFIG_S_FAILED);
>> +    return ret;
>> +}
> 
> [...]