[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <7c41d9b0-aa77-1485-4890-177ca7b01e50@redhat.com>
Date: Fri, 13 Jan 2017 15:41:41 +0800
From: Jason Wang <jasowang@...hat.com>
To: John Fastabend <john.fastabend@...il.com>, mst@...hat.com
Cc: john.r.fastabend@...el.com, netdev@...r.kernel.org,
alexei.starovoitov@...il.com, daniel@...earbox.net
Subject: Re: [net PATCH v3 5/5] virtio_net: XDP support for adjust_head
On 2017年01月13日 10:52, John Fastabend wrote:
> Add support for XDP adjust head by allocating a 256B header region
> that XDP programs can grow into. This is only enabled when a XDP
> program is loaded.
>
> In order to ensure that we do not have to unwind queue headroom push
> queue setup below bpf_prog_add. It reads better to do a prog ref
> unwind vs another queue setup call.
>
> At the moment this code must do a full reset to ensure old buffers
> without headroom on program add or with headroom on program removal
> are not used incorrectly in the datapath. Ideally we would only
> have to disable/enable the RX queues being updated but there is no
> API to do this at the moment in virtio so use the big hammer. In
> practice it is likely not that big of a problem as this will only
> happen when XDP is enabled/disabled changing programs does not
> require the reset. There is some risk that the driver may either
> have an allocation failure or for some reason fail to correctly
> negotiate with the underlying backend in this case the driver will
> be left uninitialized. I have not seen this ever happen on my test
> systems and for what its worth this same failure case can occur
> from probe and other contexts in virtio framework.
>
> Signed-off-by: John Fastabend <john.r.fastabend@...el.com>
> ---
> drivers/net/virtio_net.c | 155 ++++++++++++++++++++++++++++++++++++++++------
> drivers/virtio/virtio.c | 9 ++-
> include/linux/virtio.h | 3 +
> 3 files changed, 144 insertions(+), 23 deletions(-)
>
> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> index 6041828..8b897e7 100644
> --- a/drivers/net/virtio_net.c
> +++ b/drivers/net/virtio_net.c
> @@ -28,6 +28,7 @@
> #include <linux/slab.h>
> #include <linux/cpu.h>
> #include <linux/average.h>
> +#include <linux/pci.h>
> #include <net/busy_poll.h>
>
> static int napi_weight = NAPI_POLL_WEIGHT;
> @@ -159,6 +160,9 @@ struct virtnet_info {
> /* Ethtool settings */
> u8 duplex;
> u32 speed;
> +
> + /* Headroom allocated in RX Queue */
> + unsigned int headroom;
If this could not be changed in anyway, better use a macro instead of a
filed here. And there's even no need to add an extra parameter to
add_recvbuf_mergeable().
> };
>
> struct padded_vnet_hdr {
> @@ -359,6 +363,7 @@ static void virtnet_xdp_xmit(struct virtnet_info *vi,
> }
>
> if (vi->mergeable_rx_bufs) {
> + xdp->data -= sizeof(struct virtio_net_hdr_mrg_rxbuf);
Fail to understand why this is needed. We should have excluded vnet
header from xdp->data even before bpf_prog_run_xdp().
> /* Zero header and leave csum up to XDP layers */
> hdr = xdp->data;
> memset(hdr, 0, vi->hdr_len);
> @@ -375,7 +380,9 @@ static void virtnet_xdp_xmit(struct virtnet_info *vi,
> num_sg = 2;
> sg_init_table(sq->sg, 2);
> sg_set_buf(sq->sg, hdr, vi->hdr_len);
> - skb_to_sgvec(skb, sq->sg + 1, 0, skb->len);
> + skb_to_sgvec(skb, sq->sg + 1,
> + xdp->data - xdp->data_hard_start,
> + xdp->data_end - xdp->data);
> }
> err = virtqueue_add_outbuf(sq->vq, sq->sg, num_sg,
> data, GFP_ATOMIC);
> @@ -401,7 +408,6 @@ static struct sk_buff *receive_small(struct net_device *dev,
> struct bpf_prog *xdp_prog;
>
> len -= vi->hdr_len;
> - skb_trim(skb, len);
>
> rcu_read_lock();
> xdp_prog = rcu_dereference(rq->xdp_prog);
> @@ -413,11 +419,15 @@ static struct sk_buff *receive_small(struct net_device *dev,
> if (unlikely(hdr->hdr.gso_type || hdr->hdr.flags))
> goto err_xdp;
>
> - xdp.data = skb->data;
> + xdp.data_hard_start = skb->data;
> + xdp.data = skb->data + vi->headroom;
> xdp.data_end = xdp.data + len;
> act = bpf_prog_run_xdp(xdp_prog, &xdp);
> switch (act) {
> case XDP_PASS:
> + /* Recalculate length in case bpf program changed it */
> + len = xdp.data_end - xdp.data;
> + __skb_pull(skb, xdp.data - xdp.data_hard_start);
How about do this just after bpf_pro_run_xdp() for XDP_TX too? This is
more readable and there's no need to change xmit path.
> break;
> case XDP_TX:
> virtnet_xdp_xmit(vi, rq, &xdp, skb);
> @@ -432,6 +442,7 @@ static struct sk_buff *receive_small(struct net_device *dev,
> }
> rcu_read_unlock();
>
> + skb_trim(skb, len);
> return skb;
>
> err_xdp:
> @@ -569,7 +580,11 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
> if (unlikely(hdr->hdr.gso_type))
> goto err_xdp;
>
> + /* Allow consuming headroom but reserve enough space to push
> + * the descriptor on if we get an XDP_TX return code.
> + */
> data = page_address(xdp_page) + offset;
> + xdp.data_hard_start = data - vi->headroom + desc_room;
Two possible issues here:
1) If we want to adjust header after linearizing, we should reserve a
room for that page, but I don't see any codes for this.
2) If the header has been adjusted, looks like we need change offset
value otherwise, page_to_skb() won't build a correct skb for us for
XDP_PASS.
> xdp.data = data + desc_room;
> xdp.data_end = xdp.data + (len - vi->hdr_len);
> act = bpf_prog_run_xdp(xdp_prog, &xdp);
> @@ -748,20 +763,21 @@ static void receive_buf(struct virtnet_info *vi, struct receive_queue *rq,
> static int add_recvbuf_small(struct virtnet_info *vi, struct receive_queue *rq,
> gfp_t gfp)
> {
> + int headroom = GOOD_PACKET_LEN + vi->headroom;
> struct sk_buff *skb;
> struct virtio_net_hdr_mrg_rxbuf *hdr;
> int err;
>
> - skb = __netdev_alloc_skb_ip_align(vi->dev, GOOD_PACKET_LEN, gfp);
> + skb = __netdev_alloc_skb_ip_align(vi->dev, headroom, gfp);
> if (unlikely(!skb))
> return -ENOMEM;
>
> - skb_put(skb, GOOD_PACKET_LEN);
> + skb_put(skb, headroom);
>
> hdr = skb_vnet_hdr(skb);
> sg_init_table(rq->sg, 2);
> sg_set_buf(rq->sg, hdr, vi->hdr_len);
> - skb_to_sgvec(skb, rq->sg + 1, 0, skb->len);
> + skb_to_sgvec(skb, rq->sg + 1, vi->headroom, skb->len - vi->headroom);
>
> err = virtqueue_add_inbuf(rq->vq, rq->sg, 2, skb, gfp);
> if (err < 0)
> @@ -829,24 +845,27 @@ static unsigned int get_mergeable_buf_len(struct ewma_pkt_len *avg_pkt_len)
> return ALIGN(len, MERGEABLE_BUFFER_ALIGN);
> }
>
> -static int add_recvbuf_mergeable(struct receive_queue *rq, gfp_t gfp)
> +static int add_recvbuf_mergeable(struct virtnet_info *vi,
> + struct receive_queue *rq, gfp_t gfp)
> {
> struct page_frag *alloc_frag = &rq->alloc_frag;
> + unsigned int headroom = vi->headroom;
> char *buf;
> unsigned long ctx;
> int err;
> unsigned int len, hole;
>
> len = get_mergeable_buf_len(&rq->mrg_avg_pkt_len);
> - if (unlikely(!skb_page_frag_refill(len, alloc_frag, gfp)))
> + if (unlikely(!skb_page_frag_refill(len + headroom, alloc_frag, gfp)))
> return -ENOMEM;
>
> buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
> + buf += headroom; /* advance address leaving hole at front of pkt */
> ctx = mergeable_buf_to_ctx(buf, len);
> get_page(alloc_frag->page);
> - alloc_frag->offset += len;
> + alloc_frag->offset += len + headroom;
> hole = alloc_frag->size - alloc_frag->offset;
> - if (hole < len) {
> + if (hole < len + headroom) {
> /* To avoid internal fragmentation, if there is very likely not
> * enough space for another buffer, add the remaining space to
> * the current buffer. This extra space is not included in
> @@ -880,7 +899,7 @@ static bool try_fill_recv(struct virtnet_info *vi, struct receive_queue *rq,
> gfp |= __GFP_COLD;
> do {
> if (vi->mergeable_rx_bufs)
> - err = add_recvbuf_mergeable(rq, gfp);
> + err = add_recvbuf_mergeable(vi, rq, gfp);
> else if (vi->big_packets)
> err = add_recvbuf_big(vi, rq, gfp);
> else
> @@ -1675,12 +1694,90 @@ static void virtnet_init_settings(struct net_device *dev)
> .set_settings = virtnet_set_settings,
> };
>
> +#define VIRTIO_XDP_HEADROOM 256
> +
> +static int init_vqs(struct virtnet_info *vi);
> +static void remove_vq_common(struct virtnet_info *vi, bool lock);
> +
> +/* Reset virtio device with RTNL held this is very similar to the
> + * freeze()/restore() logic except we need to ensure locking. It is
> + * possible that this routine may fail and leave the driver in a
> + * failed state. However assuming the driver negotiated correctly
> + * at probe time we _should_ be able to (re)negotiate driver again.
> + */
Instead of duplicate codes and export helpers, why not use
virtio_device_freeze()/virtio_device_restore()? For rtnl_lock in
restore, you can probably avoid it be checking rtnl_is_locked() before?
> +static int virtnet_xdp_reset(struct virtnet_info *vi)
> +{
> + struct virtio_device *vdev = vi->vdev;
> + unsigned int status;
> + int i, ret;
> +
> + /* Disable and unwind rings */
> + virtio_config_disable(vdev);
> + vdev->failed = vdev->config->get_status(vdev) & VIRTIO_CONFIG_S_FAILED;
> +
> + netif_device_detach(vi->dev);
> + cancel_delayed_work_sync(&vi->refill);
> + if (netif_running(vi->dev)) {
> + for (i = 0; i < vi->max_queue_pairs; i++)
> + napi_disable(&vi->rq[i].napi);
> + }
> +
> + remove_vq_common(vi, false);
> +
> + /* Do a reset per virtio spec recommendation */
> + vdev->config->reset(vdev);
> +
> + /* Acknowledge that we've seen the device. */
> + status = vdev->config->get_status(vdev);
> + vdev->config->set_status(vdev, status | VIRTIO_CONFIG_S_ACKNOWLEDGE);
> +
> + /* Notify driver is up and finalize features per specification. The
> + * error code from finalize features is checked here but should not
> + * fail because we assume features were previously synced successfully.
> + */
> + status = vdev->config->get_status(vdev);
> + vdev->config->set_status(vdev, status | VIRTIO_CONFIG_S_DRIVER);
> + ret = virtio_finalize_features(vdev);
> + if (ret) {
> + netdev_warn(vi->dev, "virtio_finalize_features failed during reset aborting\n");
> + goto err;
> + }
> +
> + ret = init_vqs(vi);
> + if (ret) {
> + netdev_warn(vi->dev, "init_vqs failed during reset aborting\n");
> + goto err;
> + }
> + virtio_device_ready(vi->vdev);
> +
> + if (netif_running(vi->dev)) {
> + for (i = 0; i < vi->curr_queue_pairs; i++)
> + if (!try_fill_recv(vi, &vi->rq[i], GFP_KERNEL))
> + schedule_delayed_work(&vi->refill, 0);
> +
> + for (i = 0; i < vi->max_queue_pairs; i++)
> + virtnet_napi_enable(&vi->rq[i]);
> + }
> + netif_device_attach(vi->dev);
> + /* Finally, tell the device we're all set */
> + status = vdev->config->get_status(vdev);
> + vdev->config->set_status(vdev, status | VIRTIO_CONFIG_S_DRIVER_OK);
> + virtio_config_enable(vdev);
> +
> + return 0;
> +err:
> + status = vdev->config->get_status(vdev);
> + vdev->config->set_status(vdev, status | VIRTIO_CONFIG_S_FAILED);
> + return ret;
> +}
[...]
Powered by blists - more mailing lists