[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <87ee3auk70.fsf@toke.dk>
Date: Thu, 10 Mar 2022 12:21:39 +0100
From: Toke Høiland-Jørgensen <toke@...hat.com>
To: Lorenzo Bianconi <lorenzo@...nel.org>, bpf@...r.kernel.org,
netdev@...r.kernel.org
Cc: davem@...emloft.net, kuba@...nel.org, ast@...nel.org,
daniel@...earbox.net, brouer@...hat.com, pabeni@...hat.com,
echaudro@...hat.com, lorenzo.bianconi@...hat.com,
toshiaki.makita1@...il.com, andrii@...nel.org
Subject: Re: [PATCH v4 bpf-next 2/3] veth: rework veth_xdp_rcv_skb in order
to accept non-linear skb
Lorenzo Bianconi <lorenzo@...nel.org> writes:
> Introduce veth_convert_xdp_buff_from_skb routine in order to
> convert a non-linear skb into a xdp buffer. If the received skb
> is cloned or shared, veth_convert_xdp_buff_from_skb will copy it
> in a new skb composed by order-0 pages for the linear and the
> fragmented area. Moreover veth_convert_xdp_buff_from_skb guarantees
> we have enough headroom for xdp.
> This is a preliminary patch to allow attaching xdp programs with frags
> support on veth devices.
>
> Signed-off-by: Lorenzo Bianconi <lorenzo@...nel.org>
It's cool that we can do this! A few comments below:
> ---
> drivers/net/veth.c | 174 ++++++++++++++++++++++++++++++---------------
> net/core/xdp.c | 1 +
> 2 files changed, 119 insertions(+), 56 deletions(-)
>
> diff --git a/drivers/net/veth.c b/drivers/net/veth.c
> index b77ce3fdcfe8..47b21b1d2fd9 100644
> --- a/drivers/net/veth.c
> +++ b/drivers/net/veth.c
> @@ -433,21 +433,6 @@ static void veth_set_multicast_list(struct net_device *dev)
> {
> }
>
> -static struct sk_buff *veth_build_skb(void *head, int headroom, int len,
> - int buflen)
> -{
> - struct sk_buff *skb;
> -
> - skb = build_skb(head, buflen);
> - if (!skb)
> - return NULL;
> -
> - skb_reserve(skb, headroom);
> - skb_put(skb, len);
> -
> - return skb;
> -}
> -
> static int veth_select_rxq(struct net_device *dev)
> {
> return smp_processor_id() % dev->real_num_rx_queues;
> @@ -695,72 +680,143 @@ static void veth_xdp_rcv_bulk_skb(struct veth_rq *rq, void **frames,
> }
> }
>
> -static struct sk_buff *veth_xdp_rcv_skb(struct veth_rq *rq,
> - struct sk_buff *skb,
> - struct veth_xdp_tx_bq *bq,
> - struct veth_stats *stats)
> +static void veth_xdp_get(struct xdp_buff *xdp)
> {
> - u32 pktlen, headroom, act, metalen, frame_sz;
> - void *orig_data, *orig_data_end;
> - struct bpf_prog *xdp_prog;
> - int mac_len, delta, off;
> - struct xdp_buff xdp;
> + struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(xdp);
> + int i;
>
> - skb_prepare_for_gro(skb);
> + get_page(virt_to_page(xdp->data));
> + if (likely(!xdp_buff_has_frags(xdp)))
> + return;
>
> - rcu_read_lock();
> - xdp_prog = rcu_dereference(rq->xdp_prog);
> - if (unlikely(!xdp_prog)) {
> - rcu_read_unlock();
> - goto out;
> - }
> + for (i = 0; i < sinfo->nr_frags; i++)
> + __skb_frag_ref(&sinfo->frags[i]);
> +}
>
> - mac_len = skb->data - skb_mac_header(skb);
> - pktlen = skb->len + mac_len;
> - headroom = skb_headroom(skb) - mac_len;
> +static int veth_convert_xdp_buff_from_skb(struct veth_rq *rq,
> + struct xdp_buff *xdp,
> + struct sk_buff **pskb)
> +{
nit: It's not really "converting" and skb into an xdp_buff, since the
xdp_buff lives on the stack; so maybe 'veth_init_xdp_buff_from_skb()'?
> + struct sk_buff *skb = *pskb;
> + u32 frame_sz;
>
> if (skb_shared(skb) || skb_head_is_locked(skb) ||
> - skb_is_nonlinear(skb) || headroom < XDP_PACKET_HEADROOM) {
> + skb_shinfo(skb)->nr_frags) {
So this always clones the skb if it has frags? Is that really needed?
Also, there's a lot of memory allocation and copying going on here; have
you measured the performance?
> + u32 size, len, max_head_size, off;
> struct sk_buff *nskb;
> - int size, head_off;
> - void *head, *start;
> struct page *page;
> + int i, head_off;
>
> - size = SKB_DATA_ALIGN(VETH_XDP_HEADROOM + pktlen) +
> - SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
> - if (size > PAGE_SIZE)
> + /* We need a private copy of the skb and data buffers since
> + * the ebpf program can modify it. We segment the original skb
> + * into order-0 pages without linearize it.
> + *
> + * Make sure we have enough space for linear and paged area
> + */
> + max_head_size = SKB_WITH_OVERHEAD(PAGE_SIZE -
> + VETH_XDP_HEADROOM);
> + if (skb->len > PAGE_SIZE * MAX_SKB_FRAGS + max_head_size)
> goto drop;
>
> + /* Allocate skb head */
> page = alloc_page(GFP_ATOMIC | __GFP_NOWARN);
> if (!page)
> goto drop;
>
> - head = page_address(page);
> - start = head + VETH_XDP_HEADROOM;
> - if (skb_copy_bits(skb, -mac_len, start, pktlen)) {
> - page_frag_free(head);
> + nskb = build_skb(page_address(page), PAGE_SIZE);
> + if (!nskb) {
> + put_page(page);
> goto drop;
> }
>
> - nskb = veth_build_skb(head, VETH_XDP_HEADROOM + mac_len,
> - skb->len, PAGE_SIZE);
> - if (!nskb) {
> - page_frag_free(head);
> + skb_reserve(nskb, VETH_XDP_HEADROOM);
> + size = min_t(u32, skb->len, max_head_size);
> + if (skb_copy_bits(skb, 0, nskb->data, size)) {
> + consume_skb(nskb);
> goto drop;
> }
> + skb_put(nskb, size);
>
> skb_copy_header(nskb, skb);
> head_off = skb_headroom(nskb) - skb_headroom(skb);
> skb_headers_offset_update(nskb, head_off);
> +
> + /* Allocate paged area of new skb */
> + off = size;
> + len = skb->len - off;
> +
> + for (i = 0; i < MAX_SKB_FRAGS && off < skb->len; i++) {
> + page = alloc_page(GFP_ATOMIC | __GFP_NOWARN);
> + if (!page) {
> + consume_skb(nskb);
> + goto drop;
> + }
> +
> + size = min_t(u32, len, PAGE_SIZE);
> + skb_add_rx_frag(nskb, i, page, 0, size, PAGE_SIZE);
> + if (skb_copy_bits(skb, off, page_address(page),
> + size)) {
> + consume_skb(nskb);
> + goto drop;
> + }
> +
> + len -= size;
> + off += size;
> + }
> +
> consume_skb(skb);
> skb = nskb;
> + } else if (skb_headroom(skb) < XDP_PACKET_HEADROOM &&
> + pskb_expand_head(skb, VETH_XDP_HEADROOM, 0, GFP_ATOMIC)) {
> + goto drop;
> }
>
> /* SKB "head" area always have tailroom for skb_shared_info */
> frame_sz = skb_end_pointer(skb) - skb->head;
> frame_sz += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
> - xdp_init_buff(&xdp, frame_sz, &rq->xdp_rxq);
> - xdp_prepare_buff(&xdp, skb->head, skb->mac_header, pktlen, true);
> + xdp_init_buff(xdp, frame_sz, &rq->xdp_rxq);
> + xdp_prepare_buff(xdp, skb->head, skb_headroom(skb),
> + skb_headlen(skb), true);
> +
> + if (skb_is_nonlinear(skb)) {
> + skb_shinfo(skb)->xdp_frags_size = skb->data_len;
> + xdp_buff_set_frags_flag(xdp);
> + } else {
> + xdp_buff_clear_frags_flag(xdp);
> + }
> + *pskb = skb;
> +
> + return 0;
> +drop:
> + consume_skb(skb);
> + *pskb = NULL;
> +
> + return -ENOMEM;
> +}
> +
> +static struct sk_buff *veth_xdp_rcv_skb(struct veth_rq *rq,
> + struct sk_buff *skb,
> + struct veth_xdp_tx_bq *bq,
> + struct veth_stats *stats)
> +{
> + void *orig_data, *orig_data_end;
> + struct bpf_prog *xdp_prog;
> + struct xdp_buff xdp;
> + u32 act, metalen;
> + int off;
> +
> + skb_prepare_for_gro(skb);
> +
> + rcu_read_lock();
> + xdp_prog = rcu_dereference(rq->xdp_prog);
> + if (unlikely(!xdp_prog)) {
> + rcu_read_unlock();
> + goto out;
> + }
> +
> + __skb_push(skb, skb->data - skb_mac_header(skb));
> + if (veth_convert_xdp_buff_from_skb(rq, &xdp, &skb))
> + goto drop;
>
> orig_data = xdp.data;
> orig_data_end = xdp.data_end;
> @@ -771,7 +827,7 @@ static struct sk_buff *veth_xdp_rcv_skb(struct veth_rq *rq,
> case XDP_PASS:
> break;
> case XDP_TX:
> - get_page(virt_to_page(xdp.data));
> + veth_xdp_get(&xdp);
> consume_skb(skb);
> xdp.rxq->mem = rq->xdp_mem;
> if (unlikely(veth_xdp_tx(rq, &xdp, bq) < 0)) {
> @@ -783,7 +839,7 @@ static struct sk_buff *veth_xdp_rcv_skb(struct veth_rq *rq,
> rcu_read_unlock();
> goto xdp_xmit;
> case XDP_REDIRECT:
> - get_page(virt_to_page(xdp.data));
> + veth_xdp_get(&xdp);
> consume_skb(skb);
> xdp.rxq->mem = rq->xdp_mem;
> if (xdp_do_redirect(rq->dev, &xdp, xdp_prog)) {
> @@ -806,18 +862,24 @@ static struct sk_buff *veth_xdp_rcv_skb(struct veth_rq *rq,
> rcu_read_unlock();
>
> /* check if bpf_xdp_adjust_head was used */
> - delta = orig_data - xdp.data;
> - off = mac_len + delta;
> + off = orig_data - xdp.data;
> if (off > 0)
> __skb_push(skb, off);
> else if (off < 0)
> __skb_pull(skb, -off);
> - skb->mac_header -= delta;
> +
> + skb_reset_mac_header(skb);
>
> /* check if bpf_xdp_adjust_tail was used */
> off = xdp.data_end - orig_data_end;
> if (off != 0)
> __skb_put(skb, off); /* positive on grow, negative on shrink */
> +
> + if (xdp_buff_has_frags(&xdp))
> + skb->data_len = skb_shinfo(skb)->xdp_frags_size;
> + else
> + skb->data_len = 0;
We can remove entire frags using xdp_adjust_tail, right? Will that get
propagated in the right way to the skb frags due to the dual use of
skb_shared_info, or?
Powered by blists - more mailing lists