[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <CAL+tcoDkcKNKCe9B1bQvsZEiKmJEVSKJy_gQ-PONu9qqUXF9AQ@mail.gmail.com>
Date: Thu, 25 Sep 2025 08:17:19 +0800
From: Jason Xing <kerneljasonxing@...il.com>
To: Maciej Fijalkowski <maciej.fijalkowski@...el.com>
Cc: Stanislav Fomichev <stfomichev@...il.com>, bpf@...r.kernel.org, ast@...nel.org,
daniel@...earbox.net, andrii@...nel.org, netdev@...r.kernel.org,
magnus.karlsson@...el.com
Subject: Re: [PATCH bpf-next 2/3] xsk: remove @first_frag from xsk_build_skb()
On Wed, Sep 24, 2025 at 10:36 PM Maciej Fijalkowski
<maciej.fijalkowski@...el.com> wrote:
>
> On Tue, Sep 23, 2025 at 05:25:01PM +0800, Jason Xing wrote:
> > On Tue, Sep 23, 2025 at 1:48 AM Stanislav Fomichev <stfomichev@...il.com> wrote:
> > >
> > > On 09/22, Maciej Fijalkowski wrote:
> > > > Devices that set IFF_TX_SKB_NO_LINEAR will not execute branch that
> > > > handles metadata, as we set @first_frag only for !IFF_TX_SKB_NO_LINEAR
> > > > code in xsk_build_skb().
> > > >
> > > > Same functionality can be achieved with checking if xsk_get_num_desc()
> > > > returns 0. To replace current usage of @first_frag with
> > > > XSKCB(skb)->num_descs check, pull out the code from
> > > > xsk_set_destructor_arg() that initializes sk_buff::cb and call it before
> > > > skb_store_bits() in branch that creates skb against first processed
> > > > frag. This so error path has the XSKCB(skb)->num_descs initialized and
> > > > can free skb in case skb_store_bits() failed.
> > > >
> > > > Signed-off-by: Maciej Fijalkowski <maciej.fijalkowski@...el.com>
> > > > ---
> > > > net/xdp/xsk.c | 20 +++++++++++---------
> > > > 1 file changed, 11 insertions(+), 9 deletions(-)
> > > >
> > > > diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
> > > > index 72194f0a3fc0..064238400036 100644
> > > > --- a/net/xdp/xsk.c
> > > > +++ b/net/xdp/xsk.c
> > > > @@ -605,6 +605,13 @@ static u32 xsk_get_num_desc(struct sk_buff *skb)
> > > > return XSKCB(skb)->num_descs;
> > > > }
> > > >
> > > > +static void xsk_init_cb(struct sk_buff *skb)
> > > > +{
> > > > + BUILD_BUG_ON(sizeof(struct xsk_addr_head) > sizeof(skb->cb));
> > > > + INIT_LIST_HEAD(&XSKCB(skb)->addrs_list);
> > > > + XSKCB(skb)->num_descs = 0;
> > > > +}
> > > > +
> > > > static void xsk_destruct_skb(struct sk_buff *skb)
> > > > {
> > > > struct xsk_tx_metadata_compl *compl = &skb_shinfo(skb)->xsk_meta;
> > > > @@ -620,9 +627,6 @@ static void xsk_destruct_skb(struct sk_buff *skb)
> > > >
> > > > static void xsk_set_destructor_arg(struct sk_buff *skb, u64 addr)
> > > > {
> > > > - BUILD_BUG_ON(sizeof(struct xsk_addr_head) > sizeof(skb->cb));
> > > > - INIT_LIST_HEAD(&XSKCB(skb)->addrs_list);
> > > > - XSKCB(skb)->num_descs = 0;
> > > > skb_shinfo(skb)->destructor_arg = (void *)(uintptr_t)addr;
> > > > }
> > > >
> > > > @@ -672,7 +676,7 @@ static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs,
> > > > return ERR_PTR(err);
> > > >
> > > > skb_reserve(skb, hr);
> > > > -
> > > > + xsk_init_cb(skb);
> > > > xsk_set_destructor_arg(skb, desc->addr);
> > > > } else {
> > > > xsk_addr = kmem_cache_zalloc(xsk_tx_generic_cache, GFP_KERNEL);
> > > > @@ -725,7 +729,6 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
> > > > struct xsk_tx_metadata *meta = NULL;
> > > > struct net_device *dev = xs->dev;
> > > > struct sk_buff *skb = xs->skb;
> > > > - bool first_frag = false;
> > > > int err;
> > > >
> > > > if (dev->priv_flags & IFF_TX_SKB_NO_LINEAR) {
> > > > @@ -742,8 +745,6 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
> > > > len = desc->len;
> > > >
> > > > if (!skb) {
> > > > - first_frag = true;
> > > > -
> > > > hr = max(NET_SKB_PAD, L1_CACHE_ALIGN(dev->needed_headroom));
> > > > tr = dev->needed_tailroom;
> > > > skb = sock_alloc_send_skb(&xs->sk, hr + len + tr, 1, &err);
> > > > @@ -752,6 +753,7 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
> > > >
> > > > skb_reserve(skb, hr);
> > > > skb_put(skb, len);
> > > > + xsk_init_cb(skb);
> > > >
> > > > err = skb_store_bits(skb, 0, buffer, len);
> > > > if (unlikely(err))
> > > > @@ -797,7 +799,7 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
> > > > list_add_tail(&xsk_addr->addr_node, &XSKCB(skb)->addrs_list);
> > > > }
> > > >
> > > > - if (first_frag && desc->options & XDP_TX_METADATA) {
> > > > + if (!xsk_get_num_desc(skb) && desc->options & XDP_TX_METADATA) {
> > > > if (unlikely(xs->pool->tx_metadata_len == 0)) {
> > > > err = -EINVAL;
> > > > goto free_err;
> > > > @@ -839,7 +841,7 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
> > > > return skb;
> > > >
> > > > free_err:
> > > > - if (first_frag && skb)
> > >
> > > [..]
> > >
> > > > + if (skb && !xsk_get_num_desc(skb))
> > > > kfree_skb(skb);
> > > >
> > > > if (err == -EOVERFLOW) {
> > >
> > > For IFF_TX_SKB_NO_LINEAR case, the 'goto free_err' is super confusing.
> > > xsk_build_skb_zerocopy either returns skb or an IS_ERR. Can we
> > > add a separate label to jump directly to 'if err == -EOVERFLOW' for
> > > the IFF_TX_SKB_NO_LINEAR case?
>
> Right, I got hit with this when running xdpsock within VM now against
> virtio-net driver. Since I removed @first_frag and sock_alloc_send_skb()
> managed to give me -EAGAIN at start, skb was treated as valid pointer and
> then I got splat when accessing either cb or skb_shared_info.
>
> So either we NULL the skb for xsk_build_skb_zerocopy() error path (which
> would be fine even for -EOVERFLOW as error path uses xs->skb pointer, not
> the local one) or we introduce separate label as you suggest. No strong
> opinions here.
>
> > >
> > > diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
> > > index 72e34bd2d925..f56182c61c99 100644
> > > --- a/net/xdp/xsk.c
> > > +++ b/net/xdp/xsk.c
> > > @@ -732,7 +732,7 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
> > > skb = xsk_build_skb_zerocopy(xs, desc);
> > > if (IS_ERR(skb)) {
> > > err = PTR_ERR(skb);
> > > - goto free_err;
> > > + goto out;
> > > }
> > > } else {
> > > u32 hr, tr, len;
> > > @@ -842,6 +842,7 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
> > > if (first_frag && skb)
> > > kfree_skb(skb);
> > >
> > > +out:
> > > if (err == -EOVERFLOW) {
> > > /* Drop the packet */
> > > xsk_inc_num_desc(xs->skb);
> > >
> > > After that, it seems we can look at skb_shinfo(skb)->nr_frags? Instead
> > > of adding new xsk_init_cb, seems more robust?
>
> Thanks! I'll do it.
>
> >
> > +1. It would be simpler.
> >
> > And I think this patch should be a standalone one because it actually
> > supports the missing feature for the VM scenario.
>
> Hi Jason,
> in commit message, I wrote about enabling tx metadata for
> xsk_build_skb_zerocopy() but code did not reflect that as you point out in
> your later reply.
>
> Unless there are any objections I will actually enable it there.
Oh, if you made up your mind enabling it in the next version, how
about changing the title of this series because the core changes are
all about tx metadata rather than simply cleaning up the codes. See,
patch 1, I think, will be scrapped; patch 2 is used to support that
missing feature; patch 3 is a follow-up to patch 2. WDYT?
Thanks,
Jason
Powered by blists - more mailing lists