[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <aPu0WdUqZCB3xQgb@boxer>
Date: Fri, 24 Oct 2025 19:16:09 +0200
From: Maciej Fijalkowski <maciej.fijalkowski@...el.com>
To: Fernando Fernandez Mancera <fmancera@...e.de>
CC: <netdev@...r.kernel.org>, <csmate@....hu>, <kerneljasonxing@...il.com>,
<bjorn@...nel.org>, <sdf@...ichev.me>, <jonathan.lemon@...il.com>,
<bpf@...r.kernel.org>, <davem@...emloft.net>, <edumazet@...gle.com>,
<kuba@...nel.org>, <pabeni@...hat.com>, <horms@...nel.org>
Subject: Re: [PATCH net v2] xsk: avoid data corruption on cq descriptor number
On Fri, Oct 24, 2025 at 12:40:49PM +0200, Fernando Fernandez Mancera wrote:
> Since commit 30f241fcf52a ("xsk: Fix immature cq descriptor
> production"), the descriptor number is stored in skb control block and
> xsk_cq_submit_addr_locked() relies on it to put the umem addrs onto
> pool's completion queue.
>
> skb control block shouldn't be used for this purpose as after transmit
> xsk doesn't have control over it and other subsystems could use it. This
> leads to the following kernel panic due to a NULL pointer dereference.
>
> BUG: kernel NULL pointer dereference, address: 0000000000000000
> #PF: supervisor read access in kernel mode
> #PF: error_code(0x0000) - not-present page
> PGD 0 P4D 0
> Oops: Oops: 0000 [#1] SMP NOPTI
> CPU: 2 UID: 1 PID: 927 Comm: p4xsk.bin Not tainted 6.16.12+deb14-cloud-amd64 #1 PREEMPT(lazy) Debian 6.16.12-1
> Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.17.0-debian-1.17.0-1 04/01/2014
> RIP: 0010:xsk_destruct_skb+0xd0/0x180
> [...]
> Call Trace:
> <IRQ>
> ? napi_complete_done+0x7a/0x1a0
> ip_rcv_core+0x1bb/0x340
> ip_rcv+0x30/0x1f0
> __netif_receive_skb_one_core+0x85/0xa0
> process_backlog+0x87/0x130
> __napi_poll+0x28/0x180
> net_rx_action+0x339/0x420
> handle_softirqs+0xdc/0x320
> ? handle_edge_irq+0x90/0x1e0
> do_softirq.part.0+0x3b/0x60
> </IRQ>
> <TASK>
> __local_bh_enable_ip+0x60/0x70
> __dev_direct_xmit+0x14e/0x1f0
> __xsk_generic_xmit+0x482/0xb70
> ? __remove_hrtimer+0x41/0xa0
> ? __xsk_generic_xmit+0x51/0xb70
> ? _raw_spin_unlock_irqrestore+0xe/0x40
> xsk_sendmsg+0xda/0x1c0
> __sys_sendto+0x1ee/0x200
> __x64_sys_sendto+0x24/0x30
> do_syscall_64+0x84/0x2f0
> ? __pfx_pollwake+0x10/0x10
> ? __rseq_handle_notify_resume+0xad/0x4c0
> ? restore_fpregs_from_fpstate+0x3c/0x90
> ? switch_fpu_return+0x5b/0xe0
> ? do_syscall_64+0x204/0x2f0
> ? do_syscall_64+0x204/0x2f0
> ? do_syscall_64+0x204/0x2f0
> entry_SYSCALL_64_after_hwframe+0x76/0x7e
> </TASK>
> [...]
> Kernel panic - not syncing: Fatal exception in interrupt
> Kernel Offset: 0x1c000000 from 0xffffffff81000000 (relocation range: 0xffffffff80000000-0xffffffffbfffffff)
>
> The approach proposed stores the first address also in the xsk_addr_node
> along with the number of descriptors. The head xsk_addr_node is
> referenced in skb_shinfo(skb)->destructor_arg. The rest of the fragments
> store the address on the list.
>
> This is less efficient as 4 bytes are wasted when storing each address.
Hi Fernando,
it's not about 4 bytes being wasted but rather memory allocation that you
introduce here. I tried hard to avoid hurting non-fragmented traffic,
below you can find impact reported by Jason from similar approach as
yours:
https://lore.kernel.org/bpf/CAL+tcoD=Gn6ZmJ+_Y48vPRyHVHmP-7irsx=fRVRnyhDrpTrEtQ@mail.gmail.com/
I assume this patch will yield a similar performance degradation...
>
> Fixes: 30f241fcf52a ("xsk: Fix immature cq descriptor production")
> Closes: https://lore.kernel.org/netdev/0435b904-f44f-48f8-afb0-68868474bf1c@nop.hu/
> Signed-off-by: Fernando Fernandez Mancera <fmancera@...e.de>
> ---
> v2: fix erroneous page handling and fix typos on commit message.
> ---
> net/xdp/xsk.c | 52 ++++++++++++++++++++++++++++-----------------------
> 1 file changed, 29 insertions(+), 23 deletions(-)
>
> diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
> index 7b0c68a70888..965cf071b036 100644
> --- a/net/xdp/xsk.c
> +++ b/net/xdp/xsk.c
> @@ -37,18 +37,14 @@
> #define MAX_PER_SOCKET_BUDGET 32
>
> struct xsk_addr_node {
> + u32 num_descs;
> u64 addr;
> struct list_head addr_node;
> };
>
> -struct xsk_addr_head {
> - u32 num_descs;
> - struct list_head addrs_list;
> -};
> -
> static struct kmem_cache *xsk_tx_generic_cache;
>
> -#define XSKCB(skb) ((struct xsk_addr_head *)((skb)->cb))
> +#define XSK_TX_HEAD(skb) ((struct xsk_addr_node *)((skb_shinfo(skb)->destructor_arg)))
>
> void xsk_set_rx_need_wakeup(struct xsk_buff_pool *pool)
> {
> @@ -569,12 +565,11 @@ static void xsk_cq_submit_addr_locked(struct xsk_buff_pool *pool,
> spin_lock_irqsave(&pool->cq_lock, flags);
> idx = xskq_get_prod(pool->cq);
>
> - xskq_prod_write_addr(pool->cq, idx,
> - (u64)(uintptr_t)skb_shinfo(skb)->destructor_arg);
> + xskq_prod_write_addr(pool->cq, idx, XSK_TX_HEAD(skb)->addr);
> descs_processed++;
>
> - if (unlikely(XSKCB(skb)->num_descs > 1)) {
> - list_for_each_entry_safe(pos, tmp, &XSKCB(skb)->addrs_list, addr_node) {
> + if (unlikely(XSK_TX_HEAD(skb)->num_descs > 1)) {
> + list_for_each_entry_safe(pos, tmp, &XSK_TX_HEAD(skb)->addr_node, addr_node) {
> xskq_prod_write_addr(pool->cq, idx + descs_processed,
> pos->addr);
> descs_processed++;
> @@ -582,6 +577,7 @@ static void xsk_cq_submit_addr_locked(struct xsk_buff_pool *pool,
> kmem_cache_free(xsk_tx_generic_cache, pos);
> }
> }
> + kmem_cache_free(xsk_tx_generic_cache, XSK_TX_HEAD(skb));
> xskq_prod_submit_n(pool->cq, descs_processed);
> spin_unlock_irqrestore(&pool->cq_lock, flags);
> }
> @@ -597,12 +593,12 @@ static void xsk_cq_cancel_locked(struct xsk_buff_pool *pool, u32 n)
>
> static void xsk_inc_num_desc(struct sk_buff *skb)
> {
> - XSKCB(skb)->num_descs++;
> + XSK_TX_HEAD(skb)->num_descs++;
> }
>
> static u32 xsk_get_num_desc(struct sk_buff *skb)
> {
> - return XSKCB(skb)->num_descs;
> + return XSK_TX_HEAD(skb)->num_descs;
> }
>
> static void xsk_destruct_skb(struct sk_buff *skb)
> @@ -619,16 +615,16 @@ static void xsk_destruct_skb(struct sk_buff *skb)
> }
>
> static void xsk_skb_init_misc(struct sk_buff *skb, struct xdp_sock *xs,
> - u64 addr)
> + struct xsk_addr_node *head, u64 addr)
> {
> - BUILD_BUG_ON(sizeof(struct xsk_addr_head) > sizeof(skb->cb));
> - INIT_LIST_HEAD(&XSKCB(skb)->addrs_list);
> + INIT_LIST_HEAD(&head->addr_node);
> + head->addr = addr;
> + head->num_descs = 0;
> skb->dev = xs->dev;
> skb->priority = READ_ONCE(xs->sk.sk_priority);
> skb->mark = READ_ONCE(xs->sk.sk_mark);
> - XSKCB(skb)->num_descs = 0;
> skb->destructor = xsk_destruct_skb;
> - skb_shinfo(skb)->destructor_arg = (void *)(uintptr_t)addr;
> + skb_shinfo(skb)->destructor_arg = (void *)head;
> }
>
> static void xsk_consume_skb(struct sk_buff *skb)
> @@ -638,11 +634,12 @@ static void xsk_consume_skb(struct sk_buff *skb)
> struct xsk_addr_node *pos, *tmp;
>
> if (unlikely(num_descs > 1)) {
> - list_for_each_entry_safe(pos, tmp, &XSKCB(skb)->addrs_list, addr_node) {
> + list_for_each_entry_safe(pos, tmp, &XSK_TX_HEAD(skb)->addr_node, addr_node) {
> list_del(&pos->addr_node);
> kmem_cache_free(xsk_tx_generic_cache, pos);
> }
> }
> + kmem_cache_free(xsk_tx_generic_cache, XSK_TX_HEAD(skb));
>
> skb->destructor = sock_wfree;
> xsk_cq_cancel_locked(xs->pool, num_descs);
> @@ -720,7 +717,11 @@ static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs,
>
> skb_reserve(skb, hr);
>
> - xsk_skb_init_misc(skb, xs, desc->addr);
> + xsk_addr = kmem_cache_zalloc(xsk_tx_generic_cache, GFP_KERNEL);
> + if (!xsk_addr)
> + return ERR_PTR(-ENOMEM);
> +
> + xsk_skb_init_misc(skb, xs, xsk_addr, desc->addr);
> if (desc->options & XDP_TX_METADATA) {
> err = xsk_skb_metadata(skb, buffer, desc, pool, hr);
> if (unlikely(err))
> @@ -736,7 +737,7 @@ static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs,
> * would be dropped, which implies freeing all list elements
> */
> xsk_addr->addr = desc->addr;
> - list_add_tail(&xsk_addr->addr_node, &XSKCB(skb)->addrs_list);
> + list_add_tail(&xsk_addr->addr_node, &XSK_TX_HEAD(skb)->addr_node);
> }
>
> len = desc->len;
> @@ -773,6 +774,7 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
> struct xdp_desc *desc)
> {
> struct net_device *dev = xs->dev;
> + struct xsk_addr_node *xsk_addr;
> struct sk_buff *skb = xs->skb;
> int err;
>
> @@ -804,7 +806,12 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
> if (unlikely(err))
> goto free_err;
>
> - xsk_skb_init_misc(skb, xs, desc->addr);
> + xsk_addr = kmem_cache_zalloc(xsk_tx_generic_cache, GFP_KERNEL);
> + if (!xsk_addr) {
> + err = -ENOMEM;
> + goto free_err;
> + }
> + xsk_skb_init_misc(skb, xs, xsk_addr, desc->addr);
> if (desc->options & XDP_TX_METADATA) {
> err = xsk_skb_metadata(skb, buffer, desc,
> xs->pool, hr);
> @@ -813,7 +820,6 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
> }
> } else {
> int nr_frags = skb_shinfo(skb)->nr_frags;
> - struct xsk_addr_node *xsk_addr;
> struct page *page;
> u8 *vaddr;
>
> @@ -843,7 +849,7 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
> refcount_add(PAGE_SIZE, &xs->sk.sk_wmem_alloc);
>
> xsk_addr->addr = desc->addr;
> - list_add_tail(&xsk_addr->addr_node, &XSKCB(skb)->addrs_list);
> + list_add_tail(&xsk_addr->addr_node, &XSK_TX_HEAD(skb)->addr_node);
> }
> }
>
> --
> 2.51.0
>
Powered by blists - more mailing lists