lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <aRtIiIvfVwJCmcn1@boxer>
Date: Mon, 17 Nov 2025 17:08:43 +0100
From: Maciej Fijalkowski <maciej.fijalkowski@...el.com>
To: Fernando Fernandez Mancera <fmancera@...e.de>
CC: <netdev@...r.kernel.org>, <csmate@....hu>, <kerneljasonxing@...il.com>,
	<bjorn@...nel.org>, <sdf@...ichev.me>, <jonathan.lemon@...il.com>,
	<bpf@...r.kernel.org>, <davem@...emloft.net>, <edumazet@...gle.com>,
	<kuba@...nel.org>, <pabeni@...hat.com>, <horms@...nel.org>
Subject: Re: [PATCH net v3] xsk: avoid data corruption on cq descriptor number

On Thu, Oct 30, 2025 at 03:03:55PM +0100, Fernando Fernandez Mancera wrote:
> Since commit 30f241fcf52a ("xsk: Fix immature cq descriptor
> production"), the descriptor number is stored in skb control block and
> xsk_cq_submit_addr_locked() relies on it to put the umem addrs onto
> pool's completion queue.
> 
> skb control block shouldn't be used for this purpose as after transmit
> xsk doesn't have control over it and other subsystems could use it. This
> leads to the following kernel panic due to a NULL pointer dereference.
> 
>  BUG: kernel NULL pointer dereference, address: 0000000000000000
>  #PF: supervisor read access in kernel mode
>  #PF: error_code(0x0000) - not-present page
>  PGD 0 P4D 0
>  Oops: Oops: 0000 [#1] SMP NOPTI
>  CPU: 2 UID: 1 PID: 927 Comm: p4xsk.bin Not tainted 6.16.12+deb14-cloud-amd64 #1 PREEMPT(lazy)  Debian 6.16.12-1
>  Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.17.0-debian-1.17.0-1 04/01/2014
>  RIP: 0010:xsk_destruct_skb+0xd0/0x180
>  [...]
>  Call Trace:
>   <IRQ>
>   ? napi_complete_done+0x7a/0x1a0
>   ip_rcv_core+0x1bb/0x340
>   ip_rcv+0x30/0x1f0
>   __netif_receive_skb_one_core+0x85/0xa0
>   process_backlog+0x87/0x130
>   __napi_poll+0x28/0x180
>   net_rx_action+0x339/0x420
>   handle_softirqs+0xdc/0x320
>   ? handle_edge_irq+0x90/0x1e0
>   do_softirq.part.0+0x3b/0x60
>   </IRQ>
>   <TASK>
>   __local_bh_enable_ip+0x60/0x70
>   __dev_direct_xmit+0x14e/0x1f0
>   __xsk_generic_xmit+0x482/0xb70
>   ? __remove_hrtimer+0x41/0xa0
>   ? __xsk_generic_xmit+0x51/0xb70
>   ? _raw_spin_unlock_irqrestore+0xe/0x40
>   xsk_sendmsg+0xda/0x1c0
>   __sys_sendto+0x1ee/0x200
>   __x64_sys_sendto+0x24/0x30
>   do_syscall_64+0x84/0x2f0
>   ? __pfx_pollwake+0x10/0x10
>   ? __rseq_handle_notify_resume+0xad/0x4c0
>   ? restore_fpregs_from_fpstate+0x3c/0x90
>   ? switch_fpu_return+0x5b/0xe0
>   ? do_syscall_64+0x204/0x2f0
>   ? do_syscall_64+0x204/0x2f0
>   ? do_syscall_64+0x204/0x2f0
>   entry_SYSCALL_64_after_hwframe+0x76/0x7e
>   </TASK>
>  [...]
>  Kernel panic - not syncing: Fatal exception in interrupt
>  Kernel Offset: 0x1c000000 from 0xffffffff81000000 (relocation range: 0xffffffff80000000-0xffffffffbfffffff)
> 
> Instead use the skb destructor_arg pointer along with pointer tagging.
> As pointers are always aligned to 8B, use the bottom bit to indicate
> whether this a single address or an allocated struct containing several
> addresses.
> 
> Fixes: 30f241fcf52a ("xsk: Fix immature cq descriptor production")
> Closes: https://lore.kernel.org/netdev/0435b904-f44f-48f8-afb0-68868474bf1c@nop.hu/
> Suggested-by: Jakub Kicinski <kuba@...nel.org>
> Signed-off-by: Fernando Fernandez Mancera <fmancera@...e.de>

Tested-by: Maciej Fijalkowski <maciej.fijalkowski@...el.com>
Acked-by: Maciej Fijalkowski <maciej.fijalkowski@...el.com>

Fernando thanks for stepping in and providing this fix!
And thanks Jakub for ptr tagging trick.

@BPF maintainers, please apply this patch.

> ---
> v2: remove some leftovers on skb_build and simplify fragmented traffic
> logic
> 
> v3: drop skb extension approach, instead use pointer tagging in
> destructor_arg to know whether we have a single address or an allocated
> struct with multiple ones. Also, move from bpf to net as requested
> 
> Note: tested with the crash reproducer and xdpsock tool
> ---
>  net/xdp/xsk.c | 130 ++++++++++++++++++++++++++++----------------------
>  1 file changed, 74 insertions(+), 56 deletions(-)
> 
> diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
> index 7b0c68a70888..d7354a3e2545 100644
> --- a/net/xdp/xsk.c
> +++ b/net/xdp/xsk.c
> @@ -36,20 +36,13 @@
>  #define TX_BATCH_SIZE 32
>  #define MAX_PER_SOCKET_BUDGET 32
>  
> -struct xsk_addr_node {
> -	u64 addr;
> -	struct list_head addr_node;
> -};
> -
> -struct xsk_addr_head {
> +struct xsk_addrs {
>  	u32 num_descs;
> -	struct list_head addrs_list;
> +	u64 addrs[MAX_SKB_FRAGS + 1];
>  };
>  
>  static struct kmem_cache *xsk_tx_generic_cache;
>  
> -#define XSKCB(skb) ((struct xsk_addr_head *)((skb)->cb))
> -
>  void xsk_set_rx_need_wakeup(struct xsk_buff_pool *pool)
>  {
>  	if (pool->cached_need_wakeup & XDP_WAKEUP_RX)
> @@ -558,29 +551,53 @@ static int xsk_cq_reserve_locked(struct xsk_buff_pool *pool)
>  	return ret;
>  }
>  
> +static bool xsk_skb_destructor_is_addr(struct sk_buff *skb)
> +{
> +	return (uintptr_t)skb_shinfo(skb)->destructor_arg & 0x1UL;
> +}
> +
> +static u64 xsk_skb_destructor_get_addr(struct sk_buff *skb)
> +{
> +	return (u64)((uintptr_t)skb_shinfo(skb)->destructor_arg & ~0x1UL);
> +}
> +
> +static u32 xsk_get_num_desc(struct sk_buff *skb)
> +{
> +	struct xsk_addrs *xsk_addr;
> +
> +	if (xsk_skb_destructor_is_addr(skb))
> +		return 1;
> +
> +	xsk_addr = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg;
> +
> +	return xsk_addr->num_descs;
> +}
> +
>  static void xsk_cq_submit_addr_locked(struct xsk_buff_pool *pool,
>  				      struct sk_buff *skb)
>  {
> -	struct xsk_addr_node *pos, *tmp;
> +	u32 num_descs = xsk_get_num_desc(skb);
> +	struct xsk_addrs *xsk_addr;
>  	u32 descs_processed = 0;
>  	unsigned long flags;
> -	u32 idx;
> +	u32 idx, i;
>  
>  	spin_lock_irqsave(&pool->cq_lock, flags);
>  	idx = xskq_get_prod(pool->cq);
>  
> -	xskq_prod_write_addr(pool->cq, idx,
> -			     (u64)(uintptr_t)skb_shinfo(skb)->destructor_arg);
> -	descs_processed++;
> +	if (unlikely(num_descs > 1)) {
> +		xsk_addr = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg;
>  
> -	if (unlikely(XSKCB(skb)->num_descs > 1)) {
> -		list_for_each_entry_safe(pos, tmp, &XSKCB(skb)->addrs_list, addr_node) {
> +		for (i = 0; i < num_descs; i++) {
>  			xskq_prod_write_addr(pool->cq, idx + descs_processed,
> -					     pos->addr);
> +					     xsk_addr->addrs[i]);
>  			descs_processed++;
> -			list_del(&pos->addr_node);
> -			kmem_cache_free(xsk_tx_generic_cache, pos);
>  		}
> +		kmem_cache_free(xsk_tx_generic_cache, xsk_addr);
> +	} else {
> +		xskq_prod_write_addr(pool->cq, idx,
> +				     xsk_skb_destructor_get_addr(skb));
> +		descs_processed++;
>  	}
>  	xskq_prod_submit_n(pool->cq, descs_processed);
>  	spin_unlock_irqrestore(&pool->cq_lock, flags);
> @@ -595,16 +612,6 @@ static void xsk_cq_cancel_locked(struct xsk_buff_pool *pool, u32 n)
>  	spin_unlock_irqrestore(&pool->cq_lock, flags);
>  }
>  
> -static void xsk_inc_num_desc(struct sk_buff *skb)
> -{
> -	XSKCB(skb)->num_descs++;
> -}
> -
> -static u32 xsk_get_num_desc(struct sk_buff *skb)
> -{
> -	return XSKCB(skb)->num_descs;
> -}
> -
>  static void xsk_destruct_skb(struct sk_buff *skb)
>  {
>  	struct xsk_tx_metadata_compl *compl = &skb_shinfo(skb)->xsk_meta;
> @@ -621,27 +628,22 @@ static void xsk_destruct_skb(struct sk_buff *skb)
>  static void xsk_skb_init_misc(struct sk_buff *skb, struct xdp_sock *xs,
>  			      u64 addr)
>  {
> -	BUILD_BUG_ON(sizeof(struct xsk_addr_head) > sizeof(skb->cb));
> -	INIT_LIST_HEAD(&XSKCB(skb)->addrs_list);
>  	skb->dev = xs->dev;
>  	skb->priority = READ_ONCE(xs->sk.sk_priority);
>  	skb->mark = READ_ONCE(xs->sk.sk_mark);
> -	XSKCB(skb)->num_descs = 0;
>  	skb->destructor = xsk_destruct_skb;
> -	skb_shinfo(skb)->destructor_arg = (void *)(uintptr_t)addr;
> +	skb_shinfo(skb)->destructor_arg = (void *)((uintptr_t)addr | 0x1UL);
>  }
>  
>  static void xsk_consume_skb(struct sk_buff *skb)
>  {
>  	struct xdp_sock *xs = xdp_sk(skb->sk);
>  	u32 num_descs = xsk_get_num_desc(skb);
> -	struct xsk_addr_node *pos, *tmp;
> +	struct xsk_addrs *xsk_addr;
>  
>  	if (unlikely(num_descs > 1)) {
> -		list_for_each_entry_safe(pos, tmp, &XSKCB(skb)->addrs_list, addr_node) {
> -			list_del(&pos->addr_node);
> -			kmem_cache_free(xsk_tx_generic_cache, pos);
> -		}
> +		xsk_addr = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg;
> +		kmem_cache_free(xsk_tx_generic_cache, xsk_addr);
>  	}
>  
>  	skb->destructor = sock_wfree;
> @@ -701,7 +703,6 @@ static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs,
>  {
>  	struct xsk_buff_pool *pool = xs->pool;
>  	u32 hr, len, ts, offset, copy, copied;
> -	struct xsk_addr_node *xsk_addr;
>  	struct sk_buff *skb = xs->skb;
>  	struct page *page;
>  	void *buffer;
> @@ -727,16 +728,27 @@ static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs,
>  				return ERR_PTR(err);
>  		}
>  	} else {
> -		xsk_addr = kmem_cache_zalloc(xsk_tx_generic_cache, GFP_KERNEL);
> -		if (!xsk_addr)
> -			return ERR_PTR(-ENOMEM);
> +		struct xsk_addrs *xsk_addr;
> +
> +		if (xsk_skb_destructor_is_addr(skb)) {
> +			xsk_addr = kmem_cache_zalloc(xsk_tx_generic_cache,
> +						     GFP_KERNEL);
> +			if (!xsk_addr)
> +				return ERR_PTR(-ENOMEM);
> +
> +			xsk_addr->num_descs = 1;
> +			xsk_addr->addrs[0] = xsk_skb_destructor_get_addr(skb);
> +			skb_shinfo(skb)->destructor_arg = (void *)xsk_addr;
> +		} else {
> +			xsk_addr = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg;
> +		}
>  
>  		/* in case of -EOVERFLOW that could happen below,
>  		 * xsk_consume_skb() will release this node as whole skb
>  		 * would be dropped, which implies freeing all list elements
>  		 */
> -		xsk_addr->addr = desc->addr;
> -		list_add_tail(&xsk_addr->addr_node, &XSKCB(skb)->addrs_list);
> +		xsk_addr->addrs[xsk_addr->num_descs] = desc->addr;
> +		xsk_addr->num_descs++;
>  	}
>  
>  	len = desc->len;
> @@ -813,7 +825,7 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
>  			}
>  		} else {
>  			int nr_frags = skb_shinfo(skb)->nr_frags;
> -			struct xsk_addr_node *xsk_addr;
> +			struct xsk_addrs *xsk_addr;
>  			struct page *page;
>  			u8 *vaddr;
>  
> @@ -828,11 +840,20 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
>  				goto free_err;
>  			}
>  
> -			xsk_addr = kmem_cache_zalloc(xsk_tx_generic_cache, GFP_KERNEL);
> -			if (!xsk_addr) {
> -				__free_page(page);
> -				err = -ENOMEM;
> -				goto free_err;
> +			if (xsk_skb_destructor_is_addr(skb)) {
> +				xsk_addr = kmem_cache_zalloc(xsk_tx_generic_cache,
> +							     GFP_KERNEL);
> +				if (!xsk_addr) {
> +					__free_page(page);
> +					err = -ENOMEM;
> +					goto free_err;
> +				}
> +
> +				xsk_addr->num_descs = 1;
> +				xsk_addr->addrs[0] = xsk_skb_destructor_get_addr(skb);
> +				skb_shinfo(skb)->destructor_arg = (void *)xsk_addr;
> +			} else {
> +				xsk_addr = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg;
>  			}
>  
>  			vaddr = kmap_local_page(page);
> @@ -842,13 +863,11 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
>  			skb_add_rx_frag(skb, nr_frags, page, 0, len, PAGE_SIZE);
>  			refcount_add(PAGE_SIZE, &xs->sk.sk_wmem_alloc);
>  
> -			xsk_addr->addr = desc->addr;
> -			list_add_tail(&xsk_addr->addr_node, &XSKCB(skb)->addrs_list);
> +			xsk_addr->addrs[xsk_addr->num_descs] = desc->addr;
> +			xsk_addr->num_descs++;
>  		}
>  	}
>  
> -	xsk_inc_num_desc(skb);
> -
>  	return skb;
>  
>  free_err:
> @@ -857,7 +876,6 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
>  
>  	if (err == -EOVERFLOW) {
>  		/* Drop the packet */
> -		xsk_inc_num_desc(xs->skb);
>  		xsk_drop_skb(xs->skb);
>  		xskq_cons_release(xs->tx);
>  	} else {
> @@ -1904,7 +1922,7 @@ static int __init xsk_init(void)
>  		goto out_pernet;
>  
>  	xsk_tx_generic_cache = kmem_cache_create("xsk_generic_xmit_cache",
> -						 sizeof(struct xsk_addr_node),
> +						 sizeof(struct xsk_addrs),
>  						 0, SLAB_HWCACHE_ALIGN, NULL);
>  	if (!xsk_tx_generic_cache) {
>  		err = -ENOMEM;
> -- 
> 2.51.0
> 

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ