lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <aPu0WdUqZCB3xQgb@boxer>
Date: Fri, 24 Oct 2025 19:16:09 +0200
From: Maciej Fijalkowski <maciej.fijalkowski@...el.com>
To: Fernando Fernandez Mancera <fmancera@...e.de>
CC: <netdev@...r.kernel.org>, <csmate@....hu>, <kerneljasonxing@...il.com>,
	<bjorn@...nel.org>, <sdf@...ichev.me>, <jonathan.lemon@...il.com>,
	<bpf@...r.kernel.org>, <davem@...emloft.net>, <edumazet@...gle.com>,
	<kuba@...nel.org>, <pabeni@...hat.com>, <horms@...nel.org>
Subject: Re: [PATCH net v2] xsk: avoid data corruption on cq descriptor number

On Fri, Oct 24, 2025 at 12:40:49PM +0200, Fernando Fernandez Mancera wrote:
> Since commit 30f241fcf52a ("xsk: Fix immature cq descriptor
> production"), the descriptor number is stored in skb control block and
> xsk_cq_submit_addr_locked() relies on it to put the umem addrs onto
> pool's completion queue.
> 
> skb control block shouldn't be used for this purpose as after transmit
> xsk doesn't have control over it and other subsystems could use it. This
> leads to the following kernel panic due to a NULL pointer dereference.
> 
>  BUG: kernel NULL pointer dereference, address: 0000000000000000
>  #PF: supervisor read access in kernel mode
>  #PF: error_code(0x0000) - not-present page
>  PGD 0 P4D 0
>  Oops: Oops: 0000 [#1] SMP NOPTI
>  CPU: 2 UID: 1 PID: 927 Comm: p4xsk.bin Not tainted 6.16.12+deb14-cloud-amd64 #1 PREEMPT(lazy)  Debian 6.16.12-1
>  Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.17.0-debian-1.17.0-1 04/01/2014
>  RIP: 0010:xsk_destruct_skb+0xd0/0x180
>  [...]
>  Call Trace:
>   <IRQ>
>   ? napi_complete_done+0x7a/0x1a0
>   ip_rcv_core+0x1bb/0x340
>   ip_rcv+0x30/0x1f0
>   __netif_receive_skb_one_core+0x85/0xa0
>   process_backlog+0x87/0x130
>   __napi_poll+0x28/0x180
>   net_rx_action+0x339/0x420
>   handle_softirqs+0xdc/0x320
>   ? handle_edge_irq+0x90/0x1e0
>   do_softirq.part.0+0x3b/0x60
>   </IRQ>
>   <TASK>
>   __local_bh_enable_ip+0x60/0x70
>   __dev_direct_xmit+0x14e/0x1f0
>   __xsk_generic_xmit+0x482/0xb70
>   ? __remove_hrtimer+0x41/0xa0
>   ? __xsk_generic_xmit+0x51/0xb70
>   ? _raw_spin_unlock_irqrestore+0xe/0x40
>   xsk_sendmsg+0xda/0x1c0
>   __sys_sendto+0x1ee/0x200
>   __x64_sys_sendto+0x24/0x30
>   do_syscall_64+0x84/0x2f0
>   ? __pfx_pollwake+0x10/0x10
>   ? __rseq_handle_notify_resume+0xad/0x4c0
>   ? restore_fpregs_from_fpstate+0x3c/0x90
>   ? switch_fpu_return+0x5b/0xe0
>   ? do_syscall_64+0x204/0x2f0
>   ? do_syscall_64+0x204/0x2f0
>   ? do_syscall_64+0x204/0x2f0
>   entry_SYSCALL_64_after_hwframe+0x76/0x7e
>   </TASK>
>  [...]
>  Kernel panic - not syncing: Fatal exception in interrupt
>  Kernel Offset: 0x1c000000 from 0xffffffff81000000 (relocation range: 0xffffffff80000000-0xffffffffbfffffff)
> 
> The approach proposed stores the first address also in the xsk_addr_node
> along with the number of descriptors. The head xsk_addr_node is
> referenced in skb_shinfo(skb)->destructor_arg. The rest of the fragments
> store the address on the list.
> 
> This is less efficient as 4 bytes are wasted when storing each address.

Hi Fernando,
it's not about 4 bytes being wasted but rather memory allocation that you
introduce here. I tried hard to avoid hurting non-fragmented traffic,
below you can find impact reported by Jason from similar approach as
yours:
https://lore.kernel.org/bpf/CAL+tcoD=Gn6ZmJ+_Y48vPRyHVHmP-7irsx=fRVRnyhDrpTrEtQ@mail.gmail.com/

I assume this patch will yield a similar performance degradation...

> 
> Fixes: 30f241fcf52a ("xsk: Fix immature cq descriptor production")
> Closes: https://lore.kernel.org/netdev/0435b904-f44f-48f8-afb0-68868474bf1c@nop.hu/
> Signed-off-by: Fernando Fernandez Mancera <fmancera@...e.de>
> ---
> v2: fix erroneous page handling and fix typos on commit message.
> ---
>  net/xdp/xsk.c | 52 ++++++++++++++++++++++++++++-----------------------
>  1 file changed, 29 insertions(+), 23 deletions(-)
> 
> diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
> index 7b0c68a70888..965cf071b036 100644
> --- a/net/xdp/xsk.c
> +++ b/net/xdp/xsk.c
> @@ -37,18 +37,14 @@
>  #define MAX_PER_SOCKET_BUDGET 32
>  
>  struct xsk_addr_node {
> +	u32 num_descs;
>  	u64 addr;
>  	struct list_head addr_node;
>  };
>  
> -struct xsk_addr_head {
> -	u32 num_descs;
> -	struct list_head addrs_list;
> -};
> -
>  static struct kmem_cache *xsk_tx_generic_cache;
>  
> -#define XSKCB(skb) ((struct xsk_addr_head *)((skb)->cb))
> +#define XSK_TX_HEAD(skb) ((struct xsk_addr_node *)((skb_shinfo(skb)->destructor_arg)))
>  
>  void xsk_set_rx_need_wakeup(struct xsk_buff_pool *pool)
>  {
> @@ -569,12 +565,11 @@ static void xsk_cq_submit_addr_locked(struct xsk_buff_pool *pool,
>  	spin_lock_irqsave(&pool->cq_lock, flags);
>  	idx = xskq_get_prod(pool->cq);
>  
> -	xskq_prod_write_addr(pool->cq, idx,
> -			     (u64)(uintptr_t)skb_shinfo(skb)->destructor_arg);
> +	xskq_prod_write_addr(pool->cq, idx, XSK_TX_HEAD(skb)->addr);
>  	descs_processed++;
>  
> -	if (unlikely(XSKCB(skb)->num_descs > 1)) {
> -		list_for_each_entry_safe(pos, tmp, &XSKCB(skb)->addrs_list, addr_node) {
> +	if (unlikely(XSK_TX_HEAD(skb)->num_descs > 1)) {
> +		list_for_each_entry_safe(pos, tmp, &XSK_TX_HEAD(skb)->addr_node, addr_node) {
>  			xskq_prod_write_addr(pool->cq, idx + descs_processed,
>  					     pos->addr);
>  			descs_processed++;
> @@ -582,6 +577,7 @@ static void xsk_cq_submit_addr_locked(struct xsk_buff_pool *pool,
>  			kmem_cache_free(xsk_tx_generic_cache, pos);
>  		}
>  	}
> +	kmem_cache_free(xsk_tx_generic_cache, XSK_TX_HEAD(skb));
>  	xskq_prod_submit_n(pool->cq, descs_processed);
>  	spin_unlock_irqrestore(&pool->cq_lock, flags);
>  }
> @@ -597,12 +593,12 @@ static void xsk_cq_cancel_locked(struct xsk_buff_pool *pool, u32 n)
>  
>  static void xsk_inc_num_desc(struct sk_buff *skb)
>  {
> -	XSKCB(skb)->num_descs++;
> +	XSK_TX_HEAD(skb)->num_descs++;
>  }
>  
>  static u32 xsk_get_num_desc(struct sk_buff *skb)
>  {
> -	return XSKCB(skb)->num_descs;
> +	return XSK_TX_HEAD(skb)->num_descs;
>  }
>  
>  static void xsk_destruct_skb(struct sk_buff *skb)
> @@ -619,16 +615,16 @@ static void xsk_destruct_skb(struct sk_buff *skb)
>  }
>  
>  static void xsk_skb_init_misc(struct sk_buff *skb, struct xdp_sock *xs,
> -			      u64 addr)
> +			      struct xsk_addr_node *head, u64 addr)
>  {
> -	BUILD_BUG_ON(sizeof(struct xsk_addr_head) > sizeof(skb->cb));
> -	INIT_LIST_HEAD(&XSKCB(skb)->addrs_list);
> +	INIT_LIST_HEAD(&head->addr_node);
> +	head->addr = addr;
> +	head->num_descs = 0;
>  	skb->dev = xs->dev;
>  	skb->priority = READ_ONCE(xs->sk.sk_priority);
>  	skb->mark = READ_ONCE(xs->sk.sk_mark);
> -	XSKCB(skb)->num_descs = 0;
>  	skb->destructor = xsk_destruct_skb;
> -	skb_shinfo(skb)->destructor_arg = (void *)(uintptr_t)addr;
> +	skb_shinfo(skb)->destructor_arg = (void *)head;
>  }
>  
>  static void xsk_consume_skb(struct sk_buff *skb)
> @@ -638,11 +634,12 @@ static void xsk_consume_skb(struct sk_buff *skb)
>  	struct xsk_addr_node *pos, *tmp;
>  
>  	if (unlikely(num_descs > 1)) {
> -		list_for_each_entry_safe(pos, tmp, &XSKCB(skb)->addrs_list, addr_node) {
> +		list_for_each_entry_safe(pos, tmp, &XSK_TX_HEAD(skb)->addr_node, addr_node) {
>  			list_del(&pos->addr_node);
>  			kmem_cache_free(xsk_tx_generic_cache, pos);
>  		}
>  	}
> +	kmem_cache_free(xsk_tx_generic_cache, XSK_TX_HEAD(skb));
>  
>  	skb->destructor = sock_wfree;
>  	xsk_cq_cancel_locked(xs->pool, num_descs);
> @@ -720,7 +717,11 @@ static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs,
>  
>  		skb_reserve(skb, hr);
>  
> -		xsk_skb_init_misc(skb, xs, desc->addr);
> +		xsk_addr = kmem_cache_zalloc(xsk_tx_generic_cache, GFP_KERNEL);
> +		if (!xsk_addr)
> +			return ERR_PTR(-ENOMEM);
> +
> +		xsk_skb_init_misc(skb, xs, xsk_addr, desc->addr);
>  		if (desc->options & XDP_TX_METADATA) {
>  			err = xsk_skb_metadata(skb, buffer, desc, pool, hr);
>  			if (unlikely(err))
> @@ -736,7 +737,7 @@ static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs,
>  		 * would be dropped, which implies freeing all list elements
>  		 */
>  		xsk_addr->addr = desc->addr;
> -		list_add_tail(&xsk_addr->addr_node, &XSKCB(skb)->addrs_list);
> +		list_add_tail(&xsk_addr->addr_node, &XSK_TX_HEAD(skb)->addr_node);
>  	}
>  
>  	len = desc->len;
> @@ -773,6 +774,7 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
>  				     struct xdp_desc *desc)
>  {
>  	struct net_device *dev = xs->dev;
> +	struct xsk_addr_node *xsk_addr;
>  	struct sk_buff *skb = xs->skb;
>  	int err;
>  
> @@ -804,7 +806,12 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
>  			if (unlikely(err))
>  				goto free_err;
>  
> -			xsk_skb_init_misc(skb, xs, desc->addr);
> +			xsk_addr = kmem_cache_zalloc(xsk_tx_generic_cache, GFP_KERNEL);
> +			if (!xsk_addr) {
> +				err = -ENOMEM;
> +				goto free_err;
> +			}
> +			xsk_skb_init_misc(skb, xs, xsk_addr, desc->addr);
>  			if (desc->options & XDP_TX_METADATA) {
>  				err = xsk_skb_metadata(skb, buffer, desc,
>  						       xs->pool, hr);
> @@ -813,7 +820,6 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
>  			}
>  		} else {
>  			int nr_frags = skb_shinfo(skb)->nr_frags;
> -			struct xsk_addr_node *xsk_addr;
>  			struct page *page;
>  			u8 *vaddr;
>  
> @@ -843,7 +849,7 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
>  			refcount_add(PAGE_SIZE, &xs->sk.sk_wmem_alloc);
>  
>  			xsk_addr->addr = desc->addr;
> -			list_add_tail(&xsk_addr->addr_node, &XSKCB(skb)->addrs_list);
> +			list_add_tail(&xsk_addr->addr_node, &XSK_TX_HEAD(skb)->addr_node);
>  		}
>  	}
>  
> -- 
> 2.51.0
> 

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ