lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <CAL+tcoB9AUGLafYF0rMs7-+wFJPrTUzf1cbwy4R_hc_7Zs9B3Q@mail.gmail.com>
Date: Fri, 31 Oct 2025 17:51:48 +0800
From: Jason Xing <kerneljasonxing@...il.com>
To: Fernando Fernandez Mancera <fmancera@...e.de>
Cc: netdev@...r.kernel.org, csmate@....hu, maciej.fijalkowski@...el.com, 
	bjorn@...nel.org, sdf@...ichev.me, jonathan.lemon@...il.com, 
	bpf@...r.kernel.org, davem@...emloft.net, edumazet@...gle.com, 
	kuba@...nel.org, pabeni@...hat.com, horms@...nel.org
Subject: Re: [PATCH net v3] xsk: avoid data corruption on cq descriptor number

On Thu, Oct 30, 2025 at 10:04 PM Fernando Fernandez Mancera
<fmancera@...e.de> wrote:
>
> Since commit 30f241fcf52a ("xsk: Fix immature cq descriptor
> production"), the descriptor number is stored in skb control block and
> xsk_cq_submit_addr_locked() relies on it to put the umem addrs onto
> pool's completion queue.
>
> skb control block shouldn't be used for this purpose as after transmit
> xsk doesn't have control over it and other subsystems could use it. This
> leads to the following kernel panic due to a NULL pointer dereference.
>
>  BUG: kernel NULL pointer dereference, address: 0000000000000000
>  #PF: supervisor read access in kernel mode
>  #PF: error_code(0x0000) - not-present page
>  PGD 0 P4D 0
>  Oops: Oops: 0000 [#1] SMP NOPTI
>  CPU: 2 UID: 1 PID: 927 Comm: p4xsk.bin Not tainted 6.16.12+deb14-cloud-amd64 #1 PREEMPT(lazy)  Debian 6.16.12-1
>  Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.17.0-debian-1.17.0-1 04/01/2014
>  RIP: 0010:xsk_destruct_skb+0xd0/0x180
>  [...]
>  Call Trace:
>   <IRQ>
>   ? napi_complete_done+0x7a/0x1a0
>   ip_rcv_core+0x1bb/0x340
>   ip_rcv+0x30/0x1f0
>   __netif_receive_skb_one_core+0x85/0xa0
>   process_backlog+0x87/0x130
>   __napi_poll+0x28/0x180
>   net_rx_action+0x339/0x420
>   handle_softirqs+0xdc/0x320
>   ? handle_edge_irq+0x90/0x1e0
>   do_softirq.part.0+0x3b/0x60
>   </IRQ>
>   <TASK>
>   __local_bh_enable_ip+0x60/0x70
>   __dev_direct_xmit+0x14e/0x1f0
>   __xsk_generic_xmit+0x482/0xb70
>   ? __remove_hrtimer+0x41/0xa0
>   ? __xsk_generic_xmit+0x51/0xb70
>   ? _raw_spin_unlock_irqrestore+0xe/0x40
>   xsk_sendmsg+0xda/0x1c0
>   __sys_sendto+0x1ee/0x200
>   __x64_sys_sendto+0x24/0x30
>   do_syscall_64+0x84/0x2f0
>   ? __pfx_pollwake+0x10/0x10
>   ? __rseq_handle_notify_resume+0xad/0x4c0
>   ? restore_fpregs_from_fpstate+0x3c/0x90
>   ? switch_fpu_return+0x5b/0xe0
>   ? do_syscall_64+0x204/0x2f0
>   ? do_syscall_64+0x204/0x2f0
>   ? do_syscall_64+0x204/0x2f0
>   entry_SYSCALL_64_after_hwframe+0x76/0x7e
>   </TASK>
>  [...]
>  Kernel panic - not syncing: Fatal exception in interrupt
>  Kernel Offset: 0x1c000000 from 0xffffffff81000000 (relocation range: 0xffffffff80000000-0xffffffffbfffffff)
>
> Instead use the skb destructor_arg pointer along with pointer tagging.
> As pointers are always aligned to 8B, use the bottom bit to indicate
> whether this a single address or an allocated struct containing several
> addresses.
>
> Fixes: 30f241fcf52a ("xsk: Fix immature cq descriptor production")
> Closes: https://lore.kernel.org/netdev/0435b904-f44f-48f8-afb0-68868474bf1c@nop.hu/
> Suggested-by: Jakub Kicinski <kuba@...nel.org>
> Signed-off-by: Fernando Fernandez Mancera <fmancera@...e.de>

I don't think we need this fix anymore if we can apply the series[1].
The fix I just proposed doesn't use any new bits to store something so
the problem will disappear.

[1]: https://lore.kernel.org/all/20251031093230.82386-1-kerneljasonxing@gmail.com/

Thanks,
Jason

> ---
> v2: remove some leftovers on skb_build and simplify fragmented traffic
> logic
>
> v3: drop skb extension approach, instead use pointer tagging in
> destructor_arg to know whether we have a single address or an allocated
> struct with multiple ones. Also, move from bpf to net as requested
>
> Note: tested with the crash reproducer and xdpsock tool
> ---
>  net/xdp/xsk.c | 130 ++++++++++++++++++++++++++++----------------------
>  1 file changed, 74 insertions(+), 56 deletions(-)
>
> diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
> index 7b0c68a70888..d7354a3e2545 100644
> --- a/net/xdp/xsk.c
> +++ b/net/xdp/xsk.c
> @@ -36,20 +36,13 @@
>  #define TX_BATCH_SIZE 32
>  #define MAX_PER_SOCKET_BUDGET 32
>
> -struct xsk_addr_node {
> -       u64 addr;
> -       struct list_head addr_node;
> -};
> -
> -struct xsk_addr_head {
> +struct xsk_addrs {
>         u32 num_descs;
> -       struct list_head addrs_list;
> +       u64 addrs[MAX_SKB_FRAGS + 1];
>  };
>
>  static struct kmem_cache *xsk_tx_generic_cache;
>
> -#define XSKCB(skb) ((struct xsk_addr_head *)((skb)->cb))
> -
>  void xsk_set_rx_need_wakeup(struct xsk_buff_pool *pool)
>  {
>         if (pool->cached_need_wakeup & XDP_WAKEUP_RX)
> @@ -558,29 +551,53 @@ static int xsk_cq_reserve_locked(struct xsk_buff_pool *pool)
>         return ret;
>  }
>
> +static bool xsk_skb_destructor_is_addr(struct sk_buff *skb)
> +{
> +       return (uintptr_t)skb_shinfo(skb)->destructor_arg & 0x1UL;
> +}
> +
> +static u64 xsk_skb_destructor_get_addr(struct sk_buff *skb)
> +{
> +       return (u64)((uintptr_t)skb_shinfo(skb)->destructor_arg & ~0x1UL);
> +}
> +
> +static u32 xsk_get_num_desc(struct sk_buff *skb)
> +{
> +       struct xsk_addrs *xsk_addr;
> +
> +       if (xsk_skb_destructor_is_addr(skb))
> +               return 1;
> +
> +       xsk_addr = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg;
> +
> +       return xsk_addr->num_descs;
> +}
> +
>  static void xsk_cq_submit_addr_locked(struct xsk_buff_pool *pool,
>                                       struct sk_buff *skb)
>  {
> -       struct xsk_addr_node *pos, *tmp;
> +       u32 num_descs = xsk_get_num_desc(skb);
> +       struct xsk_addrs *xsk_addr;
>         u32 descs_processed = 0;
>         unsigned long flags;
> -       u32 idx;
> +       u32 idx, i;
>
>         spin_lock_irqsave(&pool->cq_lock, flags);
>         idx = xskq_get_prod(pool->cq);
>
> -       xskq_prod_write_addr(pool->cq, idx,
> -                            (u64)(uintptr_t)skb_shinfo(skb)->destructor_arg);
> -       descs_processed++;
> +       if (unlikely(num_descs > 1)) {
> +               xsk_addr = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg;
>
> -       if (unlikely(XSKCB(skb)->num_descs > 1)) {
> -               list_for_each_entry_safe(pos, tmp, &XSKCB(skb)->addrs_list, addr_node) {
> +               for (i = 0; i < num_descs; i++) {
>                         xskq_prod_write_addr(pool->cq, idx + descs_processed,
> -                                            pos->addr);
> +                                            xsk_addr->addrs[i]);
>                         descs_processed++;
> -                       list_del(&pos->addr_node);
> -                       kmem_cache_free(xsk_tx_generic_cache, pos);
>                 }
> +               kmem_cache_free(xsk_tx_generic_cache, xsk_addr);
> +       } else {
> +               xskq_prod_write_addr(pool->cq, idx,
> +                                    xsk_skb_destructor_get_addr(skb));
> +               descs_processed++;
>         }
>         xskq_prod_submit_n(pool->cq, descs_processed);
>         spin_unlock_irqrestore(&pool->cq_lock, flags);
> @@ -595,16 +612,6 @@ static void xsk_cq_cancel_locked(struct xsk_buff_pool *pool, u32 n)
>         spin_unlock_irqrestore(&pool->cq_lock, flags);
>  }
>
> -static void xsk_inc_num_desc(struct sk_buff *skb)
> -{
> -       XSKCB(skb)->num_descs++;
> -}
> -
> -static u32 xsk_get_num_desc(struct sk_buff *skb)
> -{
> -       return XSKCB(skb)->num_descs;
> -}
> -
>  static void xsk_destruct_skb(struct sk_buff *skb)
>  {
>         struct xsk_tx_metadata_compl *compl = &skb_shinfo(skb)->xsk_meta;
> @@ -621,27 +628,22 @@ static void xsk_destruct_skb(struct sk_buff *skb)
>  static void xsk_skb_init_misc(struct sk_buff *skb, struct xdp_sock *xs,
>                               u64 addr)
>  {
> -       BUILD_BUG_ON(sizeof(struct xsk_addr_head) > sizeof(skb->cb));
> -       INIT_LIST_HEAD(&XSKCB(skb)->addrs_list);
>         skb->dev = xs->dev;
>         skb->priority = READ_ONCE(xs->sk.sk_priority);
>         skb->mark = READ_ONCE(xs->sk.sk_mark);
> -       XSKCB(skb)->num_descs = 0;
>         skb->destructor = xsk_destruct_skb;
> -       skb_shinfo(skb)->destructor_arg = (void *)(uintptr_t)addr;
> +       skb_shinfo(skb)->destructor_arg = (void *)((uintptr_t)addr | 0x1UL);
>  }
>
>  static void xsk_consume_skb(struct sk_buff *skb)
>  {
>         struct xdp_sock *xs = xdp_sk(skb->sk);
>         u32 num_descs = xsk_get_num_desc(skb);
> -       struct xsk_addr_node *pos, *tmp;
> +       struct xsk_addrs *xsk_addr;
>
>         if (unlikely(num_descs > 1)) {
> -               list_for_each_entry_safe(pos, tmp, &XSKCB(skb)->addrs_list, addr_node) {
> -                       list_del(&pos->addr_node);
> -                       kmem_cache_free(xsk_tx_generic_cache, pos);
> -               }
> +               xsk_addr = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg;
> +               kmem_cache_free(xsk_tx_generic_cache, xsk_addr);
>         }
>
>         skb->destructor = sock_wfree;
> @@ -701,7 +703,6 @@ static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs,
>  {
>         struct xsk_buff_pool *pool = xs->pool;
>         u32 hr, len, ts, offset, copy, copied;
> -       struct xsk_addr_node *xsk_addr;
>         struct sk_buff *skb = xs->skb;
>         struct page *page;
>         void *buffer;
> @@ -727,16 +728,27 @@ static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs,
>                                 return ERR_PTR(err);
>                 }
>         } else {
> -               xsk_addr = kmem_cache_zalloc(xsk_tx_generic_cache, GFP_KERNEL);
> -               if (!xsk_addr)
> -                       return ERR_PTR(-ENOMEM);
> +               struct xsk_addrs *xsk_addr;
> +
> +               if (xsk_skb_destructor_is_addr(skb)) {
> +                       xsk_addr = kmem_cache_zalloc(xsk_tx_generic_cache,
> +                                                    GFP_KERNEL);
> +                       if (!xsk_addr)
> +                               return ERR_PTR(-ENOMEM);
> +
> +                       xsk_addr->num_descs = 1;
> +                       xsk_addr->addrs[0] = xsk_skb_destructor_get_addr(skb);
> +                       skb_shinfo(skb)->destructor_arg = (void *)xsk_addr;
> +               } else {
> +                       xsk_addr = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg;
> +               }
>
>                 /* in case of -EOVERFLOW that could happen below,
>                  * xsk_consume_skb() will release this node as whole skb
>                  * would be dropped, which implies freeing all list elements
>                  */
> -               xsk_addr->addr = desc->addr;
> -               list_add_tail(&xsk_addr->addr_node, &XSKCB(skb)->addrs_list);
> +               xsk_addr->addrs[xsk_addr->num_descs] = desc->addr;
> +               xsk_addr->num_descs++;
>         }
>
>         len = desc->len;
> @@ -813,7 +825,7 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
>                         }
>                 } else {
>                         int nr_frags = skb_shinfo(skb)->nr_frags;
> -                       struct xsk_addr_node *xsk_addr;
> +                       struct xsk_addrs *xsk_addr;
>                         struct page *page;
>                         u8 *vaddr;
>
> @@ -828,11 +840,20 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
>                                 goto free_err;
>                         }
>
> -                       xsk_addr = kmem_cache_zalloc(xsk_tx_generic_cache, GFP_KERNEL);
> -                       if (!xsk_addr) {
> -                               __free_page(page);
> -                               err = -ENOMEM;
> -                               goto free_err;
> +                       if (xsk_skb_destructor_is_addr(skb)) {
> +                               xsk_addr = kmem_cache_zalloc(xsk_tx_generic_cache,
> +                                                            GFP_KERNEL);
> +                               if (!xsk_addr) {
> +                                       __free_page(page);
> +                                       err = -ENOMEM;
> +                                       goto free_err;
> +                               }
> +
> +                               xsk_addr->num_descs = 1;
> +                               xsk_addr->addrs[0] = xsk_skb_destructor_get_addr(skb);
> +                               skb_shinfo(skb)->destructor_arg = (void *)xsk_addr;
> +                       } else {
> +                               xsk_addr = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg;
>                         }
>
>                         vaddr = kmap_local_page(page);
> @@ -842,13 +863,11 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
>                         skb_add_rx_frag(skb, nr_frags, page, 0, len, PAGE_SIZE);
>                         refcount_add(PAGE_SIZE, &xs->sk.sk_wmem_alloc);
>
> -                       xsk_addr->addr = desc->addr;
> -                       list_add_tail(&xsk_addr->addr_node, &XSKCB(skb)->addrs_list);
> +                       xsk_addr->addrs[xsk_addr->num_descs] = desc->addr;
> +                       xsk_addr->num_descs++;
>                 }
>         }
>
> -       xsk_inc_num_desc(skb);
> -
>         return skb;
>
>  free_err:
> @@ -857,7 +876,6 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
>
>         if (err == -EOVERFLOW) {
>                 /* Drop the packet */
> -               xsk_inc_num_desc(xs->skb);
>                 xsk_drop_skb(xs->skb);
>                 xskq_cons_release(xs->tx);
>         } else {
> @@ -1904,7 +1922,7 @@ static int __init xsk_init(void)
>                 goto out_pernet;
>
>         xsk_tx_generic_cache = kmem_cache_create("xsk_generic_xmit_cache",
> -                                                sizeof(struct xsk_addr_node),
> +                                                sizeof(struct xsk_addrs),
>                                                  0, SLAB_HWCACHE_ALIGN, NULL);
>         if (!xsk_tx_generic_cache) {
>                 err = -ENOMEM;
> --
> 2.51.0
>

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ