[<prev] [next>] [day] [month] [year] [list]
Message-Id: <20250530095957.43248-1-e.kubanski@partner.samsung.com>
Date: Fri, 30 May 2025 11:59:57 +0200
From: "e.kubanski" <e.kubanski@...tner.samsung.com>
To: netdev@...r.kernel.org, linux-kernel@...r.kernel.org
Cc: bjorn@...nel.org, magnus.karlsson@...el.com,
maciej.fijalkowski@...el.com, jonathan.lemon@...il.com, "e.kubanski"
<e.kubanski@...tner.samsung.com>
Subject: [PATCH bpf] xsk: Fix out of order segment free in
__xsk_generic_xmit()
Move xsk completion queue descriptor write-back to destructor.
Fix xsk descriptor management in completion queue. Descriptor
management mechanism didn't take care of situations where
completion queue submission can happen out-of-order to
descriptor write-back.
__xsk_generic_xmit() was assigning descriptor to slot right
after completion queue slot reservation. If multiple CPUs
access the same completion queue after xmit, this can result
in out-of-order submission of invalid descriptor batch.
SKB destructor call can submit descriptor batch that is
currently in use by other CPU, instead of correct transmitted
ones. This could result in User-Space <-> Kernel-Space data race.
Forbid possible out-of-order submissions:
CPU A: Reservation + Descriptor Write
CPU B: Reservation + Descriptor Write
CPU B: Submit (submitted first batch reserved by CPU A)
CPU A: Submit (submitted second batch reserved by CPU B)
Move Descriptor Write to submission phase:
CPU A: Reservation (only moves local writer)
CPU B: Reservation (only moves local writer)
CPU B: Descriptor Write + Submit
CPU A: Descriptor Write + Submit
This solves potential out-of-order free of xsk buffers.
Signed-off-by: Eryk Kubanski <e.kubanski@...tner.samsung.com>
Fixes: e6c4047f5122 ("xsk: Use xsk_buff_pool directly for cq functions")
---
include/linux/skbuff.h | 2 ++
net/xdp/xsk.c | 17 +++++++++++------
net/xdp/xsk_queue.h | 11 +++++++++++
3 files changed, 24 insertions(+), 6 deletions(-)
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index bb2b751d274a..6785faec0699 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -622,6 +622,8 @@ struct skb_shared_info {
void *destructor_arg;
};
+ u64 xsk_descs[MAX_SKB_FRAGS];
+
/* must be last field, see pskb_expand_head() */
skb_frag_t frags[MAX_SKB_FRAGS];
};
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index a2249fd2048a..f822393907da 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -526,24 +526,24 @@ static int xsk_wakeup(struct xdp_sock *xs, u8 flags)
return dev->netdev_ops->ndo_xsk_wakeup(dev, xs->queue_id, flags);
}
-static int xsk_cq_reserve_addr_locked(struct xsk_buff_pool *pool, u64 addr)
+static int xsk_cq_reserve_locked(struct xsk_buff_pool *pool)
{
unsigned long flags;
int ret;
spin_lock_irqsave(&pool->cq_lock, flags);
- ret = xskq_prod_reserve_addr(pool->cq, addr);
+ ret = xskq_prod_reserve(pool->cq);
spin_unlock_irqrestore(&pool->cq_lock, flags);
return ret;
}
-static void xsk_cq_submit_locked(struct xsk_buff_pool *pool, u32 n)
+static void xsk_cq_submit_locked(struct xsk_buff_pool *pool, u64 *descs, u32 n)
{
unsigned long flags;
spin_lock_irqsave(&pool->cq_lock, flags);
- xskq_prod_submit_n(pool->cq, n);
+ xskq_prod_write_submit_addr_n(pool->cq, descs, n);
spin_unlock_irqrestore(&pool->cq_lock, flags);
}
@@ -570,7 +570,9 @@ static void xsk_destruct_skb(struct sk_buff *skb)
*compl->tx_timestamp = ktime_get_tai_fast_ns();
}
- xsk_cq_submit_locked(xdp_sk(skb->sk)->pool, xsk_get_num_desc(skb));
+ xsk_cq_submit_locked(xdp_sk(skb->sk)->pool,
+ skb_shinfo(skb)->xsk_descs,
+ xsk_get_num_desc(skb));
sock_wfree(skb);
}
@@ -749,7 +751,9 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
skb->priority = READ_ONCE(xs->sk.sk_priority);
skb->mark = READ_ONCE(xs->sk.sk_mark);
skb->destructor = xsk_destruct_skb;
+
xsk_tx_metadata_to_compl(meta, &skb_shinfo(skb)->xsk_meta);
+ skb_shinfo(skb)->xsk_descs[xsk_get_num_desc(skb)] = desc->addr;
xsk_set_destructor_arg(skb);
return skb;
@@ -760,6 +764,7 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
if (err == -EOVERFLOW) {
/* Drop the packet */
+ skb_shinfo(xs->skb)->xsk_descs[xsk_get_num_desc(xs->skb)] = desc->addr;
xsk_set_destructor_arg(xs->skb);
xsk_drop_skb(xs->skb);
xskq_cons_release(xs->tx);
@@ -802,7 +807,7 @@ static int __xsk_generic_xmit(struct sock *sk)
* if there is space in it. This avoids having to implement
* any buffering in the Tx path.
*/
- if (xsk_cq_reserve_addr_locked(xs->pool, desc.addr))
+ if (xsk_cq_reserve_locked(xs->pool))
goto out;
skb = xsk_build_skb(xs, &desc);
diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h
index 46d87e961ad6..06ce89aae217 100644
--- a/net/xdp/xsk_queue.h
+++ b/net/xdp/xsk_queue.h
@@ -436,6 +436,17 @@ static inline void xskq_prod_submit_n(struct xsk_queue *q, u32 nb_entries)
__xskq_prod_submit(q, q->ring->producer + nb_entries);
}
+static inline void xskq_prod_write_submit_addr_n(struct xsk_queue *q, u64 *addrs, u32 nb_entries)
+{
+ struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring;
+ u32 prod = q->ring->producer;
+
+ for (u32 i = 0; i < nb_entries; ++i)
+ ring->desc[prod++ & q->ring_mask] = addrs[i];
+
+ __xskq_prod_submit(q, prod);
+}
+
static inline bool xskq_prod_is_empty(struct xsk_queue *q)
{
/* No barriers needed since data is not accessed */
--
2.34.1
Powered by blists - more mailing lists