[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20251107102853.1082118-5-dtatulea@nvidia.com>
Date: Fri, 7 Nov 2025 12:28:46 +0200
From: Dragos Tatulea <dtatulea@...dia.com>
To: Jakub Kicinski <kuba@...nel.org>, Jesper Dangaard Brouer
<hawk@...nel.org>, Andrew Lunn <andrew+netdev@...n.ch>, "David S. Miller"
<davem@...emloft.net>, Eric Dumazet <edumazet@...gle.com>, Paolo Abeni
<pabeni@...hat.com>, Alexei Starovoitov <ast@...nel.org>, Daniel Borkmann
<daniel@...earbox.net>, Andrii Nakryiko <andrii@...nel.org>, Martin KaFai Lau
<martin.lau@...ux.dev>, Eduard Zingerman <eddyz87@...il.com>, Song Liu
<song@...nel.org>, Yonghong Song <yonghong.song@...ux.dev>, John Fastabend
<john.fastabend@...il.com>, KP Singh <kpsingh@...nel.org>, Stanislav Fomichev
<sdf@...ichev.me>, Hao Luo <haoluo@...gle.com>, Jiri Olsa <jolsa@...nel.org>,
Simon Horman <horms@...nel.org>
CC: Dragos Tatulea <dtatulea@...dia.com>, Tariq Toukan <tariqt@...dia.com>,
<netdev@...r.kernel.org>, <linux-kernel@...r.kernel.org>,
<bpf@...r.kernel.org>
Subject: [RFC 2/2] xdp: Delegate fast path return decision to page_pool
XDP uses the BPF_RI_F_RF_NO_DIRECT flag to mark contexts where it is not
allowed to do direct recycling, even though the direct flag was set by
the caller. This is confusing and can lead to races which are hard to
detect [1].
Furthermore, the page_pool already contains an internal
mechanism which checks if it is safe to switch the direct
flag from off to on.
This patch drops the use of the BPF_RI_F_RF_NO_DIRECT flag and always
calls the page_pool release with the direct flag set to false. The
page_pool will decide if it is safe to do direct recycling. This
is not free but it is worth it to make the XDP code safer. The
next paragrapsh are discussing the performance impact.
Performance wise, there are 3 cases to consider. Looking from
__xdp_return() for MEM_TYPE_PAGE_POOL case:
1) napi_direct == false:
- Before: 1 comparison in __xdp_return() + call of
page_pool_napi_local() from page_pool_put_unrefed_netmem().
- After: Only one call to page_pool_napi_local().
2) napi_direct == true && BPF_RI_F_RF_NO_DIRECT
- Before: 2 comparisons in __xdp_return() + call of
page_pool_napi_local() from page_pool_put_unrefed_netmem().
- After: Only one call to page_pool_napi_local().
3) napi_direct == true && !BPF_RI_F_RF_NO_DIRECT
- Before: 2 comparisons in __xdp_return().
- After: One call to page_pool_napi_local()
Case 1 & 2 are the slower paths and they only have to gain.
But they are slow anyway so the gain is small.
Case 3 is the fast path and is the one that has to be considered more
closely. The 2 comparisons from __xdp_return() are swapped for the more
expensive page_pool_napi_local() call.
Using the page_pool benchmark between the fast-path and the
newly-added NAPI aware mode to measure [2] how expensive
page_pool_napi_local() is:
bench_page_pool: time_bench_page_pool01_fast_path(): in_serving_softirq fast-path
bench_page_pool: Type:tasklet_page_pool01_fast_path Per elem: 15 cycles(tsc) 7.537 ns (step:0)
bench_page_pool: time_bench_page_pool04_napi_aware(): in_serving_softirq fast-path
bench_page_pool: Type:tasklet_page_pool04_napi_aware Per elem: 20 cycles(tsc) 10.490 ns (step:0)
... and the slow path for reference:
bench_page_pool: time_bench_page_pool02_ptr_ring(): in_serving_softirq fast-path
bench_page_pool: Type:tasklet_page_pool02_ptr_ring Per elem: 30 cycles(tsc) 15.395 ns (step:0)
So the impact is small in the fast-path, but not negligible. One thing
to consider is the fact that the comparisons from napi_direct are
dropped. That means that the impact will be smaller than the
measurements from the benchmark.
[1] Commit 2b986b9e917b ("bpf, cpumap: Disable page_pool direct xdp_return need larger scope")
[2] Intel Xeon Platinum 8580
Signed-off-by: Dragos Tatulea <dtatulea@...dia.com>
---
drivers/net/veth.c | 2 --
include/linux/filter.h | 22 ----------------------
include/net/xdp.h | 2 +-
kernel/bpf/cpumap.c | 2 --
net/bpf/test_run.c | 2 --
net/core/filter.c | 2 +-
net/core/xdp.c | 24 ++++++++++++------------
7 files changed, 14 insertions(+), 42 deletions(-)
diff --git a/drivers/net/veth.c b/drivers/net/veth.c
index a3046142cb8e..6d5c1e0b05a7 100644
--- a/drivers/net/veth.c
+++ b/drivers/net/veth.c
@@ -975,7 +975,6 @@ static int veth_poll(struct napi_struct *napi, int budget)
bq.count = 0;
- xdp_set_return_frame_no_direct();
done = veth_xdp_rcv(rq, budget, &bq, &stats);
if (stats.xdp_redirect > 0)
@@ -994,7 +993,6 @@ static int veth_poll(struct napi_struct *napi, int budget)
if (stats.xdp_tx > 0)
veth_xdp_flush(rq, &bq);
- xdp_clear_return_frame_no_direct();
return done;
}
diff --git a/include/linux/filter.h b/include/linux/filter.h
index f5c859b8131a..877e40d81a4c 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -764,7 +764,6 @@ struct bpf_nh_params {
};
/* flags for bpf_redirect_info kern_flags */
-#define BPF_RI_F_RF_NO_DIRECT BIT(0) /* no napi_direct on return_frame */
#define BPF_RI_F_RI_INIT BIT(1)
#define BPF_RI_F_CPU_MAP_INIT BIT(2)
#define BPF_RI_F_DEV_MAP_INIT BIT(3)
@@ -1163,27 +1162,6 @@ struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off,
const struct bpf_insn *patch, u32 len);
int bpf_remove_insns(struct bpf_prog *prog, u32 off, u32 cnt);
-static inline bool xdp_return_frame_no_direct(void)
-{
- struct bpf_redirect_info *ri = bpf_net_ctx_get_ri();
-
- return ri->kern_flags & BPF_RI_F_RF_NO_DIRECT;
-}
-
-static inline void xdp_set_return_frame_no_direct(void)
-{
- struct bpf_redirect_info *ri = bpf_net_ctx_get_ri();
-
- ri->kern_flags |= BPF_RI_F_RF_NO_DIRECT;
-}
-
-static inline void xdp_clear_return_frame_no_direct(void)
-{
- struct bpf_redirect_info *ri = bpf_net_ctx_get_ri();
-
- ri->kern_flags &= ~BPF_RI_F_RF_NO_DIRECT;
-}
-
static inline int xdp_ok_fwd_dev(const struct net_device *fwd,
unsigned int pktlen)
{
diff --git a/include/net/xdp.h b/include/net/xdp.h
index aa742f413c35..2a44d84a7611 100644
--- a/include/net/xdp.h
+++ b/include/net/xdp.h
@@ -446,7 +446,7 @@ struct xdp_frame *xdp_convert_buff_to_frame(struct xdp_buff *xdp)
}
void __xdp_return(netmem_ref netmem, enum xdp_mem_type mem_type,
- bool napi_direct, struct xdp_buff *xdp);
+ struct xdp_buff *xdp);
void xdp_return_frame(struct xdp_frame *xdpf);
void xdp_return_frame_rx_napi(struct xdp_frame *xdpf);
void xdp_return_buff(struct xdp_buff *xdp);
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index 703e5df1f4ef..3ece03dc36bd 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -253,7 +253,6 @@ static void cpu_map_bpf_prog_run(struct bpf_cpu_map_entry *rcpu, void **frames,
rcu_read_lock();
bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx);
- xdp_set_return_frame_no_direct();
ret->xdp_n = cpu_map_bpf_prog_run_xdp(rcpu, frames, ret->xdp_n, stats);
if (unlikely(ret->skb_n))
@@ -263,7 +262,6 @@ static void cpu_map_bpf_prog_run(struct bpf_cpu_map_entry *rcpu, void **frames,
if (stats->redirect)
xdp_do_flush();
- xdp_clear_return_frame_no_direct();
bpf_net_ctx_clear(bpf_net_ctx);
rcu_read_unlock();
diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c
index 8b7d0b90fea7..a0fe03e9e527 100644
--- a/net/bpf/test_run.c
+++ b/net/bpf/test_run.c
@@ -289,7 +289,6 @@ static int xdp_test_run_batch(struct xdp_test_data *xdp, struct bpf_prog *prog,
local_bh_disable();
bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx);
ri = bpf_net_ctx_get_ri();
- xdp_set_return_frame_no_direct();
for (i = 0; i < batch_sz; i++) {
page = page_pool_dev_alloc_pages(xdp->pp);
@@ -352,7 +351,6 @@ static int xdp_test_run_batch(struct xdp_test_data *xdp, struct bpf_prog *prog,
err = ret;
}
- xdp_clear_return_frame_no_direct();
bpf_net_ctx_clear(bpf_net_ctx);
local_bh_enable();
return err;
diff --git a/net/core/filter.c b/net/core/filter.c
index 16105f52927d..5622ec5ac19c 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -4187,7 +4187,7 @@ static bool bpf_xdp_shrink_data(struct xdp_buff *xdp, skb_frag_t *frag,
}
if (release) {
- __xdp_return(netmem, mem_type, false, zc_frag);
+ __xdp_return(netmem, mem_type, zc_frag);
} else {
if (!tail)
skb_frag_off_add(frag, shrink);
diff --git a/net/core/xdp.c b/net/core/xdp.c
index 9100e160113a..cf8eab699d9a 100644
--- a/net/core/xdp.c
+++ b/net/core/xdp.c
@@ -431,18 +431,18 @@ EXPORT_SYMBOL_GPL(xdp_rxq_info_attach_page_pool);
* of xdp_frames/pages in those cases.
*/
void __xdp_return(netmem_ref netmem, enum xdp_mem_type mem_type,
- bool napi_direct, struct xdp_buff *xdp)
+ struct xdp_buff *xdp)
{
switch (mem_type) {
case MEM_TYPE_PAGE_POOL:
netmem = netmem_compound_head(netmem);
- if (napi_direct && xdp_return_frame_no_direct())
- napi_direct = false;
+
/* No need to check netmem_is_pp() as mem->type knows this a
* page_pool page
+ *
+ * page_pool can detect direct recycle.
*/
- page_pool_put_full_netmem(netmem_get_pp(netmem), netmem,
- napi_direct);
+ page_pool_put_full_netmem(netmem_get_pp(netmem), netmem, false);
break;
case MEM_TYPE_PAGE_SHARED:
page_frag_free(__netmem_address(netmem));
@@ -471,10 +471,10 @@ void xdp_return_frame(struct xdp_frame *xdpf)
sinfo = xdp_get_shared_info_from_frame(xdpf);
for (u32 i = 0; i < sinfo->nr_frags; i++)
__xdp_return(skb_frag_netmem(&sinfo->frags[i]), xdpf->mem_type,
- false, NULL);
+ NULL);
out:
- __xdp_return(virt_to_netmem(xdpf->data), xdpf->mem_type, false, NULL);
+ __xdp_return(virt_to_netmem(xdpf->data), xdpf->mem_type, NULL);
}
EXPORT_SYMBOL_GPL(xdp_return_frame);
@@ -488,10 +488,10 @@ void xdp_return_frame_rx_napi(struct xdp_frame *xdpf)
sinfo = xdp_get_shared_info_from_frame(xdpf);
for (u32 i = 0; i < sinfo->nr_frags; i++)
__xdp_return(skb_frag_netmem(&sinfo->frags[i]), xdpf->mem_type,
- true, NULL);
+ NULL);
out:
- __xdp_return(virt_to_netmem(xdpf->data), xdpf->mem_type, true, NULL);
+ __xdp_return(virt_to_netmem(xdpf->data), xdpf->mem_type, NULL);
}
EXPORT_SYMBOL_GPL(xdp_return_frame_rx_napi);
@@ -542,7 +542,7 @@ EXPORT_SYMBOL_GPL(xdp_return_frame_bulk);
*/
void xdp_return_frag(netmem_ref netmem, const struct xdp_buff *xdp)
{
- __xdp_return(netmem, xdp->rxq->mem.type, true, NULL);
+ __xdp_return(netmem, xdp->rxq->mem.type, NULL);
}
EXPORT_SYMBOL_GPL(xdp_return_frag);
@@ -556,10 +556,10 @@ void xdp_return_buff(struct xdp_buff *xdp)
sinfo = xdp_get_shared_info_from_buff(xdp);
for (u32 i = 0; i < sinfo->nr_frags; i++)
__xdp_return(skb_frag_netmem(&sinfo->frags[i]),
- xdp->rxq->mem.type, true, xdp);
+ xdp->rxq->mem.type, xdp);
out:
- __xdp_return(virt_to_netmem(xdp->data), xdp->rxq->mem.type, true, xdp);
+ __xdp_return(virt_to_netmem(xdp->data), xdp->rxq->mem.type, xdp);
}
EXPORT_SYMBOL_GPL(xdp_return_buff);
--
2.50.1
Powered by blists - more mailing lists