>From ec1dcd1d542880fdaa126a5d4d57147ce69c80d3 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 28 Jan 2026 15:01:27 -0800 Subject: [--tree name--] net: periodically flush the defer queues Zero-copy skbs may sit in the defer skb queue forever. AFAIU we don't see this for UDP in SW tests because over veth we end up hitting skb_orphan_frags_rx(). But it should happen when we zero-copy Tx on a real interface, which frees skbs via napi_consume_skb(). Since commit 6471658dc66c ("udp: use skb_attempt_defer_free()") we will queue the skb with a ubuf to a remote core where it may never be freed. If we make TCP defer freeing Tx skbs this will be even more obvious and trigger over veth. In TCP the stack is traversed by a skb clone, and freeing happens when the ACK comes in, so no lucky skb_orphan_frags_rx(). This patch attempts a periodic scrub of the defer queues. Hopefully perf overhead is negligible under normal load, even on huge machines, as we'll just check that timer is already pending vast majority of the time. Signed-off-by: Jakub Kicinski --- Documentation/admin-guide/sysctl/net.rst | 13 ++ include/net/hotdata.h | 1 + net/core/dev.h | 4 + net/core/dev.c | 1 + net/core/skbuff.c | 152 +++++++++++++++++++++++ net/core/sysctl_net_core.c | 8 ++ 6 files changed, 179 insertions(+) diff --git a/Documentation/admin-guide/sysctl/net.rst b/Documentation/admin-guide/sysctl/net.rst index 19408da2390b..dbfae381818a 100644 --- a/Documentation/admin-guide/sysctl/net.rst +++ b/Documentation/admin-guide/sysctl/net.rst @@ -368,6 +368,19 @@ by the cpu which allocated them. Default: 128 +skb_defer_timeout_us +-------------------- + +Timeout in microseconds for the safety timer that flushes deferred skb +queues. When skbs are deferred to a CPU that never runs network softirq +(e.g., application threads that allocated TX skbs for zero-copy sends), +they may remain queued indefinitely. This timer periodically triggers +an IPI to the affected CPUs to drain their defer queues. + +Setting this to 0 disables the safety timer. + +Default: 20000 (20 milliseconds) + optmem_max ---------- diff --git a/include/net/hotdata.h b/include/net/hotdata.h index 6632b1aa7584..fb738da24fe6 100644 --- a/include/net/hotdata.h +++ b/include/net/hotdata.h @@ -10,6 +10,7 @@ struct skb_defer_node { struct llist_head defer_list; atomic_long_t defer_count; + u8 needs_flush; } ____cacheline_aligned_in_smp; /* Read mostly data used in network fast paths. */ diff --git a/net/core/dev.h b/net/core/dev.h index 98793a738f43..e481b5e67363 100644 --- a/net/core/dev.h +++ b/net/core/dev.h @@ -366,6 +366,10 @@ static inline void napi_assert_will_not_race(const struct napi_struct *napi) void kick_defer_list_purge(unsigned int cpu); +extern unsigned int sysctl_skb_defer_timeout_us; +int sysctl_skb_defer_timeout(const struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos); + #define XMIT_RECURSION_LIMIT 8 #ifndef CONFIG_PREEMPT_RT diff --git a/net/core/dev.c b/net/core/dev.c index 43de5af0d6ec..e6e004f56c58 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -6801,6 +6801,7 @@ static void skb_defer_free_flush(void) if (llist_empty(&sdn->defer_list)) continue; atomic_long_set(&sdn->defer_count, 0); + WRITE_ONCE(sdn->needs_flush, 0); free_list = llist_del_all(&sdn->defer_list); llist_for_each_entry_safe(skb, next, free_list, ll_node) { diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 63c7c4519d63..4c234617d0ab 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -86,6 +86,7 @@ #include #include #include +#include #include #include #include @@ -7219,6 +7220,152 @@ static void kfree_skb_napi_cache(struct sk_buff *skb) local_bh_enable(); } +/* + * Deferred skb flush timer. + * + * SKBs may get stuck in the defer queue if the originating CPU never runs + * network softirq (e.g., application threads that allocated TX skbs). + * This timer periodically flushes aged skbs from all defer queues. + * + * State machine: + * DISABLED -> IDLE: sysctl changed from 0 to non-zero + * IDLE -> ACTIVE: first defer schedules the timer + * IDLE -> DISABLED: sysctl set to 0 while timer idle + * ACTIVE -> RECHECK: timer fires, reschedules to check for races + * ACTIVE -> DISABLED: timer fires, sees timeout=0 + * RECHECK -> ACTIVE: new defer arrives, timer will continue + * RECHECK -> IDLE: timer fires again with no new work + * RECHECK -> DISABLED: timer fires, sees timeout=0 + * + * IDLE is 0 so kick_defer_safety_timer() fast path is a single comparison. + * DISABLED is -1, outside the IDLE/ACTIVE/RECHECK cycle. + */ +enum { + SKB_DEFER_TIMER_DISABLED = -1, + SKB_DEFER_TIMER_IDLE = 0, + SKB_DEFER_TIMER_ACTIVE = 1, + SKB_DEFER_TIMER_RECHECK = 2, + __SKB_DEFER_TIMER_STATE_CNT +}; + +static atomic_t skb_defer_timer_state = ATOMIC_INIT(SKB_DEFER_TIMER_DISABLED); +static struct hrtimer skb_defer_flush_timer; +unsigned int sysctl_skb_defer_timeout_us __read_mostly = 20000; + +static void skb_defer_flush_aged(void) +{ + struct skb_defer_node *sdn; + int cpu; + + for_each_online_cpu(cpu) { + bool kick = false; + int node; + + for_each_node(node) { + sdn = per_cpu_ptr(net_hotdata.skb_defer_nodes, cpu) + node; + + if (!atomic_long_read(&sdn->defer_count)) { + WRITE_ONCE(sdn->needs_flush, 0); + continue; + } + + kick |= READ_ONCE(sdn->needs_flush); + WRITE_ONCE(sdn->needs_flush, 1); + } + if (kick) + kick_defer_list_purge(cpu); + } +} + +static enum hrtimer_restart skb_defer_flush_timer_fn(struct hrtimer *timer) +{ + enum hrtimer_restart ret; + unsigned int timeout; + int new_state, state; + ktime_t delay; + + state = atomic_read(&skb_defer_timer_state); + timeout = READ_ONCE(sysctl_skb_defer_timeout_us); + if (!timeout) { + atomic_set(&skb_defer_timer_state, SKB_DEFER_TIMER_DISABLED); + return HRTIMER_NORESTART; + } + + WARN_ON_ONCE(state == SKB_DEFER_TIMER_IDLE); + WARN_ON_ONCE(state == SKB_DEFER_TIMER_DISABLED); + + /* State machine: ACTIVE (1) -> RECHECK (2) -> IDLE (0) */ + new_state = (state + 1) % __SKB_DEFER_TIMER_STATE_CNT; + new_state = atomic_cmpxchg(&skb_defer_timer_state, state, new_state); + + if (new_state == SKB_DEFER_TIMER_IDLE) { + ret = HRTIMER_NORESTART; + } else { + delay = ns_to_ktime((u64)timeout * (NSEC_PER_USEC / 2)); + hrtimer_forward_now(timer, delay); + ret = HRTIMER_RESTART; + } + + skb_defer_flush_aged(); + + return ret; +} + +static DEFINE_SPINLOCK(skb_defer_sysctl_lock); + +int sysctl_skb_defer_timeout(const struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + unsigned int old_timeout; + int ret; + + if (!write) + return proc_dointvec_minmax(table, write, buffer, lenp, ppos); + + spin_lock(&skb_defer_sysctl_lock); + + old_timeout = sysctl_skb_defer_timeout_us; + ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + if (ret) + goto unlock; + + if (!old_timeout && sysctl_skb_defer_timeout_us) + atomic_cmpxchg(&skb_defer_timer_state, + SKB_DEFER_TIMER_DISABLED, + SKB_DEFER_TIMER_IDLE); + if (old_timeout && !sysctl_skb_defer_timeout_us) + atomic_cmpxchg(&skb_defer_timer_state, + SKB_DEFER_TIMER_IDLE, + SKB_DEFER_TIMER_DISABLED); +unlock: + spin_unlock(&skb_defer_sysctl_lock); + return ret; +} + +static void kick_defer_safety_timer(void) +{ + unsigned int timeout; + ktime_t delay; + int state; + + state = atomic_read(&skb_defer_timer_state); + if (likely(state)) + return; /* already running (or disabled) */ + + timeout = READ_ONCE(sysctl_skb_defer_timeout_us); + if (!timeout) + return; + + state = atomic_cmpxchg(&skb_defer_timer_state, SKB_DEFER_TIMER_IDLE, + SKB_DEFER_TIMER_ACTIVE); + if (state != SKB_DEFER_TIMER_IDLE) + return; + + /* Half the sysctl period, once cycle marks and second will flush */ + delay = ns_to_ktime((u64)timeout * (NSEC_PER_USEC / 2)); + hrtimer_start(&skb_defer_flush_timer, delay, HRTIMER_MODE_REL); +} + /** * skb_attempt_defer_free - queue skb for remote freeing * @skb: buffer @@ -7264,6 +7411,8 @@ nodefer: kfree_skb_napi_cache(skb); */ if (unlikely(kick)) kick_defer_list_purge(cpu); + else + kick_defer_safety_timer(); } static void skb_splice_csum_page(struct sk_buff *skb, struct page *page, @@ -7439,4 +7588,7 @@ void __init skb_init(void) SKB_SMALL_HEAD_HEADROOM, NULL); skb_extensions_init(); + + hrtimer_setup(&skb_defer_flush_timer, skb_defer_flush_timer_fn, + CLOCK_MONOTONIC, HRTIMER_MODE_REL); } diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index 03aea10073f0..af8528a2337c 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -645,6 +645,14 @@ static struct ctl_table net_core_table[] = { .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, }, + { + .procname = "skb_defer_timeout_us", + .data = &sysctl_skb_defer_timeout_us, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sysctl_skb_defer_timeout, + .extra1 = SYSCTL_ZERO, + }, }; static struct ctl_table netns_core_table[] = { -- 2.52.0