[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20251121083256.674562-14-edumazet@google.com>
Date: Fri, 21 Nov 2025 08:32:55 +0000
From: Eric Dumazet <edumazet@...gle.com>
To: "David S . Miller" <davem@...emloft.net>, Jakub Kicinski <kuba@...nel.org>,
Paolo Abeni <pabeni@...hat.com>
Cc: Simon Horman <horms@...nel.org>, Jamal Hadi Salim <jhs@...atatu.com>,
Cong Wang <xiyou.wangcong@...il.com>, Jiri Pirko <jiri@...nulli.us>,
"Toke Høiland-Jørgensen" <toke@...hat.com>, Kuniyuki Iwashima <kuniyu@...gle.com>,
Willem de Bruijn <willemb@...gle.com>, netdev@...r.kernel.org, eric.dumazet@...il.com,
Eric Dumazet <edumazet@...gle.com>
Subject: [PATCH v3 net-next 13/14] net_sched: add qdisc_dequeue_drop() helper
Some qdisc like cake, codel, fq_codel might drop packets
in their dequeue() method.
This is currently problematic because dequeue() runs with
the qdisc spinlock held. Freeing skbs can be extremely expensive.
Add qdisc_dequeue_drop() method and a new TCQ_F_DEQUEUE_DROPS
so that these qdiscs can opt-in to defer the skb frees
after the socket spinlock is released.
TCQ_F_DEQUEUE_DROPS is an attempt to not penalize other qdiscs
with an extra cache line miss.
Signed-off-by: Eric Dumazet <edumazet@...gle.com>
---
include/net/pkt_sched.h | 5 +++--
include/net/sch_generic.h | 30 +++++++++++++++++++++++++++---
net/core/dev.c | 22 +++++++++++++---------
3 files changed, 43 insertions(+), 14 deletions(-)
diff --git a/include/net/pkt_sched.h b/include/net/pkt_sched.h
index 4678db45832a1e3bf7b8a07756fb89ab868bd5d2..e703c507d0daa97ae7c3bf131e322b1eafcc5664 100644
--- a/include/net/pkt_sched.h
+++ b/include/net/pkt_sched.h
@@ -114,12 +114,13 @@ bool sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q,
void __qdisc_run(struct Qdisc *q);
-static inline void qdisc_run(struct Qdisc *q)
+static inline struct sk_buff *qdisc_run(struct Qdisc *q)
{
if (qdisc_run_begin(q)) {
__qdisc_run(q);
- qdisc_run_end(q);
+ return qdisc_run_end(q);
}
+ return NULL;
}
extern const struct nla_policy rtm_tca_policy[TCA_MAX + 1];
diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index b8092d0378a0cafa290123d17c1b0ba787cd0680..c3a7268b567e0abf3f38290cd4e3fa7cd0601e36 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -88,6 +88,8 @@ struct Qdisc {
#define TCQ_F_INVISIBLE 0x80 /* invisible by default in dump */
#define TCQ_F_NOLOCK 0x100 /* qdisc does not require locking */
#define TCQ_F_OFFLOADED 0x200 /* qdisc is offloaded to HW */
+#define TCQ_F_DEQUEUE_DROPS 0x400 /* ->dequeue() can drop packets in q->to_free */
+
u32 limit;
const struct Qdisc_ops *ops;
struct qdisc_size_table __rcu *stab;
@@ -119,6 +121,8 @@ struct Qdisc {
/* Note : we only change qstats.backlog in fast path. */
struct gnet_stats_queue qstats;
+
+ struct sk_buff *to_free;
__cacheline_group_end(Qdisc_write);
@@ -218,8 +222,10 @@ static inline bool qdisc_run_begin(struct Qdisc *qdisc)
return true;
}
-static inline void qdisc_run_end(struct Qdisc *qdisc)
+static inline struct sk_buff *qdisc_run_end(struct Qdisc *qdisc)
{
+ struct sk_buff *to_free = NULL;
+
if (qdisc->flags & TCQ_F_NOLOCK) {
spin_unlock(&qdisc->seqlock);
@@ -232,9 +238,16 @@ static inline void qdisc_run_end(struct Qdisc *qdisc)
if (unlikely(test_bit(__QDISC_STATE_MISSED,
&qdisc->state)))
__netif_schedule(qdisc);
- } else {
- WRITE_ONCE(qdisc->running, false);
+ return NULL;
+ }
+
+ if (qdisc->flags & TCQ_F_DEQUEUE_DROPS) {
+ to_free = qdisc->to_free;
+ if (to_free)
+ qdisc->to_free = NULL;
}
+ WRITE_ONCE(qdisc->running, false);
+ return to_free;
}
static inline bool qdisc_may_bulk(const struct Qdisc *qdisc)
@@ -1116,6 +1129,17 @@ static inline void tcf_kfree_skb_list(struct sk_buff *skb)
}
}
+static inline void qdisc_dequeue_drop(struct Qdisc *q, struct sk_buff *skb,
+ enum skb_drop_reason reason)
+{
+ DEBUG_NET_WARN_ON_ONCE(!(q->flags & TCQ_F_DEQUEUE_DROPS));
+ DEBUG_NET_WARN_ON_ONCE(q->flags & TCQ_F_NOLOCK);
+
+ tcf_set_drop_reason(skb, reason);
+ skb->next = q->to_free;
+ q->to_free = skb;
+}
+
/* Instead of calling kfree_skb() while root qdisc lock is held,
* queue the skb for future freeing at end of __dev_xmit_skb()
*/
diff --git a/net/core/dev.c b/net/core/dev.c
index e865cdb9b6966225072dc44a86610b9c7828bd8c..9094c0fb8c689cd5252274d839638839bfb7642e 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4141,7 +4141,7 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
struct net_device *dev,
struct netdev_queue *txq)
{
- struct sk_buff *next, *to_free = NULL;
+ struct sk_buff *next, *to_free = NULL, *to_free2 = NULL;
spinlock_t *root_lock = qdisc_lock(q);
struct llist_node *ll_list, *first_n;
unsigned long defer_count = 0;
@@ -4160,7 +4160,7 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
if (unlikely(!nolock_qdisc_is_empty(q))) {
rc = dev_qdisc_enqueue(skb, q, &to_free, txq);
__qdisc_run(q);
- qdisc_run_end(q);
+ to_free2 = qdisc_run_end(q);
goto free_skbs;
}
@@ -4170,12 +4170,13 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
!nolock_qdisc_is_empty(q))
__qdisc_run(q);
- qdisc_run_end(q);
- return NET_XMIT_SUCCESS;
+ to_free2 = qdisc_run_end(q);
+ rc = NET_XMIT_SUCCESS;
+ goto free_skbs;
}
rc = dev_qdisc_enqueue(skb, q, &to_free, txq);
- qdisc_run(q);
+ to_free2 = qdisc_run(q);
goto free_skbs;
}
@@ -4234,7 +4235,7 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
qdisc_bstats_update(q, skb);
if (sch_direct_xmit(skb, q, dev, txq, root_lock, true))
__qdisc_run(q);
- qdisc_run_end(q);
+ to_free2 = qdisc_run_end(q);
rc = NET_XMIT_SUCCESS;
} else {
int count = 0;
@@ -4246,7 +4247,7 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
rc = dev_qdisc_enqueue(skb, q, &to_free, txq);
count++;
}
- qdisc_run(q);
+ to_free2 = qdisc_run(q);
if (count != 1)
rc = NET_XMIT_SUCCESS;
}
@@ -4255,6 +4256,7 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
free_skbs:
tcf_kfree_skb_list(to_free);
+ tcf_kfree_skb_list(to_free2);
return rc;
}
@@ -5747,8 +5749,9 @@ static __latent_entropy void net_tx_action(void)
rcu_read_lock();
while (head) {
- struct Qdisc *q = head;
spinlock_t *root_lock = NULL;
+ struct sk_buff *to_free;
+ struct Qdisc *q = head;
head = head->next_sched;
@@ -5775,9 +5778,10 @@ static __latent_entropy void net_tx_action(void)
}
clear_bit(__QDISC_STATE_SCHED, &q->state);
- qdisc_run(q);
+ to_free = qdisc_run(q);
if (root_lock)
spin_unlock(root_lock);
+ tcf_kfree_skb_list(to_free);
}
rcu_read_unlock();
--
2.52.0.460.gd25c4c69ec-goog
Powered by blists - more mailing lists