lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <CANn89i+dL6JUpbZgJ9DEGeVWpmrfv9q=Y_daFvHAPM4ZsjinQg@mail.gmail.com>
Date: Mon, 10 Nov 2025 09:34:26 -0800
From: Eric Dumazet <edumazet@...gle.com>
To: Toke Høiland-Jørgensen <toke@...hat.com>
Cc: Jonas Köppeler <j.koeppeler@...berlin.de>, 
	"David S . Miller" <davem@...emloft.net>, Jakub Kicinski <kuba@...nel.org>, Paolo Abeni <pabeni@...hat.com>, 
	Simon Horman <horms@...nel.org>, Jamal Hadi Salim <jhs@...atatu.com>, 
	Cong Wang <xiyou.wangcong@...il.com>, Jiri Pirko <jiri@...nulli.us>, 
	Kuniyuki Iwashima <kuniyu@...gle.com>, Willem de Bruijn <willemb@...gle.com>, netdev@...r.kernel.org, 
	eric.dumazet@...il.com
Subject: Re: [PATCH v1 net-next 5/5] net: dev_queue_xmit() llist adoption

On Mon, Nov 10, 2025 at 6:49 AM Toke Høiland-Jørgensen <toke@...hat.com> wrote:
>
> Eric Dumazet <edumazet@...gle.com> writes:
>
> > I can work on a patch today.
>
> This sounds like an excellent idea in any case - thanks! :)

The following (on top of my last series) seems to work for me

 include/net/pkt_sched.h   |    5 +++--
 include/net/sch_generic.h |   24 +++++++++++++++++++++++-
 net/core/dev.c            |   33 +++++++++++++++++++--------------
 net/sched/sch_cake.c      |    4 +++-
 4 files changed, 48 insertions(+), 18 deletions(-)

diff --git a/include/net/pkt_sched.h b/include/net/pkt_sched.h
index 4678db45832a1e3bf7b8a07756fb89ab868bd5d2..e703c507d0daa97ae7c3bf131e322b1eafcc5664
100644
--- a/include/net/pkt_sched.h
+++ b/include/net/pkt_sched.h
@@ -114,12 +114,13 @@ bool sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q,

 void __qdisc_run(struct Qdisc *q);

-static inline void qdisc_run(struct Qdisc *q)
+static inline struct sk_buff *qdisc_run(struct Qdisc *q)
 {
        if (qdisc_run_begin(q)) {
                __qdisc_run(q);
-               qdisc_run_end(q);
+               return qdisc_run_end(q);
        }
+       return NULL;
 }

 extern const struct nla_policy rtm_tca_policy[TCA_MAX + 1];
diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 79501499dafba56271b9ebd97a8f379ffdc83cac..19cd2bc13bdba48f941b1599f896c15c8c7860ae
100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -88,6 +88,8 @@ struct Qdisc {
 #define TCQ_F_INVISIBLE                0x80 /* invisible by default in dump */
 #define TCQ_F_NOLOCK           0x100 /* qdisc does not require locking */
 #define TCQ_F_OFFLOADED                0x200 /* qdisc is offloaded to HW */
+#define TCQ_F_DEQUEUE_DROPS    0x400 /* ->dequeue() can drop packets
in q->to_free */
+
        u32                     limit;
        const struct Qdisc_ops  *ops;
        struct qdisc_size_table __rcu *stab;
@@ -119,6 +121,8 @@ struct Qdisc {

                /* Note : we only change qstats.backlog in fast path. */
                struct gnet_stats_queue qstats;
+
+               struct sk_buff          *to_free;
        __cacheline_group_end(Qdisc_write);


@@ -218,8 +222,15 @@ static inline bool qdisc_run_begin(struct Qdisc *qdisc)
        return true;
 }

-static inline void qdisc_run_end(struct Qdisc *qdisc)
+static inline struct sk_buff *qdisc_run_end(struct Qdisc *qdisc)
 {
+       struct sk_buff *to_free = NULL;
+
+       if (qdisc->flags & TCQ_F_DEQUEUE_DROPS) {
+               to_free = qdisc->to_free;
+               if (to_free)
+                       qdisc->to_free = NULL;
+       }
        if (qdisc->flags & TCQ_F_NOLOCK) {
                spin_unlock(&qdisc->seqlock);

@@ -235,6 +246,7 @@ static inline void qdisc_run_end(struct Qdisc *qdisc)
        } else {
                WRITE_ONCE(qdisc->running, false);
        }
+       return to_free;
 }

 static inline bool qdisc_may_bulk(const struct Qdisc *qdisc)
@@ -1105,6 +1117,16 @@ static inline void tcf_set_drop_reason(const
struct sk_buff *skb,
        tc_skb_cb(skb)->drop_reason = reason;
 }

+static inline void qdisc_dequeue_drop(struct Qdisc *q, struct sk_buff *skb,
+                                     enum skb_drop_reason reason)
+{
+       DEBUG_NET_WARN_ON_ONCE(!(q->flags & TCQ_F_DEQUEUE_DROPS));
+
+       tcf_set_drop_reason(skb, reason);
+       skb->next = q->to_free;
+       q->to_free = skb;
+}
+
 /* Instead of calling kfree_skb() while root qdisc lock is held,
  * queue the skb for future freeing at end of __dev_xmit_skb()
  */
diff --git a/net/core/dev.c b/net/core/dev.c
index ac994974e2a81889fcc0a2e664edcdb7cfd0496d..18cfcd765b1b3e4af1c5339e36df517e7abc914f
100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4141,7 +4141,7 @@ static inline int __dev_xmit_skb(struct sk_buff
*skb, struct Qdisc *q,
                                 struct net_device *dev,
                                 struct netdev_queue *txq)
 {
-       struct sk_buff *next, *to_free = NULL;
+       struct sk_buff *next, *to_free = NULL, *to_free2 = NULL;
        spinlock_t *root_lock = qdisc_lock(q);
        struct llist_node *ll_list, *first_n;
        unsigned long defer_count = 0;
@@ -4160,9 +4160,9 @@ static inline int __dev_xmit_skb(struct sk_buff
*skb, struct Qdisc *q,
                        if (unlikely(!nolock_qdisc_is_empty(q))) {
                                rc = dev_qdisc_enqueue(skb, q, &to_free, txq);
                                __qdisc_run(q);
-                               qdisc_run_end(q);
+                               to_free2 = qdisc_run_end(q);

-                               goto no_lock_out;
+                               goto free_out;
                        }

                        qdisc_bstats_cpu_update(q, skb);
@@ -4170,18 +4170,15 @@ static inline int __dev_xmit_skb(struct
sk_buff *skb, struct Qdisc *q,
                            !nolock_qdisc_is_empty(q))
                                __qdisc_run(q);

-                       qdisc_run_end(q);
-                       return NET_XMIT_SUCCESS;
+                       to_free2 = qdisc_run_end(q);
+                       rc = NET_XMIT_SUCCESS;
+                       goto free_out;
                }

                rc = dev_qdisc_enqueue(skb, q, &to_free, txq);
-               qdisc_run(q);
+               to_free2 = qdisc_run(q);

-no_lock_out:
-               if (unlikely(to_free))
-                       kfree_skb_list_reason(to_free,
-                                             tcf_get_drop_reason(to_free));
-               return rc;
+               goto free_out;
        }

        /* Open code llist_add(&skb->ll_node, &q->defer_list) + queue limit.
@@ -4239,7 +4236,7 @@ static inline int __dev_xmit_skb(struct sk_buff
*skb, struct Qdisc *q,
                qdisc_bstats_update(q, skb);
                if (sch_direct_xmit(skb, q, dev, txq, root_lock, true))
                        __qdisc_run(q);
-               qdisc_run_end(q);
+               to_free2 = qdisc_run_end(q);
                rc = NET_XMIT_SUCCESS;
        } else {
                int count = 0;
@@ -4251,15 +4248,19 @@ static inline int __dev_xmit_skb(struct
sk_buff *skb, struct Qdisc *q,
                        rc = dev_qdisc_enqueue(skb, q, &to_free, txq);
                        count++;
                }
-               qdisc_run(q);
+               to_free2 = qdisc_run(q);
                if (count != 1)
                        rc = NET_XMIT_SUCCESS;
        }
 unlock:
        spin_unlock(root_lock);
+free_out:
        if (unlikely(to_free))
                kfree_skb_list_reason(to_free,
                                      tcf_get_drop_reason(to_free));
+       if (unlikely(to_free2))
+               kfree_skb_list_reason(to_free2,
+                                     tcf_get_drop_reason(to_free2));
        return rc;
 }

@@ -5741,6 +5742,7 @@ static __latent_entropy void net_tx_action(void)
        }

        if (sd->output_queue) {
+               struct sk_buff *to_free;
                struct Qdisc *head;

                local_irq_disable();
@@ -5780,9 +5782,12 @@ static __latent_entropy void net_tx_action(void)
                        }

                        clear_bit(__QDISC_STATE_SCHED, &q->state);
-                       qdisc_run(q);
+                       to_free = qdisc_run(q);
                        if (root_lock)
                                spin_unlock(root_lock);
+                       if (unlikely(to_free))
+                               kfree_skb_list_reason(to_free,
+                                             tcf_get_drop_reason(to_free));
                }

                rcu_read_unlock();
diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c
index 312f5b000ffb67d74faf70f26d808e26315b4ab8..a717cc4e0606e80123ec9c76331d454dad699b69
100644
--- a/net/sched/sch_cake.c
+++ b/net/sched/sch_cake.c
@@ -2183,7 +2183,7 @@ static struct sk_buff *cake_dequeue(struct Qdisc *sch)
                b->tin_dropped++;
                qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(skb));
                qdisc_qstats_drop(sch);
-               kfree_skb_reason(skb, reason);
+               qdisc_dequeue_drop(sch, skb, reason);
                if (q->rate_flags & CAKE_FLAG_INGRESS)
                        goto retry;
        }
@@ -2569,6 +2569,8 @@ static void cake_reconfigure(struct Qdisc *sch)

        sch->flags &= ~TCQ_F_CAN_BYPASS;

+       sch->flags |= TCQ_F_DEQUEUE_DROPS;
+
        q->buffer_limit = min(q->buffer_limit,
                              max(sch->limit * psched_mtu(qdisc_dev(sch)),
                                  q->buffer_config_limit));

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ