netdev - [RFC Patch net-next 2/2] net_sched: Propagate per-qdisc max_segment

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-Id: <20250701232915.377351-3-xiyou.wangcong@gmail.com>
Date: Tue,  1 Jul 2025 16:29:15 -0700
From: Cong Wang <xiyou.wangcong@...il.com>
To: netdev@...r.kernel.org
Cc: jhs@...atatu.com,
	jiri@...nulli.us,
	mincho@...ori.io,
	victor@...atatu.com,
	Cong Wang <xiyou.wangcong@...il.com>
Subject: [RFC Patch net-next 2/2] net_sched: Propagate per-qdisc max_segment_size for GSO segmentation

Introduce a max_segment_size field in struct Qdisc and a get_max_size()
callback in struct Qdisc_ops to support per-qdisc maximum segment size
constraints. When a qdisc (such as TBF or TAPRIO) requires a specific
maximum packet size, it implements get_max_size() to return the appropriate
limit. This value is then propagated up the qdisc tree, and the root qdisc
tracks the minimum max_segment_size required by any child.

During GSO segmentation at the root, the strictest max_segment_size is used
to ensure that all resulting segments comply with downstream qdisc
requirements. If no max_segment_size is set, segmentation falls back to the
device MTU. This guarantees that oversized segments are never enqueued into
child qdiscs, preventing unnecessary drops and maintaining atomicity.

This change enables robust, hierarchical enforcement of per-qdisc size
limits, improves correctness for advanced qdiscs like TBF and TAPRIO, and
lays the groundwork for further per-class or per-priority segmentation
policies in the future.

Reported-by: Mingi Cho <mincho@...ori.io>
Signed-off-by: Cong Wang <xiyou.wangcong@...il.com>
---
 include/net/sch_generic.h |  3 ++-
 net/core/dev.c            | 16 +++++++++++-----
 net/sched/sch_api.c       | 13 ++++++++++---
 net/sched/sch_taprio.c    | 23 ++++++++++++++++-------
 net/sched/sch_tbf.c       |  8 ++++++++
 5 files changed, 47 insertions(+), 16 deletions(-)

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 9c4082ccefb5..d740b803c921 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -108,7 +108,7 @@ struct Qdisc {
 	struct net_rate_estimator __rcu *rate_est;
 	struct gnet_stats_basic_sync __percpu *cpu_bstats;
 	struct gnet_stats_queue	__percpu *cpu_qstats;
-	int			pad;
+	unsigned int max_segment_size; /* minimum max_size of all children, 0 = use device MTU */
 	refcount_t		refcnt;
 
 	/*
@@ -319,6 +319,7 @@ struct Qdisc_ops {
 						    u32 block_index);
 	u32			(*ingress_block_get)(struct Qdisc *sch);
 	u32			(*egress_block_get)(struct Qdisc *sch);
+	unsigned int (*get_max_size)(struct Qdisc *sch, struct sk_buff *skb);
 
 	struct module		*owner;
 };
diff --git a/net/core/dev.c b/net/core/dev.c
index 95627552488e..ba136d53f0f1 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4059,11 +4059,17 @@ static void qdisc_pkt_len_init(struct sk_buff *skb)
 
 static int dev_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *q,
 			     struct sk_buff **to_free,
-			     struct netdev_queue *txq)
+			     struct netdev_queue *txq, unsigned int mtu)
 {
+	unsigned int seg_limit = mtu;
 	int rc = NET_XMIT_SUCCESS;
 
-	if ((q->flags & TCQ_F_NEED_SEGMENT) && skb_is_gso(skb)) {
+	if (q->max_segment_size)
+		seg_limit = q->max_segment_size;
+
+	if ((q->flags & TCQ_F_NEED_SEGMENT) &&
+	    qdisc_pkt_len(skb) > seg_limit &&
+	    skb_is_gso(skb)) {
 		netdev_features_t features = netif_skb_features(skb);
 		struct sk_buff *segs, *nskb, *next;
 		struct sk_buff *fail_list = NULL;
@@ -4125,7 +4131,7 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
 			 * of q->seqlock to protect from racing with requeuing.
 			 */
 			if (unlikely(!nolock_qdisc_is_empty(q))) {
-				rc = dev_qdisc_enqueue(skb, q, &to_free, txq);
+				rc = dev_qdisc_enqueue(skb, q, &to_free, txq, dev->mtu);
 				__qdisc_run(q);
 				qdisc_run_end(q);
 
@@ -4141,7 +4147,7 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
 			return NET_XMIT_SUCCESS;
 		}
 
-		rc = dev_qdisc_enqueue(skb, q, &to_free, txq);
+		rc = dev_qdisc_enqueue(skb, q, &to_free, txq, dev->mtu);
 		qdisc_run(q);
 
 no_lock_out:
@@ -4195,7 +4201,7 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
 		rc = NET_XMIT_SUCCESS;
 	} else {
 		WRITE_ONCE(q->owner, smp_processor_id());
-		rc = dev_qdisc_enqueue(skb, q, &to_free, txq);
+		rc = dev_qdisc_enqueue(skb, q, &to_free, txq, dev->mtu);
 		WRITE_ONCE(q->owner, -1);
 		if (qdisc_run_begin(q)) {
 			if (unlikely(contended)) {
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index 8a83e55ebc0d..357488b8f055 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -1210,12 +1210,19 @@ static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
 		err = cops->graft(parent, cl, new, &old, extack);
 		if (err)
 			return err;
-		/* Propagate TCQ_F_NEED_SEGMENT to root Qdisc if needed */
+		/* Propagate TCQ_F_NEED_SEGMENT and max_segment_size to root Qdisc if needed */
 		if (new && (new->flags & TCQ_F_NEED_SEGMENT)) {
 			struct Qdisc *root = qdisc_root(parent);
-
-			if (root)
+			unsigned int child_max = 0;
+
+			if (new->ops->get_max_size)
+				child_max = new->ops->get_max_size(new, NULL);
+			if (root) {
+				if (!root->max_segment_size ||
+				    (child_max && child_max < root->max_segment_size))
+					root->max_segment_size = child_max;
 				root->flags |= TCQ_F_NEED_SEGMENT;
+			}
 		}
 		notify_and_destroy(net, skb, n, classid, old, new, extack);
 	}
diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c
index 6b02a6697378..4644781d3465 100644
--- a/net/sched/sch_taprio.c
+++ b/net/sched/sch_taprio.c
@@ -531,26 +531,34 @@ static long get_packet_txtime(struct sk_buff *skb, struct Qdisc *sch)
 	return txtime;
 }
 
-/* Devices with full offload are expected to honor this in hardware */
-static bool taprio_skb_exceeds_queue_max_sdu(struct Qdisc *sch,
-					     struct sk_buff *skb)
+static unsigned int taprio_get_max_size(struct Qdisc *sch, struct sk_buff *skb)
 {
 	struct taprio_sched *q = qdisc_priv(sch);
 	struct net_device *dev = qdisc_dev(sch);
 	struct sched_gate_list *sched;
 	int prio = skb->priority;
-	bool exceeds = false;
+	unsigned int ret = 0;
 	u8 tc;
 
 	tc = netdev_get_prio_tc_map(dev, prio);
 
 	rcu_read_lock();
 	sched = rcu_dereference(q->oper_sched);
-	if (sched && skb->len > sched->max_frm_len[tc])
-		exceeds = true;
+	if (sched)
+		ret = sched->max_frm_len[tc];
 	rcu_read_unlock();
+	return ret;
+}
+
+/* Devices with full offload are expected to honor this in hardware */
+static bool taprio_skb_exceeds_queue_max_sdu(struct Qdisc *sch,
+					     struct sk_buff *skb)
+{
+	unsigned int size = taprio_get_max_size(sch, skb);
 
-	return exceeds;
+	if (size)
+		return skb->len > size;
+	return false;
 }
 
 static int taprio_enqueue_one(struct sk_buff *skb, struct Qdisc *sch,
@@ -2481,6 +2489,7 @@ static struct Qdisc_ops taprio_qdisc_ops __read_mostly = {
 	.enqueue	= taprio_enqueue,
 	.dump		= taprio_dump,
 	.dump_stats	= taprio_dump_stats,
+	.get_max_size	= taprio_get_max_size,
 	.owner		= THIS_MODULE,
 };
 MODULE_ALIAS_NET_SCH("taprio");
diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c
index 6200a6e70113..7b1abc465f4f 100644
--- a/net/sched/sch_tbf.c
+++ b/net/sched/sch_tbf.c
@@ -552,12 +552,20 @@ static const struct Qdisc_class_ops tbf_class_ops = {
 	.dump		=	tbf_dump_class,
 };
 
+static unsigned int tbf_get_max_size(struct Qdisc *sch, struct sk_buff *skb)
+{
+	struct tbf_sched_data *q = qdisc_priv(sch);
+
+	return q->max_size;
+}
+
 static struct Qdisc_ops tbf_qdisc_ops __read_mostly = {
 	.next		=	NULL,
 	.cl_ops		=	&tbf_class_ops,
 	.id		=	"tbf",
 	.priv_size	=	sizeof(struct tbf_sched_data),
 	.enqueue	=	tbf_enqueue,
+	.get_max_size	=	tbf_get_max_size,
 	.dequeue	=	tbf_dequeue,
 	.peek		=	qdisc_peek_dequeued,
 	.init		=	tbf_init,
-- 
2.34.1