lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite for Android: free password hash cracker in your pocket
[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <966ed106-1b78-4c7b-964a-d3d7ffcb09f0@oracle.com>
Date: Wed, 11 Jun 2025 21:46:28 +0530
From: ALOK TIWARI <alok.a.tiwari@...cle.com>
To: chia-yu.chang@...ia-bell-labs.com, donald.hunter@...il.com,
        xandfury@...il.com, netdev@...r.kernel.org, dave.taht@...il.com,
        pabeni@...hat.com, jhs@...atatu.com, kuba@...nel.org,
        stephen@...workplumber.org, xiyou.wangcong@...il.com, jiri@...nulli.us,
        davem@...emloft.net, edumazet@...gle.com, horms@...nel.org,
        andrew+netdev@...n.ch, ast@...erby.net, liuhangbin@...il.com,
        shuah@...nel.org, linux-kselftest@...r.kernel.org, ij@...nel.org,
        ncardwell@...gle.com, koen.de_schepper@...ia-bell-labs.com,
        g.white@...lelabs.com, ingemar.s.johansson@...csson.com,
        mirja.kuehlewind@...csson.com, cheshire@...le.com, rs.ietf@....at,
        Jason_Livingood@...cast.com, vidhi_goel@...le.com
Cc: Olga Albisser <olga@...isser.org>,
        Oliver Tilmans <olivier.tilmans@...ia.com>,
        Bob Briscoe <research@...briscoe.net>,
        Henrik Steen <henrist@...rist.net>
Subject: Re: [PATCH v8 RESEND iproute2-next 1/1] tc: add dualpi2 scheduler
 module



On 11-06-2025 20:38, chia-yu.chang@...ia-bell-labs.com wrote:
> From: Chia-Yu Chang <chia-yu.chang@...ia-bell-labs.com>
> 
> DUALPI2 AQM is a combination of the DUALQ Coupled-AQM with a PI2
> base-AQM. The PI2 AQM is in turn both an extension and a simplification
> of the PIE AQM. PI2 makes quite some PIE heuristics unnecessary, while
> being able to control scalable congestion controls like TCP-Prague.
> With PI2, both Reno/Cubic can be used in parallel with Prague,
> maintaining window fairness. DUALQ provides latency separation between
> low latency Prague flows and Reno/Cubic flows that need a bigger queue.
> 
> This patch adds support to tc to configure it through its netlink
> interface.
> 
> Signed-off-by: Chia-Yu Chang <chia-yu.chang@...ia-bell-labs.com>
> Co-developed-by: Olga Albisser <olga@...isser.org>
> Signed-off-by: Olga Albisser <olga@...isser.org>
> Co-developed-by: Koen De Schepper <koen.de_schepper@...ia-bell-labs.com>
> Signed-off-by: Koen De Schepper <koen.de_schepper@...ia-bell-labs.com>
> Co-developed-by: Oliver Tilmans <olivier.tilmans@...ia.com>
> Signed-off-by: Oliver Tilmans <olivier.tilmans@...ia.com>
> Signed-off-by: Bob Briscoe <research@...briscoe.net>
> Co-developed-by: Henrik Steen <henrist@...rist.net>
> Signed-off-by: Henrik Steen <henrist@...rist.net>
> ---
>   bash-completion/tc             |  11 +-
>   include/uapi/linux/pkt_sched.h |  68 +++++
>   include/utils.h                |   2 +
>   ip/iplink_can.c                |  14 -
>   lib/utils.c                    |  30 ++
>   man/man8/tc-dualpi2.8          | 249 +++++++++++++++
>   tc/Makefile                    |   1 +
>   tc/q_dualpi2.c                 | 534 +++++++++++++++++++++++++++++++++
>   8 files changed, 894 insertions(+), 15 deletions(-)
>   create mode 100644 man/man8/tc-dualpi2.8
>   create mode 100644 tc/q_dualpi2.c
> 
> diff --git a/bash-completion/tc b/bash-completion/tc
> index 61f0039d..c18288a3 100644
> --- a/bash-completion/tc
> +++ b/bash-completion/tc
> @@ -4,7 +4,7 @@
>   
>   QDISC_KIND=' choke codel bfifo pfifo pfifo_head_drop fq fq_codel gred hhf \
>               mqprio multiq netem pfifo_fast pie fq_pie red sfb sfq tbf \
> -            drr hfsc htb prio qfq '
> +            drr hfsc htb prio qfq dualpi2'
>   FILTER_KIND=' basic bpf cgroup flow flower fw route u32 matchall '
>   ACTION_KIND=' gact mirred bpf sample '
>   
> @@ -366,6 +366,15 @@ _tc_qdisc_options()
>               _tc_once_attr 'default r2q direct_qlen debug'
>               return 0
>               ;;
> +        dualpi2)
> +            _tc_once_attr 'limit memlimit coupling_factor step_thresh \
> +                min_qlen_step classic_protection max_rtt typical_rtt \
> +                target tupdate alpha beta'
> +            _tc_one_of_list 'drop_on_overload overflow'
> +            _tc_one_of_list 'drop_enqueue drop_dequeue'
> +            _tc_one_of_list 'l4s_ect any_ect'
> +            _tc_one_of_list 'split_gso no_split_gso'
> +            ;;
>           multiq|pfifo_fast|drr|qfq)
>               return 0
>               ;;
> diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h
> index 25a9a470..e5a58167 100644
> --- a/include/uapi/linux/pkt_sched.h
> +++ b/include/uapi/linux/pkt_sched.h
> @@ -1210,4 +1210,72 @@ enum {
>   
>   #define TCA_ETS_MAX (__TCA_ETS_MAX - 1)
>   
> +/* DUALPI2 */
> +enum {
> +	TCA_DUALPI2_DROP_OVERLOAD_OVERFLOW = 0,
> +	TCA_DUALPI2_DROP_OVERLOAD_DROP = 1,
> +	__TCA_DUALPI2_DROP_OVERLOAD_MAX,
> +};
> +#define TCA_DUALPI2_DROP_OVERLOAD_MAX (__TCA_DUALPI2_DROP_OVERLOAD_MAX - 1)
> +
> +enum {
> +	TCA_DUALPI2_DROP_EARLY_DROP_DEQUEUE = 0,
> +	TCA_DUALPI2_DROP_EARLY_DROP_ENQUEUE = 1,
> +	__TCA_DUALPI2_DROP_EARLY_MAX,
> +};
> +#define TCA_DUALPI2_DROP_EARLY_MAX (__TCA_DUALPI2_DROP_EARLY_MAX - 1)
> +
> +enum {
> +	TCA_DUALPI2_ECN_MASK_L4S_ECT = 1,
> +	TCA_DUALPI2_ECN_MASK_CLA_ECT = 2,
> +	TCA_DUALPI2_ECN_MASK_ANY_ECT = 3,
> +	__TCA_DUALPI2_ECN_MASK_MAX,
> +};
> +#define TCA_DUALPI2_ECN_MASK_MAX (__TCA_DUALPI2_ECN_MASK_MAX - 1)
> +
> +enum {
> +	TCA_DUALPI2_SPLIT_GSO_NO_SPLIT_GSO = 0,
> +	TCA_DUALPI2_SPLIT_GSO_SPLIT_GSO = 1,
> +	__TCA_DUALPI2_SPLIT_GSO_MAX,
> +};
> +#define TCA_DUALPI2_SPLIT_GSO_MAX (__TCA_DUALPI2_SPLIT_GSO_MAX - 1)
> +
> +enum {
> +	TCA_DUALPI2_UNSPEC,
> +	TCA_DUALPI2_LIMIT,		/* Packets */
> +	TCA_DUALPI2_MEMORY_LIMIT,	/* Bytes */
> +	TCA_DUALPI2_TARGET,		/* us */
> +	TCA_DUALPI2_TUPDATE,		/* us */
> +	TCA_DUALPI2_ALPHA,		/* Hz scaled up by 256 */
> +	TCA_DUALPI2_BETA,		/* HZ scaled up by 256 */

HZ -> Hz for consistency

> +	TCA_DUALPI2_STEP_THRESH,	/* Packets or us */
> +	TCA_DUALPI2_STEP_PACKETS,	/* Whether STEP_THRESH is in packets */
> +	TCA_DUALPI2_MIN_QLEN_STEP,	/* Minimum qlen to apply STEP_THRESH */
> +	TCA_DUALPI2_COUPLING,		/* Coupling factor between queues */
> +	TCA_DUALPI2_DROP_OVERLOAD,	/* Whether to drop on overload */
> +	TCA_DUALPI2_DROP_EARLY,		/* Whether to drop on enqueue */
> +	TCA_DUALPI2_C_PROTECTION,	/* Percentage */
> +	TCA_DUALPI2_ECN_MASK,		/* L4S queue classification mask */
> +	TCA_DUALPI2_SPLIT_GSO,		/* Split GSO packets at enqueue */
> +	TCA_DUALPI2_PAD,
> +	__TCA_DUALPI2_MAX
> +};
> +
> +#define TCA_DUALPI2_MAX   (__TCA_DUALPI2_MAX - 1)
> +
> +struct tc_dualpi2_xstats {
> +	__u32 prob;		/* current probability */
> +	__u32 delay_c;		/* current delay in C queue */
> +	__u32 delay_l;		/* current delay in L queue */
> +	__u32 packets_in_c;	/* number of packets enqueued in C queue */
> +	__u32 packets_in_l;	/* number of packets enqueued in L queue */
> +	__u32 maxq;		/* maximum queue size */
> +	__u32 ecn_mark;		/* packets marked with ecn*/

Missing space after before */

> +	__u32 step_marks;	/* ECN marks due to the step AQM */
> +	__s32 credit;		/* current c_protection credit */
> +	__u32 memory_used;	/* Memory used of both queues */

used of both-> used by both

> +	__u32 max_memory_used;	/* Maximum used memory */
> +	__u32 memory_limit;	/* Memory limit of both queues */
> +};
> +
[clip]
> +.BI coupling_factor " NUMBER"
> +Set the coupling rate factor between Classic and L4S. Defaults to
> +.I 2
> +.PD
> +.TP
> +.BI l4s_ect | any_ect
> +Configures the ECT classifier. Packets whose ECT codepoint matches this are sent to the L-queue, where they receive a scalable marking. Defaults to
> +.I l4s_ect
> +, i.e., the L4S identifier ECT(1). Setting this to
> +.I any_ect
> +causes all packets whose ECN field is not zero to be sent to the L-queue. This enables it to be backward compatible with, e.g., DCTCP. Note DCTCP should only be used for intra-DC traffic with very low RTTs and AQM delay targets bigger than those RTTs, separated from Internet traffic (also if Prague compliant CC), as it does not support all Prague requirements that make sure that a congestion control can work well with the range of RTTs on the Internet.
> +.PD
> +.TP
> +.BI step_thresh " TIME | PACKETS"
> +Set the step threshold for the L-queue. This will cause packets with a sojourn time exceeding the threshold to always be marked. This value can either be specified using time units (i.e., us, ms, s), or in packets (p, pkt, packet(s)). A value without units is assumed to be in time (us). If defining the step in packets, be sure to disable GRO on the ingress interfaces. Defaults to
> +.I 1ms
> +.PD
> +.TP
> +.BI min_qlen_step " PACKETS"
> +Incoming packets enqueued to the L-queue may apply the step threshold when the queue length of the L-queue exceeds this value. Default to
> +.I 0
> +packets. This means that every enqueued packets to the L-queue with a sojourn time exceed the step threshold will be marked.

exceed -> exceeds

> +.PD
> +.TP
> +.B drop_on_overload | overflow
> +Control the overload strategy.
> +.I drop_on_overload
> +preserves the delay in the L-queue by dropping in both queues on overload.
> +.I overflow
> +sacrifices delay to avoid losses, eventually resulting in a taildrop behavior once the
> +.I limit
> +is reached. Defaults to
> +.I drop_on_overload
> +.PD
> +.TP
> +.B drop_enqueue | drop_dequeue
> +Decide when packets are PI-based dropped or marked. The
> +.I step_thresh
> +based L4S marking is always at dequeue. Defaults to
> +.I drop_dequeue
> +.PD
> +.TP
> +.BI classic_protection " PERCENTAGE
> +Protects the C-queue from unresponsive traffic in the L-queue. This bounds the maximal scheduling delay in the C-queue to be
> +.I (100 - PERCENTAGE)
> +times greater than the one in the L-queue. Defaults to
> +.I 10
> +.TP
> +.BI typical_rtt " TIME"
> +.PD 0
> +.TP
> +.PD
> +.BI max_rtt " TIME"
> +Specify the maximum round trip time (RTT) and/or the typical RTT of the traffic that will be controlled by DUALPI2. These values are specified using time units (i.e., us, ms, s). A value without units is assumed to be in us. If either
> +.I max_rtt
> +or
[clip]
> +		} else if (strcmp(*argv, "max_rtt") == 0) {
> +			NEXT_ARG();
> +			if (get_time(&rtt_max, *argv)) {
> +				fprintf(stderr, "Illegal \"rtt_max\"\n");
> +				return -1;
> +			}
> +		} else if (strcmp(*argv, "typical_rtt") == 0) {
> +			NEXT_ARG();
> +			if (get_time(&rtt_typ, *argv)) {
> +				fprintf(stderr, "Illegal \"rtt_typical\"\n");

use typical_rtt not rtt_typical

> +				return -1;
> +			}
> +		} else if (strcmp(*argv, "help") == 0) {
> +			explain();
> +			return -1;
> +		} else {
> +			fprintf(stderr, "What is \"%s\"?\n", *argv);
> +			explain();
> +			return -1;
> +		}
> +		--argc;
> +		++argv;
> +	}
> +

Reviewed-by: Alok Tiwari <alok.a.tiwari@...cle.com>

Thanks,
Alok


Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ