lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20241015102940.26157-45-chia-yu.chang@nokia-bell-labs.com>
Date: Tue, 15 Oct 2024 12:29:40 +0200
From: chia-yu.chang@...ia-bell-labs.com
To: netdev@...r.kernel.org, ij@...nel.org, ncardwell@...gle.com,
        koen.de_schepper@...ia-bell-labs.com, g.white@...leLabs.com,
        ingemar.s.johansson@...csson.com, mirja.kuehlewind@...csson.com,
        cheshire@...le.com, rs.ietf@....at, Jason_Livingood@...cast.com,
        vidhi_goel@...le.com
Cc: Chia-Yu Chang <chia-yu.chang@...ia-bell-labs.com>,
        Olivier Tilmans <olivier.tilmans@...ia.com>,
        Bob Briscoe <research@...briscoe.net>
Subject: [PATCH net-next 44/44] tcp: Add the TCP Prague congestion control module

From: Chia-Yu Chang <chia-yu.chang@...ia-bell-labs.com>

TCP-Prague evolved from DCTCP, adapted for the use over the public
internet and removing all reasons why the NW would need to build a
queue in a bottleneck link to control the rate of this congestion-
control. As such, it needs to implement the performance and safety
requirements listed in Appendix A of IETF RFC9331:
https://datatracker.ietf.org/doc/html/rfc9331

This version enhances DCTCP by:
* RTT independence
* Fractional window and increased alpha resolution
* Updated integer arithmetics and fixed-point scaling
* Only Additive Increase for ACK of non-marked packets
* Pacing/tso sizing
* Pacing below minimum congestion window of 2
* +/- 3% pacing variations per RTT
* Enforce the use of ECT_1, Accurate ECN and ECN++

All above improvements make Prague behave under 25ms very rate fair
and RTT independent, assures full or close to full link utilization
on a stable network link and allows the NW to control the rate down
to 100kbps without the need to drop packets or built a queue.
For RTTs from 0us till 25ms and link rates higher than 100kbps, the
resulting rate equation is very close to:
      r [Mbps] = 1/p - 1
Above 25ms, a correction factor of 25ms/RTT needs to be applied
(reducing the rate proportional to the higher RTT).

Co-developed-by: Olivier Tilmans <olivier.tilmans@...ia.com>
Signed-off-by: Olivier Tilmans <olivier.tilmans@...ia.com>
Co-developed-by: Koen De Schepper <koen.de_schepper@...ia-bell-labs.com>
Signed-off-by: Koen De Schepper <koen.de_schepper@...ia-bell-labs.com>
Signed-off-by: Bob Briscoe <research@...briscoe.net>
Signed-off-by: Ilpo Järvinen <ij@...nel.org>
Signed-off-by: Chia-Yu Chang <chia-yu.chang@...ia-bell-labs.com>
---
 include/uapi/linux/inet_diag.h |  13 +
 net/ipv4/Kconfig               |  37 ++
 net/ipv4/Makefile              |   1 +
 net/ipv4/tcp_prague.c          | 866 +++++++++++++++++++++++++++++++++
 4 files changed, 917 insertions(+)
 create mode 100644 net/ipv4/tcp_prague.c

diff --git a/include/uapi/linux/inet_diag.h b/include/uapi/linux/inet_diag.h
index 86bb2e8b17c9..69d144ba3627 100644
--- a/include/uapi/linux/inet_diag.h
+++ b/include/uapi/linux/inet_diag.h
@@ -161,6 +161,7 @@ enum {
 	INET_DIAG_SK_BPF_STORAGES,
 	INET_DIAG_CGROUP_ID,
 	INET_DIAG_SOCKOPT,
+	INET_DIAG_PRAGUEINFO,
 	__INET_DIAG_MAX,
 };
 
@@ -231,9 +232,21 @@ struct tcp_bbr_info {
 	__u32	bbr_cwnd_gain;		/* cwnd gain shifted left 8 bits */
 };
 
+/* INET_DIAG_PRAGUEINFO */
+
+struct tcp_prague_info {
+	__u64	prague_alpha;
+	__u64	prague_frac_cwnd;
+	__u64	prague_rate_bytes;
+	__u32	prague_max_burst;
+	__u32	prague_round;
+	__u32	prague_rtt_target;
+};
+
 union tcp_cc_info {
 	struct tcpvegas_info	vegas;
 	struct tcp_dctcp_info	dctcp;
+	struct tcp_prague_info	prague;
 	struct tcp_bbr_info	bbr;
 };
 #endif /* _UAPI_INET_DIAG_H_ */
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 6d2c97f8e9ef..f55438c70579 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -679,6 +679,39 @@ config TCP_CONG_BBR
 	  AQM schemes that do not provide a delay signal. It requires the fq
 	  ("Fair Queue") pacing packet scheduler.
 
+config TCP_CONG_PRAGUE
+	tristate "TCP Prague"
+	default n
+	help
+	  TCP Prague is an enhancement for DCTCP to make DCTCP congestion
+	  control deployable into general Internet.
+	  TCP-Prague evolved from DCTCP, adapted for the use over the public
+	  internet and removing all reasons why the NW would need to build a
+	  queue in a bottleneck link to control the rate of this congestion-
+	  control. As such, it needs to implement the performance and safety
+	  requirements listed in Appendix A of IETF RFC9331:
+	  https://datatracker.ietf.org/doc/html/rfc9331
+
+	  This version enhances DCTCP by:
+	  * RTT independence
+	  * Fractional window and increased alpha resolution
+	  * Updated integer arithmetics and fixed-point scaling
+	  * Only Additive Increase for ACK of non-marked packets
+	  * Pacing/tso sizing
+	  * Pacing below minimum congestion window of 2
+	  * +/- 3% pacing variations per RTT
+	  * Enforce the use of ECT_1, Accurate ECN and ECN++
+
+	  All above improvements make Prague behave under 25ms very rate fair
+	  and RTT independent, assures full or close to full link utilization
+	  on a stable network link and allows the NW to control the rate down
+	  to 100kbps without the need to drop packets or built a queue.
+	  For RTTs from 0us till 25ms and link rates higher than 100kbps, the
+	  resulting rate equation is very close to:
+	  r [Mbps] = 1/p - 1
+	  Above 25ms, a correction factor of 25ms/RTT needs to be applied
+	  (reducing the rate proportional to the higher RTT).
+
 choice
 	prompt "Default TCP congestion control"
 	default DEFAULT_CUBIC
@@ -716,6 +749,9 @@ choice
 	config DEFAULT_BBR
 		bool "BBR" if TCP_CONG_BBR=y
 
+	config DEFAULT_PRAGUE
+		bool "Prague" if TCP_CONG_PRAGUE=y
+
 	config DEFAULT_RENO
 		bool "Reno"
 endchoice
@@ -740,6 +776,7 @@ config DEFAULT_TCP_CONG
 	default "dctcp" if DEFAULT_DCTCP
 	default "cdg" if DEFAULT_CDG
 	default "bbr" if DEFAULT_BBR
+	default "prague" if DEFAULT_PRAGUE
 	default "cubic"
 
 config TCP_SIGPOOL
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index ec36d2ec059e..47b1304ffa09 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -60,6 +60,7 @@ obj-$(CONFIG_TCP_CONG_SCALABLE) += tcp_scalable.o
 obj-$(CONFIG_TCP_CONG_LP) += tcp_lp.o
 obj-$(CONFIG_TCP_CONG_YEAH) += tcp_yeah.o
 obj-$(CONFIG_TCP_CONG_ILLINOIS) += tcp_illinois.o
+obj-$(CONFIG_TCP_CONG_PRAGUE) += tcp_prague.o
 obj-$(CONFIG_TCP_SIGPOOL) += tcp_sigpool.o
 obj-$(CONFIG_NET_SOCK_MSG) += tcp_bpf.o
 obj-$(CONFIG_BPF_SYSCALL) += udp_bpf.o
diff --git a/net/ipv4/tcp_prague.c b/net/ipv4/tcp_prague.c
new file mode 100644
index 000000000000..6db9386b13af
--- /dev/null
+++ b/net/ipv4/tcp_prague.c
@@ -0,0 +1,866 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (C) 2024 Nokia
+ *
+ * Author: Chia-Yu Chang <chia-yu.chang@...ia-bell-labs.com>
+ * Author: Olivier Tilmans <olivier.tilmans@...ia.com>
+ * Author: Koen De Schepper <koen.de_schepper@...ia-bell-labs.com>
+ * Author: Bob Briscoe <koen.de_schepper@...ia-bell-labs.com>
+ *
+ * TCP Prague congestion control.
+ *
+ * This congestion-control, part of the L4S architecture, achieves low loss,
+ * low latency and scalable throughput when used in combination with AQMs such
+ * as DualPI2, CurvyRED, or even fq_codel with a low ce_threshold for the
+ * L4S flows.
+ *
+ * TCP-Prague evolved from DCTCP, adapted for the use over the public
+ * internet and removing all reasons why the network would need to build a
+ * queue in a bottleneck link to control the rate of this congestion-control.
+ * As such, it needs to implement the performance and safety requirements
+ * listed in Appendix A of IETF RFC9331:
+ * https://datatracker.ietf.org/doc/html/rfc9331
+ *
+ * Notable changes from DCTCP:
+ *
+ * 1/ RTT independence:
+ * Below a minimum target RTT, Prague will operate as if it was experiencing
+ * that target RTT (default=25ms). This enable short RTT flows to co-exist
+ * with long RTT ones (e.g., Edge-DC flows competing vs intercontinental
+ * internet traffic) without causing starvation or saturating the ECN signal,
+ * without the need for Diffserv or bandwdith reservation. It also makes the
+ * lower RTT flows more resilient to the inertia of higher RTT flows.
+ *
+ * This is achieved by scaling cwnd growth during Additive Increase, thus
+ * leaving room for higher RTT flows to grab a larger bandwidth share while at
+ * the same time relieving the pressure on bottleneck link hence lowering the
+ * overall marking probability.
+ *
+ * Given that this slows short RTT flows, this behavior only makes sense for
+ * long-running flows that actually need to share the link--as opposed to,
+ * e.g., RPC traffic. To that end, flows become RTT independent after
+ * DEFAULT_RTT_TRANSITION number of RTTs after slowstart (default = 4).
+ *
+ * 2/ Fractional window and increased alpha resolution:
+ * To support slower and more gradual increase of the window, a fractional
+ * window is kept and manipulated from which the socket congestion window is
+ * derived (rounded up to the next integer and capped to at least 2).
+ *
+ * The resolution of alpha has been increased to ensure that a low amount of
+ * marks over high-BDP paths can be accurately taken into account in the
+ * computation.
+ *
+ * Orthogonally, the value of alpha that is kept in the connection state is
+ * stored upscaled, in order to preserve its remainder over the course of its
+ * updates (similarly to how tp->srtt_us is maintained, as opposed to
+ * dctcp->alpha).
+ *
+ * 3/ Updated integer arithmetics and fixed point scaling
+ * In order to operate with a permanent, (very) low marking probability and
+ * much larger RTT range, the arithmetics have been updated to track decimal
+ * precision with unbiased rounding, alongside avoiding capping the integer
+ * parts. This improves the precision, avoiding avalanche effects as
+ * remainders are carried over next operations, as well as responsiveness as
+ * the AQM at the bottleneck can effectively control the operation of the flow
+ * without drastic marking probability increase.
+ *
+ * 4/ Only Additive Increase for ACK of non-marked packets
+ * DCTCP disabled increase for a full RTT when marks were received. Given that
+ * L4S AQM may induce CE marks applied every ACK (e.g., from the PI2
+ * part of dualpi2), instead of full RTTs of marks once in a while that a step
+ * AQM would cause, Prague will increase every RTT, but proportional to the
+ * non-marked packets. So the total increase over an RTT is proportional to
+ * (1-p)/p. The cwnd is updated for every ACK that reports non-marked
+ * data on the receiver, regardless of the congestion status of the connection
+ * (i.e., it is expected to spent most of its time in TCP_CA_CWR when used
+ * over dualpi2). Note that this is only valid for CE marks. For loss (so
+ * being in TCP_CA_LOSS state) the increase is still disabled for one RTT.
+ *
+ * See https://arxiv.org/abs/1904.07605 for more details around saturation.
+ *
+ * 5/ Pacing/TSO sizing
+ * Prague aims to keep queuing delay as low as possible. To that end, it is in
+ * its best interest to pace outgoing segments (i.e., to smooth its traffic),
+ * as well as impose a maximal GSO burst size to avoid instantaneous queue
+ * buildups in the bottleneck link. The current GSO burst size is limited to
+ * create up to 250us latency assuming the current transmission rate is the
+ * bottleneck rate. For this functionality to be active, the "fq" qdisc needs
+ * to be active on the network interfaces that need to carry Prague flows.
+ * Note this is the "fq" qdisc, not the default "fq_codel" qdisc.
+ *
+ * 6/ Pacing below minimum congestion window of 2
+ * Prague will further reduce the pacing rate based on the fractional window
+ * below 2 MTUs. This is needed for very low RTT networks to be able to
+ * control flows to low rates without the need for the network to buffer the
+ * 2 packets in flight per active flow. The rate can go down to 100kbps on
+ * any RTT. Below 1Mbps, the packet size will be reduced to make sure we
+ * still can send 2 packets per 25ms, down to 150 bytes at 100kbps.
+ * The real blocking congestion window will still be 2, but as long as ACKs
+ * come in, the pacing rate will block the sending. The fractional window
+ * is also always rounded up to the next integer when assigned to the
+ * blocking congestion window. This makes the pacing rate most of the time
+ * the blocking mechanism. As the fractional window is updated every ACK,
+ * the pacing rate is smoothly increased guaranteeing a non-stepwise rate
+ * increase when the congestion window has a low integer value.
+ *
+ * 7/ +/- 3% pacing variations per RTT
+ * The first half of every RTT (or 25ms if it is less) the pacing rate is
+ * increased by 3%, the second half it is decreased by 3%. This triggers
+ * a stable amount of marks every RTT on a STEP marking AQM when the link
+ * is very stable. It avoids the undesired on/off marking scheme of DCTCP
+ * (one RTT of 100% marks and several RTTs no marks), which leads to larger
+ * rate variations and unfairness of rate and RTT due to its different rate
+ * to marking probability proportionality:
+ *     r ~ 1/p^2
+ *
+ * 8/ Enforce the use of ECT_1, Accurate ECN and ECN++
+ * As per RFC 9331, Prague needs to use ECT_1, Accurate ECN and ECN++
+ * (also ECT_1 on non-data packets like SYN, pure ACKs, ...). Independent
+ * of the other sysctl configs of the kernel, setting the Prague CC on a
+ * socket will cause the system-wide configuration being overruled. This
+ * also means that using Prague selectively on a system does not require
+ * any system-wide changes (except using the FQ qdisc on the NICs).
+ *
+ * All above improvements make Prague behave under 25ms very rate fair and
+ * RTT independent, and assures full or close to full link utilization on
+ * a stable network link. It allows the network to control the rate down to
+ * 100kbps without the need to drop packets or built a queue. For RTTs
+ * from 0us till 25ms and link rates higher than 100kbps, the resulting
+ * rate equation is very close to:
+ *     r [Mbps] = 1/p - 1
+ * or typically the other way around that a flow needs p marking probability
+ * to get squeezed down to r Mbps:
+ *     p = 1 / (r + 1)
+ * So 50% (p = 0.5) will result in a rate of 1Mbps or typically the other
+ * way around: 1Mbps needs 50% marks, 99Mbps needs 1% marks, 100kbps needs
+ * 91% marks, etc...
+ * For RTTs above 25ms, a correction factor should be taken into account:
+ *     r [Mbps] = (1/p - 1) * 25ms / RTT
+ * with RTT and 25ms expressed in the same unit.
+ */
+
+#define pr_fmt(fmt) "TCP-Prague " fmt
+
+#include <linux/inet_diag.h>
+#include <linux/inet.h>
+#include <linux/math64.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/printk.h>
+#include <net/tcp.h>
+
+#define MIN_CWND_RTT		2U
+#define MIN_CWND_VIRT		2U
+#define MIN_MSS			150U
+#define MINIMUM_RATE		12500ULL	/* Minimum rate: 100kbps */
+#define PRAGUE_ALPHA_BITS	24U
+#define PRAGUE_MAX_ALPHA	BIT_ULL(PRAGUE_ALPHA_BITS)
+#define CWND_UNIT		20U
+#define ONE_CWND		BIT_ULL(CWND_UNIT)
+#define PRAGUE_SHIFT_G		4		/* EWMA gain g = 1/2^4 */
+#define DEFAULT_RTT_TRANSITION	4
+#define MAX_SCALED_RTT		(100 * USEC_PER_MSEC)
+#define MTU_SYS			1500UL
+#define RATE_OFFSET		4
+#define OFFSET_UNIT		7
+#define HSRTT_SHIFT		7
+#define RTT2SEC_SHIFT		23
+
+static u32 prague_burst_shift __read_mostly = 12; /* 1/2^12 sec ~=.25ms */
+MODULE_PARM_DESC(prague_burst_shift,
+		 "maximal GSO burst duration as a base-2 negative exponent");
+module_param(prague_burst_shift, uint, 0644);
+
+static u32 prague_max_tso_segs __read_mostly; /* Default value for static is 0 */
+MODULE_PARM_DESC(prague_max_tso_segs, "Maximum TSO/GSO segments");
+module_param(prague_max_tso_segs, uint, 0644);
+
+static u32 prague_rtt_target __read_mostly = 25 * USEC_PER_MSEC;
+MODULE_PARM_DESC(prague_rtt_target, "RTT scaling target");
+module_param(prague_rtt_target, uint, 0644);
+
+static int prague_rtt_transition __read_mostly = DEFAULT_RTT_TRANSITION;
+MODULE_PARM_DESC(prague_rtt_transition,
+		 "Amount of post-SS rounds to transition to be RTT independent.");
+module_param(prague_rtt_transition, uint, 0644);
+
+static int prague_rate_offset __read_mostly = 4; /* 4/128 ~= 3% */
+MODULE_PARM_DESC(prague_rate_offset,
+		 "Pacing rate offset in 1/128 units at each half of RTT_virt");
+module_param(prague_rate_offset, uint, 0644);
+
+static int prague_alpha_mode __read_mostly; /* Default value for static is 0 */
+MODULE_PARM_DESC(prague_alpha_mode,
+		 "TCP Prague SS mode (0: Half cwnd at 1st mark; 1: Init alpha 1)");
+module_param(prague_alpha_mode, uint, 0644);
+
+static int prague_cwnd_mode __read_mostly; /* Default value for static is 0 */
+MODULE_PARM_DESC(prague_cwnd_mode,
+		 "TCP Prague mode (0: FracWin-base; 1: Rate-base; 2: Switch)");
+module_param(prague_cwnd_mode, uint, 0644);
+
+static int prague_cwnd_transit __read_mostly = 4;
+MODULE_PARM_DESC(prague_cwnd_transit,
+		 "CWND mode switching point in term of # of MTU_SYS");
+module_param(prague_cwnd_transit, uint, 0644);
+
+struct prague {
+	u64 cwr_stamp;
+	u64 alpha_stamp;	/* EWMA update timestamp */
+	u64 upscaled_alpha;	/* Congestion-estimate EWMA */
+	u64 ai_ack_increase;	/* AI increase per non-CE ACKed MSS */
+	u32 mtu_cache;
+	u64 hsrtt_us;
+	u64 frac_cwnd;		/* internal fractional cwnd */
+	u64 rate_bytes;		/* internal pacing rate in bytes */
+	u64 loss_rate_bytes;
+	u32 loss_cwnd;
+	u32 max_tso_burst;
+	u32 old_delivered;	/* tp->delivered at round start */
+	u32 old_delivered_ce;	/* tp->delivered_ce at round start */
+	u32 next_seq;		/* tp->snd_nxt at round start */
+	u32 round;		/* Round count since last slow-start exit */
+	u8  saw_ce:1,		/* Is there an AQM on the path? */
+	    cwnd_mode:1,	/* CWND operating mode */
+	    in_loss:1;		/* In cwnd reduction caused by loss */
+};
+
+/* Fallback struct ops if we fail to negotiate AccECN */
+static struct tcp_congestion_ops prague_reno;
+
+static void __prague_connection_id(struct sock *sk, char *str, size_t len)
+{
+	u16 dport = ntohs(inet_sk(sk)->inet_dport);
+	u16 sport = ntohs(inet_sk(sk)->inet_sport);
+
+	if (sk->sk_family == AF_INET)
+		snprintf(str, len, "%pI4:%u-%pI4:%u", &sk->sk_rcv_saddr, sport,
+			 &sk->sk_daddr, dport);
+	else if (sk->sk_family == AF_INET6)
+		snprintf(str, len, "[%pI6c]:%u-[%pI6c]:%u",
+			 &sk->sk_v6_rcv_saddr, sport, &sk->sk_v6_daddr, dport);
+}
+
+#define LOG(sk, fmt, ...) do {						\
+	char __tmp[2 * (INET6_ADDRSTRLEN + 9) + 1] = {0};		\
+	__prague_connection_id(sk, __tmp, sizeof(__tmp));		\
+	/* pr_fmt expects the connection ID*/				\
+	pr_info("(%s) : " fmt "\n", __tmp, ##__VA_ARGS__);		\
+} while (0)
+
+static struct prague *prague_ca(struct sock *sk)
+{
+	return (struct prague *)inet_csk_ca(sk);
+}
+
+static bool prague_is_rtt_indep(struct sock *sk)
+{
+	struct prague *ca = prague_ca(sk);
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	return !tcp_in_slow_start(tp) &&
+		ca->round >= prague_rtt_transition;
+}
+
+static bool prague_e2e_rtt_elapsed(struct sock *sk)
+{
+	struct prague *ca = prague_ca(sk);
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	return !before(tp->snd_una, ca->next_seq);
+}
+
+static u32 prague_target_rtt(struct sock *sk)
+{
+	return prague_rtt_target << 3;
+}
+
+static u32 prague_elapsed_since_alpha_update(struct sock *sk)
+{
+	struct prague *ca = prague_ca(sk);
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	return tcp_stamp_us_delta(tp->tcp_mstamp, ca->alpha_stamp);
+}
+
+static bool prague_target_rtt_elapsed(struct sock *sk)
+{
+	return (prague_target_rtt(sk) >> 3) <=
+		prague_elapsed_since_alpha_update(sk);
+}
+
+/* RTT independence on a step AQM requires the competing flows to converge to the
+ * same alpha, i.e., the EWMA update frequency might no longer be "once every RTT"
+ */
+static bool prague_should_update_ewma(struct sock *sk)
+{
+	return prague_e2e_rtt_elapsed(sk) &&
+		(!prague_is_rtt_indep(sk) || prague_target_rtt_elapsed(sk));
+}
+
+static u64 prague_unscaled_ai_ack_increase(struct sock *sk)
+{
+	return 1 << CWND_UNIT;
+}
+
+static u64 prague_rate_scaled_ai_ack_increase(struct sock *sk, u32 rtt)
+{
+	u64 increase;
+	u64 divisor;
+	u64 target;
+
+	target = prague_target_rtt(sk);
+	if (rtt >= target)
+		return prague_unscaled_ai_ack_increase(sk);
+	/* Scale increase to:
+	 * - Grow by 1MSS/target RTT
+	 * - Take into account the rate ratio of doing cwnd += 1MSS
+	 *
+	 * Overflows if e2e RTT is > 100ms, hence the cap
+	 */
+	increase = (u64)rtt << CWND_UNIT;
+	increase *= rtt;
+	divisor = target * target;
+	increase = DIV64_U64_ROUND_CLOSEST(increase, divisor);
+	return increase;
+}
+
+static u32 prague_frac_cwnd_to_snd_cwnd(struct sock *sk)
+{
+	struct prague *ca = prague_ca(sk);
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	return min_t(u32, max_t(u32, MIN_CWND_RTT,
+		     (ca->frac_cwnd + (ONE_CWND - 1)) >> CWND_UNIT), tp->snd_cwnd_clamp);
+}
+
+static u64 prague_virtual_rtt(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	return max_t(u32, prague_target_rtt(sk), tp->srtt_us);
+}
+
+static u64 prague_pacing_rate_to_max_mtu(struct sock *sk)
+{
+	struct prague *ca = prague_ca(sk);
+	struct tcp_sock *tp = tcp_sk(sk);
+	u64 cwnd_bytes;
+
+	if (prague_is_rtt_indep(sk) && ca->cwnd_mode == 1) {
+		cwnd_bytes = mul_u64_u64_shr(ca->rate_bytes, prague_virtual_rtt(sk),
+					     RTT2SEC_SHIFT);
+	} else {
+		u64 target = prague_target_rtt(sk);
+		u64 scaled_cwnd = ca->frac_cwnd;
+		u64 rtt = tp->srtt_us;
+
+		if (rtt < target)
+			scaled_cwnd = div64_u64(scaled_cwnd * target, rtt);
+		cwnd_bytes = mul_u64_u64_shr(scaled_cwnd, tcp_mss_to_mtu(sk, tp->mss_cache),
+					     CWND_UNIT);
+	}
+	return DIV_U64_ROUND_UP(cwnd_bytes, MIN_CWND_VIRT);
+}
+
+static bool prague_half_virtual_rtt_elapsed(struct sock *sk)
+{
+	return (prague_virtual_rtt(sk) >> (3 + 1)) <=
+		prague_elapsed_since_alpha_update(sk);
+}
+
+static u64 prague_pacing_rate_to_frac_cwnd(struct sock *sk)
+{
+	struct prague *ca = prague_ca(sk);
+	struct tcp_sock *tp = tcp_sk(sk);
+	u64 rtt;
+	u64 mtu;
+
+	mtu = tcp_mss_to_mtu(sk, tp->mss_cache);
+	rtt = (ca->hsrtt_us >> HSRTT_SHIFT) ?: tp->srtt_us;
+
+	return DIV_U64_ROUND_UP(mul_u64_u64_shr(ca->rate_bytes, rtt,
+						RTT2SEC_SHIFT - CWND_UNIT), mtu);
+}
+
+static u64 prague_frac_cwnd_to_pacing_rate(struct sock *sk)
+{
+	struct prague *ca = prague_ca(sk);
+	struct tcp_sock *tp = tcp_sk(sk);
+	u64 rate;
+
+	rate = (u64)((u64)USEC_PER_SEC << 3) * tcp_mss_to_mtu(sk, tp->mss_cache);
+	if (tp->srtt_us)
+		rate = div64_u64(rate, tp->srtt_us);
+	return max_t(u64, mul_u64_u64_shr(rate, ca->frac_cwnd, CWND_UNIT),
+		     MINIMUM_RATE);
+}
+
+static u32 prague_valid_mtu(struct sock *sk, u32 mtu)
+{
+	struct prague *ca = prague_ca(sk);
+
+	return max_t(u32, min_t(u32, ca->mtu_cache, mtu), tcp_mss_to_mtu(sk, MIN_MSS));
+}
+
+/* RTT independence will scale the classical 1/W per ACK increase. */
+static void prague_ai_ack_increase(struct sock *sk)
+{
+	struct prague *ca = prague_ca(sk);
+	struct tcp_sock *tp = tcp_sk(sk);
+	u64 increase;
+	u32 rtt;
+
+	rtt = tp->srtt_us;
+	if (ca->round < prague_rtt_transition ||
+	    !rtt || rtt > (MAX_SCALED_RTT << 3)) {
+		increase = prague_unscaled_ai_ack_increase(sk);
+		goto exit;
+	}
+
+	increase = prague_rate_scaled_ai_ack_increase(sk, rtt);
+
+exit:
+	WRITE_ONCE(ca->ai_ack_increase, increase);
+}
+
+static void prague_update_pacing_rate(struct sock *sk)
+{
+	struct prague *ca = prague_ca(sk);
+	struct tcp_sock *tp = tcp_sk(sk);
+	u32 rate_offset = RATE_OFFSET;
+	u64 rate, burst, offset;
+
+	if (prague_rate_offset && prague_rate_offset < ((1 << OFFSET_UNIT) - 1))
+		rate_offset = prague_rate_offset;
+
+	if (tcp_snd_cwnd(tp) < tp->snd_ssthresh / 2) {
+		rate = ca->rate_bytes << 1;
+	} else {
+		offset = mul_u64_u64_shr(rate_offset, ca->rate_bytes, OFFSET_UNIT);
+		if (prague_half_virtual_rtt_elapsed(sk))
+			rate = ca->rate_bytes - offset;
+		else
+			rate = ca->rate_bytes + offset;
+	}
+
+	rate = min_t(u64, rate, sk->sk_max_pacing_rate);
+	burst = div_u64(rate, tcp_mss_to_mtu(sk, tp->mss_cache));
+
+	WRITE_ONCE(prague_ca(sk)->max_tso_burst,
+		   max_t(u32, 1, burst >> prague_burst_shift));
+	WRITE_ONCE(sk->sk_pacing_rate, rate);
+}
+
+static void prague_new_round(struct sock *sk)
+{
+	struct prague *ca = prague_ca(sk);
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	ca->next_seq = tp->snd_nxt;
+	ca->old_delivered_ce = tp->delivered_ce;
+	ca->old_delivered = tp->delivered;
+	if (!tcp_in_slow_start(tp)) {
+		++ca->round;
+		if (!ca->round)
+			ca->round = prague_rtt_transition;
+	}
+	prague_ai_ack_increase(sk);
+}
+
+static void prague_cwnd_changed(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	tp->snd_cwnd_stamp = tcp_jiffies32;
+	prague_ai_ack_increase(sk);
+}
+
+static void prague_update_alpha(struct sock *sk)
+{
+	struct prague *ca = prague_ca(sk);
+	struct tcp_sock *tp = tcp_sk(sk);
+	u64 ecn_segs, alpha, mtu, mtu_used;
+
+	/* Do not update alpha before we have proof that there's an AQM on
+	 * the path.
+	 */
+	if (unlikely(!ca->saw_ce))
+		goto skip;
+
+	alpha = ca->upscaled_alpha;
+	ecn_segs = tp->delivered_ce - ca->old_delivered_ce;
+	/* We diverge from the original EWMA, i.e.,
+	 * alpha = (1 - g) * alpha + g * F
+	 * by working with (and storing)
+	 * upscaled_alpha = alpha * (1/g) [recall that 0<g<1]
+	 *
+	 * This enables to carry alpha's residual value to the next EWMA round.
+	 *
+	 * We first compute F, the fraction of ecn segments.
+	 */
+	if (ecn_segs) {
+		u32 acked_segs = tp->delivered - ca->old_delivered;
+
+		ecn_segs <<= PRAGUE_ALPHA_BITS;
+		ecn_segs = div_u64(ecn_segs, max(1U, acked_segs));
+	}
+	alpha = alpha - (alpha >> PRAGUE_SHIFT_G) + ecn_segs;
+	ca->alpha_stamp = tp->tcp_mstamp;
+
+	WRITE_ONCE(ca->upscaled_alpha,
+		   min(PRAGUE_MAX_ALPHA << PRAGUE_SHIFT_G, alpha));
+
+	if (prague_is_rtt_indep(sk) && !ca->in_loss) {
+		mtu_used = tcp_mss_to_mtu(sk, tp->mss_cache);
+		mtu = prague_valid_mtu(sk, prague_pacing_rate_to_max_mtu(sk));
+		if (mtu_used != mtu) {
+			ca->frac_cwnd = div_u64(ca->frac_cwnd * mtu_used, mtu);
+			tp->mss_cache_set_by_ca = true;
+			tcp_sync_mss(sk, mtu);
+
+			u64 new_cwnd = prague_frac_cwnd_to_snd_cwnd(sk);
+
+			if (tcp_snd_cwnd(tp) != new_cwnd) {
+				tcp_snd_cwnd_set(tp, new_cwnd);
+				tp->snd_ssthresh = div_u64(tp->snd_ssthresh * mtu_used, mtu);
+				prague_cwnd_changed(sk);
+			}
+		}
+	}
+skip:
+	ca->hsrtt_us = ca->hsrtt_us + tp->srtt_us - (ca->hsrtt_us >> HSRTT_SHIFT);
+	prague_new_round(sk);
+}
+
+static void prague_update_cwnd(struct sock *sk, const struct rate_sample *rs)
+{
+	struct prague *ca = prague_ca(sk);
+	struct tcp_sock *tp = tcp_sk(sk);
+	u64 increase;
+	u64 new_cwnd;
+	u64 mtu_used;
+	u64 divisor;
+	s64 acked;
+
+	acked = rs->acked_sacked;
+	if (rs->ece_delta) {
+		if (rs->ece_delta > acked)
+			LOG(sk, "Received %u marks for %lld acks at %u",
+			    rs->ece_delta, acked, tp->snd_una);
+		if (unlikely(!ca->saw_ce) && !prague_alpha_mode)
+			ca->frac_cwnd = (ca->frac_cwnd + 1U) >> 1;
+		ca->saw_ce = 1;
+		acked -= rs->ece_delta;
+	}
+
+	if (acked <= 0 || ca->in_loss || tp->app_limited)
+		goto adjust;
+
+	if (tcp_in_slow_start(tp)) {
+		acked = tcp_slow_start(tp, acked);
+		ca->frac_cwnd = (u64)tcp_snd_cwnd(tp) << CWND_UNIT;
+		if (!acked) {
+			prague_cwnd_changed(sk);
+			return;
+		}
+	}
+
+	if (prague_is_rtt_indep(sk) && ca->cwnd_mode == 1) {
+		mtu_used = tcp_mss_to_mtu(sk, tp->mss_cache);
+		increase = div_u64(((u64)(acked * MTU_SYS)) << RTT2SEC_SHIFT,
+				   prague_virtual_rtt(sk));
+		divisor = mtu_used << RTT2SEC_SHIFT;
+		new_cwnd = DIV64_U64_ROUND_UP(ca->rate_bytes * prague_virtual_rtt(sk), divisor);
+		if (likely(new_cwnd))
+			ca->rate_bytes += DIV_U64_ROUND_CLOSEST(increase, new_cwnd);
+		ca->frac_cwnd = max_t(u64, ca->frac_cwnd + acked,
+				      prague_pacing_rate_to_frac_cwnd(sk));
+	} else {
+		increase = acked * ca->ai_ack_increase;
+		new_cwnd = ca->frac_cwnd;
+		if (likely(new_cwnd))
+			increase = DIV64_U64_ROUND_CLOSEST((increase << CWND_UNIT), new_cwnd);
+		increase = div_u64(increase * MTU_SYS, tcp_mss_to_mtu(sk, tp->mss_cache));
+		ca->frac_cwnd += max_t(u64, acked, increase);
+
+		u64 rate = prague_frac_cwnd_to_pacing_rate(sk);
+
+		ca->rate_bytes = max_t(u64, ca->rate_bytes + acked, rate);
+	}
+
+adjust:
+	new_cwnd = prague_frac_cwnd_to_snd_cwnd(sk);
+	if (tcp_snd_cwnd(tp) > new_cwnd) {
+		/* Step-wise cwnd decrement */
+		tcp_snd_cwnd_set(tp, tcp_snd_cwnd(tp) - 1);
+		tp->snd_ssthresh = tcp_snd_cwnd(tp);
+		prague_cwnd_changed(sk);
+	} else if (tcp_snd_cwnd(tp) < new_cwnd) {
+		/* Step-wise cwnd increment */
+		tcp_snd_cwnd_set(tp, tcp_snd_cwnd(tp) + 1);
+		prague_cwnd_changed(sk);
+	}
+}
+
+static void prague_ca_open(struct sock *sk)
+{
+	struct prague *ca = prague_ca(sk);
+
+	ca->in_loss = 0;
+}
+
+static void prague_enter_loss(struct sock *sk)
+{
+	struct prague *ca = prague_ca(sk);
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	ca->loss_cwnd = tcp_snd_cwnd(tp);
+	ca->loss_rate_bytes = ca->rate_bytes;
+	if (prague_is_rtt_indep(sk) && ca->cwnd_mode == 1) {
+		ca->rate_bytes -= (ca->rate_bytes >> 1);
+		ca->frac_cwnd = prague_pacing_rate_to_frac_cwnd(sk);
+	} else {
+		ca->frac_cwnd -= (ca->frac_cwnd >> 1);
+		ca->rate_bytes = prague_frac_cwnd_to_pacing_rate(sk);
+	}
+	ca->in_loss = 1;
+}
+
+static void prague_enter_cwr(struct sock *sk)
+{
+	struct prague *ca = prague_ca(sk);
+	struct tcp_sock *tp = tcp_sk(sk);
+	u64 reduction;
+	u64 alpha;
+
+	if (prague_is_rtt_indep(sk) &&
+	    (prague_target_rtt(sk) >> 3) > tcp_stamp_us_delta(tp->tcp_mstamp, ca->cwr_stamp))
+		return;
+	ca->cwr_stamp = tp->tcp_mstamp;
+	alpha = ca->upscaled_alpha >> PRAGUE_SHIFT_G;
+
+	if (prague_is_rtt_indep(sk) && ca->cwnd_mode == 1) {
+		reduction = mul_u64_u64_shr(ca->rate_bytes, alpha, PRAGUE_ALPHA_BITS + 1);
+		ca->rate_bytes = max_t(u64, ca->rate_bytes - reduction, MINIMUM_RATE);
+		ca->frac_cwnd = prague_pacing_rate_to_frac_cwnd(sk);
+	} else {
+		reduction = (alpha * (ca->frac_cwnd) +
+			     /* Unbias the rounding by adding 1/2 */
+			     PRAGUE_MAX_ALPHA) >>
+			     (PRAGUE_ALPHA_BITS + 1U);
+		ca->frac_cwnd -= reduction;
+		ca->rate_bytes = prague_frac_cwnd_to_pacing_rate(sk);
+	}
+}
+
+static void prague_state(struct sock *sk, u8 new_state)
+{
+	if (new_state == inet_csk(sk)->icsk_ca_state)
+		return;
+
+	switch (new_state) {
+	case TCP_CA_Recovery:
+		prague_enter_loss(sk);
+		break;
+	case TCP_CA_CWR:
+		prague_enter_cwr(sk);
+		break;
+	case TCP_CA_Open:
+		prague_ca_open(sk);
+		break;
+	}
+}
+
+static void prague_cwnd_event(struct sock *sk, enum tcp_ca_event ev)
+{
+	if (ev == CA_EVENT_LOSS)
+		prague_enter_loss(sk);
+}
+
+static u32 prague_cwnd_undo(struct sock *sk)
+{
+	struct prague *ca = prague_ca(sk);
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	/* We may have made some progress since then, account for it. */
+	ca->in_loss = 0;
+	ca->rate_bytes = max(ca->rate_bytes, ca->loss_rate_bytes);
+	ca->frac_cwnd = prague_pacing_rate_to_frac_cwnd(sk);
+	return max(ca->loss_cwnd, tp->snd_cwnd);
+}
+
+static void prague_cong_control(struct sock *sk, u32 ack, int flag, const struct rate_sample *rs)
+{
+	struct prague *ca = prague_ca(sk);
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	prague_update_cwnd(sk, rs);
+	if (prague_should_update_ewma(sk))
+		prague_update_alpha(sk);
+	prague_update_pacing_rate(sk);
+	if (prague_cwnd_mode > 1) {
+		u64 cwnd_bytes = tcp_snd_cwnd(tp) * tcp_mss_to_mtu(sk, tp->mss_cache);
+		u64 cwnd_bytes_transit = prague_cwnd_transit * MTU_SYS;
+
+		if (likely(ca->saw_ce) && cwnd_bytes <= cwnd_bytes_transit)
+			ca->cwnd_mode = 1;
+		else if (unlikely(!ca->saw_ce) || cwnd_bytes > cwnd_bytes_transit)
+			ca->cwnd_mode = 0;
+	}
+}
+
+static u32 prague_ssthresh(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	return tp->snd_ssthresh;
+}
+
+static u32 prague_tso_segs(struct sock *sk, unsigned int mss_now)
+{
+	u32 tso_segs = prague_ca(sk)->max_tso_burst;
+
+	if (prague_max_tso_segs)
+		tso_segs = min(tso_segs, prague_max_tso_segs);
+
+	return tso_segs;
+}
+
+static size_t prague_get_info(struct sock *sk, u32 ext, int *attr,
+			      union tcp_cc_info *info)
+{
+	const struct prague *ca = prague_ca(sk);
+
+	if (ext & (1 << (INET_DIAG_PRAGUEINFO - 1)) ||
+	    ext & (1 << (INET_DIAG_VEGASINFO - 1))) {
+		memset(&info->prague, 0, sizeof(info->prague));
+		if (inet_csk(sk)->icsk_ca_ops != &prague_reno) {
+			info->prague.prague_alpha =
+				ca->upscaled_alpha >> PRAGUE_SHIFT_G;
+			info->prague.prague_max_burst = ca->max_tso_burst;
+			info->prague.prague_round = ca->round;
+			info->prague.prague_rate_bytes =
+				READ_ONCE(ca->rate_bytes);
+			info->prague.prague_frac_cwnd =
+				READ_ONCE(ca->frac_cwnd);
+			info->prague.prague_rtt_target =
+				prague_target_rtt(sk);
+		}
+		*attr = INET_DIAG_PRAGUEINFO;
+		return sizeof(info->prague);
+	}
+	return 0;
+}
+
+static void prague_release(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	cmpxchg(&sk->sk_pacing_status, SK_PACING_NEEDED, SK_PACING_NONE);
+	tp->ecn_flags &= ~TCP_ECN_ECT_1;
+	if (!tcp_ecn_mode_any(tp))
+		/* We forced the use of ECN, but failed to negotiate it */
+		INET_ECN_dontxmit(sk);
+
+	LOG(sk, "Released [delivered_ce=%u,received_ce=%u]",
+	    tp->delivered_ce, tp->received_ce);
+}
+
+static void prague_init(struct sock *sk)
+{
+	struct prague *ca = prague_ca(sk);
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	if (!tcp_ecn_mode_any(tp) &&
+	    sk->sk_state != TCP_LISTEN && sk->sk_state != TCP_CLOSE) {
+		prague_release(sk);
+		LOG(sk, "Switching to pure reno [ecn_status=%u,sk_state=%u]",
+		    tcp_ecn_mode_any(tp), sk->sk_state);
+		inet_csk(sk)->icsk_ca_ops = &prague_reno;
+		return;
+	}
+
+	tp->ecn_flags |= TCP_ECN_ECT_1;
+	cmpxchg(&sk->sk_pacing_status, SK_PACING_NONE, SK_PACING_NEEDED);
+	/* If we have an initial RTT estimate, ensure we have an initial pacing
+	 * rate to use if net.ipv4.tcp_pace_iw is set.
+	 */
+	ca->alpha_stamp = tp->tcp_mstamp;
+	if (!prague_alpha_mode)
+		ca->upscaled_alpha = 0;
+	else
+		ca->upscaled_alpha = PRAGUE_MAX_ALPHA << PRAGUE_SHIFT_G;
+	ca->frac_cwnd = (u64)tcp_snd_cwnd(tp) << CWND_UNIT;
+	ca->max_tso_burst = 1;
+
+	/* rate initialization */
+	if (tp->srtt_us) {
+		ca->rate_bytes = div_u64(((u64)USEC_PER_SEC << 3) *
+					 tcp_mss_to_mtu(sk, tp->mss_cache),
+					 tp->srtt_us);
+		ca->rate_bytes = max_t(u64, ca->rate_bytes * tcp_snd_cwnd(tp), MINIMUM_RATE);
+	} else {
+		ca->rate_bytes = MINIMUM_RATE;
+	}
+	prague_update_pacing_rate(sk);
+	ca->loss_rate_bytes = 0;
+	ca->round = 0;
+	ca->saw_ce = !!tp->delivered_ce;
+
+	ca->mtu_cache = tcp_mss_to_mtu(sk, tp->mss_cache) ?: MTU_SYS;
+	// Default as 1us
+	ca->hsrtt_us = tp->srtt_us ? (((u64)tp->srtt_us) << HSRTT_SHIFT) : (1 << (HSRTT_SHIFT + 3));
+	ca->cwnd_mode = (prague_cwnd_mode <= 1) ? prague_cwnd_mode : 0;
+
+	prague_new_round(sk);
+}
+
+static struct tcp_congestion_ops prague __read_mostly = {
+	.init		= prague_init,
+	.release	= prague_release,
+	.cong_control	= prague_cong_control,
+	.cwnd_event	= prague_cwnd_event,
+	.ssthresh	= prague_ssthresh,
+	.undo_cwnd	= prague_cwnd_undo,
+	.set_state	= prague_state,
+	.get_info	= prague_get_info,
+	.tso_segs	= prague_tso_segs,
+	.flags		= TCP_CONG_NEEDS_ECN |
+			  TCP_CONG_NEEDS_ACCECN |
+			  TCP_CONG_NO_FALLBACK_RFC3168 |
+			  TCP_CONG_NON_RESTRICTED,
+	.owner		= THIS_MODULE,
+	.name		= "prague",
+};
+
+static struct tcp_congestion_ops prague_reno __read_mostly = {
+	.ssthresh	= tcp_reno_ssthresh,
+	.cong_avoid	= tcp_reno_cong_avoid,
+	.undo_cwnd	= tcp_reno_undo_cwnd,
+	.get_info	= prague_get_info,
+	.owner		= THIS_MODULE,
+	.name		= "prague-reno",
+};
+
+static int __init prague_register(void)
+{
+	BUILD_BUG_ON(sizeof(struct prague) > ICSK_CA_PRIV_SIZE);
+	return tcp_register_congestion_control(&prague);
+}
+
+static void __exit prague_unregister(void)
+{
+	tcp_unregister_congestion_control(&prague);
+}
+
+module_init(prague_register);
+module_exit(prague_unregister);
+
+MODULE_DESCRIPTION("TCP Prague");
+MODULE_AUTHOR("Chia-Yu Chang <chia-yu.chang@...ia-bell-labs.com>");
+MODULE_AUTHOR("Olivier Tilmans <olivier.tilmans@...ia-bell-labs.com>");
+MODULE_AUTHOR("Koen De Schepper <koen.de_schepper@...ia-bell-labs.com>");
+MODULE_AUTHOR("Bob briscoe <research@...briscoe.net>");
+
+MODULE_LICENSE("GPL");
+MODULE_VERSION("0.7");
-- 
2.34.1


Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ