lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [day] [month] [year] [list]
Message-ID: <aYSZzlHXT7wBJu_e@volt-roccet-vm>
Date: Thu, 5 Feb 2026 14:23:26 +0100
From: Tim Fuechsel <t.fuechsel@....de>
To: "David S. Miller" <davem@...emloft.net>,
	David Ahern <dsahern@...nel.org>,
	Eric Dumazet <edumazet@...gle.com>,
	Jakub Kicinski <kuba@...nel.org>, Paolo Abeni <pabeni@...hat.com>,
	Simon Horman <horms@...nel.org>,
	Neal Cardwell <ncardwell@...gle.com>,
	Kuniyuki Iwashima <kuniyu@...gle.com>, linux-kernel@...r.kernel.org,
	netdev@...r.kernel.org, lukas.prause@....uni-hannover.de,
	t.fuechsel@....de
Subject: [RFC] tcp: Add TCP ROCCET congestion control module.

TCP ROCCET is an extension of TCP CUBIC that improves its overall
performance. By its mode of function, CUBIC causes bufferbloat while
it tries to detect the available throughput of a network path. This is
particularly a problem with large buffers in mobile networks. A more
detailed description and analysis of this problem caused by TCP CUBIC
can be found in [1]. TCP ROCCET addresses this problem by adding two
additional metrics to detect congestion (queueing and bufferbloat)
on a network path. TCP ROCCET achieves better performance than CUBIC
and BBRv3, by maintaining similar throughput while reducing the latency.
In addition, TCP ROCCET does not have fairness issues when sharing a
link with TCP CUBIC and BBRv3. A paper that evaluates the performance
and function of TCP ROCCET has already been peer-reviewed and will be
presented at the WONS 2026 conference. A draft of this paper can be
found here [2].

[1] https://doi.org/10.1109/VTC2023-Fall60731.2023.10333357
[2] https://arxiv.org/abs/2510.25281

Signed-off-by: Lukas Prause <lukas.prause@....uni-hannover.de>
Signed-off-by: Tim Fuechsel <t.fuechsel@....de>
---
 net/ipv4/Kconfig      |  12 +
 net/ipv4/Makefile     |   1 +
 net/ipv4/tcp_roccet.c | 686 ++++++++++++++++++++++++++++++++++++++++++
 net/ipv4/tcp_roccet.h |  52 ++++
 4 files changed, 751 insertions(+)
 create mode 100644 net/ipv4/tcp_roccet.c
 create mode 100644 net/ipv4/tcp_roccet.h

diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index b71c22475c51..781a0db37309 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -664,6 +664,18 @@ config TCP_CONG_CDG
 	    delay gradients." In Networking 2011. Preprint:
 	    http://caia.swin.edu.au/cv/dahayes/content/networking2011-cdg-preprint.pdf
 
+config TCP_CONG_ROCCET
+	tristate "ROCCET TCP"
+	default n
+	help
+      TCP ROCCET is a sender-side only modification of the TCP CUBIC
+      protocol stack that optimizes the performance of TCP congestion
+      control. Especially for networks with large buffers (wireless,
+      cellular networks), TCP ROCCET has improved performance by maintaining
+      similar throughput as CUBIC while reducing the latency.
+       For more information, see: https://arxiv.org/abs/2510.25281
+
+
 config TCP_CONG_BBR
 	tristate "BBR TCP"
 	default n
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index ec36d2ec059e..35fa62b6d07f 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -45,6 +45,7 @@ obj-$(CONFIG_INET_TCP_DIAG) += tcp_diag.o
 obj-$(CONFIG_INET_UDP_DIAG) += udp_diag.o
 obj-$(CONFIG_INET_RAW_DIAG) += raw_diag.o
 obj-$(CONFIG_TCP_CONG_BBR) += tcp_bbr.o
+obj-$(CONFIG_TCP_CONG_ROCCET) += tcp_roccet.o
 obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o
 obj-$(CONFIG_TCP_CONG_CDG) += tcp_cdg.o
 obj-$(CONFIG_TCP_CONG_CUBIC) += tcp_cubic.o
diff --git a/net/ipv4/tcp_roccet.c b/net/ipv4/tcp_roccet.c
new file mode 100644
index 000000000000..998a97bcb03e
--- /dev/null
+++ b/net/ipv4/tcp_roccet.c
@@ -0,0 +1,686 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * TCP ROCCET: An RTT-Oriented CUBIC Congestion Control
+ * Extension for 5G and Beyond Networks
+ *
+ * TCP ROCCET is a new TCP congestion control
+ * algorithm suited for current cellular 5G NR beyond networks.
+ * It extends the kernel default congestion control CUBIC
+ * and improves its performance, and additionally solves an
+ * unwanted side effects of CUBIC’s implementation.
+ * ROCCET uses its own Slow Start, called LAUNCH, where loss
+ * is not considered as a congestion event.
+ * The congestion avoidance phase, called ORBITER, uses
+ * CUBIC's window growth function and adds, based on RTT
+ * and ACK rate, congestion events.
+ *
+ * A peer-reviewed paper on TCP ROCCET will be presented at the WONS 2026 conference.
+ * A draft of the paper is available here:
+ *		https://arxiv.org/abs/2510.25281
+ *
+ *
+ * Further information about CUBIC:
+ * TCP CUBIC: Binary Increase Congestion control for TCP v2.3
+ * Home page:
+ *	http://netsrv.csc.ncsu.edu/twiki/bin/view/Main/BIC
+ * This is from the implementation of CUBIC TCP in
+ * Sangtae Ha, Injong Rhee and Lisong Xu,
+ *  "CUBIC: A New TCP-Friendly High-Speed TCP Variant"
+ *  in ACM SIGOPS Operating System Review, July 2008.
+ * Available from:
+ *	http://netsrv.csc.ncsu.edu/export/cubic_a_new_tcp_2008.pdf
+ *
+ * CUBIC integrates a new slow start algorithm, called HyStart.
+ * The details of HyStart are presented in
+ *  Sangtae Ha and Injong Rhee,
+ *  "Taming the Elephants: New TCP Slow Start", NCSU TechReport 2008.
+ * Available from:
+ *  http://netsrv.csc.ncsu.edu/export/hystart_techreport_2008.pdf
+ *
+ * All testing results are available from:
+ * http://netsrv.csc.ncsu.edu/wiki/index.php/TCP_Testing
+ *
+ * Unless CUBIC is enabled and congestion window is large
+ * this behaves the same as the original Reno.
+ */
+
+#include "tcp_roccet.h"
+#include "linux/printk.h"
+#include <linux/btf.h>
+#include <linux/btf_ids.h>
+#include <linux/math64.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <net/tcp.h>
+
+/* Scale factor beta calculation (max_cwnd = snd_cwnd * beta) */
+#define BICTCP_BETA_SCALE 1024
+
+#define BICTCP_HZ 10 /* BIC HZ 2^10 = 1024 */
+
+/* Alpha value for the sRrTT  multiplied by 100.
+ * Here 20 represents a value of 0.2
+ */
+#define ROCCET_ALPHA_TIMES_100 20
+
+/* The amount of seconds ROCCET stores a minRTT.
+ * Enable "calculate_min_rtt" first.
+ */
+#define ROCCET_RTT_LOOKBACK_S 10
+
+/* Parameters that are specific to the ROCCET-Algorithm */
+static int sr_rtt_upper_bound __read_mostly = 100;
+static int ack_rate_diff_ss __read_mostly = 10;
+static int ack_rate_diff_ca __read_mostly = 200;
+static bool calculate_min_rtt __read_mostly;
+static bool ignore_loss __read_mostly;
+static int roccet_min_rtt_interpolation_factor __read_mostly = 70;
+
+module_param(sr_rtt_upper_bound, int, 0644);
+MODULE_PARM_DESC(sr_rtt_upper_bound, "ROCCET's upper bound for srRTT.");
+module_param(ack_rate_diff_ss, int, 0644);
+MODULE_PARM_DESC(ack_rate_diff_ss,
+		 "ROCCET's threshold to exit slow start if ACK-rate defer by given amount of segments.");
+module_param(ack_rate_diff_ca, int, 0644);
+MODULE_PARM_DESC(ack_rate_diff_ca,
+		 "ROCCET's threshold for ack-rate and cum_cwnd, in percentage of the current cwnd.");
+module_param(calculate_min_rtt, bool, 0644);
+MODULE_PARM_DESC(calculate_min_rtt,
+		 "Calculate min RTT if no lower RTT occurs after 10 sec.");
+module_param(ignore_loss, bool, 0644);
+MODULE_PARM_DESC(ignore_loss, "Ignore loss as a congestion event.");
+module_param(roccet_min_rtt_interpolation_factor, int, 0644);
+MODULE_PARM_DESC(roccet_min_rtt_interpolation_factor,
+		 "ROCCET factor for interpolating the current RTT with the last minRTT (minRTT = (factor * currRTT + (100-factor) * minRTT) / 100)");
+
+static bool fast_convergence __read_mostly = true;
+static int beta __read_mostly = 717; /* = 717/1024 (BICTCP_BETA_SCALE) */
+static int initial_ssthresh __read_mostly;
+static int bic_scale __read_mostly = 41;
+static bool tcp_friendliness __read_mostly = true;
+
+static u32 cube_rtt_scale __read_mostly;
+static u32 beta_scale __read_mostly;
+static u64 cube_factor __read_mostly;
+
+/* Note parameters that are used for precomputing scale factors are read-only */
+module_param(fast_convergence, bool, 0644);
+MODULE_PARM_DESC(fast_convergence, "turn on/off fast convergence");
+module_param(beta, int, 0644);
+MODULE_PARM_DESC(beta, "beta for multiplicative increase");
+module_param(initial_ssthresh, int, 0644);
+MODULE_PARM_DESC(initial_ssthresh, "initial value of slow start threshold");
+module_param(bic_scale, int, 0444);
+MODULE_PARM_DESC(bic_scale,
+		 "scale (scaled by 1024) value for bic function (bic_scale/1024)");
+module_param(tcp_friendliness, bool, 0644);
+MODULE_PARM_DESC(tcp_friendliness, "turn on/off tcp friendliness");
+
+static inline void roccettcp_reset(struct roccettcp *ca)
+{
+	memset(ca, 0, offsetof(struct roccettcp, curr_rtt));
+	ca->bw_limit.sum_cwnd = 1;
+	ca->bw_limit.sum_acked = 1;
+	ca->bw_limit.next_check = 0;
+	ca->curr_min_rtt_timed.rtt = ~0U;
+	ca->curr_min_rtt_timed.time = ~0U;
+	ca->ece_srrtt = 0;
+	ca->ece_cwnd = 2;
+}
+
+static inline void update_min_rtt(struct sock *sk)
+{
+	struct roccettcp *ca = inet_csk_ca(sk);
+	u32 now = jiffies_to_usecs(tcp_jiffies32);
+
+	if (now - ca->curr_min_rtt_timed.time >
+		    ROCCET_RTT_LOOKBACK_S * USEC_PER_SEC &&
+	    calculate_min_rtt) {
+		u32 new_min_rtt = max(ca->curr_rtt, 1);
+		u32 old_min_rtt = ca->curr_min_rtt_timed.rtt;
+
+		u32 interpolated_min_rtt =
+			(new_min_rtt * roccet_min_rtt_interpolation_factor +
+			 old_min_rtt *
+				 (100 - roccet_min_rtt_interpolation_factor)) /
+			100;
+
+		ca->curr_min_rtt_timed.rtt = interpolated_min_rtt;
+		ca->curr_min_rtt_timed.time = now;
+	}
+
+	/* Check if new lower min RTT was found. If so, set it directly */
+	if (ca->curr_rtt < ca->curr_min_rtt_timed.rtt) {
+		ca->curr_min_rtt_timed.rtt = max(ca->curr_rtt, 1);
+		ca->curr_min_rtt_timed.time = now;
+	}
+}
+
+/* Return difference between last and current ack rate.
+ */
+static inline int get_ack_rate_diff(struct roccettcp *ca)
+{
+	return ca->ack_rate.last_rate - ca->ack_rate.curr_rate;
+}
+
+/* Update ack rate sampled by 100ms.
+ */
+static inline void update_ack_rate(struct sock *sk)
+{
+	struct roccettcp *ca = inet_csk_ca(sk);
+	u32 now = jiffies_to_usecs(tcp_jiffies32);
+	u32 interval = USEC_PER_MSEC * 100;
+
+	if ((u32)(now - ca->ack_rate.last_rate_time) >= interval) {
+		ca->ack_rate.last_rate_time = now;
+		ca->ack_rate.last_rate = ca->ack_rate.curr_rate;
+		ca->ack_rate.curr_rate = ca->ack_rate.cnt;
+		ca->ack_rate.cnt = 0;
+	} else {
+		ca->ack_rate.cnt += 1;
+	}
+}
+
+/* Compute srRTT.
+ */
+static inline void update_srrtt(struct sock *sk)
+{
+	struct roccettcp *ca = inet_csk_ca(sk);
+
+	if (ca->curr_min_rtt_timed.rtt == 0)
+		return;
+
+	/* Calculate the new rRTT (Scaled by 100).
+	 * 100 * ((sRTT - sRTT_min) / sRTT_min)
+	 */
+	u32 rrtt = (100 * (ca->curr_rtt - ca->curr_min_rtt_timed.rtt)) /
+		   ca->curr_min_rtt_timed.rtt;
+
+	// (1 - alpha) * srRTT + alpha * rRTT
+	ca->curr_srrtt = ((100 - ROCCET_ALPHA_TIMES_100) * ca->curr_srrtt +
+			  ROCCET_ALPHA_TIMES_100 * rrtt) /
+			 100;
+}
+
+__bpf_kfunc static void roccettcp_init(struct sock *sk)
+{
+	struct roccettcp *ca = inet_csk_ca(sk);
+
+	roccettcp_reset(ca);
+
+	if (initial_ssthresh)
+		tcp_sk(sk)->snd_ssthresh = initial_ssthresh;
+
+	/* Initial roccet parameters */
+	ca->roccet_last_event_time_us = 0;
+	ca->ack_rate.last_rate = 0;
+	ca->ack_rate.last_rate_time = 0;
+	ca->ack_rate.curr_rate = 0;
+	ca->ack_rate.cnt = 0;
+}
+
+__bpf_kfunc static void roccettcp_cwnd_event(struct sock *sk,
+					     enum tcp_ca_event event)
+{
+	if (event == CA_EVENT_TX_START) {
+		struct roccettcp *ca = inet_csk_ca(sk);
+		u32 now = tcp_jiffies32;
+		s32 delta;
+
+		delta = now - tcp_sk(sk)->lsndtime;
+
+		/* We were application limited (idle) for a while.
+		 * Shift epoch_start to keep cwnd growth to cubic curve.
+		 */
+		if (ca->epoch_start && delta > 0) {
+			ca->epoch_start += delta;
+			if (after(ca->epoch_start, now))
+				ca->epoch_start = now;
+		}
+		return;
+	}
+}
+
+/* calculate the cubic root of x using a table lookup followed by one
+ * Newton-Raphson iteration.
+ * Avg err ~= 0.195%
+ */
+static u32 cubic_root(u64 a)
+{
+	u32 x, b, shift;
+	/* cbrt(x) MSB values for x MSB values in [0..63].
+	 * Precomputed then refined by hand - Willy Tarreau
+	 *
+	 * For x in [0..63],
+	 *   v = cbrt(x << 18) - 1
+	 *   cbrt(x) = (v[x] + 10) >> 6
+	 */
+	static const u8 v[] = {
+		/* 0x00 */ 0,	54,  54,  54,  118, 118, 118, 118,
+		/* 0x08 */ 123, 129, 134, 138, 143, 147, 151, 156,
+		/* 0x10 */ 157, 161, 164, 168, 170, 173, 176, 179,
+		/* 0x18 */ 181, 185, 187, 190, 192, 194, 197, 199,
+		/* 0x20 */ 200, 202, 204, 206, 209, 211, 213, 215,
+		/* 0x28 */ 217, 219, 221, 222, 224, 225, 227, 229,
+		/* 0x30 */ 231, 232, 234, 236, 237, 239, 240, 242,
+		/* 0x38 */ 244, 245, 246, 248, 250, 251, 252, 254,
+	};
+
+	b = fls64(a);
+	if (b < 7) {
+		/* a in [0..63] */
+		return ((u32)v[(u32)a] + 35) >> 6;
+	}
+
+	b = ((b * 84) >> 8) - 1;
+	shift = (a >> (b * 3));
+
+	x = ((u32)(((u32)v[shift] + 10) << b)) >> 6;
+
+	/* Newton-Raphson iteration
+	 *                         2
+	 * x    = ( 2 * x  +  a / x  ) / 3
+	 *  k+1          k         k
+	 */
+	x = (2 * x + (u32)div64_u64(a, (u64)x * (u64)(x - 1)));
+	x = ((x * 341) >> 10);
+	return x;
+}
+
+/* Compute congestion window to use.
+ */
+static inline void bictcp_update(struct roccettcp *ca, u32 cwnd, u32 acked)
+{
+	u32 delta, bic_target, max_cnt;
+	u64 offs, t;
+
+	ca->ack_cnt += acked; /* count the number of ACKed packets */
+
+	if (ca->last_cwnd == cwnd &&
+	    (s32)(tcp_jiffies32 - ca->last_time) <= HZ / 32)
+		return;
+
+	/* The CUBIC function can update ca->cnt at most once per jiffy.
+	 * On all cwnd reduction events, ca->epoch_start is set to 0,
+	 * which will force a recalculation of ca->cnt.
+	 */
+	if (ca->epoch_start && tcp_jiffies32 == ca->last_time)
+		goto tcp_friendliness;
+
+	ca->last_cwnd = cwnd;
+	ca->last_time = tcp_jiffies32;
+
+	if (ca->epoch_start == 0) {
+		ca->epoch_start = tcp_jiffies32; /* record beginning */
+		ca->ack_cnt = acked; /* start counting */
+		ca->tcp_cwnd = cwnd; /* syn with cubic */
+
+		if (ca->last_max_cwnd <= cwnd) {
+			ca->bic_K = 0;
+			ca->bic_origin_point = cwnd;
+		} else {
+			/* Compute new K based on
+			 * (wmax-cwnd) * (srtt>>3 / HZ) / c * 2^(3*bictcp_HZ)
+			 */
+			ca->bic_K = cubic_root(cube_factor *
+					       (ca->last_max_cwnd - cwnd));
+			ca->bic_origin_point = ca->last_max_cwnd;
+		}
+	}
+
+	/* cubic function - calc */
+	/* calculate c * time^3 / rtt,
+	 *  while considering overflow in calculation of time^3
+	 * (so time^3 is done by using 64 bit)
+	 * and without the support of division of 64bit numbers
+	 * (so all divisions are done by using 32 bit)
+	 *  also NOTE the unit of those variables
+	 *	  time  = (t - K) / 2^bictcp_HZ
+	 *	  c = bic_scale >> 10
+	 * rtt  = (srtt >> 3) / HZ
+	 * !!! The following code does not have overflow problems,
+	 * if the cwnd < 1 million packets !!!
+	 */
+
+	t = (s32)(tcp_jiffies32 - ca->epoch_start);
+	t += usecs_to_jiffies(ca->delay_min);
+
+	/* change the unit from HZ to bictcp_HZ */
+	t <<= BICTCP_HZ;
+	do_div(t, HZ);
+
+	if (t < ca->bic_K) /* t - K */
+		offs = ca->bic_K - t;
+	else
+		offs = t - ca->bic_K;
+
+	/* c/rtt * (t-K)^3 */
+	delta = (cube_rtt_scale * offs * offs * offs) >> (10 + 3 * BICTCP_HZ);
+	if (t < ca->bic_K) /* below origin*/
+		bic_target = ca->bic_origin_point - delta;
+	else /* above origin*/
+		bic_target = ca->bic_origin_point + delta;
+
+	/* cubic function - calc bictcp_cnt*/
+	if (bic_target > cwnd)
+		ca->cnt = cwnd / (bic_target - cwnd);
+	else
+		ca->cnt = 100 * cwnd; /* very small increment*/
+
+	/* The initial growth of cubic function may be too conservative
+	 * when the available bandwidth is still unknown.
+	 */
+	if (ca->last_max_cwnd == 0 && ca->cnt > 20)
+		ca->cnt = 20; /* increase cwnd 5% per RTT */
+
+tcp_friendliness:
+	/* TCP Friendly */
+	if (tcp_friendliness) {
+		u32 scale = beta_scale;
+
+		delta = (cwnd * scale) >> 3;
+		while (ca->ack_cnt > delta) { /* update tcp cwnd */
+			ca->ack_cnt -= delta;
+			ca->tcp_cwnd++;
+		}
+
+		if (ca->tcp_cwnd > cwnd) { /* if bic is slower than tcp */
+			delta = ca->tcp_cwnd - cwnd;
+			max_cnt = cwnd / delta;
+			if (ca->cnt > max_cnt)
+				ca->cnt = max_cnt;
+		}
+	}
+
+	/* The maximum rate of cwnd increase CUBIC allows is 1 packet per
+	 * 2 packets ACKed, meaning cwnd grows at 1.5x per RTT.
+	 */
+	ca->cnt = max(ca->cnt, 2U);
+}
+
+__bpf_kfunc static void roccettcp_cong_avoid(struct sock *sk, u32 ack,
+					     u32 acked)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct roccettcp *ca = inet_csk_ca(sk);
+
+	u32 now = jiffies_to_usecs(tcp_jiffies32);
+	u32 bw_limit_detect = 0;
+	u32 roccet_xj;
+	u32 jitter;
+
+	if (ca->last_rtt > ca->curr_rtt)
+		jitter = ca->last_rtt - ca->curr_rtt;
+	else
+		jitter = ca->curr_rtt - ca->last_rtt;
+
+	/* Update roccet parameters */
+	update_ack_rate(sk);
+	update_min_rtt(sk);
+	update_srrtt(sk);
+
+	/* Reset ECE handling if we already have more bandwidth
+	 * than we received the last ECE.
+	 */
+	if (ca->ece_srrtt > 0) {
+		if (tcp_snd_cwnd(tp) >= ca->ece_cwnd)
+			ca->ece_srrtt = 0;
+	}
+
+	/* ROCCET drain.
+	 * Do not increase the cwnd for 100ms after a roccet congestion event
+	 */
+	if (now - ca->roccet_last_event_time_us <= 100 * USEC_PER_MSEC)
+		return;
+
+	/* LAUNCH: Detect an exit point for tcp slow start
+	 * in networks with large buffers of multiple BDP
+	 * Like in cellular networks (5G, ...).
+	 * Or exit LAUNCH if cwnd is too large for application layer
+	 * data rate.
+	 */
+
+	if ((tcp_in_slow_start(tp) && ca->curr_srrtt > sr_rtt_upper_bound &&
+	     get_ack_rate_diff(ca) >= ack_rate_diff_ss) ||
+		(!tcp_is_cwnd_limited(sk) && tcp_in_slow_start(tp))) {
+		ca->epoch_start = 0;
+
+		/* Handle initial slow start. Here we observe the most problems */
+		if (tp->snd_ssthresh == TCP_INFINITE_SSTHRESH) {
+			tcp_sk(sk)->snd_ssthresh = tcp_snd_cwnd(tp) / 2;
+			tcp_snd_cwnd_set(tp, tcp_snd_cwnd(tp) / 2);
+		} else {
+			tcp_sk(sk)->snd_ssthresh =
+				tcp_snd_cwnd(tp) - (tcp_snd_cwnd(tp) / 3);
+			tcp_snd_cwnd_set(tp, tcp_snd_cwnd(tp) -
+						     (tcp_snd_cwnd(tp) / 3));
+		}
+		ca->roccet_last_event_time_us = now;
+		return;
+	}
+
+	if (tcp_in_slow_start(tp)) {
+		acked = tcp_slow_start(tp, acked);
+		if (!acked)
+			return;
+	}
+
+	if (ca->bw_limit.next_check == 0)
+		ca->bw_limit.next_check = now + 5 * ca->curr_rtt;
+
+	ca->bw_limit.sum_cwnd += tcp_snd_cwnd(tp);
+	ca->bw_limit.sum_acked += acked;
+
+	if (ca->bw_limit.next_check < now) {
+		/* We send more data as we got acked in the last 5 RTTs */
+		if ((ca->bw_limit.sum_cwnd * 100) / ca->bw_limit.sum_acked >=
+		    ack_rate_diff_ca)
+			bw_limit_detect = 1;
+
+		/* reset struct and set next end of period */
+		ca->bw_limit.sum_cwnd = 1;
+
+		/* set to 1 to avoid division by zero */
+		ca->bw_limit.sum_acked = 1;
+		ca->bw_limit.next_check = now + 5 * ca->curr_rtt;
+	}
+
+	/* Respects the jitter of the connection and add it on top of the upper bound
+	 * for the srRTT
+	 */
+	roccet_xj = ((jitter * 100) / ca->curr_min_rtt_timed.rtt) +
+		    sr_rtt_upper_bound;
+	if (roccet_xj < sr_rtt_upper_bound)
+		roccet_xj = sr_rtt_upper_bound;
+
+	/* This is true if we recently received an ECE bit.
+	 * Therefore we should respect the srRTT at this point.
+	 */
+	if (ca->ece_srrtt < roccet_xj && ca->ece_srrtt > 0)
+		roccet_xj = ca->ece_srrtt;
+
+	if (ca->curr_srrtt > roccet_xj && (bw_limit_detect || ca->ece_srrtt > 0)) {
+		ca->epoch_start = 0;
+		ca->roccet_last_event_time_us = now;
+		ca->cnt = 100 * tcp_snd_cwnd(tp);
+
+		/* Set Wmax if cwnd is larger than the old Wmax */
+		if (tcp_snd_cwnd(tp) > ca->last_max_cwnd)
+			ca->last_max_cwnd = tcp_snd_cwnd(tp);
+
+		tcp_snd_cwnd_set(tp, min(tp->snd_cwnd_clamp,
+					 max((tcp_snd_cwnd(tp) * beta) / BICTCP_BETA_SCALE, 2U)));
+		tp->snd_ssthresh = tcp_snd_cwnd(tp);
+		return;
+	}
+
+	/* Terminates this function if cwnd is not fully utilized.
+	 * In mobile networks like 5G, this termination causes the cwnd to be frozen at
+	 * an excessively high value. This is because slow start or HyStart massively
+	 * exceed the available bandwidth and leave the cwnd at an excessively high
+	 * value. The cwnd cannot therefore be fully utilized because it is limited by
+	 * the connection capacity.
+	 */
+	if (!tcp_is_cwnd_limited(sk))
+		return;
+
+	bictcp_update(ca, tcp_snd_cwnd(tp), acked);
+	tcp_cong_avoid_ai(tp, max(1, ca->cnt), acked);
+}
+
+__bpf_kfunc static u32 roccettcp_recalc_ssthresh(struct sock *sk)
+{
+	const struct tcp_sock *tp = tcp_sk(sk);
+	struct roccettcp *ca = inet_csk_ca(sk);
+
+	if (ignore_loss)
+		return tcp_snd_cwnd(tp);
+
+	/* Don't exit slow start if loss occurs. */
+	if (tcp_in_slow_start(tp))
+		return tcp_snd_cwnd(tp);
+
+	ca->epoch_start = 0; /* end of epoch */
+
+	/* Wmax and fast convergence */
+	if (tcp_snd_cwnd(tp) < ca->last_max_cwnd && fast_convergence)
+		ca->last_max_cwnd =
+			(tcp_snd_cwnd(tp) * (BICTCP_BETA_SCALE + beta)) /
+			(2 * BICTCP_BETA_SCALE);
+	else
+		ca->last_max_cwnd = tcp_snd_cwnd(tp);
+
+	return max((tcp_snd_cwnd(tp) * beta) / BICTCP_BETA_SCALE, 2U);
+}
+
+__bpf_kfunc static void roccettcp_state(struct sock *sk, u8 new_state)
+{
+	struct roccettcp *ca = inet_csk_ca(sk);
+
+	if (new_state == TCP_CA_Loss)
+		roccettcp_reset(ca);
+}
+
+__bpf_kfunc static void roccettcp_acked(struct sock *sk,
+					const struct ack_sample *sample)
+{
+	struct roccettcp *ca = inet_csk_ca(sk);
+
+	/* Some calls are for duplicates without timestamps */
+	if (sample->rtt_us < 0)
+		return;
+
+	/* Discard delay samples right after fast recovery */
+	if (ca->epoch_start && (s32)(tcp_jiffies32 - ca->epoch_start) < HZ)
+		return;
+
+	u32 delay = sample->rtt_us;
+
+	if (delay == 0)
+		delay = 1;
+
+	/* first time call or link delay decreases */
+	if (ca->delay_min == 0 || ca->delay_min > delay)
+		ca->delay_min = delay;
+
+	/* Get valid sample for roccet */
+	if (sample->rtt_us > 0) {
+		ca->last_rtt = ca->curr_rtt;
+		ca->curr_rtt = sample->rtt_us;
+	}
+}
+
+__bpf_kfunc static void roccet_in_ack_event(struct sock *sk, u32 flags)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct roccettcp *ca = inet_csk_ca(sk);
+
+	/* Handle ECE bit.
+	 * Processing of ECE events is done in roccettcp_cong_avoid()
+	 */
+	if (flags & CA_ACK_ECE) {
+		ca->ece_srrtt = ca->curr_srrtt;
+		ca->ece_cwnd = tcp_snd_cwnd(tp);
+	}
+}
+
+static struct tcp_congestion_ops roccet_tcp __read_mostly = {
+	.init = roccettcp_init,
+	.ssthresh = roccettcp_recalc_ssthresh,
+	.cong_avoid = roccettcp_cong_avoid,
+	.set_state = roccettcp_state,
+	.undo_cwnd = tcp_reno_undo_cwnd,
+	.cwnd_event = roccettcp_cwnd_event,
+	.pkts_acked = roccettcp_acked,
+	.in_ack_event   = roccet_in_ack_event,
+	.owner = THIS_MODULE,
+	.name = "roccet",
+};
+
+BTF_KFUNCS_START(tcp_roccet_check_kfunc_ids)
+BTF_ID_FLAGS(func, roccettcp_init)
+BTF_ID_FLAGS(func, roccettcp_recalc_ssthresh)
+BTF_ID_FLAGS(func, roccettcp_cong_avoid)
+BTF_ID_FLAGS(func, roccettcp_state)
+BTF_ID_FLAGS(func, roccettcp_cwnd_event)
+BTF_ID_FLAGS(func, roccettcp_acked)
+BTF_KFUNCS_END(tcp_roccet_check_kfunc_ids)
+
+static const struct btf_kfunc_id_set tcp_roccet_kfunc_set = {
+	.owner = THIS_MODULE,
+	.set = &tcp_roccet_check_kfunc_ids,
+};
+
+static int __init roccettcp_register(void)
+{
+	int ret;
+
+	BUILD_BUG_ON(sizeof(struct roccettcp) > ICSK_CA_PRIV_SIZE);
+
+	/* Precompute a bunch of the scaling factors that are used per-packet
+	 * based on SRTT of 100ms
+	 */
+
+	beta_scale =
+		8 * (BICTCP_BETA_SCALE + beta) / 3 / (BICTCP_BETA_SCALE - beta);
+
+	cube_rtt_scale = (bic_scale * 10); /* 1024*c/rtt */
+
+	/* calculate the "K" for (wmax-cwnd) = c/rtt * K^3
+	 *  so K = cubic_root( (wmax-cwnd)*rtt/c )
+	 * the unit of K is bictcp_HZ=2^10, not HZ
+	 *
+	 *  c = bic_scale >> 10
+	 *  rtt = 100ms
+	 *
+	 * the following code has been designed and tested for
+	 * cwnd < 1 million packets
+	 * RTT < 100 seconds
+	 * HZ < 1,000,00  (corresponding to 10 nano-second)
+	 */
+
+	/* 1/c * 2^2*bictcp_HZ * srtt */
+	cube_factor = 1ull << (10 + 3 * BICTCP_HZ); /* 2^40 */
+
+	/* divide by bic_scale and by constant Srtt (100ms) */
+	do_div(cube_factor, bic_scale * 10);
+
+	ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS,
+					&tcp_roccet_kfunc_set);
+	if (ret < 0)
+		return ret;
+	return tcp_register_congestion_control(&roccet_tcp);
+}
+
+static void __exit roccettcp_unregister(void)
+{
+	tcp_unregister_congestion_control(&roccet_tcp);
+}
+
+module_init(roccettcp_register);
+module_exit(roccettcp_unregister);
+
+MODULE_AUTHOR("Lukas Prause, Tim Fuechsel");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("ROCCET TCP");
+MODULE_VERSION("1.0");
diff --git a/net/ipv4/tcp_roccet.h b/net/ipv4/tcp_roccet.h
new file mode 100644
index 000000000000..5168d57efec5
--- /dev/null
+++ b/net/ipv4/tcp_roccet.h
@@ -0,0 +1,52 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * TCP ROCCET congestion control interface
+ */
+#ifndef __TCP_ROCCET_H
+#define __TCP_ROCCET_H 1
+
+#include <linux/math64.h>
+
+struct ack_rate {
+	u16 last_rate;	/* Last ACK-rate */
+	u32 last_rate_time;	/* Timestamp of the last ACK-rate */
+	u16 curr_rate;	/* Current ACK-rate */
+	u16 cnt;	/* Used for counting acks */
+};
+
+struct bandwidth_limit_detected {
+	u32 sum_cwnd;	/* sum of cwnd during time interval */
+	u32 sum_acked;	/* sum of received acks during time interval */
+	u32 next_check;	/* end/upper bound of time interval */
+};
+
+struct timed_rtt {
+	u32 time;	/* Time of recoding */
+	u32 rtt;	/* Measured RTT */
+};
+
+/* Based on the BICTCP struct with additions specific for the ROCCET-Algorithm */
+struct roccettcp {
+	u32 cnt;		/* increase cwnd by 1 after ACKs */
+	u32 last_max_cwnd;	/* last maximum snd_cwnd */
+	u32 last_cwnd;		/* last snd_cwnd */
+	u32 last_time;		/* time when updated last_cwnd */
+	u32 bic_origin_point;	/* origin point of bic function */
+	u32 bic_K;		/* time to origin point from the beginning of the current epoch */
+	u32 delay_min;		/* min delay (usec) */
+	u32 epoch_start;	/* beginning of an epoch */
+	u32 ack_cnt;		/* number of acks */
+	u32 tcp_cwnd;		/* estimated tcp cwnd */
+	u32 curr_rtt;		/* minimum rtt of current round */
+
+	u32 roccet_last_event_time_us;	/* last time ROCCET was triggered */
+	u32 ece_cwnd;		/* cwnd when a ECE bit was received */
+	u32 ece_srrtt;		/* srRTT when the ECE was received */
+	struct timed_rtt curr_min_rtt_timed;	/* observed minRTT with the timestamp */
+	u32 curr_srrtt;		/* srRTT calculated based on the latest ACK */
+	struct ack_rate ack_rate;	/* last and the current ACK rate */
+	struct bandwidth_limit_detected bw_limit;
+	u32 last_rtt;		/* Used for jitter calculation */
+};
+
+#endif /* __TCP_ROCCET_H */
-- 
2.43.0


Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ