[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20170728195919.10099-6-natale.patriciello@gmail.com>
Date: Fri, 28 Jul 2017 21:59:19 +0200
From: Natale Patriciello <natale.patriciello@...il.com>
To: "David S . Miller" <davem@...emloft.net>,
Alexey Kuznetsov <kuznet@....inr.ac.ru>,
James Morris <jmorris@...ei.org>,
Hideaki YOSHIFUJI <yoshfuji@...ux-ipv6.org>,
Patrick McHardy <kaber@...sh.net>
Cc: netdev <netdev@...r.kernel.org>,
Ahmed Said <ahmed.said@...roma2.it>,
Natale Patriciello <natale.patriciello@...il.com>,
Francesco Zampognaro <zampognaro@....uniroma2.it>,
Cesare Roseti <roseti@....uniroma2.it>
Subject: [RFC PATCH v1 5/5] wave: Added basic version of TCP Wave
TCP Wave (TCPW) replaces the window-based transmission paradigm of the
standard TCP with a burst-based transmission, the ACK-clock scheduling
with a self-managed timer and the RTT-based congestion control loop
with an Ack-based Capacity and Congestion Estimation (ACCE) module. In
non-technical words, it sends data down the stack when its internal
timer expires, and the timing of the received ACKs contribute to
updating this timer regularly.
It is the first TCP congestion control that uses the timing constraint
developed in the Linux kernel.
Signed-off-by: Natale Patriciello <natale.patriciello@...il.com>
Tested-by: Ahmed Said <ahmed.said@...roma2.it>
---
MAINTAINERS | 6 +
net/ipv4/Kconfig | 16 +
net/ipv4/Makefile | 1 +
net/ipv4/tcp_output.c | 4 +-
net/ipv4/tcp_wave.c | 914 ++++++++++++++++++++++++++++++++++++++++++++++++++
5 files changed, 940 insertions(+), 1 deletion(-)
create mode 100644 net/ipv4/tcp_wave.c
diff --git a/MAINTAINERS b/MAINTAINERS
index 767e9d202adf..39c57bdc417d 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -12427,6 +12427,12 @@ W: http://tcp-lp-mod.sourceforge.net/
S: Maintained
F: net/ipv4/tcp_lp.c
+TCP WAVE MODULE
+M: "Natale Patriciello" <natale.patriciello@...il.com>
+W: http://tcp-lp-mod.sourceforge.net/
+S: Maintained
+F: net/ipv4/tcp_wave.c
+
TDA10071 MEDIA DRIVER
M: Antti Palosaari <crope@....fi>
L: linux-media@...r.kernel.org
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 91a2557942fa..de23b3a04b98 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -492,6 +492,18 @@ config TCP_CONG_BIC
increase provides TCP friendliness.
See http://www.csc.ncsu.edu/faculty/rhee/export/bitcp/
+config TCP_CONG_WAVE
+ tristate "Wave TCP"
+ default m
+ ---help---
+ TCP Wave (TCPW) replaces the window-based transmission paradigm of the
+ standard TCP with a burst-based transmission, the ACK-clock scheduling
+ with a self-managed timer and the RTT-based congestion control loop with
+ an Ack-based Capacity and Congestion Estimation (ACCE) module. In
+ non-technical words, it sends data down the stack when its internal
+ timer expires, and the timing of the received ACKs contribute to
+ updating this timer regularly.
+
config TCP_CONG_CUBIC
tristate "CUBIC TCP"
default y
@@ -690,6 +702,9 @@ choice
config DEFAULT_CUBIC
bool "Cubic" if TCP_CONG_CUBIC=y
+ config DEFAULT_WAVE
+ bool "Wave" if TCP_CONG_WAVE=y
+
config DEFAULT_HTCP
bool "Htcp" if TCP_CONG_HTCP=y
@@ -729,6 +744,7 @@ config DEFAULT_TCP_CONG
string
default "bic" if DEFAULT_BIC
default "cubic" if DEFAULT_CUBIC
+ default "wave" if DEFAULT_WAVE
default "htcp" if DEFAULT_HTCP
default "hybla" if DEFAULT_HYBLA
default "vegas" if DEFAULT_VEGAS
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index f83de23a30e7..c5b3ae3cf5b1 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -47,6 +47,7 @@ obj-$(CONFIG_TCP_CONG_BBR) += tcp_bbr.o
obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o
obj-$(CONFIG_TCP_CONG_CDG) += tcp_cdg.o
obj-$(CONFIG_TCP_CONG_CUBIC) += tcp_cubic.o
+obj-$(CONFIG_TCP_CONG_WAVE) += tcp_wave.o
obj-$(CONFIG_TCP_CONG_DCTCP) += tcp_dctcp.o
obj-$(CONFIG_TCP_CONG_WESTWOOD) += tcp_westwood.o
obj-$(CONFIG_TCP_CONG_HSTCP) += tcp_highspeed.o
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index febce533c0a0..616daf46b3df 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -2522,7 +2522,9 @@ void tcp_push_one(struct sock *sk, unsigned int mss_now)
{
struct sk_buff *skb = tcp_send_head(sk);
- BUG_ON(!skb || skb->len < mss_now);
+ /* Don't be forced to send not meaningful data */
+ if (!skb || skb->len < mss_now)
+ return;
tcp_write_xmit(sk, mss_now, TCP_NAGLE_PUSH, 1, sk->sk_allocation);
}
diff --git a/net/ipv4/tcp_wave.c b/net/ipv4/tcp_wave.c
new file mode 100644
index 000000000000..079df4d223e2
--- /dev/null
+++ b/net/ipv4/tcp_wave.c
@@ -0,0 +1,914 @@
+/*
+ * TCP Wave
+ *
+ * Copyright 2017 Natale Patriciello <natale.patriciello@...il.com>
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#include <net/tcp.h>
+#include <linux/module.h>
+#include <linux/delay.h>
+#include <linux/slab.h>
+
+#define WAVE_DEBUG 1
+
+#ifdef WAVE_DEBUG
+ #define DBG(msg ...) printk(KERN_DEBUG "WAVE_DEBUG: " msg)
+#else
+ static inline void DBG(const char *msg, ...) { }
+#endif
+
+static uint init_burst __read_mostly = 10;
+static uint min_burst __read_mostly = 3;
+static uint init_timer_ms __read_mostly = 500;
+static uint beta_ms __read_mostly = 150;
+
+module_param(init_burst, uint, 0644);
+MODULE_PARM_DESC(init_burst, "initial burst (segments)");
+module_param(min_burst, uint, 0644);
+MODULE_PARM_DESC(min_burst, "minimum burst (segments)");
+module_param(init_timer_ms, uint, 0644);
+MODULE_PARM_DESC(init_timer_ms, "initial timer (ms)");
+module_param(beta_ms, uint, 0644);
+MODULE_PARM_DESC(beta_ms, "beta parameter (ms)");
+
+/* Shift factor for the exponentially weighted average. */
+#define AVG_SCALE 20
+#define AVG_UNIT (1 << AVG_SCALE)
+
+/* Taken from BBR */
+#define BW_SCALE 24
+#define BW_UNIT (1 << BW_SCALE)
+
+/* Tell if the driver is initialized (init has been called) */
+#define FLAG_INIT 0x1
+/* Tell if, as sender, the driver is started (after TX_START) */
+#define FLAG_START 0x2
+/* If it's true, we save the sent size as a burst */
+#define FLAG_SAVE 0x4
+
+/* List for saving the size of sent burst over time */
+struct wavetcp_burst_hist {
+ u16 size; /* The burst size */
+ struct list_head list; /* Kernel list declaration */
+};
+
+static __always_inline bool test_flag(u8 value, const u8 *flags)
+{
+ return (*flags & value) == value;
+}
+
+static __always_inline void set_flag(u8 value, u8 *flags)
+{
+ *flags |= value;
+}
+
+static __always_inline void clear_flag(u8 value, u8 *flags)
+{
+ *flags &= ~(value);
+}
+
+/* TCP Wave private struct */
+struct wavetcp {
+ /* The module flags */
+ u8 flags;
+ /* The current transmission timer (us) */
+ u32 tx_timer;
+ /* The current burst size (segments) */
+ u16 burst;
+ /* Represents a delta from the burst size of segments sent */
+ char delta_segments;
+ /* The segments acked in the round */
+ u16 pkts_acked;
+ /* Heuristic scale, to divide the RTT */
+ u8 heuristic_scale;
+ /* Previous ack_train_disp Value */
+ u32 previous_ack_train_disp;
+ /* First ACK time of the round */
+ u32 first_ack_time;
+ /* Backup value of the first ack time */
+ u32 backup_first_ack_time;
+ /* First RTT of the round */
+ u32 first_rtt;
+ /* Minimum RTT of the round */
+ u32 min_rtt;
+ /* Average RTT of the previous round */
+ u32 avg_rtt;
+ /* Maximum RTT */
+ u32 max_rtt;
+ /* Stability factor */
+ u8 stab_factor;
+ /* The memory cache for saving the burst sizes */
+ struct kmem_cache *cache;
+ /* The burst history */
+ struct wavetcp_burst_hist *history;
+ /* To Print TCP Source Port */
+ u16 sport;
+};
+
+/* Called to setup Wave for the current socket after it enters the CONNECTED
+ * state (i.e., called after the SYN-ACK is received). The slow start should be
+ * 0 (see wavetcp_get_ssthresh) and we set the initial cwnd to the initial
+ * burst.
+ *
+ * After the ACK of the SYN-ACK is sent, the TCP will add a bit of delay to
+ * permit the queueing of data from the application, otherwise we will end up
+ * in a scattered situation (we have one segment -> send it -> no other segment,
+ * don't set the timer -> slightly after, another segment come and we loop).
+ *
+ * At the first expiration, the cwnd will be large enough to push init_burst
+ * segments out.
+ */
+static void wavetcp_init(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct wavetcp *ca = inet_csk_ca(sk);
+
+ ca->sport = ntohs(inet_sk(sk)->inet_sport);
+
+ DBG("%u sport: %u [%s]\n", tcp_time_stamp, ca->sport,
+ __func__);
+
+ /* Setting the initial Cwnd to 0 will not call the TX_START event */
+ tp->snd_ssthresh = 0;
+ tp->snd_cwnd = init_burst;
+
+ /* Used to avoid to take the SYN-ACK measurements */
+ ca->flags = 0;
+ ca->flags = FLAG_INIT | FLAG_SAVE;
+
+ ca->burst = init_burst;
+ ca->delta_segments = init_burst;
+ ca->tx_timer = init_timer_ms * USEC_PER_MSEC;
+ ca->first_ack_time = 0;
+ ca->backup_first_ack_time = 0;
+ ca->heuristic_scale = 0;
+ ca->first_rtt = 0;
+ ca->min_rtt = -1; /* a lot of time */
+ ca->avg_rtt = 0;
+ ca->max_rtt = 0;
+ ca->stab_factor = 0;
+ ca->previous_ack_train_disp = 0;
+
+ ca->history = kmalloc(sizeof(*ca->history), GFP_KERNEL);
+
+ /* Init the history of bwnd */
+ INIT_LIST_HEAD(&ca->history->list);
+
+ /* Init our cache pool for the bwnd history */
+ ca->cache = KMEM_CACHE(wavetcp_burst_hist, 0);
+ BUG_ON(ca->cache == 0);
+}
+
+static void wavetcp_release(struct sock *sk)
+{
+ struct wavetcp *ca = inet_csk_ca(sk);
+ struct wavetcp_burst_hist *tmp;
+ struct list_head *pos, *q;
+
+ if (!test_flag(FLAG_INIT, &ca->flags))
+ return;
+
+ DBG("%u sport: %u [%s]\n", tcp_time_stamp, ca->sport,
+ __func__);
+
+ list_for_each_safe(pos, q, &ca->history->list) {
+ tmp = list_entry(pos, struct wavetcp_burst_hist, list);
+ list_del(pos);
+ kmem_cache_free(ca->cache, tmp);
+ }
+
+ if (ca->history != 0)
+ kfree(ca->history);
+
+ /* Thanks for the cache, we don't need it anymore */
+ if (ca->cache != 0)
+ kmem_cache_destroy(ca->cache);
+}
+
+static void wavetcp_print_history(struct wavetcp *ca)
+{
+ struct wavetcp_burst_hist *tmp;
+ struct list_head *pos, *q;
+
+ list_for_each_safe(pos, q, &ca->history->list) {
+ tmp = list_entry(pos, struct wavetcp_burst_hist, list);
+ DBG("[%s] %u\n", __func__, tmp->size);
+ }
+}
+
+/* Please explain that we will be forever in congestion avoidance. */
+static u32 wavetcp_recalc_ssthresh(struct sock *sk)
+{
+ DBG("%u [%s]\n", tcp_time_stamp, __func__);
+ return 0;
+}
+
+static void wavetcp_state(struct sock *sk, u8 new_state)
+{
+ struct wavetcp *ca = inet_csk_ca(sk);
+
+ if (!test_flag(FLAG_INIT, &ca->flags))
+ return;
+
+ switch (new_state) {
+ case TCP_CA_Open:
+ DBG("%u sport: %u [%s] set CA_Open\n", tcp_time_stamp,
+ ca->sport, __func__);
+ /* We have fully recovered, so reset some variables */
+ ca->delta_segments = 0;
+ break;
+ default:
+ DBG("%u sport: %u [%s] set state %u, ignored\n",
+ tcp_time_stamp, ca->sport, __func__, new_state);
+ }
+}
+
+static u32 wavetcp_undo_cwnd(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ /* Not implemented yet. We stick to the decision made earlier */
+ DBG("%u [%s]\n", tcp_time_stamp, __func__);
+ return tp->snd_cwnd;
+}
+
+/* Add the size of the burst in the history of bursts */
+static void wavetcp_insert_burst(struct wavetcp *ca, u32 burst)
+{
+ struct wavetcp_burst_hist *cur;
+
+ DBG("%u sport: %u [%s] adding %u segment in the history of burst\n",
+ tcp_time_stamp, ca->sport, __func__, burst);
+
+ /* Take the memory from the pre-allocated pool */
+ cur = (struct wavetcp_burst_hist *)kmem_cache_alloc(ca->cache,
+ GFP_KERNEL);
+ BUG_ON(!cur);
+
+ cur->size = burst;
+ list_add_tail(&cur->list, &ca->history->list);
+}
+
+static void wavetcp_cwnd_event(struct sock *sk, enum tcp_ca_event event)
+{
+ struct wavetcp *ca = inet_csk_ca(sk);
+
+ if (!test_flag(FLAG_INIT, &ca->flags))
+ return;
+
+ switch (event) {
+ case CA_EVENT_TX_START:
+ /* first transmit when no packets in flight */
+ DBG("%u sport: %u [%s] TX_START\n", tcp_time_stamp,
+ ca->sport, __func__);
+
+ set_flag(FLAG_START, &ca->flags);
+
+ break;
+ default:
+ DBG("%u sport: %u [%s] got event %u, ignored\n",
+ tcp_time_stamp, ca->sport, __func__, event);
+ break;
+ }
+}
+
+static __always_inline void wavetcp_adj_mode(struct wavetcp *ca,
+ unsigned long delta_rtt)
+{
+ ca->stab_factor = ca->avg_rtt / ca->tx_timer;
+
+ ca->min_rtt = -1; /* a lot of time */
+ ca->avg_rtt = ca->max_rtt;
+ ca->tx_timer = init_timer_ms * USEC_PER_MSEC;
+
+ DBG("%u sport: %u [%s] stab_factor %u, timer %u us, avg_rtt %u us\n",
+ tcp_time_stamp, ca->sport, __func__, ca->stab_factor,
+ ca->tx_timer, ca->avg_rtt);
+}
+
+static __always_inline void wavetcp_tracking_mode(struct wavetcp *ca,
+ u32 ack_train_disp,
+ u64 delta_rtt)
+{
+ if (ack_train_disp == 0) {
+ DBG("%u sport: %u [%s] ack_train_disp is 0. Impossible to do tracking.\n",
+ tcp_time_stamp, ca->sport, __func__);
+ return;
+ }
+
+ ca->tx_timer = (ack_train_disp + (delta_rtt / 2));
+
+ if (ca->tx_timer == 0) {
+ DBG("%u sport: %u [%s] WARNING: tx timer is 0"
+ ", forcefully set it to 1000 us\n",
+ tcp_time_stamp, ca->sport, __func__);
+ ca->tx_timer = 1000;
+ }
+
+ DBG("%u sport: %u [%s] tx timer is %u us\n",
+ tcp_time_stamp, ca->sport, __func__,
+ ca->tx_timer);
+}
+
+/* The weight a is:
+ *
+ * a = (first_rtt - min_rtt) / first_rtt
+ *
+ */
+static __always_inline u64 wavetcp_compute_weight(u32 first_rtt,
+ u32 min_rtt)
+{
+ u64 diff = first_rtt - min_rtt;
+
+ diff = diff * AVG_UNIT;
+
+ return diff / first_rtt;
+}
+
+static u32 heuristic_ack_train_disp(struct wavetcp *ca, const struct rate_sample *rs,
+ u32 burst)
+{
+ u32 ack_train_disp = 0;
+ u32 backup_interval = 0;
+
+ BUG_ON (ca->previous_ack_train_disp != 0);
+
+ /*
+ * The heuristic takes the RTT of the first ACK, the RTT of the
+ * latest ACK, and uses the difference as ack_train_disp.
+ *
+ * If the sample for the first and last ACK are the same (e.g.,
+ * one ACK per burst) we use as the latest option the value of
+ * interval_us (which is the RTT). However, this value is
+ * exponentially lowered each time we don't have any valid
+ * sample (i.e., we perform a division by 2, by 4, and so on).
+ * The increased transmitted rate, if it is out of the capacity
+ * of the bottleneck, will be compensated by an higher
+ * delta_rtt, and so limited by the adjustment algorithm. This
+ * is a blind search, but we do not have any valid sample...
+ */
+ if (rs->interval_us > 0) {
+ if (rs->interval_us >= ca->backup_first_ack_time) {
+ /* first heuristic */
+ backup_interval = rs->interval_us - ca->backup_first_ack_time;
+ } else {
+ /* this branch avoids an overflow. However, reaching
+ * this point means that the ACK train is not aligned
+ * with the sent burst.
+ */
+ backup_interval = ca->backup_first_ack_time - rs->interval_us;
+ }
+
+ if (backup_interval == 0) {
+ /* Blind search */
+ ack_train_disp = rs->interval_us >> ca->heuristic_scale;
+ ++ca->heuristic_scale;
+ DBG("%u sport: %u [%s] we received one BIG ack."
+ " Doing an heuristic with scale %u, interval_us"
+ " %li us, and setting ack_train_disp to %u us\n",
+ tcp_time_stamp, ca->sport, __func__,
+ ca->heuristic_scale, rs->interval_us, ack_train_disp);
+ } else {
+ ack_train_disp = backup_interval;
+ DBG("%u sport: %u [%s] we got the first ack with"
+ " interval %u us, the last (this) with interval %li us."
+ " Doing a substraction and setting ack_train_disp"
+ " to %u us\n",
+ tcp_time_stamp, ca->sport, __func__,
+ ca->backup_first_ack_time, rs->interval_us,
+ ack_train_disp);
+ }
+ } else {
+ DBG("%u sport: %u [%s] WARNING is not possible "
+ "to heuristically calculate ack_train_disp, returning 0."
+ "Delivered %u, interval_us %li\n",
+ tcp_time_stamp, ca->sport, __func__,
+ rs->delivered, rs->interval_us);
+ return 0;
+ }
+
+ return ack_train_disp;
+}
+
+static u32 calculate_ack_train_disp(struct wavetcp *ca,
+ const struct rate_sample *rs,
+ u32 burst, u64 delta_rtt)
+{
+ u32 ack_train_disp = jiffies_to_usecs(tcp_time_stamp - ca->first_ack_time);
+
+ if (ca->previous_ack_train_disp == 0 && ack_train_disp == 0) {
+ /* We received a cumulative ACK just after we sent the data, so
+ * the dispersion would be close to zero, OR the connection
+ * is so fast that tcp_time_stamp is not good enough to measure
+ * time. Moreover, we don't have any valid sample from the past;
+ * in this case, we use an heuristic to calculate
+ * ack_train_disp.
+ */
+ return heuristic_ack_train_disp(ca, rs, burst);
+ }
+
+ DBG("%u sport: %u [%s] using measured ack_train_disp %u",
+ tcp_time_stamp, ca->sport, __func__, ack_train_disp);
+
+ /* resetting the heuristic scale because we have a real sample */
+ ca->heuristic_scale = 0;
+
+ if (ca->previous_ack_train_disp == 0) {
+ /* initialize the value */
+ ca->previous_ack_train_disp = ack_train_disp;
+ } else if (ack_train_disp > ca->previous_ack_train_disp) {
+ /* filter the measured value */
+ u64 alpha;
+ u64 left;
+ u64 right;
+
+ alpha = (delta_rtt * AVG_UNIT) / (beta_ms * 1000);
+ left = ((AVG_UNIT - alpha) * ca->previous_ack_train_disp) / AVG_UNIT;
+ right = (alpha * ack_train_disp) / AVG_UNIT;
+ DBG("%u sport: %u [%s] AVG_UNIT %i delta_rtt %llu beta %i alpha %llu "
+ "rcv_ack_train_disp %u prv_ack_train_disp %u left %llu right %llu\n",
+ tcp_time_stamp, ca->sport, __func__, AVG_UNIT, delta_rtt,
+ beta_ms, alpha, ack_train_disp, ca->previous_ack_train_disp,
+ left, right);
+
+ ack_train_disp = (u32)left + (u32)right;
+
+ DBG("%u sport: %u [%s] filtered_ack_train_disp %u (u32)left %u (u32)right %u\n",
+ tcp_time_stamp, ca->sport, __func__, ack_train_disp,
+ (u32)left, (u32)right);
+
+ } else if (ack_train_disp == 0) {
+ /* Use the plain previous value */
+ ack_train_disp = ca->previous_ack_train_disp;
+ } else {
+ /* In all other cases, update the previous value */
+ ca->previous_ack_train_disp = ack_train_disp;
+ }
+
+ DBG("%u sport: %u [%s] previous_ack_train_disp %u us, final ack_train_disp %u us\n",
+ tcp_time_stamp, ca->sport, __func__,
+ ca->previous_ack_train_disp, ack_train_disp);
+
+ return ack_train_disp;
+}
+
+static u64 calculate_delta_rtt(struct wavetcp *ca)
+{
+ if (ca->first_rtt == 0) {
+ ca->first_rtt = ca->avg_rtt;
+ DBG("%u sport: %u [%s] It was impossible to get any rtt "
+ "in the train. Using the average value %u\n",
+ tcp_time_stamp, ca->sport, __func__,
+ ca->first_rtt);
+ }
+ /* Why the first if?
+ *
+ * a = (first_rtt - min_rtt) / first_rtt = 1 - (min_rtt/first_rtt)
+ *
+ * avg_rtt_0 = (1 - a) * first_rtt
+ * = (1 - (1 - (min_rtt/first_rtt))) * first_rtt
+ * = first_rtt - (first_rtt - min_rtt)
+ * = min_rtt
+ *
+ *
+ * And.. what happen in the else branch? We calculate first a (scaled by
+ * 1024), then do the substraction (1-a) by keeping in the consideration
+ * the scale, and in the end coming back to the result removing the
+ * scaling.
+ *
+ * We divide the equation
+ *
+ * AvgRtt = a * AvgRtt + (1-a)*Rtt
+ *
+ * in two part properly scaled, left and right, and then having a sum of
+ * the two parts to avoid (possible) overflow.
+ */
+ if (ca->avg_rtt == 0) {
+ ca->avg_rtt = ca->min_rtt;
+ } else if (ca->first_rtt > 0) {
+ u64 a;
+ u64 left;
+ u64 right;
+ a = wavetcp_compute_weight(ca->first_rtt, ca->min_rtt);
+
+ DBG("%u sport: %u [%s] init. avg %u us, first %u us, "
+ "min %u us, a (shifted) %llu",
+ tcp_time_stamp, ca->sport, __func__,
+ ca->avg_rtt, ca->first_rtt, ca->min_rtt, a);
+
+ left = (a * ca->avg_rtt) / AVG_UNIT;
+ right = ((AVG_UNIT - a) * ca->first_rtt) / AVG_UNIT;
+
+ ca->avg_rtt = (u32)left + (u32)right;
+ } else {
+ DBG("%u sport: %u [%s] first_rtt is 0. It is impossible "
+ "to calculate the average RTT. Using the old value.\n",
+ tcp_time_stamp, ca->sport, __func__);
+ }
+
+ DBG("%u sport: %u [%s] final avg %u\n",
+ tcp_time_stamp, ca->sport, __func__, ca->avg_rtt);
+ /* We clearly missed a measurements if this happens */
+ BUG_ON(ca->avg_rtt < ca->min_rtt);
+ return ca->avg_rtt - ca->min_rtt;
+}
+
+static void wavetcp_round_terminated(struct sock *sk, const struct rate_sample *rs,
+ u32 burst)
+{
+ u64 delta_rtt;
+ struct wavetcp *ca = inet_csk_ca(sk);
+
+ DBG("%u sport: %u [%s] reached the burst size %u\n",
+ tcp_time_stamp, ca->sport, __func__, burst);
+
+ BUG_ON(time_after((unsigned long)ca->first_ack_time,
+ (unsigned long)tcp_time_stamp));
+
+ delta_rtt = calculate_delta_rtt(ca);
+ DBG("%u sport: %u [%s] delta rtt %llu us\n",
+ tcp_time_stamp, ca->sport, __func__, delta_rtt);
+
+ /* If we have to wait, let's wait */
+ if (ca->stab_factor > 0) {
+ --ca->stab_factor;
+ DBG("%u sport: %u [%s] avoiding update for stability reasons\n",
+ tcp_time_stamp, ca->sport, __func__);
+ return;
+ }
+
+ DBG("%u sport: %u [%s] drtt %llu\n",
+ tcp_time_stamp, ca->sport, __func__, delta_rtt);
+
+ /* delta_rtt is in us, beta_ms in ms */
+ if (delta_rtt > beta_ms * 1000)
+ wavetcp_adj_mode(ca, delta_rtt);
+ else
+ wavetcp_tracking_mode(ca, calculate_ack_train_disp(ca, rs,
+ burst,
+ delta_rtt),
+ delta_rtt);
+}
+
+static void wavetcp_cong_control(struct sock *sk, const struct rate_sample *rs)
+{
+ struct wavetcp_burst_hist *tmp;
+ struct list_head *pos;
+ struct wavetcp *ca = inet_csk_ca(sk);
+
+ if (!test_flag(FLAG_INIT, &ca->flags))
+ return;
+
+ if (ca->backup_first_ack_time == 0 && rs->interval_us > 0)
+ ca->backup_first_ack_time = rs->interval_us;
+
+ pos = ca->history->list.next;
+ tmp = list_entry(pos, struct wavetcp_burst_hist, list);
+
+ if (tmp->size == 0) {
+ /* No burst in memory. Most likely we sent some segments out of
+ * the allowed window (e.g., loss probe) */
+ DBG("%u sport: %u [%s] WARNING! empty burst\n",
+ tcp_time_stamp, ca->sport, __func__);
+ wavetcp_print_history(ca);
+ goto reset;
+ }
+
+ DBG("%u sport: %u [%s] prior_delivered %u, delivered %i, interval_us %li, "
+ "rtt_us %li, losses %i, ack_sack %u, prior_in_flight %u, is_app %i,"
+ " is_retrans %i\n", tcp_time_stamp, ca->sport, __func__,
+ rs->prior_delivered, rs->delivered, rs->interval_us, rs->rtt_us,
+ rs->losses, rs->acked_sacked, rs->prior_in_flight,
+ rs->is_app_limited, rs->is_retrans);
+
+ if (!test_flag(FLAG_INIT, &ca->flags))
+ return;
+
+ /* Train management.*/
+ ca->pkts_acked += rs->acked_sacked;
+
+ if (ca->pkts_acked < tmp->size)
+ return;
+
+ while (ca->pkts_acked >= tmp->size) {
+ /* Usually the burst end is also reflected in the rs->delivered
+ * variable. If this is not the case, and such variable is
+ * behind just for 1 segment, then do this experimental thing
+ * to re-allineate the burst with the rs->delivered variable.
+ * In the majority of cases, we went out of allineation because
+ * of a tail loss probe. */
+ if (rs->delivered + 1 == tmp->size) {
+ DBG("%u sport: %u [%s] highly experimental:"
+ " ignore 1 pkt. pkts_acked %u, delivered %u,"
+ " burst %u\n", tcp_time_stamp, ca->sport, __func__,
+ ca->pkts_acked, rs->delivered, tmp->size);
+ ca->pkts_acked--;
+ return;
+ }
+ wavetcp_round_terminated(sk, rs, tmp->size);
+
+ BUG_ON(ca->pkts_acked < tmp->size);
+
+ ca->pkts_acked -= tmp->size;
+
+ /* Delete the burst from the history */
+ list_del(pos);
+ kmem_cache_free(ca->cache, tmp);
+
+ /* Take next burst */
+ pos = ca->history->list.next;
+ tmp = list_entry(pos, struct wavetcp_burst_hist, list);
+
+ /* If we cycle, inside wavetcp_round_terminated we will take the
+ * Linux path instead of the wave path.. first_rtt will not be
+ * read, so don't waste a cycle to set it */
+ ca->first_ack_time = tcp_time_stamp;
+ ca->backup_first_ack_time = 0;
+ }
+
+reset:
+ /* Reset the variables needed for the beginning of the next round*/
+ ca->first_ack_time = 0;
+ ca->backup_first_ack_time = 0;
+ ca->first_rtt = 0;
+ DBG("%u sport: %u [%s] resetting RTT values for next round\n",
+ tcp_time_stamp, ca->sport, __func__);
+}
+
+static void wavetcp_acce(struct wavetcp *ca, s32 rtt_us, u32 pkts_acked)
+{
+ if (ca->first_ack_time == 0) {
+ ca->first_ack_time = tcp_time_stamp;
+ DBG("%u sport: %u [%s] first ack of the train\n",
+ tcp_time_stamp, ca->sport, __func__);
+ }
+
+ if (ca->first_rtt == 0 && rtt_us > 0) {
+ ca->first_rtt = rtt_us;
+
+ DBG("%u sport: %u [%s] first measurement rtt %i\n",
+ tcp_time_stamp, ca->sport, __func__,
+ ca->first_rtt);
+ }
+
+ if (rtt_us <= 0)
+ return;
+
+ /* Check the minimum rtt we have seen */
+ if (rtt_us < ca->min_rtt) {
+ ca->min_rtt = rtt_us;
+ DBG("%u sport: %u [%s] min rtt %u\n", tcp_time_stamp,
+ ca->sport, __func__, rtt_us);
+ }
+
+ if (rtt_us > ca->max_rtt)
+ ca->max_rtt = rtt_us;
+}
+
+/* Invoked each time we receive an ACK. Obviously, this function also gets
+ * called when we receive the SYN-ACK, but we ignore it thanks to the
+ * FLAG_INIT flag.
+ *
+ * We close the cwnd of the amount of segments acked, because we don't like
+ * sending out segments if the timer is not expired. Without doing this, we
+ * would end with cwnd - in_flight > 0.
+ */
+static void wavetcp_acked(struct sock *sk, const struct ack_sample *sample)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct wavetcp *ca = inet_csk_ca(sk);
+
+ if (!test_flag(FLAG_INIT, &ca->flags))
+ return;
+
+ DBG("%u sport: %u [%s] pkts_acked %u, rtt_us %i, in_flight %u "
+ ", cwnd %u, seq ack %u\n",
+ tcp_time_stamp, ca->sport, __func__, sample->pkts_acked,
+ sample->rtt_us, sample->in_flight, tp->snd_cwnd, tp->snd_una);
+
+ /* We can divide the ACCE function in two part: the first take care of
+ * the RTT, and the second of the train management. Here we could have
+ * pkts_acked == 0, but with RTT values (because the underlying TCP can
+ * identify what segment has been ACKed through the SACK option). In any
+ * case, therefore, we enter wavetcp_acce.*/
+ wavetcp_acce(ca, sample->rtt_us, sample->pkts_acked);
+
+ if (tp->snd_cwnd < sample->pkts_acked) {
+ /* We sent some scattered segments, so the burst segments and
+ * the ACK we get is not aligned.
+ */
+ DBG("%u sport: %u [%s] delta_seg %i\n",
+ tcp_time_stamp, ca->sport, __func__,
+ ca->delta_segments);
+
+ ca->delta_segments += sample->pkts_acked - tp->snd_cwnd;
+ }
+
+ DBG("%u sport: %u [%s] snd_cwnd %u pkts_acked %u delta %i\n",
+ tcp_time_stamp, ca->sport, __func__, tp->snd_cwnd,
+ sample->pkts_acked, ca->delta_segments);
+
+ /* Brutally set the cwnd in order to not let segment out */
+ tp->snd_cwnd = tcp_packets_in_flight(tp);
+
+ DBG("%u sport: %u [%s] new window %u in_flight %u delta %i\n",
+ tcp_time_stamp, ca->sport, __func__, tp->snd_cwnd,
+ tcp_packets_in_flight(tp), ca->delta_segments);
+}
+
+/* The TCP informs us that the timer is expired (or has never been set). We can
+ * infer the latter by the FLAG_STARTED flag: if it's false, don't increase the
+ * cwnd, because it is at its default value (init_burst) and we still have to
+ * transmit the first burst.
+ */
+static void wavetcp_timer_expired(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct wavetcp *ca = inet_csk_ca(sk);
+ u32 current_burst = ca->burst;
+
+ BUG_ON(!test_flag(FLAG_INIT, &ca->flags));
+
+ if (!test_flag(FLAG_START, &ca->flags)) {
+ DBG("%u sport: %u [%s] returning because of !FLAG_START, leaving cwnd %u\n",
+ tcp_time_stamp, ca->sport, __func__, tp->snd_cwnd);
+ return;
+ }
+
+ DBG("%u sport: %u [%s] starting with delta %u current_burst %u\n",
+ tcp_time_stamp, ca->sport, __func__, ca->delta_segments,
+ current_burst);
+
+ if (ca->delta_segments < 0) {
+ /* In the previous round, we sent more than the allowed burst,
+ * so reduce the current burst.
+ */
+ BUG_ON(current_burst > ca->delta_segments);
+ current_burst += ca->delta_segments; /* please *reduce* */
+
+ /* Right now, we should send "current_burst" segments out */
+
+ if (tcp_packets_in_flight(tp) > tp->snd_cwnd) {
+ /* For some reasons (e.g., tcp loss probe)
+ * we sent something outside the allowed window.
+ * Add the amount of segments into the burst, in order
+ * to effectively send the previous "current_burst"
+ * segments, but without touching delta_segments.
+ */
+ u32 diff = tcp_packets_in_flight(tp) - tp->snd_cwnd;
+
+ current_burst += diff;
+ DBG("%u sport: %u [%s] adding %u to balance "
+ "segments sent out of window", tcp_time_stamp,
+ ca->sport, __func__, diff);
+ }
+ }
+
+ ca->delta_segments = current_burst;
+ DBG("%u sport: %u [%s] setting delta_seg %u current burst %u\n",
+ tcp_time_stamp, ca->sport, __func__,
+ ca->delta_segments, current_burst);
+
+ if (current_burst < min_burst) {
+ DBG("%u sport: %u [%s] WARNING !! not min_burst",
+ tcp_time_stamp, ca->sport, __func__);
+ ca->delta_segments += min_burst - current_burst;
+ current_burst = min_burst;
+ }
+
+ tp->snd_cwnd += current_burst;
+ set_flag(FLAG_SAVE, &ca->flags);
+
+ DBG("%u sport: %u [%s], increased window of %u segments, "
+ "total %u, delta %i, in_flight %u\n",
+ tcp_time_stamp, ca->sport, __func__, ca->burst,
+ tp->snd_cwnd, ca->delta_segments, tcp_packets_in_flight(tp));
+
+ if (tp->snd_cwnd - tcp_packets_in_flight(tp) > current_burst) {
+ DBG("%u sport: %u [%s] WARNING! "
+ " cwnd %u, in_flight %u, current burst %u\n",
+ tcp_time_stamp, ca->sport, __func__,
+ tp->snd_cwnd, tcp_packets_in_flight(tp),
+ current_burst);
+ }
+}
+
+/* The TCP is asking for a timer value in jiffies. This will be subject to
+ * change for a realtime timer in the future.
+ */
+static unsigned long wavetcp_get_timer(struct sock *sk)
+{
+ struct wavetcp *ca = inet_csk_ca(sk);
+ u32 timer;
+
+ BUG_ON(!test_flag(FLAG_INIT, &ca->flags));
+
+ timer = min_t(unsigned long, ca->tx_timer, init_timer_ms * USEC_PER_MSEC);
+
+ DBG("%u sport: %u [%s] returning timer of %u us\n",
+ tcp_time_stamp, ca->sport, __func__, timer);
+
+ return usecs_to_jiffies(timer);
+}
+
+static void wavetcp_segment_sent(struct sock *sk, u32 sent)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct wavetcp *ca = inet_csk_ca(sk);
+
+ if (test_flag(FLAG_SAVE, &ca->flags) && sent > 0) {
+ wavetcp_insert_burst(ca, sent);
+ clear_flag(FLAG_SAVE, &ca->flags);
+ } else {
+ DBG("%u sport: %u [%s] not saving burst, sent %u\n",
+ tcp_time_stamp, ca->sport, __func__, sent);
+ }
+
+ if (sent > ca->burst) {
+ DBG("%u sport: %u [%s] WARNING! sent %u, burst %u"
+ " cwnd %u delta_seg %i\n, TSO very probable",
+ tcp_time_stamp, ca->sport, __func__, sent,
+ ca->burst, tp->snd_cwnd, ca->delta_segments);
+ }
+
+ ca->delta_segments -= sent;
+
+ if (ca->delta_segments >= 0 &&
+ ca->burst > sent &&
+ tcp_packets_in_flight(tp) <= tp->snd_cwnd) {
+ /* Reduce the cwnd accordingly, because we didn't sent enough
+ * to cover it (we are app limited probably) */
+ u32 diff = ca->burst - sent;
+
+ if (tp->snd_cwnd >= diff)
+ tp->snd_cwnd -= diff;
+ else
+ tp->snd_cwnd = 0;
+ DBG("%u sport: %u [%s] reducing cwnd by %u, value %u\n",
+ tcp_time_stamp, ca->sport, __func__,
+ ca->burst - sent, tp->snd_cwnd);
+ }
+}
+
+static void wavetcp_no_data(struct sock *sk)
+{
+ DBG("%u [%s]\n", tcp_time_stamp, __func__);
+}
+
+static u32 wavetcp_sndbuf_expand(struct sock *sk)
+{
+ return 10;
+}
+
+static struct tcp_congestion_ops wave_cong_tcp __read_mostly = {
+ .init = wavetcp_init,
+ .release = wavetcp_release,
+ .ssthresh = wavetcp_recalc_ssthresh,
+/* .cong_avoid = wavetcp_cong_avoid, */
+ .cong_control = wavetcp_cong_control,
+ .set_state = wavetcp_state,
+ .undo_cwnd = wavetcp_undo_cwnd,
+ .cwnd_event = wavetcp_cwnd_event,
+ .pkts_acked = wavetcp_acked,
+ .sndbuf_expand = wavetcp_sndbuf_expand,
+ .owner = THIS_MODULE,
+ .name = "wave",
+ .get_send_timer_exp_time = wavetcp_get_timer,
+ .send_timer_expired = wavetcp_timer_expired,
+ .no_data_to_transmit = wavetcp_no_data,
+ .segment_sent = wavetcp_segment_sent,
+};
+
+static int __init wavetcp_register(void)
+{
+ BUILD_BUG_ON(sizeof(struct wavetcp) > ICSK_CA_PRIV_SIZE);
+
+ return tcp_register_congestion_control(&wave_cong_tcp);
+}
+
+static void __exit wavetcp_unregister(void)
+{
+ tcp_unregister_congestion_control(&wave_cong_tcp);
+}
+
+module_init(wavetcp_register);
+module_exit(wavetcp_unregister);
+
+MODULE_AUTHOR("Natale Patriciello");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("WAVE TCP");
+MODULE_VERSION("0.1");
--
2.13.2
Powered by blists - more mailing lists