netdev - [RFC PATCH net-next] tcp: add CDG congestion control

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-Id: <1431894670-12609-1-git-send-email-kennetkl@ifi.uio.no>
Date:	Sun, 17 May 2015 22:31:10 +0200
From:	Kenneth Klette Jonassen <kennetkl@....uio.no>
To:	netdev@...r.kernel.org
Cc:	Kenneth Klette Jonassen <kennetkl@....uio.no>,
	David Hayes <davihay@....uio.no>,
	Yuchung Cheng <ycheng@...gle.com>,
	Andreas Petlund <apetlund@...ula.no>
Subject: [RFC PATCH net-next] tcp: add CDG congestion control

This is a request for comments.

CAIA Delay-Gradient (CDG) is a TCP congestion control that modifies
the TCP sender in order to [1]:

  o Use the delay gradient as a congestion signal.
  o Back off with an average probability that is independent of the RTT.
  o Compete with flows that use loss-based congestion control.
  o Tolerate packet loss unrelated to congestion.

Its FreeBSD implementation was presented for the ICCRG in July 2012;
slides are available at: http://www.ietf.org/proceedings/84/iccrg.html

Our Linux implementation tries to honor the original design without major
changes. However, we encourage others to experiment with enhancements, e.g.
switch to the Hybrid Slow start algorithm. We decided to disable the loss
tolerance heuristic by default due to concerns about its safety outside
closed environments.

Running the experiment scenarios in [1] suggests that our implementation
achieves more goodput compared with FreeBSD 10.0 senders, although it also
induces more queueing for a given backoff factor. We do not currently
understand all the variables that make these implementations different.

There is a potential misbehavior during slow start: if we back off due to
delay gradient indications, we might overshoot more than New Reno would
have due to PRR. This is because a delay gradient backoff only reduces cwnd
by a factor of 0.7, and subsequent losses (which would have reduced by
a factor of 0.5) are ignored until we exit recovery.

[1] D.A. Hayes and G. Armitage. "Revisiting TCP congestion control using
    delay gradients." In Networking 2011, pages 328-341. Springer, 2011.

Cc: David Hayes <davihay@....uio.no>
Cc: Yuchung Cheng <ycheng@...gle.com>
Cc: Andreas Petlund <apetlund@...ula.no>
Signed-off-by: Kenneth Klette Jonassen <kennetkl@....uio.no>

---

[1] http://caia.swin.edu.au/cv/dahayes/content/networking2011-cdg-preprint.pdf
---
 net/ipv4/Kconfig   |  20 +++
 net/ipv4/Makefile  |   1 +
 net/ipv4/tcp_cdg.c | 378 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 399 insertions(+)
 create mode 100644 net/ipv4/tcp_cdg.c

diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index d83071d..d2b966c 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -615,6 +615,22 @@ config TCP_CONG_DCTCP
 	For further details see:
 	  http://simula.stanford.edu/~alizade/Site/DCTCP_files/dctcp-final.pdf
 
+config TCP_CONG_CDG
+	tristate "CAIA Delay-Gradient (CDG)"
+	default n
+	---help---
+	CAIA Delay-Gradient (CDG) is a TCP congestion control that modifies
+	the TCP sender in order to:
+
+	  o Use the delay gradient as a congestion signal.
+	  o Back off with an average probability that is independent of the RTT.
+	  o Compete with flows that use loss-based congestion control.
+	  o Tolerate packet loss unrelated to congestion.
+
+	For further details see:
+	  D.A. Hayes and G. Armitage. "Revisiting TCP congestion control using
+	  delay gradients." In Networking 2011. Preprint: http://goo.gl/No3vdg
+
 choice
 	prompt "Default TCP congestion control"
 	default DEFAULT_CUBIC
@@ -646,6 +662,9 @@ choice
 	config DEFAULT_DCTCP
 		bool "DCTCP" if TCP_CONG_DCTCP=y
 
+	config DEFAULT_CDG
+		bool "CDG" if TCP_CONG_CDG=y
+
 	config DEFAULT_RENO
 		bool "Reno"
 endchoice
@@ -668,6 +687,7 @@ config DEFAULT_TCP_CONG
 	default "veno" if DEFAULT_VENO
 	default "reno" if DEFAULT_RENO
 	default "dctcp" if DEFAULT_DCTCP
+	default "cdg" if DEFAULT_CDG
 	default "cubic"
 
 config TCP_MD5SIG
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index b36236d..efc43f3 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -42,6 +42,7 @@ obj-$(CONFIG_INET_TCP_DIAG) += tcp_diag.o
 obj-$(CONFIG_INET_UDP_DIAG) += udp_diag.o
 obj-$(CONFIG_NET_TCPPROBE) += tcp_probe.o
 obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o
+obj-$(CONFIG_TCP_CONG_CDG) += tcp_cdg.o
 obj-$(CONFIG_TCP_CONG_CUBIC) += tcp_cubic.o
 obj-$(CONFIG_TCP_CONG_DCTCP) += tcp_dctcp.o
 obj-$(CONFIG_TCP_CONG_WESTWOOD) += tcp_westwood.o
diff --git a/net/ipv4/tcp_cdg.c b/net/ipv4/tcp_cdg.c
new file mode 100644
index 0000000..28abe1f
--- /dev/null
+++ b/net/ipv4/tcp_cdg.c
@@ -0,0 +1,378 @@
+/*
+ * CAIA Delay-Gradient (CDG) congestion control
+ *
+ * This implementation is based on the paper:
+ *   D.A. Hayes and G. Armitage. "Revisiting TCP congestion control using
+ *   delay gradients." In Networking 2011, pages 328-341. Springer, 2011.
+ *
+ * For background traffic, disable coexistence heuristics using parameters
+ * use_shadow=0 ineffective_thresh=0.
+ *
+ * Notable differences from paper and FreeBSD patch:
+ *   o Add toggle for shadow window mechanism. Suggested by David Hayes.
+ *   o Add toggle for non-congestion loss tolerance.
+ *   o Scaling parameter G is changed to a backoff factor;
+ *     conversion is given by: backoff_factor = 1000/G.
+ *   o Clear shadow window on ambiguity between full and empty queue.
+ *   o Limit shadow window when application limited.
+ *   o Re-initialize shadow window on cwnd restart.
+ *   o Infer full queue on ECN signal.
+ *   o More accurate e^-x.
+ */
+#include <linux/kernel.h>
+#include <linux/random.h>
+#include <linux/module.h>
+#include <net/tcp.h>
+
+static  int window __read_mostly = 8;
+static bool use_shadow __read_mostly = true;
+static bool use_tolerance __read_mostly;
+static uint backoff_beta __read_mostly = 0.70 * 1024;
+static uint backoff_factor __read_mostly = 333;
+static uint ineffective_thresh __read_mostly = 5;
+static uint ineffective_hold __read_mostly = 5;
+
+module_param(window, int, 0444);
+MODULE_PARM_DESC(window, "moving average window width (power of two)");
+module_param(use_shadow, bool, 0644);
+MODULE_PARM_DESC(use_shadow, "use shadow window heuristic");
+module_param(use_tolerance, bool, 0644);
+MODULE_PARM_DESC(use_tolerance, "use loss tolerance heuristic");
+module_param(backoff_beta, uint, 0444);
+MODULE_PARM_DESC(backoff_beta, "backoff beta (1-1024)");
+module_param(backoff_factor, uint, 0444);
+MODULE_PARM_DESC(backoff_factor, "backoff probability scale factor (1-65535)");
+module_param(ineffective_thresh, uint, 0644);
+MODULE_PARM_DESC(ineffective_thresh, "ineffective backoff threshold");
+module_param(ineffective_hold, uint, 0644);
+MODULE_PARM_DESC(ineffective_hold, "ineffective backoff hold time");
+
+struct minmax {
+	union {
+		struct {
+			s32 min;
+			s32 max;
+		};
+		u64 v64;
+	};
+};
+
+enum cdg_state {
+	CDG_UNKNOWN = 0,
+	CDG_FULL    = 0,
+	CDG_NONFULL = 1,
+};
+
+struct cdg {
+	struct minmax rtt;
+	struct minmax rtt_prev;
+	struct minmax gsum;
+	struct minmax *gradients;
+	enum cdg_state state;
+	u32 rtt_seq;
+	u32 shadow_wnd;
+	u32 loss_cwnd;
+	uint tail;
+	uint backoff_cnt;
+	uint delack;
+	bool ecn_ce;
+};
+
+/**
+ * nexp_u32 - negative base-e exponential
+ * @ux: x in units of micro
+ *
+ * Returns exp(ux * -1e-6) * U32_MAX.
+ */
+static u32 __pure nexp_u32(u32 ux)
+{
+	static const u16 v[] = {
+		/* exp(-x)*65536-1 for x = 0, 0.000256, 0.000512, ... */
+		65535,
+		65518, 65501, 65468, 65401, 65267, 65001, 64470, 63422,
+		61378, 57484, 50423, 38795, 22965, 8047,  987,   14,
+	};
+	u64 res;
+	u32 msb = ux >> 8;
+	int i;
+
+	/* Cut off when ux >= 2^24 (actual result is <= 222/U32_MAX). */
+	if (msb > U16_MAX)
+		return 0;
+
+	/* Scale first eight bits linearly: */
+	res = U32_MAX - (ux & 0xff) * (U32_MAX / 1000000);
+
+	/* Obtain e^(x + y + ...) by computing e^x * e^y * ...: */
+	for (i = 1; msb; i++, msb >>= 1) {
+		u64 y = v[i & -(msb & 1)] + 1ULL;
+
+		res = (res * y) >> 16;
+	}
+
+	return (u32)res;
+}
+
+static s32 tcp_cdg_grad(struct sock *sk)
+{
+	struct cdg *ca = inet_csk_ca(sk);
+	struct tcp_sock *tp = tcp_sk(sk);
+	s32 grad = 0;
+
+	if (ca->rtt_prev.v64) {
+		s32 gmin = ca->rtt.min - ca->rtt_prev.min;
+		s32 gmax = ca->rtt.max - ca->rtt_prev.max;
+		s32 gmin_s;
+		s32 gmax_s;
+
+		ca->gsum.min += gmin - ca->gradients[ca->tail].min;
+		ca->gsum.max += gmax - ca->gradients[ca->tail].max;
+		ca->gradients[ca->tail].min = gmin;
+		ca->gradients[ca->tail].max = gmax;
+		ca->tail = (ca->tail + 1) & (window - 1);
+
+		/* We keep sums to ignore gradients during CWR;
+		 * smoothed gradients otherwise simplify to:
+		 * (rtt_latest - rtt_oldest) / window.
+		 */
+		gmin_s = DIV_ROUND_CLOSEST(ca->gsum.min, window);
+		gmax_s = DIV_ROUND_CLOSEST(ca->gsum.max, window);
+
+		/* Only use smoothed gradients in CA: */
+		if (tp->snd_cwnd > tp->snd_ssthresh) {
+			grad = gmin_s > 0 ? gmin_s : gmax_s;
+		} else {
+			/* Prefer unsmoothed gradients in slow start. */
+			grad = gmin > 0 ? gmin : gmin_s;
+			if (grad < 0)
+				grad = gmax > 0 ? gmax : gmax_s;
+		}
+
+		if (gmin_s > 0 && gmax_s <= 0)
+			ca->state = CDG_FULL;
+		else if ((gmin_s > 0 && gmax_s > 0) || gmax_s < 0)
+			ca->state = CDG_NONFULL;
+
+		/* Empty queue: */
+		if (gmin_s >= 0 && gmax_s < 0)
+			ca->shadow_wnd = 0;
+		/* Backoff was effectual: */
+		if (gmin_s < 0 || gmax_s < 0)
+			ca->backoff_cnt = 0;
+	}
+
+	ca->rtt_prev = ca->rtt;
+	ca->rtt.v64 = 0;
+	return grad;
+}
+
+static int tcp_cdg_backoff(struct sock *sk, s32 grad)
+{
+	struct cdg *ca = inet_csk_ca(sk);
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	if (grad <= 0 || prandom_u32() <= nexp_u32(grad * backoff_factor))
+		return 0;
+
+	ca->shadow_wnd = max(ca->shadow_wnd, tp->snd_cwnd);
+	ca->backoff_cnt++;
+	if (ca->backoff_cnt > ineffective_thresh && ineffective_thresh) {
+		if (ca->backoff_cnt >= (ineffective_thresh + ineffective_hold))
+			ca->backoff_cnt = 0;
+		return 0;
+	}
+
+	/* reset TLP and prohibit cwnd undo: */
+	tp->tlp_high_seq = 0;
+	tp->prior_ssthresh = 0;
+
+	/* set PRR target and enter CWR: */
+	tp->snd_ssthresh = max(2U, (tp->snd_cwnd * backoff_beta) >> 10U);
+	tp->prr_delivered = 0;
+	tp->prr_out = 0;
+	tp->prior_cwnd = tp->snd_cwnd;
+	tp->high_seq = tp->snd_nxt;
+
+	tcp_set_ca_state(sk, TCP_CA_CWR);
+	return 1;
+}
+
+/* Not called in CWR or Recovery state. */
+static void tcp_cdg_cong_avoid(struct sock *sk, u32 ack, u32 acked)
+{
+	struct cdg *ca = inet_csk_ca(sk);
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	if (unlikely(!ca->gradients))
+		return tcp_reno_cong_avoid(sk, ack, acked);
+
+	/* Measure filtered gradients once per RTT: */
+	if (after(ack, ca->rtt_seq) && ca->rtt.v64) {
+		s32 grad = tcp_cdg_grad(sk);
+
+		ca->rtt_seq = tp->snd_nxt;
+
+		if (tcp_cdg_backoff(sk, grad))
+			return;
+	} else if (tp->snd_cwnd > tp->snd_ssthresh) {
+		/* In CA, synchronize cwnd growth with delay gradients. */
+		return;
+	}
+
+	if (!tcp_is_cwnd_limited(sk)) {
+		ca->shadow_wnd = min(ca->shadow_wnd, tp->snd_cwnd);
+		return;
+	}
+
+	if (tp->snd_cwnd <= tp->snd_ssthresh)
+		tcp_slow_start(tp, acked);
+	else if (tp->snd_cwnd < tp->snd_cwnd_clamp)
+		tp->snd_cwnd++;
+
+	if (ca->shadow_wnd && ca->shadow_wnd < tp->snd_cwnd_clamp)
+		ca->shadow_wnd = max(tp->snd_cwnd, ca->shadow_wnd + 1);
+}
+
+static void tcp_cdg_acked(struct sock *sk, u32 num_acked, s32 rtt_us)
+{
+	struct cdg *ca = inet_csk_ca(sk);
+
+	if (rtt_us <= 0)
+		return;
+
+	/* A heuristic for filtering delayed ACKs, adapted from:
+	 * D.A. Hayes. "Timing enhancements to the FreeBSD kernel to support
+	 * delay and rate based TCP mechanisms." TR 100219A. CAIA, 2010.
+	 *
+	 * Assume num_acked == 0 indicates RTT measurement from SACK.
+	 */
+	if (num_acked == 1 && ca->delack) {
+		/* A delayed ACK is only used for the minimum if it is
+		 * provenly lower than an existing non-zero minimum.
+		 */
+		ca->rtt.min = min(ca->rtt.min, rtt_us);
+		ca->delack--;
+		return;
+	} else if (num_acked > 1 && ca->delack < 5) {
+		ca->delack++;
+	}
+
+	ca->rtt.min = min_not_zero(ca->rtt.min, rtt_us);
+	ca->rtt.max = max(ca->rtt.max, rtt_us);
+}
+
+static u32 tcp_cdg_ssthresh(struct sock *sk)
+{
+	struct cdg *ca = inet_csk_ca(sk);
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	if (unlikely(!ca->gradients))
+		return tcp_reno_ssthresh(sk);
+
+	ca->loss_cwnd = tp->snd_cwnd;
+
+	if (use_tolerance && ca->state == CDG_NONFULL && !ca->ecn_ce)
+		return tp->snd_cwnd;
+
+	ca->shadow_wnd >>= 1;
+	if (!use_shadow)
+		return max(2U, tp->snd_cwnd >> 1);
+	return max3(2U, ca->shadow_wnd, tp->snd_cwnd >> 1);
+}
+
+static u32 tcp_cdg_undo_cwnd(struct sock *sk)
+{
+	struct cdg *ca = inet_csk_ca(sk);
+
+	return max(tcp_sk(sk)->snd_cwnd, ca->loss_cwnd);
+}
+
+static void tcp_cdg_cwnd_event(struct sock *sk, const enum tcp_ca_event ev)
+{
+	struct cdg *ca = inet_csk_ca(sk);
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct minmax *gradients;
+
+	switch (ev) {
+	case CA_EVENT_ECN_NO_CE:
+		ca->ecn_ce = false;
+		break;
+	case CA_EVENT_ECN_IS_CE:
+		ca->ecn_ce = true;
+		ca->state = CDG_UNKNOWN;
+		break;
+	case CA_EVENT_CWND_RESTART:
+		gradients = ca->gradients;
+		if (gradients)
+			memset(gradients, 0, window * sizeof(gradients[0]));
+		memset(ca, 0, sizeof(*ca));
+
+		ca->gradients = gradients;
+		ca->rtt_seq = tp->snd_nxt;
+		ca->shadow_wnd = tp->snd_cwnd;
+		break;
+	case CA_EVENT_COMPLETE_CWR:
+		ca->state = CDG_UNKNOWN;
+		ca->rtt_seq = tp->snd_nxt;
+		ca->rtt_prev = ca->rtt;
+		ca->rtt.v64 = 0;
+		break;
+	default:
+		break;
+	}
+}
+
+static void tcp_cdg_init(struct sock *sk)
+{
+	struct cdg *ca = inet_csk_ca(sk);
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	/* May fail. Not allocating from emergency pools. */
+	ca->gradients = kcalloc(window, sizeof(ca->gradients[0]), GFP_NOWAIT);
+	ca->shadow_wnd = tp->snd_cwnd;
+	ca->rtt_seq = tp->snd_nxt;
+}
+
+static void tcp_cdg_release(struct sock *sk)
+{
+	struct cdg *ca = inet_csk_ca(sk);
+
+	kfree(ca->gradients);
+}
+
+struct tcp_congestion_ops tcp_cdg __read_mostly = {
+	.cong_avoid = tcp_cdg_cong_avoid,
+	.cwnd_event = tcp_cdg_cwnd_event,
+	.pkts_acked = tcp_cdg_acked,
+	.undo_cwnd = tcp_cdg_undo_cwnd,
+	.ssthresh = tcp_cdg_ssthresh,
+	.release = tcp_cdg_release,
+	.init = tcp_cdg_init,
+	.owner = THIS_MODULE,
+	.name = "cdg",
+};
+
+static int __init tcp_cdg_register(void)
+{
+	window = clamp(window, 1, 256);
+	backoff_beta = clamp(backoff_beta, 1U, 1024U);
+	backoff_factor = clamp(backoff_factor, 1U, 65535U);
+
+	if (!is_power_of_2(window))
+		return -EINVAL;
+
+	BUILD_BUG_ON(sizeof(struct cdg) > ICSK_CA_PRIV_SIZE);
+	tcp_register_congestion_control(&tcp_cdg);
+	return 0;
+}
+
+static void __exit tcp_cdg_unregister(void)
+{
+	tcp_unregister_congestion_control(&tcp_cdg);
+}
+
+module_init(tcp_cdg_register);
+module_exit(tcp_cdg_unregister);
+MODULE_AUTHOR("Kenneth Klette Jonassen");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("TCP CDG");
-- 
2.1.0

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html