[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <CALx6S35pRq_Z32R3xWTj4PpQJ4YkC-MzxzeYb8+vH=cfTibwcQ@mail.gmail.com>
Date: Fri, 3 Jul 2015 10:10:54 -0700
From: Tom Herbert <tom@...bertland.com>
To: Lawrence Brakmo <brakmo@...com>
Cc: netdev <netdev@...r.kernel.org>, Kernel Team <kernel-team@...com>
Subject: Re: [RFC PATCH net-next] tcp: add NV congestion control
On Thu, Jul 2, 2015 at 6:21 PM, Lawrence Brakmo <brakmo@...com> wrote:
> This is a request for comments.
>
> TCP-NV (New Vegas) is a major update to TCP-Vegas. An earlier version of
> NV was presented at 2010's LPC (slides). It is a delayed based
> congestion avoidance for the data center. This version has been tested
> within a 10G rack where the HW RTTs are 20-50us.
>
> A description of TCP-NV, including implementation and experimental
> results, can be found at:
> http://www.brakmo.org/networking/tcp-nv/TCPNV.html
>
> The current version includes many module parameters to support
> experimentation with the parameters.
>
> Signed-off-by: Lawrence Brakmo <lawrence@...kmo.org>
> ---
> include/linux/skbuff.h | 2 +-
> include/linux/tcp.h | 4 +
> include/net/tcp.h | 5 +-
> net/ipv4/Kconfig | 16 ++
> net/ipv4/Makefile | 1 +
> net/ipv4/sysctl_net_ipv4.c | 9 +
> net/ipv4/tcp_input.c | 5 +
> net/ipv4/tcp_nv.c | 477 +++++++++++++++++++++++++++++++++++++++++++++
> net/ipv4/tcp_output.c | 4 +-
> 9 files changed, 520 insertions(+), 3 deletions(-)
> create mode 100644 net/ipv4/tcp_nv.c
>
> diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
> index d6cdd6e..96a131d 100644
> --- a/include/linux/skbuff.h
> +++ b/include/linux/skbuff.h
> @@ -547,7 +547,7 @@ struct sk_buff {
> * want to keep them across layers you have to do a skb_clone()
> * first. This is owned by whoever has the skb queued ATM.
> */
> - char cb[48] __aligned(8);
> + char cb[52] __aligned(8);
>
> unsigned long _skb_refdst;
> void (*destructor)(struct sk_buff *skb);
> diff --git a/include/linux/tcp.h b/include/linux/tcp.h
> index 48c3696..05e0da5 100644
> --- a/include/linux/tcp.h
> +++ b/include/linux/tcp.h
> @@ -254,6 +254,10 @@ struct tcp_sock {
> u32 lost_out; /* Lost packets */
> u32 sacked_out; /* SACK'd packets */
> u32 fackets_out; /* FACK'd packets */
> + u32 ack_in_flight; /* This field is populated when new acks
> + * are received. It contains the number of
> + * bytes in flight when the last packet
> + * acked was sent. Used by tcp-nv. */
>
> /* from STCP, retrans queue hinting */
> struct sk_buff* lost_skb_hint;
> diff --git a/include/net/tcp.h b/include/net/tcp.h
> index 950cfec..3e385c1 100644
> --- a/include/net/tcp.h
> +++ b/include/net/tcp.h
> @@ -281,6 +281,7 @@ extern unsigned int sysctl_tcp_notsent_lowat;
> extern int sysctl_tcp_min_tso_segs;
> extern int sysctl_tcp_autocorking;
> extern int sysctl_tcp_invalid_ratelimit;
> +extern int sysctl_tcp_nv_enable;
>
> extern atomic_long_t tcp_memory_allocated;
> extern struct percpu_counter tcp_sockets_allocated;
> @@ -720,12 +721,14 @@ static inline u32 tcp_skb_timestamp(const struct sk_buff *skb)
> /* This is what the send packet queuing engine uses to pass
> * TCP per-packet control information to the transmission code.
> * We also store the host-order sequence numbers in here too.
> - * This is 44 bytes if IPV6 is enabled.
> + * This is 48 bytes if IPV6 is enabled.
> * If this grows please adjust skbuff.h:skbuff->cb[xxx] size appropriately.
> */
> struct tcp_skb_cb {
> __u32 seq; /* Starting sequence number */
> __u32 end_seq; /* SEQ + FIN + SYN + datalen */
> + __u32 in_flight; /* bytes in flight when this packet
> + * was sent. */
> union {
> /* Note : tcp_tw_isn is used in input path only
> * (isn chosen by tcp_timewait_state_process())
> diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
> index 6fb3c90..c21f85d 100644
> --- a/net/ipv4/Kconfig
> +++ b/net/ipv4/Kconfig
> @@ -539,6 +539,22 @@ config TCP_CONG_VEGAS
> window. TCP Vegas should provide less packet loss, but it is
> not as aggressive as TCP Reno.
>
> +config TCP_CONG_NV
> + tristate "TCP NV"
> + default m
> + ---help---
> + TCP NV is a follow up to TCP Vegas. It has been modified to deal with
> + 10G networks, measurement noise introduced by LRO, GRO and interrupt
> + coalescence. In addition, it will decrease its cwnd multiplicative
> + instead of linearly.
> +
> + Note that in general congestion avoidance (cwnd decreased when # packets
> + queued grows) cannot coexist with congestion control (cwnd decreased only
> + when there is packet loss) due to fairness issues. One scenario when the
> + can coexist safely is when the CA flows have RTTs << CC flows RTTs.
> +
> + For further details see http://www.brakmo.org/networking/tcp-nv/TCPNV>html
> +
> config TCP_CONG_SCALABLE
> tristate "Scalable TCP"
> default n
> diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
> index efc43f3..06f335f 100644
> --- a/net/ipv4/Makefile
> +++ b/net/ipv4/Makefile
> @@ -50,6 +50,7 @@ obj-$(CONFIG_TCP_CONG_HSTCP) += tcp_highspeed.o
> obj-$(CONFIG_TCP_CONG_HYBLA) += tcp_hybla.o
> obj-$(CONFIG_TCP_CONG_HTCP) += tcp_htcp.o
> obj-$(CONFIG_TCP_CONG_VEGAS) += tcp_vegas.o
> +obj-$(CONFIG_TCP_CONG_NV) += tcp_nv.o
> obj-$(CONFIG_TCP_CONG_VENO) += tcp_veno.o
> obj-$(CONFIG_TCP_CONG_SCALABLE) += tcp_scalable.o
> obj-$(CONFIG_TCP_CONG_LP) += tcp_lp.o
> diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
> index 433231c..31846d5 100644
> --- a/net/ipv4/sysctl_net_ipv4.c
> +++ b/net/ipv4/sysctl_net_ipv4.c
> @@ -730,6 +730,15 @@ static struct ctl_table ipv4_table[] = {
> .proc_handler = proc_dointvec_ms_jiffies,
> },
> {
> + .procname = "tcp_nv_enable",
> + .data = &sysctl_tcp_nv_enable,
> + .maxlen = sizeof(int),
> + .mode = 0644,
> + .proc_handler = proc_dointvec_minmax,
> + .extra1 = &zero,
> + .extra2 = &one,
> + },
> + {
> .procname = "icmp_msgs_per_sec",
> .data = &sysctl_icmp_msgs_per_sec,
> .maxlen = sizeof(int),
> diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
> index 684f095..2a3c413 100644
> --- a/net/ipv4/tcp_input.c
> +++ b/net/ipv4/tcp_input.c
> @@ -101,6 +101,8 @@ int sysctl_tcp_thin_dupack __read_mostly;
> int sysctl_tcp_moderate_rcvbuf __read_mostly = 1;
> int sysctl_tcp_early_retrans __read_mostly = 3;
> int sysctl_tcp_invalid_ratelimit __read_mostly = HZ/2;
> +int sysctl_tcp_nv_enable __read_mostly = 1;
> +EXPORT_SYMBOL(sysctl_tcp_nv_enable);
>
> #define FLAG_DATA 0x01 /* Incoming frame contained data. */
> #define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */
> @@ -3063,6 +3065,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
> long ca_rtt_us = -1L;
> struct sk_buff *skb;
> u32 pkts_acked = 0;
> + u32 last_in_flight = 0;
> bool rtt_update;
> int flag = 0;
>
> @@ -3102,6 +3105,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
> if (!first_ackt.v64)
> first_ackt = last_ackt;
>
> + last_in_flight = TCP_SKB_CB(skb)->in_flight;
> reord = min(pkts_acked, reord);
> if (!after(scb->end_seq, tp->high_seq))
> flag |= FLAG_ORIG_SACK_ACKED;
> @@ -3190,6 +3194,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
> tcp_rearm_rto(sk);
> }
>
> + tp->ack_in_flight = last_in_flight;
> if (icsk->icsk_ca_ops->pkts_acked)
> icsk->icsk_ca_ops->pkts_acked(sk, pkts_acked, ca_rtt_us);
>
> diff --git a/net/ipv4/tcp_nv.c b/net/ipv4/tcp_nv.c
> new file mode 100644
> index 0000000..585f1dd
> --- /dev/null
> +++ b/net/ipv4/tcp_nv.c
> @@ -0,0 +1,477 @@
> +/*
> + * TCP NV: TCP with Congestion Avoidance
> + *
> + * TCP-NV is a successor of TCP-Vegas that has been developed to
> + * deal with the issues that occur in modern networks.
> + * Like TCP-Vegas, TCP-NV supports true congestion avoidance,
> + * the ability to detect congestion before packet losses occur.
> + * When congestion (queue buildup) starts to occur, TCP-NV
> + * predicts what the cwnd size should be for the current
> + * throughput and it reduces the cwnd proportionally to
> + * the difference between the current cwnd and the predicted cwnd.
> + * TCP-NV behaves like Reno when no congestion is detected, or when
> + * recovering from packet losses.
> + *
> + * More information on the design, implementation and experimental
> + * results at http://www.brakmo.org:/networking/tcp-nv/TCPNV.html
> + *
> + * TODO:
> + * 1) Add mechanism to deal with reverse congestion.
> + */
> +
> +#include <linux/mm.h>
> +#include <linux/module.h>
> +#include <linux/math64.h>
> +#include <net/tcp.h>
> +#include <linux/inet_diag.h>
> +
> +/* TCP NV parameters */
> +static int nv_pad __read_mostly = 8;
> +static int nv_reset_period __read_mostly = 5;
> +static int nv_min_cwnd = 10;
> +static int nv_dec_eval_min_calls = 100;
> +static int nv_ssthresh_eval_min_calls = 30;
> +static int nv_rtt_min_cnt = 2;
> +static int nv_cong_decrease_mult = 30*128/100;
> +static int nv_ssthresh_factor = 8;
> +static int nv_rtt_factor = 128;
> +static int nv_rtt_cnt_inc_delta = 32; /* dec cwnd by this many RTTs */
> +static int nv_dec_factor = 4; /* actual value is factor/8 */
> +static int nv_loss_dec_factor = 820; /* on loss reduce cwnd by 20% */
> +static int nv_cwnd_growth_factor = 2; /* larger => cwnd grows slower */
> +
> +module_param(nv_pad, int, 0644);
> +MODULE_PARM_DESC(nv_pad, "extra packets above congestion level");
> +module_param(nv_reset_period, int, 0644);
> +MODULE_PARM_DESC(nv_reset_period, "nv_min_rtt reset period (secs)");
> +module_param(nv_min_cwnd, int, 0644);
> +MODULE_PARM_DESC(nv_min_cwnd, "NV will not decrease cwnd below this value"
> + " without losses");
> +module_param(nv_dec_eval_min_calls, int, 0644);
> +MODULE_PARM_DESC(nv_dec_eval_min_calls, "Wait for this many data points "
> + "before declaring congestion (< 256)");
> +module_param(nv_ssthresh_eval_min_calls, int, 0644);
> +MODULE_PARM_DESC(nv_ssthresh_eval_min_calls, "Wait for this many data points "
> + "before declaring congestion during initial slow-start");
> +module_param(nv_rtt_min_cnt, int, 0644);
> +MODULE_PARM_DESC(nv_rtt_min_cnt, "Wait for this many RTTs before declaring"
> + " congestion (<64)");
> +module_param(nv_cong_decrease_mult, int, 0644);
> +MODULE_PARM_DESC(nv_cong_decrease_mult, "Congestion decrease factor");
> +module_param(nv_ssthresh_factor, int, 0644);
> +MODULE_PARM_DESC(nv_ssthresh_factor, "ssthresh factor");
> +module_param(nv_rtt_factor, int, 0644);
> +MODULE_PARM_DESC(nv_rtt_factor, "rtt averaging factor (0-256)");
> +module_param(nv_rtt_cnt_inc_delta, int, 0644);
> +MODULE_PARM_DESC(nv_rtt_cnt_inc_delta, "decrease cwnd for this many RTTs "
> + "every 100 RTTs");
> +module_param(nv_dec_factor, int, 0644);
> +MODULE_PARM_DESC(nv_dec_factor, "decrease cwnd every ~192 RTTS by factor/8");
> +module_param(nv_loss_dec_factor, int, 0644);
> +MODULE_PARM_DESC(nv_loss_dec_factor, "on loss new cwnd = cwnd * this / 1024");
> +module_param(nv_cwnd_growth_factor, int, 0644);
> +MODULE_PARM_DESC(nv_cwnd_growth_factor, "larger => cwnd grows slower");
> +
A lot of module parameters... can these be sysctls?
> +/* TCP NV Parameters */
> +struct tcpnv {
> + unsigned long nv_min_rtt_reset_jiffies; /* when to switch to
> + * nv_min_rtt_new */
> + u32 cnt; /* increase cwnd by 1 after ACKs */
> + u32 loss_cwnd; /* cwnd at last loss */
> + u8 nv_enable:1,
> + nv_allow_cwnd_growth:1, /* whether cwnd can grow */
> + nv_rtt_cnt:6; /* RTTs without making ca decision */
> + u8 nv_rtt_cnt_dec; /* RTTs since last temporary cwnd decrease */
> + u8 nv_eval_call_cnt;/* call count since last eval */
> + u8 nv_min_cwnd; /* nv won't make a ca decision if cwnd is
> + * smaller than this. It may grow to handle
> + * TSO, LRO and interrupt coalescence because
> + * with these a small cwnd cannot saturate
> + * the link. Note that this is different from
> + * sysctl_tcp_nv_min_cwnd */
> + u32 nv_last_rtt; /* last rtt */
> + u32 nv_min_rtt; /* active min rtt. Used to determine slope */
> + u32 nv_min_rtt_new; /* min rtt for future use */
> + u32 nv_rtt_max_rate; /* max rate seen during current RTT */
> + u32 nv_rtt_start_seq; /* current RTT ends when packet arrives
> + * acking beyond nv_rtt_start_seq */
> + u32 nv_last_snd_una; /* Previous value of tp->snd_una. It is
> + * used to determine bytes acked since last
> + * call to bictcp_acked */
> + u32 nv_no_cong_cnt; /* Consecutive no congestion decisions */
> +};
> +
> +#define NV_INIT_RTT 0xffffffff
> +#define NV_MIN_CWND 4
> +#define NV_MIN_CWND_GROW 2
> +#define NV_TSO_CWND_BOUND 80
> +
> +static inline void tcpnv_reset(struct tcpnv *ca, struct sock *sk)
> +{
> + struct tcp_sock *tp = tcp_sk(sk);
> +
> + ca->loss_cwnd = 0;
> + ca->nv_no_cong_cnt = 0;
> + ca->cnt = 0;
> + ca->nv_rtt_cnt = 0;
> + ca->nv_rtt_cnt_dec = 0;
> + ca->nv_allow_cwnd_growth = 1;
> + ca->nv_last_rtt = 0;
> + ca->nv_rtt_max_rate = 0;
> + ca->nv_rtt_start_seq = tp->snd_una;
> + ca->nv_eval_call_cnt = 0;
> + ca->nv_last_snd_una = tp->snd_una;
> +}
> +
> +static void tcpnv_init(struct sock *sk)
> +{
> + struct tcpnv *ca = inet_csk_ca(sk);
> +
> + tcpnv_reset(ca, sk);
> +
> + ca->nv_min_rtt_reset_jiffies = jiffies + 2*HZ;
> + ca->nv_min_rtt = NV_INIT_RTT;
> + ca->nv_min_rtt_new = NV_INIT_RTT;
> + ca->nv_enable = sysctl_tcp_nv_enable;
> + ca->nv_min_cwnd = NV_MIN_CWND;
> + if (nv_dec_eval_min_calls > 255)
> + nv_dec_eval_min_calls = 255;
> + if (nv_rtt_min_cnt > 63)
> + nv_rtt_min_cnt = 63;
> +}
> +
> +static void tcpnv_cong_avoid(struct sock *sk, u32 ack, u32 acked)
> +{
> + struct tcp_sock *tp = tcp_sk(sk);
> + struct tcpnv *ca = inet_csk_ca(sk);
> +
> + if (!tcp_is_cwnd_limited(sk))
> + return;
> +
> + /* Only grow cwnd if NV has not detected congestion */
> + if (sysctl_tcp_nv_enable && ca->nv_enable &&
> + !ca->nv_allow_cwnd_growth)
> + return;
> +
> + if (tp->snd_cwnd <= tp->snd_ssthresh) {
> + acked = tcp_slow_start(tp, acked);
> + if (!acked)
> + return;
> + }
> + if (ca->cnt == 0)
> + ca->cnt = tp->snd_cwnd;
> +
> + tcp_cong_avoid_ai(tp, ca->cnt, acked);
> +}
> +
> +static u32 tcpnv_recalc_ssthresh(struct sock *sk)
> +{
> + const struct tcp_sock *tp = tcp_sk(sk);
> + struct tcpnv *ca = inet_csk_ca(sk);
> +
> + ca->loss_cwnd = tp->snd_cwnd;
> + return max((tp->snd_cwnd * nv_loss_dec_factor) >> 10, 2U);
> +}
> +
> +static u32 tcpnv_undo_cwnd(struct sock *sk)
> +{
> + struct tcpnv *ca = inet_csk_ca(sk);
> +
> + return max(tcp_sk(sk)->snd_cwnd, ca->loss_cwnd);
> +}
> +
> +static void tcpnv_state(struct sock *sk, u8 new_state)
> +{
> + struct tcpnv *ca = inet_csk_ca(sk);
> +
> + if (new_state == TCP_CA_Open) {
> + ca->nv_enable = 1;
> + tcpnv_reset(ca, sk);
> + } else if (new_state == TCP_CA_Loss) {
> + ca->nv_enable = 0;
> + }
> +}
> +
> +/* Do congestion avoidance calculaitons for TCP-NV
> + */
> +static void tcpnv_acked(struct sock *sk, u32 cnt, s32 rtt_us)
> +{
> + const struct inet_connection_sock *icsk = inet_csk(sk);
> + struct tcp_sock *tp = tcp_sk(sk);
> + struct tcpnv *ca = inet_csk_ca(sk);
> + unsigned long now = jiffies;
> + s64 rate64 = 0;
> + u32 rate, max_win, cwnd_by_slope;
> + u32 avg_rtt;
> + u32 bytes_acked = 0;
> +
> + /* Some calls are for duplicates without timetamps */
> + if (rtt_us < 0)
> + return;
> +
> + /* If not in TCP_CA_Open state, skip. */
> + if (icsk->icsk_ca_state != TCP_CA_Open)
> + return;
> +
> + /* If NV mode is not enabled, behave like Reno */
> + if (!sysctl_tcp_nv_enable || !ca->nv_enable) {
> + ca->nv_allow_cwnd_growth = 1;
> + return;
> + }
> +
> + bytes_acked = tp->snd_una - ca->nv_last_snd_una;
> + ca->nv_last_snd_una = tp->snd_una;
> +
> + if (tp->ack_in_flight == 0)
> + return;
> +
> + /* Calculate moving average of RTT */
> + if (nv_rtt_factor > 0) {
> + if (ca->nv_last_rtt > 0) {
> + avg_rtt = (((u64)rtt_us) * nv_rtt_factor +
> + ((u64)ca->nv_last_rtt)
> + * (256 - nv_rtt_factor)) >> 8;
> + } else {
> + avg_rtt = rtt_us;
> + ca->nv_min_rtt = avg_rtt << 1;
> + }
> + ca->nv_last_rtt = avg_rtt;
> + } else {
> + avg_rtt = rtt_us;
> + }
> +
> + /* rate in 100's bits per second */
> + rate64 = ((u64)tp->ack_in_flight) * 8000000;
> + rate = (u32)div64_u64(rate64, (u64)(avg_rtt*100));
> +
> + /* Remember the maximum rate seen during this RTT
> + * Note: It may be more than one RTT. This function should be
> + * called at least nv_dec_eval_min_calls times.
> + */
> + if (ca->nv_rtt_max_rate < rate)
> + ca->nv_rtt_max_rate = rate;
> +
> + /* We have valid information, increment counter */
> + if (ca->nv_eval_call_cnt < 255)
> + ca->nv_eval_call_cnt++;
> +
> + /* update min rtt if necessary */
> + if (avg_rtt < ca->nv_min_rtt)
> + ca->nv_min_rtt = avg_rtt;
> +
> + /* update future min_rtt if necessary */
> + if (avg_rtt < ca->nv_min_rtt_new)
> + ca->nv_min_rtt_new = avg_rtt;
> +
> + /* nv_min_rtt is updated with the minimum (possibley averaged) rtt
> + * seen in the last sysctl_tcp_nv_reset_period seconds (i.e. a
> + * warm reset). This new nv_min_rtt will be continued to be updated
> + * and be used for another sysctl_tcp_nv_reset_period seconds,
> + * when it will be updated again.
> + * In practice we introduce some randomness, so the actual period used
> + * is chosen randomly from the range:
> + * [sysctl_tcp_nv_reset_period*3/4, sysctl_tcp_nv_reset_period*5/4)
> + */
> + if (time_after_eq(now, ca->nv_min_rtt_reset_jiffies)) {
> + unsigned char rand;
> + ca->nv_min_rtt = ca->nv_min_rtt_new;
> + ca->nv_min_rtt_new = NV_INIT_RTT;
> + get_random_bytes(&rand, 1);
> + ca->nv_min_rtt_reset_jiffies =
> + now + ((nv_reset_period*(384 + rand)*HZ)>>9);
> + /* Every so often we decrease nv_min_cwnd in case previous
> + * value is no longer accurate.
> + */
> + ca->nv_min_cwnd = max(ca->nv_min_cwnd/2, NV_MIN_CWND);
> + }
> +
> + /* Once per RTT check if we need to do congestion avoidance */
> + if (before(ca->nv_rtt_start_seq, tp->snd_una)) {
> + ca->nv_rtt_start_seq = tp->snd_nxt;
> + if (ca->nv_rtt_cnt < 63)
> + /* Increase counter for RTTs without CA decision */
> + ca->nv_rtt_cnt++;
> + if (ca->nv_rtt_cnt_dec < 255)
> + /* Increase counter for temporary cwnd decrease */
> + ca->nv_rtt_cnt_dec++;
> +
> + /* If this function is only called once within an RTT
> + * the cwnd is probably too small (in some cases due to
> + * tso, lro or interrupt coalescence), so we increase
> + * nv_min_cwnd.
> + */
> + if (ca->nv_eval_call_cnt == 1
> + && bytes_acked >= (ca->nv_min_cwnd - 1) * tp->mss_cache
> + && ca->nv_min_cwnd < (NV_TSO_CWND_BOUND + 1)
> + && ca->nv_rtt_cnt_dec < 192) {
> + ca->nv_min_cwnd = min(ca->nv_min_cwnd
> + + NV_MIN_CWND_GROW,
> + NV_TSO_CWND_BOUND + 1);
> + ca->nv_rtt_start_seq = tp->snd_nxt +
> + ca->nv_min_cwnd*tp->mss_cache;
> + ca->nv_eval_call_cnt = 0;
> + ca->nv_allow_cwnd_growth = 1;
> + return;
> + }
> +
> + /* Every 64 to 192 RTTs decrease cwnd to get better min RTT
> + * measurement. In practice we accomplish this by initializing
> + * nv_rtt_cnd_dec randomly form the range [0, 128) and
> + * stopping at 192.
> + * We keep the value low for nv_rtt_cnt_inc_delta RTTs and then
> + * we restore cwnd to its previous value (by setting
> + * ssthresh to the previous value).
> + */
> + if (ca->nv_rtt_cnt_dec == 192) {
> + /* decrease cwnd and ssthresh */
> + tp->snd_cwnd =
> + max((unsigned int)nv_min_cwnd,
> + ((tp->snd_cwnd * nv_dec_factor) >> 3));
> + tp->snd_ssthresh =
> + max(tp->snd_cwnd,
> + ((tp->snd_ssthresh * nv_dec_factor) >> 3));
> + ca->nv_allow_cwnd_growth = 0;
> + return;
> + } else if (ca->nv_rtt_cnt_dec > 192) {
> + if (ca->nv_rtt_cnt_dec - 192 >= nv_rtt_cnt_inc_delta) {
> + /* Restore ssthresh to restore cwnd */
> + unsigned char rand;
> + get_random_bytes(&rand, 1);
> + ca->nv_rtt_cnt_dec = rand >> 1;
> + tp->snd_ssthresh = (tp->snd_ssthresh << 3)
> + / nv_dec_factor;
> + ca->nv_allow_cwnd_growth = 1;
> + ca->nv_no_cong_cnt = 0;
> + }
> + return;
> + }
> +
> + /* Find the ideal cwnd for current rate from slope
> + * slope = 80000.0 * mss / nv_min_rtt
> + * cwnd_by_slope = nv_rtt_max_rate / slope
> + */
> + cwnd_by_slope = (u32)
> + div64_u64(((u64)ca->nv_rtt_max_rate) * ca->nv_min_rtt,
> + (u64)(80000 * tp->mss_cache));
> + max_win = cwnd_by_slope + nv_pad;
> +
> + /* If cwnd > max_win, decrease cwnd
> + * if cwnd < max_win, grow cwnd
> + * else leave the same
> + */
> + if (tp->snd_cwnd > max_win) {
> + /* there is congestion, check that it is ok
> + * to make a CA decision
> + * 1. We should have at least nv_dec_eval_min_calls
> + * data points before making a CA decision
> + * 2. We only make a congesion decision after
> + * nv_rtt_min_cnt RTTs
> + */
> + if (ca->nv_rtt_cnt < nv_rtt_min_cnt)
> + return;
> + else if (tp->snd_ssthresh == TCP_INFINITE_SSTHRESH) {
> + if (ca->nv_eval_call_cnt <
> + nv_ssthresh_eval_min_calls)
> + return;
> + } else if (ca->nv_eval_call_cnt <
> + nv_dec_eval_min_calls) {
> + return;
> + }
> +
> + /* We have enough data to determine we are congested */
> + ca->nv_allow_cwnd_growth = 0;
> + tp->snd_ssthresh =
> + (nv_ssthresh_factor * max_win) >> 3;
> + if (tp->snd_cwnd - max_win > 2) {
> + /* gap > 2, we do exponential cwnd decrease */
> + int dec;
> + dec = max(2U, ((tp->snd_cwnd - max_win) *
> + nv_cong_decrease_mult) >> 7);
> + tp->snd_cwnd -= dec;
> + } else if (nv_cong_decrease_mult > 0) {
> + tp->snd_cwnd = max_win;
> + }
> + ca->cnt = tp->snd_cwnd;
> + ca->nv_no_cong_cnt = 0;
> + } else if (tp->snd_cwnd <= max_win - 2) {
> + /* We allow growth of cwnd every RTT since we would
> + * have grown even if we waited (just slower)
> + */
> + ca->nv_allow_cwnd_growth = 1;
> + ca->nv_no_cong_cnt++;
> + if (nv_cwnd_growth_factor > 0 &&
> + ca->nv_no_cong_cnt > nv_cwnd_growth_factor) {
> + ca->cnt = max(ca->cnt >> 1, (u32) 4);
> + ca->nv_no_cong_cnt = 0;
> + }
> + } else {
> + ca->nv_allow_cwnd_growth = 0;
> + }
> +
> + /* update state */
> + ca->nv_eval_call_cnt = 0;
> + ca->nv_rtt_cnt = 0;
> + ca->nv_rtt_max_rate = 0;
> +
> + /* Don't want to make cwnd < nv_min_cwnd
> + * (it wasn't before, if it is now is because nv
> + * decreased it).
> + */
> + if (tp->snd_cwnd < nv_min_cwnd)
> + tp->snd_cwnd = nv_min_cwnd;
> +
> + }
> +}
> +
> +/* Extract info for Tcp socket info provided via netlink */
> +size_t tcpnv_get_info(struct sock *sk, u32 ext, int *attr,
> + union tcp_cc_info *info)
> +{
> + const struct tcpnv *ca = inet_csk_ca(sk);
> +
> + if (ext & (1 << (INET_DIAG_VEGASINFO - 1))) {
> + info->vegas.tcpv_enabled = ca->nv_enable
> + && sysctl_tcp_nv_enable;
> + info->vegas.tcpv_rttcnt = ca->nv_rtt_cnt;
> + info->vegas.tcpv_rtt = ca->nv_last_rtt;
> + info->vegas.tcpv_minrtt = ca->nv_min_rtt;
> +
> + *attr = INET_DIAG_VEGASINFO;
> + return sizeof(struct tcpvegas_info);
> + }
> + return 0;
> +}
> +EXPORT_SYMBOL_GPL(tcpnv_get_info);
> +
> +static struct tcp_congestion_ops tcpnv __read_mostly = {
> + .init = tcpnv_init,
> + .ssthresh = tcpnv_recalc_ssthresh,
> + .cong_avoid = tcpnv_cong_avoid,
> + .set_state = tcpnv_state,
> + .undo_cwnd = tcpnv_undo_cwnd,
> + .pkts_acked = tcpnv_acked,
> + .get_info = tcpnv_get_info,
> +
> + .owner = THIS_MODULE,
> + .name = "nv",
> +};
> +
> +static int __init tcpnv_register(void)
> +{
> + BUILD_BUG_ON(sizeof(struct tcpnv) > ICSK_CA_PRIV_SIZE);
> +
> + return tcp_register_congestion_control(&tcpnv);
> +}
> +
> +static void __exit tcpnv_unregister(void)
> +{
> + tcp_unregister_congestion_control(&tcpnv);
> +}
> +
> +module_init(tcpnv_register);
> +module_exit(tcpnv_unregister);
> +
> +MODULE_AUTHOR("Lawrence Brakmo");
> +MODULE_LICENSE("GPL");
> +MODULE_DESCRIPTION("TCP NV");
> +MODULE_VERSION("1.0");
> diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
> index b1c218d..97b02f1 100644
> --- a/net/ipv4/tcp_output.c
> +++ b/net/ipv4/tcp_output.c
> @@ -923,8 +923,11 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
>
> BUG_ON(!skb || !tcp_skb_pcount(skb));
>
> + tp = tcp_sk(sk);
> if (clone_it) {
> skb_mstamp_get(&skb->skb_mstamp);
> + TCP_SKB_CB(skb)->in_flight = TCP_SKB_CB(skb)->end_seq
> + - tp->snd_una;
>
> if (unlikely(skb_cloned(skb)))
> skb = pskb_copy(skb, gfp_mask);
> @@ -935,7 +938,6 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
> }
>
> inet = inet_sk(sk);
> - tp = tcp_sk(sk);
> tcb = TCP_SKB_CB(skb);
> memset(&opts, 0, sizeof(opts));
>
> --
> 1.8.1
>
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@...r.kernel.org
> More majordomo info at http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Powered by blists - more mailing lists