lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-ID: <549070CF.1010506@psc.edu>
Date:	Tue, 16 Dec 2014 12:50:07 -0500
From:	rapier <rapier@....edu>
To:	netdev <netdev@...r.kernel.org>
Subject: [PATCH net-next 1/3] Implementation of RFC 4898 Extended TCP Statistics
 (Web10G)

This patch provides the kernel instrument set. While this patch
compiles and runs it does not have control and management capabilities.
These are provided in the next patch submission.

---
  include/linux/tcp.h      |   8 +
  include/net/tcp.h        |   1 +
  include/net/tcp_estats.h | 376 +++++++++++++++++++++++++++++++++++++++++++++++
  include/uapi/linux/tcp.h |   6 +-
  net/ipv4/tcp.c           |  21 ++-
  net/ipv4/tcp_cong.c      |   3 +
  net/ipv4/tcp_htcp.c      |   1 +
  net/ipv4/tcp_input.c     | 116 +++++++++++++--
  net/ipv4/tcp_ipv4.c      |  10 ++
  net/ipv4/tcp_output.c    |  61 +++++++-
  net/ipv4/tcp_timer.c     |   3 +
  net/ipv6/tcp_ipv6.c      |   7 +
  12 files changed, 592 insertions(+), 21 deletions(-)
  create mode 100644 include/net/tcp_estats.h

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 67309ec..8758360 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -126,6 +126,10 @@ static inline struct tcp_request_sock *tcp_rsk(const struct request_sock *req)
  	return (struct tcp_request_sock *)req;
  }
  
+#ifdef CONFIG_TCP_ESTATS
+struct tcp_estats;
+#endif
+
  struct tcp_sock {
  	/* inet_connection_sock has to be the first member of tcp_sock */
  	struct inet_connection_sock	inet_conn;
@@ -309,6 +313,10 @@ struct tcp_sock {
  	struct tcp_md5sig_info	__rcu *md5sig_info;
  #endif
  
+#ifdef CONFIG_TCP_ESTATS
+	struct tcp_estats	*tcp_stats;
+#endif
+
  /* TCP fastopen related information */
  	struct tcp_fastopen_request *fastopen_req;
  	/* fastopen_rsk points to request_sock that resulted in this big
diff --git a/include/net/tcp.h b/include/net/tcp.h
index f50f29faf..9f7e31e 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -43,6 +43,7 @@
  #include <net/tcp_states.h>
  #include <net/inet_ecn.h>
  #include <net/dst.h>
+#include <net/tcp_estats.h>
  
  #include <linux/seq_file.h>
  #include <linux/memcontrol.h>
diff --git a/include/net/tcp_estats.h b/include/net/tcp_estats.h
new file mode 100644
index 0000000..ff6000e
--- /dev/null
+++ b/include/net/tcp_estats.h
@@ -0,0 +1,376 @@
+/*
+ * include/net/tcp_estats.h
+ *
+ * Implementation of TCP Extended Statistics MIB (RFC 4898)
+ *
+ * Authors:
+ *   John Estabrook <jsestabrook@...il.com>
+ *   Andrew K. Adams <akadams@....edu>
+ *   Kevin Hogan <kwabena@...gle.com>
+ *   Dominin Hamon <dma@...ipysock.com>
+ *   John Heffner <johnwheffner@...il.com>
+ *
+ * The Web10Gig project.  See http://www.web10gig.org
+ *
+ * Copyright © 2011, Pittsburgh Supercomputing Center (PSC).
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#ifndef _TCP_ESTATS_H
+#define _TCP_ESTATS_H
+
+#include <net/sock.h>
+#include <linux/idr.h>
+#include <linux/in.h>
+#include <linux/jump_label.h>
+#include <linux/spinlock.h>
+#include <linux/tcp.h>
+#include <linux/workqueue.h>
+
+/* defines number of seconds that stats persist after connection ends */
+#define TCP_ESTATS_PERSIST_DELAY_SECS 5
+
+enum tcp_estats_sndlim_states {
+	TCP_ESTATS_SNDLIM_NONE = -1,
+	TCP_ESTATS_SNDLIM_SENDER,
+	TCP_ESTATS_SNDLIM_CWND,
+	TCP_ESTATS_SNDLIM_RWIN,
+	TCP_ESTATS_SNDLIM_STARTUP,
+	TCP_ESTATS_SNDLIM_TSODEFER,
+	TCP_ESTATS_SNDLIM_PACE,
+	TCP_ESTATS_SNDLIM_NSTATES	/* Keep at end */
+};
+
+enum tcp_estats_addrtype {
+	TCP_ESTATS_ADDRTYPE_IPV4 = 1,
+	TCP_ESTATS_ADDRTYPE_IPV6 = 2
+};
+
+enum tcp_estats_softerror_reason {
+	TCP_ESTATS_SOFTERROR_BELOW_DATA_WINDOW = 1,
+	TCP_ESTATS_SOFTERROR_ABOVE_DATA_WINDOW = 2,
+	TCP_ESTATS_SOFTERROR_BELOW_ACK_WINDOW = 3,
+	TCP_ESTATS_SOFTERROR_ABOVE_ACK_WINDOW = 4,
+	TCP_ESTATS_SOFTERROR_BELOW_TS_WINDOW = 5,
+	TCP_ESTATS_SOFTERROR_ABOVE_TS_WINDOW = 6,
+	TCP_ESTATS_SOFTERROR_DATA_CHECKSUM = 7,
+	TCP_ESTATS_SOFTERROR_OTHER = 8,
+};
+
+#define TCP_ESTATS_INACTIVE	2
+#define TCP_ESTATS_ACTIVE	1
+
+#define TCP_ESTATS_TABLEMASK_INACTIVE	0x00
+#define TCP_ESTATS_TABLEMASK_ACTIVE	0x01
+#define TCP_ESTATS_TABLEMASK_PERF	0x02
+#define TCP_ESTATS_TABLEMASK_PATH	0x04
+#define TCP_ESTATS_TABLEMASK_STACK	0x08
+#define TCP_ESTATS_TABLEMASK_APP	0x10
+#define TCP_ESTATS_TABLEMASK_EXTRAS	0x40
+
+#ifdef CONFIG_TCP_ESTATS
+
+extern struct static_key tcp_estats_enabled;
+
+#define TCP_ESTATS_CHECK(tp, table, expr)				\
+	do {								\
+		if (static_key_false(&tcp_estats_enabled)) {		\
+			if (likely((tp)->tcp_stats) &&			\
+			    likely((tp)->tcp_stats->tables.table)) {	\
+				(expr);					\
+			}						\
+		}							\
+	} while (0)
+
+#define TCP_ESTATS_VAR_INC(tp, table, var)				\
+	TCP_ESTATS_CHECK(tp, table, ++((tp)->tcp_stats->tables.table->var))
+#define TCP_ESTATS_VAR_DEC(tp, table, var)				\
+	TCP_ESTATS_CHECK(tp, table, --((tp)->tcp_stats->tables.table->var))
+#define TCP_ESTATS_VAR_ADD(tp, table, var, val)				\
+	TCP_ESTATS_CHECK(tp, table,					\
+			 ((tp)->tcp_stats->tables.table->var) += (val))
+#define TCP_ESTATS_VAR_SET(tp, table, var, val)				\
+	TCP_ESTATS_CHECK(tp, table,					\
+			 ((tp)->tcp_stats->tables.table->var) = (val))
+#define TCP_ESTATS_UPDATE(tp, func)					\
+	do {								\
+		if (static_key_false(&tcp_estats_enabled)) {		\
+			if (likely((tp)->tcp_stats)) {			\
+				(func);					\
+			}						\
+		}							\
+	} while (0)
+
+/*
+ * Variables that can be read and written directly.
+ *
+ * Contains all variables from RFC 4898. Commented fields are
+ * either not implemented (only StartTimeStamp
+ * remains unimplemented in this release) or have
+ * handlers and do not need struct storage.
+ */
+struct tcp_estats_connection_table {
+	u32			AddressType;
+	union { struct in_addr addr; struct in6_addr addr6; }	LocalAddress;
+	union { struct in_addr addr; struct in6_addr addr6; }	RemAddress;
+	u16			LocalPort;
+	u16			RemPort;
+};
+
+struct tcp_estats_perf_table {
+	u32		SegsOut;
+	u32		DataSegsOut;
+	u64		DataOctetsOut;
+	u32		SegsRetrans;
+	u32		OctetsRetrans;
+	u32		SegsIn;
+	u32		DataSegsIn;
+	u64		DataOctetsIn;
+	/*		ElapsedSecs */
+	/*		ElapsedMicroSecs */
+	/*		StartTimeStamp */
+	/*		CurMSS */
+	/*		PipeSize */
+	u32		MaxPipeSize;
+	/*		SmoothedRTT */
+	/*		CurRTO */
+	u32		CongSignals;
+	/*		CurCwnd */
+	/*		CurSsthresh */
+	u32		Timeouts;
+	/*		CurRwinSent */
+	u32		MaxRwinSent;
+	u32		ZeroRwinSent;
+	/*		CurRwinRcvd */
+	u32		MaxRwinRcvd;
+	u32		ZeroRwinRcvd;
+	/*		SndLimTransRwin */
+	/*		SndLimTransCwnd */
+	/*		SndLimTransSnd */
+	/*		SndLimTimeRwin */
+	/*		SndLimTimeCwnd */
+	/*		SndLimTimeSnd */
+	u32		snd_lim_trans[TCP_ESTATS_SNDLIM_NSTATES];
+	u32		snd_lim_time[TCP_ESTATS_SNDLIM_NSTATES];
+};
+
+struct tcp_estats_path_table {
+	/*		RetranThresh */
+	u32		NonRecovDAEpisodes;
+	u32		SumOctetsReordered;
+	u32		NonRecovDA;
+	u32		SampleRTT;
+	/*		RTTVar */
+	u32		MaxRTT;
+	u32		MinRTT;
+	u64		SumRTT;
+	u32		CountRTT;
+	u32		MaxRTO;
+	u32		MinRTO;
+	u8		IpTtl;
+	u8		IpTosIn;
+	/*		IpTosOut */
+	u32		PreCongSumCwnd;
+	u32		PreCongSumRTT;
+	u32		PostCongSumRTT;
+	u32		PostCongCountRTT;
+	u32		ECNsignals;
+	u32		DupAckEpisodes;
+	/*		RcvRTT */
+	u32		DupAcksOut;
+	u32		CERcvd;
+	u32		ECESent;
+};
+
+struct tcp_estats_stack_table {
+	u32		ActiveOpen;
+	/*		MSSSent */
+	/*		MSSRcvd */
+	/*		WinScaleSent */
+	/*		WinScaleRcvd */
+	/*		TimeStamps */
+	/*		ECN */
+	/*		WillSendSACK */
+	/*		WillUseSACK */
+	/*		State */
+	/*		Nagle */
+	u32		MaxSsCwnd;
+	u32		MaxCaCwnd;
+	u32		MaxSsthresh;
+	u32		MinSsthresh;
+	/*		InRecovery */
+	u32		DupAcksIn;
+	u32		SpuriousFrDetected;
+	u32		SpuriousRtoDetected;
+	u32		SoftErrors;
+	u32		SoftErrorReason;
+	u32		SlowStart;
+	u32		CongAvoid;
+	u32		OtherReductions;
+	u32		CongOverCount;
+	u32		FastRetran;
+	u32		SubsequentTimeouts;
+	/*		CurTimeoutCount */
+	u32		AbruptTimeouts;
+	u32		SACKsRcvd;
+	u32		SACKBlocksRcvd;
+	u32		SendStall;
+	u32		DSACKDups;
+	u32		MaxMSS;
+	u32		MinMSS;
+	u32		SndInitial;
+	u32		RecInitial;
+	/*		CurRetxQueue */
+	/*		MaxRetxQueue */
+	/*		CurReasmQueue */
+	u32		MaxReasmQueue;
+	u32		EarlyRetrans;
+	u32		EarlyRetransDelay;
+};
+
+struct tcp_estats_app_table {
+	/*		SndUna */
+	/*		SndNxt */
+	u32		SndMax;
+	u64		ThruOctetsAcked;
+	/*		RcvNxt */
+	u64		ThruOctetsReceived;
+	/*		CurAppWQueue */
+	u32		MaxAppWQueue;
+	/*		CurAppRQueue */
+	u32		MaxAppRQueue;
+};
+
+/*
+    currently, no backing store is needed for tuning elements in
+     web10g - they are all read or written to directly in other
+     data structures (such as the socket)
+*/
+
+struct tcp_estats_extras_table {
+	/*		OtherReductionsCV */
+	u32		OtherReductionsCM;
+	u32		Priority;
+};
+
+struct tcp_estats_tables {
+	struct tcp_estats_connection_table	*connection_table;
+	struct tcp_estats_perf_table		*perf_table;
+	struct tcp_estats_path_table		*path_table;
+	struct tcp_estats_stack_table		*stack_table;
+	struct tcp_estats_app_table		*app_table;
+	struct tcp_estats_extras_table		*extras_table;
+};
+
+struct tcp_estats {
+	int				tcpe_cid; /* idr map id */
+
+	struct sock			*sk;
+	kuid_t				uid;
+	kgid_t				gid;
+	int				ids;
+
+	atomic_t			users;
+
+	enum tcp_estats_sndlim_states	limstate;
+	ktime_t				limstate_ts;
+#ifdef CONFIG_TCP_ESTATS_STRICT_ELAPSEDTIME
+	ktime_t				start_ts;
+	ktime_t				current_ts;
+#else
+	unsigned long			start_ts;
+	unsigned long			current_ts;
+#endif
+	struct timeval			start_tv;
+
+        int				queued;
+        struct work_struct		create_notify;
+        struct work_struct		establish_notify;
+        struct delayed_work		destroy_notify;
+
+	struct tcp_estats_tables	tables;
+
+	struct rcu_head			rcu;
+};
+
+extern struct idr tcp_estats_idr;
+
+extern int tcp_estats_wq_enabled;
+extern struct workqueue_struct *tcp_estats_wq;
+extern void (*create_notify_func)(struct work_struct *work);
+extern void (*establish_notify_func)(struct work_struct *work);
+extern void (*destroy_notify_func)(struct work_struct *work);
+
+extern unsigned long persist_delay;
+extern spinlock_t tcp_estats_idr_lock;
+
+/* For the TCP code */
+extern int  tcp_estats_create(struct sock *sk, enum tcp_estats_addrtype t,
+			      int active);
+extern void tcp_estats_destroy(struct sock *sk);
+extern void tcp_estats_establish(struct sock *sk);
+extern void tcp_estats_free(struct rcu_head *rcu);
+
+extern void tcp_estats_update_snd_nxt(struct tcp_sock *tp);
+extern void tcp_estats_update_acked(struct tcp_sock *tp, u32 ack);
+extern void tcp_estats_update_rtt(struct sock *sk, unsigned long rtt_sample);
+extern void tcp_estats_update_timeout(struct sock *sk);
+extern void tcp_estats_update_mss(struct tcp_sock *tp);
+extern void tcp_estats_update_rwin_rcvd(struct tcp_sock *tp);
+extern void tcp_estats_update_sndlim(struct tcp_sock *tp,
+				     enum tcp_estats_sndlim_states why);
+extern void tcp_estats_update_rcvd(struct tcp_sock *tp, u32 seq);
+extern void tcp_estats_update_rwin_sent(struct tcp_sock *tp);
+extern void tcp_estats_update_congestion(struct tcp_sock *tp);
+extern void tcp_estats_update_post_congestion(struct tcp_sock *tp);
+extern void tcp_estats_update_segsend(struct sock *sk, int pcount,
+                                      u32 seq, u32 end_seq, int flags);
+extern void tcp_estats_update_segrecv(struct tcp_sock *tp, struct sk_buff *skb);
+extern void tcp_estats_update_finish_segrecv(struct tcp_sock *tp);
+extern void tcp_estats_update_writeq(struct sock *sk);
+extern void tcp_estats_update_recvq(struct sock *sk);
+
+extern void tcp_estats_init(void);
+
+static inline void tcp_estats_use(struct tcp_estats *stats)
+{
+	atomic_inc(&stats->users);
+}
+
+static inline int tcp_estats_use_if_valid(struct tcp_estats *stats)
+{
+	return atomic_inc_not_zero(&stats->users);
+}
+
+static inline void tcp_estats_unuse(struct tcp_estats *stats)
+{
+	if (atomic_dec_and_test(&stats->users)) {
+		sock_put(stats->sk);
+		stats->sk = NULL;
+		call_rcu(&stats->rcu, tcp_estats_free);
+	}
+}
+
+#else /* !CONFIG_TCP_ESTATS */
+
+#define tcp_estats_enabled	(0)
+
+#define TCP_ESTATS_VAR_INC(tp, table, var)	do {} while (0)
+#define TCP_ESTATS_VAR_DEC(tp, table, var)	do {} while (0)
+#define TCP_ESTATS_VAR_ADD(tp, table, var, val)	do {} while (0)
+#define TCP_ESTATS_VAR_SET(tp, table, var, val)	do {} while (0)
+#define TCP_ESTATS_UPDATE(tp, func)		do {} while (0)
+
+static inline void tcp_estats_init(void) { }
+static inline void tcp_estats_establish(struct sock *sk) { }
+static inline void tcp_estats_create(struct sock *sk,
+				     enum tcp_estats_addrtype t,
+				     int active) { }
+static inline void tcp_estats_destroy(struct sock *sk) { }
+
+#endif /* CONFIG_TCP_ESTATS */
+
+#endif /* _TCP_ESTATS_H */
diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h
index 3b97183..5dae043 100644
--- a/include/uapi/linux/tcp.h
+++ b/include/uapi/linux/tcp.h
@@ -186,9 +186,13 @@ struct tcp_info {
  	__u32	tcpi_rcv_space;
  
  	__u32	tcpi_total_retrans;
-
  	__u64	tcpi_pacing_rate;
  	__u64	tcpi_max_pacing_rate;
+
+#ifdef CONFIG_TCP_ESTATS
+	/* RFC 4898 extended stats Info */
+	__u32	tcpi_estats_cid;
+#endif
  };
  
  /* for TCP_MD5SIG socket option */
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 3075723..698dbb7 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -418,6 +418,10 @@ void tcp_init_sock(struct sock *sk)
  	sk->sk_sndbuf = sysctl_tcp_wmem[1];
  	sk->sk_rcvbuf = sysctl_tcp_rmem[1];
  
+#ifdef CONFIG_TCP_ESTATS
+	tp->tcp_stats = NULL;
+#endif
+
  	local_bh_disable();
  	sock_update_memcg(sk);
  	sk_sockets_allocated_inc(sk);
@@ -972,6 +976,9 @@ wait_for_memory:
  		tcp_push(sk, flags & ~MSG_MORE, mss_now,
  			 TCP_NAGLE_PUSH, size_goal);
  
+		if (copied)
+                        TCP_ESTATS_UPDATE(tp, tcp_estats_update_writeq(sk));
+
  		if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
  			goto do_error;
  
@@ -1264,9 +1271,11 @@ new_segment:
  wait_for_sndbuf:
  			set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
  wait_for_memory:
-			if (copied)
+			if (copied) {
  				tcp_push(sk, flags & ~MSG_MORE, mss_now,
  					 TCP_NAGLE_PUSH, size_goal);
+				TCP_ESTATS_UPDATE(tp, tcp_estats_update_writeq(sk));
+			}
  
  			if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
  				goto do_error;
@@ -1658,6 +1667,8 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
  			     *seq, TCP_SKB_CB(skb)->seq, tp->rcv_nxt, flags);
  		}
  
+		TCP_ESTATS_UPDATE(tp, tcp_estats_update_recvq(sk));
+
  		/* Well, if we have backlog, try to process it now yet. */
  
  		if (copied >= target && !sk->sk_backlog.tail)
@@ -2684,6 +2695,11 @@ void tcp_get_info(const struct sock *sk, struct tcp_info *info)
  					sk->sk_pacing_rate : ~0ULL;
  	info->tcpi_max_pacing_rate = sk->sk_max_pacing_rate != ~0U ?
  					sk->sk_max_pacing_rate : ~0ULL;
+
+#ifdef CONFIG_TCP_ESTATS
+	info->tcpi_estats_cid = (tp->tcp_stats && tp->tcp_stats->tcpe_cid > 0)
+					? tp->tcp_stats->tcpe_cid : 0;
+#endif
  }
  EXPORT_SYMBOL_GPL(tcp_get_info);
  
@@ -3101,6 +3117,9 @@ void __init tcp_init(void)
  		tcp_hashinfo.ehash_mask + 1, tcp_hashinfo.bhash_size);
  
  	tcp_metrics_init();
+
  	BUG_ON(tcp_register_congestion_control(&tcp_reno) != 0);
+	tcp_estats_init();
+
  	tcp_tasklet_init();
  }
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
index 27ead0d..e93929d 100644
--- a/net/ipv4/tcp_cong.c
+++ b/net/ipv4/tcp_cong.c
@@ -295,6 +295,8 @@ void tcp_slow_start(struct tcp_sock *tp, u32 acked)
  {
  	u32 cwnd = tp->snd_cwnd + acked;
  
+	TCP_ESTATS_VAR_INC(tp, stack_table, SlowStart);
+
  	if (cwnd > tp->snd_ssthresh)
  		cwnd = tp->snd_ssthresh + 1;
  	tp->snd_cwnd = min(cwnd, tp->snd_cwnd_clamp);
@@ -304,6 +306,7 @@ EXPORT_SYMBOL_GPL(tcp_slow_start);
  /* In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd (or alternative w) */
  void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w)
  {
+	TCP_ESTATS_VAR_INC(tp, stack_table, CongAvoid);
  	if (tp->snd_cwnd_cnt >= w) {
  		if (tp->snd_cwnd < tp->snd_cwnd_clamp)
  			tp->snd_cwnd++;
diff --git a/net/ipv4/tcp_htcp.c b/net/ipv4/tcp_htcp.c
index 58469ff..5facb4c 100644
--- a/net/ipv4/tcp_htcp.c
+++ b/net/ipv4/tcp_htcp.c
@@ -251,6 +251,7 @@ static void htcp_cong_avoid(struct sock *sk, u32 ack, u32 acked)
  			tp->snd_cwnd_cnt += ca->pkts_acked;
  
  		ca->pkts_acked = 1;
+		TCP_ESTATS_VAR_INC(tp, stack_table, CongAvoid);
  	}
  }
  
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 075ab4d..8f0601b 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -77,8 +77,10 @@
  #include <linux/errqueue.h>
  
  int sysctl_tcp_timestamps __read_mostly = 1;
+EXPORT_SYMBOL(sysctl_tcp_timestamps);
  int sysctl_tcp_window_scaling __read_mostly = 1;
  int sysctl_tcp_sack __read_mostly = 1;
+EXPORT_SYMBOL(sysctl_tcp_sack);
  int sysctl_tcp_fack __read_mostly = 1;
  int sysctl_tcp_reordering __read_mostly = TCP_FASTRETRANS_THRESH;
  int sysctl_tcp_max_reordering __read_mostly = 300;
@@ -231,13 +233,15 @@ static void __tcp_ecn_check_ce(struct tcp_sock *tp, const struct sk_buff *skb)
  			tcp_enter_quickack_mode((struct sock *)tp);
  		break;
  	case INET_ECN_CE:
+		TCP_ESTATS_VAR_INC(tp, path_table, CERcvd);
  		if (tcp_ca_needs_ecn((struct sock *)tp))
  			tcp_ca_event((struct sock *)tp, CA_EVENT_ECN_IS_CE);
-
  		if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR)) {
  			/* Better not delay acks, sender can have a very low cwnd */
  			tcp_enter_quickack_mode((struct sock *)tp);
  			tp->ecn_flags |= TCP_ECN_DEMAND_CWR;
+		} else {
+			TCP_ESTATS_VAR_INC(tp, path_table, ECESent);
  		}
  		tp->ecn_flags |= TCP_ECN_SEEN;
  		break;
@@ -1104,6 +1108,7 @@ static bool tcp_check_dsack(struct sock *sk, const struct sk_buff *ack_skb,
  		dup_sack = true;
  		tcp_dsack_seen(tp);
  		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPDSACKRECV);
+		TCP_ESTATS_VAR_INC(tp, stack_table, DSACKDups);
  	} else if (num_sacks > 1) {
  		u32 end_seq_1 = get_unaligned_be32(&sp[1].end_seq);
  		u32 start_seq_1 = get_unaligned_be32(&sp[1].start_seq);
@@ -1114,6 +1119,7 @@ static bool tcp_check_dsack(struct sock *sk, const struct sk_buff *ack_skb,
  			tcp_dsack_seen(tp);
  			NET_INC_STATS_BH(sock_net(sk),
  					LINUX_MIB_TCPDSACKOFORECV);
+			TCP_ESTATS_VAR_INC(tp, stack_table, DSACKDups);
  		}
  	}
  
@@ -1653,6 +1659,9 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb,
  	state.reord = tp->packets_out;
  	state.rtt_us = -1L;
  
+	TCP_ESTATS_VAR_INC(tp, stack_table, SACKsRcvd);
+	TCP_ESTATS_VAR_ADD(tp, stack_table, SACKBlocksRcvd, num_sacks);
+
  	if (!tp->sacked_out) {
  		if (WARN_ON(tp->fackets_out))
  			tp->fackets_out = 0;
@@ -1928,6 +1937,8 @@ void tcp_enter_loss(struct sock *sk)
  	bool new_recovery = false;
  	bool is_reneg;			/* is receiver reneging on SACKs? */
  
+	TCP_ESTATS_UPDATE(tp, tcp_estats_update_congestion(tp));
+
  	/* Reduce ssthresh if it has not yet been made inside this window. */
  	if (icsk->icsk_ca_state <= TCP_CA_Disorder ||
  	    !after(tp->high_seq, tp->snd_una) ||
@@ -2200,8 +2211,12 @@ static bool tcp_time_to_recover(struct sock *sk, int flag)
  	 */
  	if (tp->do_early_retrans && !tp->retrans_out && tp->sacked_out &&
  	    (tp->packets_out >= (tp->sacked_out + 1) && tp->packets_out < 4) &&
-	    !tcp_may_send_now(sk))
-		return !tcp_pause_early_retransmit(sk, flag);
+	    !tcp_may_send_now(sk)) {
+		int early_retrans = !tcp_pause_early_retransmit(sk, flag);
+		if (early_retrans)
+			TCP_ESTATS_VAR_INC(tp, stack_table, EarlyRetrans);
+		return early_retrans;
+	}
  
  	return false;
  }
@@ -2299,9 +2314,15 @@ static void tcp_update_scoreboard(struct sock *sk, int fast_rexmit)
   */
  static inline void tcp_moderate_cwnd(struct tcp_sock *tp)
  {
-	tp->snd_cwnd = min(tp->snd_cwnd,
-			   tcp_packets_in_flight(tp) + tcp_max_burst(tp));
-	tp->snd_cwnd_stamp = tcp_time_stamp;
+	u32 pkts = tcp_packets_in_flight(tp) + tcp_max_burst(tp);
+
+	if (pkts < tp->snd_cwnd) {
+		tp->snd_cwnd = pkts;
+		tp->snd_cwnd_stamp = tcp_time_stamp;
+
+		TCP_ESTATS_VAR_INC(tp, stack_table, OtherReductions);
+		TCP_ESTATS_VAR_INC(tp, extras_table, OtherReductionsCM);
+	}
  }
  
  /* Nothing was retransmitted or returned timestamp is less
@@ -2402,6 +2423,7 @@ static void tcp_undo_cwnd_reduction(struct sock *sk, bool unmark_loss)
  		if (tp->prior_ssthresh > tp->snd_ssthresh) {
  			tp->snd_ssthresh = tp->prior_ssthresh;
  			tcp_ecn_withdraw_cwr(tp);
+			TCP_ESTATS_VAR_INC(tp, stack_table, CongOverCount);
  		}
  	} else {
  		tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh);
@@ -2428,10 +2450,15 @@ static bool tcp_try_undo_recovery(struct sock *sk)
  		 */
  		DBGUNDO(sk, inet_csk(sk)->icsk_ca_state == TCP_CA_Loss ? "loss" : "retrans");
  		tcp_undo_cwnd_reduction(sk, false);
-		if (inet_csk(sk)->icsk_ca_state == TCP_CA_Loss)
+		if (inet_csk(sk)->icsk_ca_state == TCP_CA_Loss) {
  			mib_idx = LINUX_MIB_TCPLOSSUNDO;
-		else
+			TCP_ESTATS_VAR_INC(tp, stack_table,
+					   SpuriousRtoDetected);
+		} else {
  			mib_idx = LINUX_MIB_TCPFULLUNDO;
+			TCP_ESTATS_VAR_INC(tp, stack_table,
+					   SpuriousFrDetected);
+		}
  
  		NET_INC_STATS_BH(sock_net(sk), mib_idx);
  	}
@@ -2472,9 +2499,12 @@ static bool tcp_try_undo_loss(struct sock *sk, bool frto_undo)
  
  		DBGUNDO(sk, "partial loss");
  		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPLOSSUNDO);
-		if (frto_undo)
+		if (frto_undo) {
  			NET_INC_STATS_BH(sock_net(sk),
  					 LINUX_MIB_TCPSPURIOUSRTOS);
+			TCP_ESTATS_VAR_INC(tp, stack_table,
+					   SpuriousRtoDetected);
+		}
  		inet_csk(sk)->icsk_retransmits = 0;
  		if (frto_undo || tcp_is_sack(tp))
  			tcp_set_ca_state(sk, TCP_CA_Open);
@@ -2555,6 +2585,7 @@ void tcp_enter_cwr(struct sock *sk)
  		tcp_init_cwnd_reduction(sk);
  		tcp_set_ca_state(sk, TCP_CA_CWR);
  	}
+	TCP_ESTATS_UPDATE(tp, tcp_estats_update_congestion(tp));
  }
  
  static void tcp_try_keep_open(struct sock *sk)
@@ -2580,8 +2611,10 @@ static void tcp_try_to_open(struct sock *sk, int flag, const int prior_unsacked)
  	if (!tcp_any_retrans_done(sk))
  		tp->retrans_stamp = 0;
  
-	if (flag & FLAG_ECE)
+	if (flag & FLAG_ECE) {
  		tcp_enter_cwr(sk);
+		TCP_ESTATS_VAR_INC(tp, path_table, ECNsignals);
+	}
  
  	if (inet_csk(sk)->icsk_ca_state != TCP_CA_CWR) {
  		tcp_try_keep_open(sk);
@@ -2826,6 +2859,10 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked,
  			}
  			break;
  
+		case TCP_CA_Disorder:
+			TCP_ESTATS_VAR_INC(tp, path_table, NonRecovDAEpisodes);
+			break;
+
  		case TCP_CA_Recovery:
  			if (tcp_is_reno(tp))
  				tcp_reset_reno_sack(tp);
@@ -2870,6 +2907,10 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked,
  		if (icsk->icsk_ca_state <= TCP_CA_Disorder)
  			tcp_try_undo_dsack(sk);
  
+
+		if (icsk->icsk_ca_state == TCP_CA_Disorder)
+			TCP_ESTATS_VAR_INC(tp, path_table, NonRecovDA);
+
  		if (!tcp_time_to_recover(sk, flag)) {
  			tcp_try_to_open(sk, flag, prior_unsacked);
  			return;
@@ -2889,6 +2930,8 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked,
  		/* Otherwise enter Recovery state */
  		tcp_enter_recovery(sk, (flag & FLAG_ECE));
  		fast_rexmit = 1;
+		TCP_ESTATS_UPDATE(tp, tcp_estats_update_congestion(tp));
+		TCP_ESTATS_VAR_INC(tp, stack_table, FastRetran);
  	}
  
  	if (do_lost)
@@ -2928,6 +2971,7 @@ static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag,
  
  	tcp_rtt_estimator(sk, seq_rtt_us);
  	tcp_set_rto(sk);
+	TCP_ESTATS_UPDATE(tcp_sk(sk), tcp_estats_update_rtt(sk, seq_rtt_us));
  
  	/* RFC6298: only reset backoff on valid RTT measurement. */
  	inet_csk(sk)->icsk_backoff = 0;
@@ -3007,6 +3051,7 @@ void tcp_resume_early_retransmit(struct sock *sk)
  	if (!tp->do_early_retrans)
  		return;
  
+	TCP_ESTATS_VAR_INC(tp, stack_table, EarlyRetransDelay);
  	tcp_enter_recovery(sk, false);
  	tcp_update_scoreboard(sk, 1);
  	tcp_xmit_retransmit_queue(sk);
@@ -3310,9 +3355,11 @@ static int tcp_ack_update_window(struct sock *sk, const struct sk_buff *skb, u32
  				tp->max_window = nwin;
  				tcp_sync_mss(sk, inet_csk(sk)->icsk_pmtu_cookie);
  			}
+			TCP_ESTATS_UPDATE(tp, tcp_estats_update_rwin_rcvd(tp));
  		}
  	}
  
+	TCP_ESTATS_UPDATE(tp, tcp_estats_update_acked(tp, ack));
  	tp->snd_una = ack;
  
  	return flag;
@@ -3410,6 +3457,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
  	int prior_packets = tp->packets_out;
  	const int prior_unsacked = tp->packets_out - tp->sacked_out;
  	int acked = 0; /* Number of packets newly acked */
+	int prior_state = icsk->icsk_ca_state;
  	long sack_rtt_us = -1L;
  
  	/* We very likely will need to access write queue head. */
@@ -3419,6 +3467,9 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
  	 * then we can probably ignore it.
  	 */
  	if (before(ack, prior_snd_una)) {
+		TCP_ESTATS_VAR_INC(tp, stack_table, SoftErrors);
+		TCP_ESTATS_VAR_SET(tp, stack_table, SoftErrorReason,
+				   TCP_ESTATS_SOFTERROR_BELOW_ACK_WINDOW);
  		/* RFC 5961 5.2 [Blind Data Injection Attack].[Mitigation] */
  		if (before(ack, prior_snd_una - tp->max_window)) {
  			tcp_send_challenge_ack(sk);
@@ -3430,8 +3481,12 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
  	/* If the ack includes data we haven't sent yet, discard
  	 * this segment (RFC793 Section 3.9).
  	 */
-	if (after(ack, tp->snd_nxt))
+	if (after(ack, tp->snd_nxt)) {
+		TCP_ESTATS_VAR_INC(tp, stack_table, SoftErrors);
+		TCP_ESTATS_VAR_SET(tp, stack_table, SoftErrorReason,
+				   TCP_ESTATS_SOFTERROR_ABOVE_ACK_WINDOW);
  		goto invalid_ack;
+	}
  
  	if (icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
  	    icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)
@@ -3439,6 +3494,9 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
  
  	if (after(ack, prior_snd_una)) {
  		flag |= FLAG_SND_UNA_ADVANCED;
+		if (icsk->icsk_ca_state == TCP_CA_Disorder)
+			TCP_ESTATS_VAR_ADD(tp, path_table, SumOctetsReordered,
+					   ack - prior_snd_una);
  		icsk->icsk_retransmits = 0;
  	}
  
@@ -3456,6 +3514,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
  		 * Note, we use the fact that SND.UNA>=SND.WL2.
  		 */
  		tcp_update_wl(tp, ack_seq);
+		TCP_ESTATS_UPDATE(tp, tcp_estats_update_acked(tp, ack));
  		tp->snd_una = ack;
  		flag |= FLAG_WIN_UPDATE;
  
@@ -3510,6 +3569,10 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
  		is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP));
  		tcp_fastretrans_alert(sk, acked, prior_unsacked,
  				      is_dupack, flag);
+		if (icsk->icsk_ca_state == TCP_CA_Open &&
+		    prior_state >= TCP_CA_CWR)
+			TCP_ESTATS_UPDATE(tp,
+				tcp_estats_update_post_congestion(tp));
  	}
  	if (tp->tlp_high_seq)
  		tcp_process_tlp_ack(sk, ack, flag);
@@ -4177,7 +4240,9 @@ static void tcp_ofo_queue(struct sock *sk)
  
  		tail = skb_peek_tail(&sk->sk_receive_queue);
  		eaten = tail && tcp_try_coalesce(sk, tail, skb, &fragstolen);
+		TCP_ESTATS_UPDATE(tp, tcp_estats_update_rcvd(tp, tp->rcv_nxt));
  		tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
+
  		if (!eaten)
  			__skb_queue_tail(&sk->sk_receive_queue, skb);
  		if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
@@ -4232,6 +4297,9 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
  	SOCK_DEBUG(sk, "out of order segment: rcv_next %X seq %X - %X\n",
  		   tp->rcv_nxt, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq);
  
+        TCP_ESTATS_UPDATE(tp, tcp_estats_update_recvq(sk));
+        TCP_ESTATS_VAR_INC(tp, path_table, DupAcksOut);
+
  	skb1 = skb_peek_tail(&tp->out_of_order_queue);
  	if (!skb1) {
  		/* Initial out of order segment, build 1 SACK. */
@@ -4242,6 +4310,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
  						TCP_SKB_CB(skb)->end_seq;
  		}
  		__skb_queue_head(&tp->out_of_order_queue, skb);
+                TCP_ESTATS_VAR_INC(tp, path_table, DupAckEpisodes);
  		goto end;
  	}
  
@@ -4438,6 +4507,9 @@ queue_and_out:
  
  			eaten = tcp_queue_rcv(sk, skb, 0, &fragstolen);
  		}
+		TCP_ESTATS_UPDATE(
+			tp,
+			tcp_estats_update_rcvd(tp, TCP_SKB_CB(skb)->end_seq));
  		tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
  		if (skb->len)
  			tcp_event_data_recv(sk, skb);
@@ -4459,6 +4531,8 @@ queue_and_out:
  
  		tcp_fast_path_check(sk);
  
+		TCP_ESTATS_UPDATE(tp, tcp_estats_update_recvq(sk));
+
  		if (eaten > 0)
  			kfree_skb_partial(skb, fragstolen);
  		if (!sock_flag(sk, SOCK_DEAD))
@@ -4990,6 +5064,9 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
  	    tcp_paws_discard(sk, skb)) {
  		if (!th->rst) {
  			NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED);
+			TCP_ESTATS_VAR_INC(tp, stack_table, SoftErrors);
+			TCP_ESTATS_VAR_SET(tp, stack_table, SoftErrorReason,
+					   TCP_ESTATS_SOFTERROR_BELOW_TS_WINDOW);
  			tcp_send_dupack(sk, skb);
  			goto discard;
  		}
@@ -5004,6 +5081,11 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
  		 * an acknowledgment should be sent in reply (unless the RST
  		 * bit is set, if so drop the segment and return)".
  		 */
+		TCP_ESTATS_VAR_INC(tp, stack_table, SoftErrors);
+		TCP_ESTATS_VAR_SET(tp, stack_table, SoftErrorReason,
+			before(TCP_SKB_CB(skb)->end_seq, tp->rcv_wup) ?
+				TCP_ESTATS_SOFTERROR_BELOW_DATA_WINDOW :
+				TCP_ESTATS_SOFTERROR_ABOVE_DATA_WINDOW);
  		if (!th->rst) {
  			if (th->syn)
  				goto syn_challenge;
@@ -5152,6 +5234,10 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
  				return;
  			} else { /* Header too small */
  				TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
+				TCP_ESTATS_VAR_INC(tp, stack_table, SoftErrors);
+				TCP_ESTATS_VAR_SET(tp, stack_table,
+						   SoftErrorReason,
+						   TCP_ESTATS_SOFTERROR_OTHER);
  				goto discard;
  			}
  		} else {
@@ -5178,6 +5264,7 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
  					tcp_rcv_rtt_measure_ts(sk, skb);
  
  					__skb_pull(skb, tcp_header_len);
+					TCP_ESTATS_UPDATE(tp, tcp_estats_update_rcvd(tp, TCP_SKB_CB(skb)->end_seq));
  					tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
  					NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPHITSTOUSER);
  					eaten = 1;
@@ -5204,10 +5291,12 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
  				NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPHITS);
  
  				/* Bulk data transfer: receiver */
+				TCP_ESTATS_UPDATE(tp, tcp_estats_update_rcvd(tp, TCP_SKB_CB(skb)->end_seq));
  				eaten = tcp_queue_rcv(sk, skb, tcp_header_len,
  						      &fragstolen);
  			}
  
+			TCP_ESTATS_UPDATE(tp, tcp_estats_update_recvq(sk));
  			tcp_event_data_recv(sk, skb);
  
  			if (TCP_SKB_CB(skb)->ack_seq != tp->snd_una) {
@@ -5260,6 +5349,9 @@ step5:
  csum_error:
  	TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_CSUMERRORS);
  	TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
+	TCP_ESTATS_VAR_INC(tp, stack_table, SoftErrors);
+	TCP_ESTATS_VAR_SET(tp, stack_table, SoftErrorReason,
+			   TCP_ESTATS_SOFTERROR_DATA_CHECKSUM);
  
  discard:
  	__kfree_skb(skb);
@@ -5459,6 +5551,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
  		smp_mb();
  
  		tcp_finish_connect(sk, skb);
+		tcp_estats_establish(sk);
  
  		if ((tp->syn_fastopen || tp->syn_data) &&
  		    tcp_rcv_fastopen_synack(sk, skb, &foc))
@@ -5685,6 +5778,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
  		smp_mb();
  		tcp_set_state(sk, TCP_ESTABLISHED);
  		sk->sk_state_change(sk);
+		tcp_estats_establish(sk);
  
  		/* Note, that this wakeup is only for marginal crossed SYN case.
  		 * Passively open sockets are not waked up, because
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index a3f72d7..9c85a54 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1310,6 +1310,8 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
  	if (!newsk)
  		goto exit_nonewsk;
  
+	tcp_estats_create(newsk, TCP_ESTATS_ADDRTYPE_IPV4, TCP_ESTATS_INACTIVE);
+
  	newsk->sk_gso_type = SKB_GSO_TCPV4;
  	inet_sk_rx_dst_set(newsk, skb);
  
@@ -1670,6 +1672,8 @@ process:
  	skb->dev = NULL;
  
  	bh_lock_sock_nested(sk);
+	TCP_ESTATS_UPDATE(
+		tcp_sk(sk), tcp_estats_update_segrecv(tcp_sk(sk), skb));
  	ret = 0;
  	if (!sock_owned_by_user(sk)) {
  		if (!tcp_prequeue(sk, skb))
@@ -1680,6 +1684,8 @@ process:
  		NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
  		goto discard_and_relse;
  	}
+	TCP_ESTATS_UPDATE(
+		tcp_sk(sk), tcp_estats_update_finish_segrecv(tcp_sk(sk)));
  	bh_unlock_sock(sk);
  
  	sock_put(sk);
@@ -1809,6 +1815,8 @@ static int tcp_v4_init_sock(struct sock *sk)
  	tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
  #endif
  
+	tcp_estats_create(sk, TCP_ESTATS_ADDRTYPE_IPV4, TCP_ESTATS_ACTIVE);
+
  	return 0;
  }
  
@@ -1842,6 +1850,8 @@ void tcp_v4_destroy_sock(struct sock *sk)
  	if (inet_csk(sk)->icsk_bind_hash)
  		inet_put_port(sk);
  
+	tcp_estats_destroy(sk);
+
  	BUG_ON(tp->fastopen_rsk != NULL);
  
  	/* If socket is aborted during connect operation */
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 7f18262..145b4f2 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -80,6 +80,7 @@ static void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb)
  
  	tcp_advance_send_head(sk, skb);
  	tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
+	TCP_ESTATS_UPDATE(tp, tcp_estats_update_snd_nxt(tp));
  
  	tp->packets_out += tcp_skb_pcount(skb);
  	if (!prior_packets || icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
@@ -292,6 +293,7 @@ static u16 tcp_select_window(struct sock *sk)
  	}
  	tp->rcv_wnd = new_win;
  	tp->rcv_wup = tp->rcv_nxt;
+	TCP_ESTATS_UPDATE(tp, tcp_estats_update_rwin_sent(tp));
  
  	/* Make sure we do not exceed the maximum possible
  	 * scaled window.
@@ -905,6 +907,12 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
  	struct tcp_md5sig_key *md5;
  	struct tcphdr *th;
  	int err;
+#ifdef CONFIG_TCP_ESTATS
+	__u32 seq;
+	__u32 end_seq;
+	int tcp_flags;
+	int pcount;
+#endif
  
  	BUG_ON(!skb || !tcp_skb_pcount(skb));
  
@@ -1008,6 +1016,15 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
  		TCP_ADD_STATS(sock_net(sk), TCP_MIB_OUTSEGS,
  			      tcp_skb_pcount(skb));
  
+#ifdef CONFIG_TCP_ESTATS
+	/* If the skb isn't cloned, we can't reference it after
+	 * calling queue_xmit, so copy everything we need here. */
+	pcount = tcp_skb_pcount(skb);
+	seq = TCP_SKB_CB(skb)->seq;
+	end_seq = TCP_SKB_CB(skb)->end_seq;
+	tcp_flags = TCP_SKB_CB(skb)->tcp_flags;
+#endif
+
  	/* OK, its time to fill skb_shinfo(skb)->gso_segs */
  	skb_shinfo(skb)->gso_segs = tcp_skb_pcount(skb);
  
@@ -1020,10 +1037,17 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
  
  	err = icsk->icsk_af_ops->queue_xmit(sk, skb, &inet->cork.fl);
  
+	if (likely(!err)) {
+		TCP_ESTATS_UPDATE(tp, tcp_estats_update_segsend(sk, pcount,
+								seq, end_seq,
+								tcp_flags));
+	}
+
  	if (likely(err <= 0))
  		return err;
  
  	tcp_enter_cwr(sk);
+	TCP_ESTATS_VAR_INC(tp, stack_table, SendStall);
  
  	return net_xmit_eval(err);
  }
@@ -1398,6 +1422,7 @@ unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu)
  	if (icsk->icsk_mtup.enabled)
  		mss_now = min(mss_now, tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_low));
  	tp->mss_cache = mss_now;
+	TCP_ESTATS_UPDATE(tp, tcp_estats_update_mss(tp));
  
  	return mss_now;
  }
@@ -1670,11 +1695,13 @@ static unsigned int tcp_snd_test(const struct sock *sk, struct sk_buff *skb,
  	tcp_init_tso_segs(sk, skb, cur_mss);
  
  	if (!tcp_nagle_test(tp, skb, cur_mss, nonagle))
-		return 0;
+		return -TCP_ESTATS_SNDLIM_SENDER;
  
  	cwnd_quota = tcp_cwnd_test(tp, skb);
-	if (cwnd_quota && !tcp_snd_wnd_test(tp, skb, cur_mss))
-		cwnd_quota = 0;
+	if (!cwnd_quota)
+		return -TCP_ESTATS_SNDLIM_CWND;
+	if (!tcp_snd_wnd_test(tp, skb, cur_mss))
+		return -TCP_ESTATS_SNDLIM_RWIN;
  
  	return cwnd_quota;
  }
@@ -1688,7 +1715,7 @@ bool tcp_may_send_now(struct sock *sk)
  	return skb &&
  		tcp_snd_test(sk, skb, tcp_current_mss(sk),
  			     (tcp_skb_is_last(sk, skb) ?
-			      tp->nonagle : TCP_NAGLE_PUSH));
+			      tp->nonagle : TCP_NAGLE_PUSH)) > 0;
  }
  
  /* Trim TSO SKB to LEN bytes, put the remaining data into a new packet
@@ -1978,6 +2005,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
  	unsigned int tso_segs, sent_pkts;
  	int cwnd_quota;
  	int result;
+	int why = TCP_ESTATS_SNDLIM_SENDER;
  	bool is_cwnd_limited = false;
  	u32 max_segs;
  
@@ -2008,6 +2036,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
  
  		cwnd_quota = tcp_cwnd_test(tp, skb);
  		if (!cwnd_quota) {
+			why = TCP_ESTATS_SNDLIM_CWND;
  			is_cwnd_limited = true;
  			if (push_one == 2)
  				/* Force out a loss probe pkt. */
@@ -2016,19 +2045,24 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
  				break;
  		}
  
-		if (unlikely(!tcp_snd_wnd_test(tp, skb, mss_now)))
+		if (unlikely(!tcp_snd_wnd_test(tp, skb, mss_now))) {
+			why = TCP_ESTATS_SNDLIM_RWIN;
  			break;
-
+		}
+		
  		if (tso_segs == 1) {
  			if (unlikely(!tcp_nagle_test(tp, skb, mss_now,
  						     (tcp_skb_is_last(sk, skb) ?
  						      nonagle : TCP_NAGLE_PUSH))))
+				/* set above: why = TCP_ESTATS_SNDLIM_SENDER; */
  				break;
  		} else {
  			if (!push_one &&
  			    tcp_tso_should_defer(sk, skb, &is_cwnd_limited,
-						 max_segs))
+						 max_segs)) {
+				why = TCP_ESTATS_SNDLIM_TSODEFER;
  				break;
+			}
  		}
  
  		limit = mss_now;
@@ -2041,6 +2075,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
  
  		if (skb->len > limit &&
  		    unlikely(tso_fragment(sk, skb, limit, mss_now, gfp)))
+			/* set above: why = TCP_ESTATS_SNDLIM_SENDER; */
  			break;
  
  		/* TCP Small Queues :
@@ -2064,10 +2099,12 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
  			 */
  			smp_mb__after_atomic();
  			if (atomic_read(&sk->sk_wmem_alloc) > limit)
+				/* set above: why = TCP_ESTATS_SNDLIM_SENDER; */
  				break;
  		}
  
  		if (unlikely(tcp_transmit_skb(sk, skb, 1, gfp)))
+			/* set above: why = TCP_ESTATS_SNDLIM_SENDER; */
  			break;
  
  repair:
@@ -2080,9 +2117,12 @@ repair:
  		sent_pkts += tcp_skb_pcount(skb);
  
  		if (push_one)
+			/* set above: why = TCP_ESTATS_SNDLIM_SENDER; */
  			break;
  	}
  
+	TCP_ESTATS_UPDATE(tp, tcp_estats_update_sndlim(tp, why));
+
  	if (likely(sent_pkts)) {
  		if (tcp_in_cwnd_reduction(sk))
  			tp->prr_out += sent_pkts;
@@ -3148,11 +3188,16 @@ int tcp_connect(struct sock *sk)
  	 */
  	tp->snd_nxt = tp->write_seq;
  	tp->pushed_seq = tp->write_seq;
-	TCP_INC_STATS(sock_net(sk), TCP_MIB_ACTIVEOPENS);
  
  	/* Timer for repeating the SYN until an answer. */
  	inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
  				  inet_csk(sk)->icsk_rto, TCP_RTO_MAX);
+
+	TCP_ESTATS_VAR_SET(tp, stack_table, SndInitial, tp->write_seq);
+	TCP_ESTATS_VAR_SET(tp, app_table, SndMax, tp->write_seq);
+	TCP_ESTATS_UPDATE(tp, tcp_estats_update_snd_nxt(tp));
+	TCP_INC_STATS(sock_net(sk), TCP_MIB_ACTIVEOPENS);
+
  	return 0;
  }
  EXPORT_SYMBOL(tcp_connect);
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 1829c7f..0f6f1f4 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -477,6 +477,9 @@ out_reset_timer:
  		icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX);
  	}
  	inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto, TCP_RTO_MAX);
+
+        TCP_ESTATS_UPDATE(tp, tcp_estats_update_timeout(sk));
+
  	if (retransmits_timed_out(sk, sysctl_tcp_retries1 + 1, 0, 0))
  		__sk_dst_reset(sk);
  
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 5ff8780..db1f88f 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1131,6 +1131,8 @@ static struct sock *tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
  	if (newsk == NULL)
  		goto out_nonewsk;
  
+	tcp_estats_create(newsk, TCP_ESTATS_ADDRTYPE_IPV6, TCP_ESTATS_INACTIVE);
+
  	/*
  	 * No need to charge this sock to the relevant IPv6 refcnt debug socks
  	 * count here, tcp_create_openreq_child now does this for us, see the
@@ -1463,6 +1465,8 @@ process:
  	skb->dev = NULL;
  
  	bh_lock_sock_nested(sk);
+	TCP_ESTATS_UPDATE(
+		tcp_sk(sk), tcp_estats_update_segrecv(tcp_sk(sk), skb));
  	ret = 0;
  	if (!sock_owned_by_user(sk)) {
  		if (!tcp_prequeue(sk, skb))
@@ -1473,6 +1477,8 @@ process:
  		NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
  		goto discard_and_relse;
  	}
+	TCP_ESTATS_UPDATE(
+		tcp_sk(sk), tcp_estats_update_finish_segrecv(tcp_sk(sk)));
  	bh_unlock_sock(sk);
  
  	sock_put(sk);
@@ -1661,6 +1667,7 @@ static int tcp_v6_init_sock(struct sock *sk)
  #ifdef CONFIG_TCP_MD5SIG
  	tcp_sk(sk)->af_specific = &tcp_sock_ipv6_specific;
  #endif
+	tcp_estats_create(sk, TCP_ESTATS_ADDRTYPE_IPV6, TCP_ESTATS_ACTIVE);
  
  	return 0;
  }
-- 
1.9.3
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ