[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-ID: <549070D3.5050808@psc.edu>
Date: Tue, 16 Dec 2014 12:50:11 -0500
From: rapier <rapier@....edu>
To: netdev <netdev@...r.kernel.org>
Subject: [PATCH net-next 2/3] Implementation of RFC 4898 Extended TCP Statistics
(Web10G)
This set of patches provide control and management routines for the
kernel instrument set (KIS). This set of patches can be applied
against net-next independently of the KIS. While the kernel can be
patched, compiled, and run with this patch set it provides no real
functionality without the KIS implementation.
The reason is that the development team is primarily focused on ensuring
that the KIS is taken up by the community. Alternative control and
management methods can be developed and implemented as long as the KIS
is in the kernel.
In order for this patch set to compile on its own we have included two
files/patches that were previously introduced in the KIS implementation.
These are include/net/tcp_estats.h and include/linux/tcp.h. If patching
against a source tree that includes the KIS implementation
net/ipv4/[tcp_estats.c, sysctl_net_ipv4.c, Kconfig, Makefile] are required.
---
include/linux/tcp.h | 8 +
include/net/tcp_estats.h | 376 +++++++++++++++++++++++
net/ipv4/Kconfig | 25 ++
net/ipv4/Makefile | 1 +
net/ipv4/sysctl_net_ipv4.c | 14 +
net/ipv4/tcp_estats.c | 736 +++++++++++++++++++++++++++++++++++++++++++++
6 files changed, 1160 insertions(+)
create mode 100644 include/net/tcp_estats.h
create mode 100644 net/ipv4/tcp_estats.c
diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 67309ec..8758360 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -126,6 +126,10 @@ static inline struct tcp_request_sock *tcp_rsk(const struct request_sock *req)
return (struct tcp_request_sock *)req;
}
+#ifdef CONFIG_TCP_ESTATS
+struct tcp_estats;
+#endif
+
struct tcp_sock {
/* inet_connection_sock has to be the first member of tcp_sock */
struct inet_connection_sock inet_conn;
@@ -309,6 +313,10 @@ struct tcp_sock {
struct tcp_md5sig_info __rcu *md5sig_info;
#endif
+#ifdef CONFIG_TCP_ESTATS
+ struct tcp_estats *tcp_stats;
+#endif
+
/* TCP fastopen related information */
struct tcp_fastopen_request *fastopen_req;
/* fastopen_rsk points to request_sock that resulted in this big
diff --git a/include/net/tcp_estats.h b/include/net/tcp_estats.h
new file mode 100644
index 0000000..ff6000e
--- /dev/null
+++ b/include/net/tcp_estats.h
@@ -0,0 +1,376 @@
+/*
+ * include/net/tcp_estats.h
+ *
+ * Implementation of TCP Extended Statistics MIB (RFC 4898)
+ *
+ * Authors:
+ * John Estabrook <jsestabrook@...il.com>
+ * Andrew K. Adams <akadams@....edu>
+ * Kevin Hogan <kwabena@...gle.com>
+ * Dominin Hamon <dma@...ipysock.com>
+ * John Heffner <johnwheffner@...il.com>
+ *
+ * The Web10Gig project. See http://www.web10gig.org
+ *
+ * Copyright © 2011, Pittsburgh Supercomputing Center (PSC).
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#ifndef _TCP_ESTATS_H
+#define _TCP_ESTATS_H
+
+#include <net/sock.h>
+#include <linux/idr.h>
+#include <linux/in.h>
+#include <linux/jump_label.h>
+#include <linux/spinlock.h>
+#include <linux/tcp.h>
+#include <linux/workqueue.h>
+
+/* defines number of seconds that stats persist after connection ends */
+#define TCP_ESTATS_PERSIST_DELAY_SECS 5
+
+enum tcp_estats_sndlim_states {
+ TCP_ESTATS_SNDLIM_NONE = -1,
+ TCP_ESTATS_SNDLIM_SENDER,
+ TCP_ESTATS_SNDLIM_CWND,
+ TCP_ESTATS_SNDLIM_RWIN,
+ TCP_ESTATS_SNDLIM_STARTUP,
+ TCP_ESTATS_SNDLIM_TSODEFER,
+ TCP_ESTATS_SNDLIM_PACE,
+ TCP_ESTATS_SNDLIM_NSTATES /* Keep at end */
+};
+
+enum tcp_estats_addrtype {
+ TCP_ESTATS_ADDRTYPE_IPV4 = 1,
+ TCP_ESTATS_ADDRTYPE_IPV6 = 2
+};
+
+enum tcp_estats_softerror_reason {
+ TCP_ESTATS_SOFTERROR_BELOW_DATA_WINDOW = 1,
+ TCP_ESTATS_SOFTERROR_ABOVE_DATA_WINDOW = 2,
+ TCP_ESTATS_SOFTERROR_BELOW_ACK_WINDOW = 3,
+ TCP_ESTATS_SOFTERROR_ABOVE_ACK_WINDOW = 4,
+ TCP_ESTATS_SOFTERROR_BELOW_TS_WINDOW = 5,
+ TCP_ESTATS_SOFTERROR_ABOVE_TS_WINDOW = 6,
+ TCP_ESTATS_SOFTERROR_DATA_CHECKSUM = 7,
+ TCP_ESTATS_SOFTERROR_OTHER = 8,
+};
+
+#define TCP_ESTATS_INACTIVE 2
+#define TCP_ESTATS_ACTIVE 1
+
+#define TCP_ESTATS_TABLEMASK_INACTIVE 0x00
+#define TCP_ESTATS_TABLEMASK_ACTIVE 0x01
+#define TCP_ESTATS_TABLEMASK_PERF 0x02
+#define TCP_ESTATS_TABLEMASK_PATH 0x04
+#define TCP_ESTATS_TABLEMASK_STACK 0x08
+#define TCP_ESTATS_TABLEMASK_APP 0x10
+#define TCP_ESTATS_TABLEMASK_EXTRAS 0x40
+
+#ifdef CONFIG_TCP_ESTATS
+
+extern struct static_key tcp_estats_enabled;
+
+#define TCP_ESTATS_CHECK(tp, table, expr) \
+ do { \
+ if (static_key_false(&tcp_estats_enabled)) { \
+ if (likely((tp)->tcp_stats) && \
+ likely((tp)->tcp_stats->tables.table)) { \
+ (expr); \
+ } \
+ } \
+ } while (0)
+
+#define TCP_ESTATS_VAR_INC(tp, table, var) \
+ TCP_ESTATS_CHECK(tp, table, ++((tp)->tcp_stats->tables.table->var))
+#define TCP_ESTATS_VAR_DEC(tp, table, var) \
+ TCP_ESTATS_CHECK(tp, table, --((tp)->tcp_stats->tables.table->var))
+#define TCP_ESTATS_VAR_ADD(tp, table, var, val) \
+ TCP_ESTATS_CHECK(tp, table, \
+ ((tp)->tcp_stats->tables.table->var) += (val))
+#define TCP_ESTATS_VAR_SET(tp, table, var, val) \
+ TCP_ESTATS_CHECK(tp, table, \
+ ((tp)->tcp_stats->tables.table->var) = (val))
+#define TCP_ESTATS_UPDATE(tp, func) \
+ do { \
+ if (static_key_false(&tcp_estats_enabled)) { \
+ if (likely((tp)->tcp_stats)) { \
+ (func); \
+ } \
+ } \
+ } while (0)
+
+/*
+ * Variables that can be read and written directly.
+ *
+ * Contains all variables from RFC 4898. Commented fields are
+ * either not implemented (only StartTimeStamp
+ * remains unimplemented in this release) or have
+ * handlers and do not need struct storage.
+ */
+struct tcp_estats_connection_table {
+ u32 AddressType;
+ union { struct in_addr addr; struct in6_addr addr6; } LocalAddress;
+ union { struct in_addr addr; struct in6_addr addr6; } RemAddress;
+ u16 LocalPort;
+ u16 RemPort;
+};
+
+struct tcp_estats_perf_table {
+ u32 SegsOut;
+ u32 DataSegsOut;
+ u64 DataOctetsOut;
+ u32 SegsRetrans;
+ u32 OctetsRetrans;
+ u32 SegsIn;
+ u32 DataSegsIn;
+ u64 DataOctetsIn;
+ /* ElapsedSecs */
+ /* ElapsedMicroSecs */
+ /* StartTimeStamp */
+ /* CurMSS */
+ /* PipeSize */
+ u32 MaxPipeSize;
+ /* SmoothedRTT */
+ /* CurRTO */
+ u32 CongSignals;
+ /* CurCwnd */
+ /* CurSsthresh */
+ u32 Timeouts;
+ /* CurRwinSent */
+ u32 MaxRwinSent;
+ u32 ZeroRwinSent;
+ /* CurRwinRcvd */
+ u32 MaxRwinRcvd;
+ u32 ZeroRwinRcvd;
+ /* SndLimTransRwin */
+ /* SndLimTransCwnd */
+ /* SndLimTransSnd */
+ /* SndLimTimeRwin */
+ /* SndLimTimeCwnd */
+ /* SndLimTimeSnd */
+ u32 snd_lim_trans[TCP_ESTATS_SNDLIM_NSTATES];
+ u32 snd_lim_time[TCP_ESTATS_SNDLIM_NSTATES];
+};
+
+struct tcp_estats_path_table {
+ /* RetranThresh */
+ u32 NonRecovDAEpisodes;
+ u32 SumOctetsReordered;
+ u32 NonRecovDA;
+ u32 SampleRTT;
+ /* RTTVar */
+ u32 MaxRTT;
+ u32 MinRTT;
+ u64 SumRTT;
+ u32 CountRTT;
+ u32 MaxRTO;
+ u32 MinRTO;
+ u8 IpTtl;
+ u8 IpTosIn;
+ /* IpTosOut */
+ u32 PreCongSumCwnd;
+ u32 PreCongSumRTT;
+ u32 PostCongSumRTT;
+ u32 PostCongCountRTT;
+ u32 ECNsignals;
+ u32 DupAckEpisodes;
+ /* RcvRTT */
+ u32 DupAcksOut;
+ u32 CERcvd;
+ u32 ECESent;
+};
+
+struct tcp_estats_stack_table {
+ u32 ActiveOpen;
+ /* MSSSent */
+ /* MSSRcvd */
+ /* WinScaleSent */
+ /* WinScaleRcvd */
+ /* TimeStamps */
+ /* ECN */
+ /* WillSendSACK */
+ /* WillUseSACK */
+ /* State */
+ /* Nagle */
+ u32 MaxSsCwnd;
+ u32 MaxCaCwnd;
+ u32 MaxSsthresh;
+ u32 MinSsthresh;
+ /* InRecovery */
+ u32 DupAcksIn;
+ u32 SpuriousFrDetected;
+ u32 SpuriousRtoDetected;
+ u32 SoftErrors;
+ u32 SoftErrorReason;
+ u32 SlowStart;
+ u32 CongAvoid;
+ u32 OtherReductions;
+ u32 CongOverCount;
+ u32 FastRetran;
+ u32 SubsequentTimeouts;
+ /* CurTimeoutCount */
+ u32 AbruptTimeouts;
+ u32 SACKsRcvd;
+ u32 SACKBlocksRcvd;
+ u32 SendStall;
+ u32 DSACKDups;
+ u32 MaxMSS;
+ u32 MinMSS;
+ u32 SndInitial;
+ u32 RecInitial;
+ /* CurRetxQueue */
+ /* MaxRetxQueue */
+ /* CurReasmQueue */
+ u32 MaxReasmQueue;
+ u32 EarlyRetrans;
+ u32 EarlyRetransDelay;
+};
+
+struct tcp_estats_app_table {
+ /* SndUna */
+ /* SndNxt */
+ u32 SndMax;
+ u64 ThruOctetsAcked;
+ /* RcvNxt */
+ u64 ThruOctetsReceived;
+ /* CurAppWQueue */
+ u32 MaxAppWQueue;
+ /* CurAppRQueue */
+ u32 MaxAppRQueue;
+};
+
+/*
+ currently, no backing store is needed for tuning elements in
+ web10g - they are all read or written to directly in other
+ data structures (such as the socket)
+*/
+
+struct tcp_estats_extras_table {
+ /* OtherReductionsCV */
+ u32 OtherReductionsCM;
+ u32 Priority;
+};
+
+struct tcp_estats_tables {
+ struct tcp_estats_connection_table *connection_table;
+ struct tcp_estats_perf_table *perf_table;
+ struct tcp_estats_path_table *path_table;
+ struct tcp_estats_stack_table *stack_table;
+ struct tcp_estats_app_table *app_table;
+ struct tcp_estats_extras_table *extras_table;
+};
+
+struct tcp_estats {
+ int tcpe_cid; /* idr map id */
+
+ struct sock *sk;
+ kuid_t uid;
+ kgid_t gid;
+ int ids;
+
+ atomic_t users;
+
+ enum tcp_estats_sndlim_states limstate;
+ ktime_t limstate_ts;
+#ifdef CONFIG_TCP_ESTATS_STRICT_ELAPSEDTIME
+ ktime_t start_ts;
+ ktime_t current_ts;
+#else
+ unsigned long start_ts;
+ unsigned long current_ts;
+#endif
+ struct timeval start_tv;
+
+ int queued;
+ struct work_struct create_notify;
+ struct work_struct establish_notify;
+ struct delayed_work destroy_notify;
+
+ struct tcp_estats_tables tables;
+
+ struct rcu_head rcu;
+};
+
+extern struct idr tcp_estats_idr;
+
+extern int tcp_estats_wq_enabled;
+extern struct workqueue_struct *tcp_estats_wq;
+extern void (*create_notify_func)(struct work_struct *work);
+extern void (*establish_notify_func)(struct work_struct *work);
+extern void (*destroy_notify_func)(struct work_struct *work);
+
+extern unsigned long persist_delay;
+extern spinlock_t tcp_estats_idr_lock;
+
+/* For the TCP code */
+extern int tcp_estats_create(struct sock *sk, enum tcp_estats_addrtype t,
+ int active);
+extern void tcp_estats_destroy(struct sock *sk);
+extern void tcp_estats_establish(struct sock *sk);
+extern void tcp_estats_free(struct rcu_head *rcu);
+
+extern void tcp_estats_update_snd_nxt(struct tcp_sock *tp);
+extern void tcp_estats_update_acked(struct tcp_sock *tp, u32 ack);
+extern void tcp_estats_update_rtt(struct sock *sk, unsigned long rtt_sample);
+extern void tcp_estats_update_timeout(struct sock *sk);
+extern void tcp_estats_update_mss(struct tcp_sock *tp);
+extern void tcp_estats_update_rwin_rcvd(struct tcp_sock *tp);
+extern void tcp_estats_update_sndlim(struct tcp_sock *tp,
+ enum tcp_estats_sndlim_states why);
+extern void tcp_estats_update_rcvd(struct tcp_sock *tp, u32 seq);
+extern void tcp_estats_update_rwin_sent(struct tcp_sock *tp);
+extern void tcp_estats_update_congestion(struct tcp_sock *tp);
+extern void tcp_estats_update_post_congestion(struct tcp_sock *tp);
+extern void tcp_estats_update_segsend(struct sock *sk, int pcount,
+ u32 seq, u32 end_seq, int flags);
+extern void tcp_estats_update_segrecv(struct tcp_sock *tp, struct sk_buff *skb);
+extern void tcp_estats_update_finish_segrecv(struct tcp_sock *tp);
+extern void tcp_estats_update_writeq(struct sock *sk);
+extern void tcp_estats_update_recvq(struct sock *sk);
+
+extern void tcp_estats_init(void);
+
+static inline void tcp_estats_use(struct tcp_estats *stats)
+{
+ atomic_inc(&stats->users);
+}
+
+static inline int tcp_estats_use_if_valid(struct tcp_estats *stats)
+{
+ return atomic_inc_not_zero(&stats->users);
+}
+
+static inline void tcp_estats_unuse(struct tcp_estats *stats)
+{
+ if (atomic_dec_and_test(&stats->users)) {
+ sock_put(stats->sk);
+ stats->sk = NULL;
+ call_rcu(&stats->rcu, tcp_estats_free);
+ }
+}
+
+#else /* !CONFIG_TCP_ESTATS */
+
+#define tcp_estats_enabled (0)
+
+#define TCP_ESTATS_VAR_INC(tp, table, var) do {} while (0)
+#define TCP_ESTATS_VAR_DEC(tp, table, var) do {} while (0)
+#define TCP_ESTATS_VAR_ADD(tp, table, var, val) do {} while (0)
+#define TCP_ESTATS_VAR_SET(tp, table, var, val) do {} while (0)
+#define TCP_ESTATS_UPDATE(tp, func) do {} while (0)
+
+static inline void tcp_estats_init(void) { }
+static inline void tcp_estats_establish(struct sock *sk) { }
+static inline void tcp_estats_create(struct sock *sk,
+ enum tcp_estats_addrtype t,
+ int active) { }
+static inline void tcp_estats_destroy(struct sock *sk) { }
+
+#endif /* CONFIG_TCP_ESTATS */
+
+#endif /* _TCP_ESTATS_H */
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index bd29016..c04ba8f 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -680,3 +680,28 @@ config TCP_MD5SIG
on the Internet.
If unsure, say N.
+
+config TCP_ESTATS
+ bool "TCP: Extended TCP statistics (RFC4898) MIB"
+ ---help---
+ RFC 4898 specifies a number of extended statistics for TCP. This
+ data can be accessed using netlink. See http://www.web10g.org for
+ more details.
+
+if TCP_ESTATS
+
+config TCP_ESTATS_STRICT_ELAPSEDTIME
+ bool "TCP: ESTATS strict ElapsedSecs/Msecs counters"
+ depends on TCP_ESTATS
+ default n
+ ---help---
+ Elapsed time since beginning of connection.
+ RFC4898 defines ElapsedSecs/Msecs as being updated via ktime_get
+ at each protocol event (sending or receiving of a segment);
+ as this can be a performance hit, leaving this config option off
+ will update elapsed based on on the jiffies counter instead.
+ Set to Y for strict conformance with the MIB.
+
+ If unsure, say N.
+
+endif
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index 518c04e..7e2c69a 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -36,6 +36,7 @@ obj-$(CONFIG_INET_TUNNEL) += tunnel4.o
obj-$(CONFIG_INET_XFRM_MODE_TRANSPORT) += xfrm4_mode_transport.o
obj-$(CONFIG_INET_XFRM_MODE_TUNNEL) += xfrm4_mode_tunnel.o
obj-$(CONFIG_IP_PNP) += ipconfig.o
+obj-$(CONFIG_TCP_ESTATS) += tcp_estats.o
obj-$(CONFIG_NETFILTER) += netfilter.o netfilter/
obj-$(CONFIG_INET_DIAG) += inet_diag.o
obj-$(CONFIG_INET_TCP_DIAG) += tcp_diag.o
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index e0ee384..edc5a66 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -42,6 +42,11 @@ static int tcp_syn_retries_max = MAX_TCP_SYNCNT;
static int ip_ping_group_range_min[] = { 0, 0 };
static int ip_ping_group_range_max[] = { GID_T_MAX, GID_T_MAX };
+/* Extended statistics (RFC4898). */
+#ifdef CONFIG_TCP_ESTATS
+int sysctl_tcp_estats __read_mostly;
+#endif /* CONFIG_TCP_ESTATS */
+
/* Update system visible IP port range */
static void set_local_port_range(struct net *net, int range[2])
{
@@ -767,6 +772,15 @@ static struct ctl_table ipv4_table[] = {
.proc_handler = proc_dointvec_minmax,
.extra1 = &one
},
+#ifdef CONFIG_TCP_ESTATS
+ {
+ .procname = "tcp_estats",
+ .data = &sysctl_tcp_estats,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+#endif /* CONFIG TCP ESTATS */
{ }
};
diff --git a/net/ipv4/tcp_estats.c b/net/ipv4/tcp_estats.c
new file mode 100644
index 0000000..e817540
--- /dev/null
+++ b/net/ipv4/tcp_estats.c
@@ -0,0 +1,736 @@
+/*
+ * net/ipv4/tcp_estats.c
+ *
+ * Implementation of TCP ESTATS MIB (RFC 4898)
+ *
+ * Authors:
+ * John Estabrook <jsestabrook@...il.com>
+ * Andrew K. Adams <akadams@....edu>
+ * Kevin Hogan <kwabena@...gle.com>
+ * Dominin Hamon <dma@...ipysock.com>
+ * John Heffner <johnwheffner@...il.com>
+ *
+ * The Web10Gig project. See http://www.web10gig.org
+ *
+ * Copyright © 2011, Pittsburgh Supercomputing Center (PSC).
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ */
+
+#include <linux/export.h>
+#ifndef CONFIG_TCP_ESTATS_STRICT_ELAPSEDTIME
+#include <linux/jiffies.h>
+#endif
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/string.h>
+#include <net/tcp_estats.h>
+#include <net/tcp.h>
+#include <asm/atomic.h>
+#include <asm/byteorder.h>
+
+#define ESTATS_INF32 0xffffffff
+
+#define ESTATS_MAX_CID 5000000
+
+extern int sysctl_tcp_estats;
+
+struct idr tcp_estats_idr;
+EXPORT_SYMBOL(tcp_estats_idr);
+static int next_id = 1;
+DEFINE_SPINLOCK(tcp_estats_idr_lock);
+EXPORT_SYMBOL(tcp_estats_idr_lock);
+
+int tcp_estats_wq_enabled __read_mostly = 0;
+EXPORT_SYMBOL(tcp_estats_wq_enabled);
+struct workqueue_struct *tcp_estats_wq = NULL;
+EXPORT_SYMBOL(tcp_estats_wq);
+void (*create_notify_func)(struct work_struct *work);
+EXPORT_SYMBOL(create_notify_func);
+void (*establish_notify_func)(struct work_struct *work);
+EXPORT_SYMBOL(establish_notify_func);
+void (*destroy_notify_func)(struct work_struct *work);
+EXPORT_SYMBOL(destroy_notify_func);
+unsigned long persist_delay = 0;
+EXPORT_SYMBOL(persist_delay);
+
+struct static_key tcp_estats_enabled __read_mostly = STATIC_KEY_INIT_FALSE;
+EXPORT_SYMBOL(tcp_estats_enabled);
+
+/* if HAVE_JUMP_LABEL is defined, then static_key_slow_inc/dec uses a
+ * mutex in its implementation, and hence can't be called if in_interrupt().
+ * if HAVE_JUMP_LABEL is NOT defined, then no mutex is used, hence no need
+ * for deferring enable/disable */
+#ifdef HAVE_JUMP_LABEL
+static atomic_t tcp_estats_enabled_deferred;
+
+static void tcp_estats_handle_deferred_enable_disable(void)
+{
+ int count = atomic_xchg(&tcp_estats_enabled_deferred, 0);
+
+ while (count > 0) {
+ static_key_slow_inc(&tcp_estats_enabled);
+ --count;
+ }
+
+ while (count < 0) {
+ static_key_slow_dec(&tcp_estats_enabled);
+ ++count;
+ }
+}
+#endif
+
+static inline void tcp_estats_enable(void)
+{
+#ifdef HAVE_JUMP_LABEL
+ if (in_interrupt()) {
+ atomic_inc(&tcp_estats_enabled_deferred);
+ return;
+ }
+ tcp_estats_handle_deferred_enable_disable();
+#endif
+ static_key_slow_inc(&tcp_estats_enabled);
+}
+
+static inline void tcp_estats_disable(void)
+{
+#ifdef HAVE_JUMP_LABEL
+ if (in_interrupt()) {
+ atomic_dec(&tcp_estats_enabled_deferred);
+ return;
+ }
+ tcp_estats_handle_deferred_enable_disable();
+#endif
+ static_key_slow_dec(&tcp_estats_enabled);
+}
+
+/* Calculates the required amount of memory for any enabled tables. */
+int tcp_estats_get_allocation_size(int sysctl)
+{
+ int size = sizeof(struct tcp_estats) +
+ sizeof(struct tcp_estats_connection_table);
+
+ if (sysctl & TCP_ESTATS_TABLEMASK_PERF)
+ size += sizeof(struct tcp_estats_perf_table);
+ if (sysctl & TCP_ESTATS_TABLEMASK_PATH)
+ size += sizeof(struct tcp_estats_path_table);
+ if (sysctl & TCP_ESTATS_TABLEMASK_STACK)
+ size += sizeof(struct tcp_estats_stack_table);
+ if (sysctl & TCP_ESTATS_TABLEMASK_APP)
+ size += sizeof(struct tcp_estats_app_table);
+ if (sysctl & TCP_ESTATS_TABLEMASK_EXTRAS)
+ size += sizeof(struct tcp_estats_extras_table);
+ return size;
+}
+
+/* Called whenever a TCP/IPv4 sock is created.
+ * net/ipv4/tcp_ipv4.c: tcp_v4_syn_recv_sock,
+ * tcp_v4_init_sock
+ * Allocates a stats structure and initializes values.
+ */
+int tcp_estats_create(struct sock *sk, enum tcp_estats_addrtype addrtype,
+ int active)
+{
+ struct tcp_estats *stats;
+ struct tcp_estats_tables *tables;
+ struct tcp_sock *tp = tcp_sk(sk);
+ void *estats_mem;
+ int sysctl;
+ int ret;
+
+ /* Read the sysctl once before calculating memory needs and initializing
+ * tables to avoid raciness. */
+ sysctl = ACCESS_ONCE(sysctl_tcp_estats);
+ if (likely(sysctl == TCP_ESTATS_TABLEMASK_INACTIVE)) {
+ return 0;
+ }
+
+ estats_mem = kzalloc(tcp_estats_get_allocation_size(sysctl), gfp_any());
+ if (!estats_mem)
+ return -ENOMEM;
+
+ stats = estats_mem;
+ estats_mem += sizeof(struct tcp_estats);
+
+ tables = &stats->tables;
+
+ tables->connection_table = estats_mem;
+ estats_mem += sizeof(struct tcp_estats_connection_table);
+
+ if (sysctl & TCP_ESTATS_TABLEMASK_PERF) {
+ tables->perf_table = estats_mem;
+ estats_mem += sizeof(struct tcp_estats_perf_table);
+ }
+ if (sysctl & TCP_ESTATS_TABLEMASK_PATH) {
+ tables->path_table = estats_mem;
+ estats_mem += sizeof(struct tcp_estats_path_table);
+ }
+ if (sysctl & TCP_ESTATS_TABLEMASK_STACK) {
+ tables->stack_table = estats_mem;
+ estats_mem += sizeof(struct tcp_estats_stack_table);
+ }
+ if (sysctl & TCP_ESTATS_TABLEMASK_APP) {
+ tables->app_table = estats_mem;
+ estats_mem += sizeof(struct tcp_estats_app_table);
+ }
+ if (sysctl & TCP_ESTATS_TABLEMASK_EXTRAS) {
+ tables->extras_table = estats_mem;
+ estats_mem += sizeof(struct tcp_estats_extras_table);
+ }
+
+ stats->tcpe_cid = -1;
+ stats->queued = 0;
+
+ tables->connection_table->AddressType = addrtype;
+
+ sock_hold(sk);
+ stats->sk = sk;
+ atomic_set(&stats->users, 0);
+
+ stats->limstate = TCP_ESTATS_SNDLIM_STARTUP;
+ stats->limstate_ts = ktime_get();
+#ifdef CONFIG_TCP_ESTATS_STRICT_ELAPSEDTIME
+ stats->start_ts = stats->current_ts = stats->limstate_ts;
+#else
+ stats->start_ts = stats->current_ts = jiffies;
+#endif
+ do_gettimeofday(&stats->start_tv);
+
+ /* order is important -
+ * must have stats hooked into tp and tcp_estats_enabled()
+ * in order to have the TCP_ESTATS_VAR_<> macros work */
+ tp->tcp_stats = stats;
+ tcp_estats_enable();
+
+ TCP_ESTATS_VAR_SET(tp, stack_table, ActiveOpen, active);
+
+ TCP_ESTATS_VAR_SET(tp, app_table, SndMax, tp->snd_nxt);
+ TCP_ESTATS_VAR_SET(tp, stack_table, SndInitial, tp->snd_nxt);
+
+ TCP_ESTATS_VAR_SET(tp, path_table, MinRTT, ESTATS_INF32);
+ TCP_ESTATS_VAR_SET(tp, path_table, MinRTO, ESTATS_INF32);
+ TCP_ESTATS_VAR_SET(tp, stack_table, MinMSS, ESTATS_INF32);
+ TCP_ESTATS_VAR_SET(tp, stack_table, MinSsthresh, ESTATS_INF32);
+
+ tcp_estats_use(stats);
+
+ if (tcp_estats_wq_enabled) {
+ tcp_estats_use(stats);
+ stats->queued = 1;
+ stats->tcpe_cid = 0;
+ INIT_WORK(&stats->create_notify, create_notify_func);
+ ret = queue_work(tcp_estats_wq, &stats->create_notify);
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL(tcp_estats_create);
+
+void tcp_estats_destroy(struct sock *sk)
+{
+ struct tcp_estats *stats = tcp_sk(sk)->tcp_stats;
+
+ if (stats == NULL)
+ return;
+
+ /* Attribute final sndlim time. */
+ tcp_estats_update_sndlim(tcp_sk(stats->sk), stats->limstate);
+
+ if (tcp_estats_wq_enabled && stats->queued) {
+ INIT_DELAYED_WORK(&stats->destroy_notify,
+ destroy_notify_func);
+ queue_delayed_work(tcp_estats_wq, &stats->destroy_notify,
+ persist_delay);
+ }
+ tcp_estats_unuse(stats);
+}
+
+/* Do not call directly. Called from tcp_estats_unuse() through call_rcu. */
+void tcp_estats_free(struct rcu_head *rcu)
+{
+ struct tcp_estats *stats = container_of(rcu, struct tcp_estats, rcu);
+ tcp_estats_disable();
+ kfree(stats);
+}
+EXPORT_SYMBOL(tcp_estats_free);
+
+/* Called when a connection enters the ESTABLISHED state, and has all its
+ * state initialized.
+ * net/ipv4/tcp_input.c: tcp_rcv_state_process,
+ * tcp_rcv_synsent_state_process
+ * Here we link the statistics structure in so it is visible in the /proc
+ * fs, and do some final init.
+ */
+void tcp_estats_establish(struct sock *sk)
+{
+ struct inet_sock *inet = inet_sk(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct tcp_estats *stats = tp->tcp_stats;
+ struct tcp_estats_connection_table *conn_table;
+
+ if (stats == NULL)
+ return;
+
+ conn_table = stats->tables.connection_table;
+
+ /* Let's set these here, since they can't change once the
+ * connection is established.
+ */
+ conn_table->LocalPort = inet->inet_num;
+ conn_table->RemPort = ntohs(inet->inet_dport);
+
+ if (conn_table->AddressType == TCP_ESTATS_ADDRTYPE_IPV4) {
+ memcpy(&conn_table->LocalAddress.addr, &inet->inet_rcv_saddr,
+ sizeof(struct in_addr));
+ memcpy(&conn_table->RemAddress.addr, &inet->inet_daddr,
+ sizeof(struct in_addr));
+ }
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+ else if (conn_table->AddressType == TCP_ESTATS_ADDRTYPE_IPV6) {
+ memcpy(&conn_table->LocalAddress.addr6, &(sk)->sk_v6_rcv_saddr,
+ sizeof(struct in6_addr));
+ /* ipv6 daddr now uses a different struct than saddr */
+ memcpy(&conn_table->RemAddress.addr6, &(sk)->sk_v6_daddr,
+ sizeof(struct in6_addr));
+ }
+#endif
+ else {
+ pr_err("TCP ESTATS: AddressType not valid.\n");
+ }
+
+ tcp_estats_update_finish_segrecv(tp);
+ tcp_estats_update_rwin_rcvd(tp);
+ tcp_estats_update_rwin_sent(tp);
+
+ TCP_ESTATS_VAR_SET(tp, stack_table, RecInitial, tp->rcv_nxt);
+
+ tcp_estats_update_sndlim(tp, TCP_ESTATS_SNDLIM_SENDER);
+
+ if (tcp_estats_wq_enabled && stats->queued) {
+ INIT_WORK(&stats->establish_notify, establish_notify_func);
+ queue_work(tcp_estats_wq, &stats->establish_notify);
+ }
+}
+
+/*
+ * Statistics update functions
+ */
+
+void tcp_estats_update_snd_nxt(struct tcp_sock *tp)
+{
+ struct tcp_estats *stats = tp->tcp_stats;
+
+ if (stats->tables.app_table) {
+ if (after(tp->snd_nxt, stats->tables.app_table->SndMax))
+ stats->tables.app_table->SndMax = tp->snd_nxt;
+ }
+}
+
+void tcp_estats_update_acked(struct tcp_sock *tp, u32 ack)
+{
+ struct tcp_estats *stats = tp->tcp_stats;
+
+ if (stats->tables.app_table)
+ stats->tables.app_table->ThruOctetsAcked += ack - tp->snd_una;
+}
+
+void tcp_estats_update_rtt(struct sock *sk, unsigned long rtt_sample)
+{
+ struct tcp_estats *stats = tcp_sk(sk)->tcp_stats;
+ struct tcp_estats_path_table *path_table = stats->tables.path_table;
+ unsigned long rtt_sample_msec = rtt_sample/1000;
+ u32 rto;
+
+ if (path_table == NULL)
+ return;
+
+ path_table->SampleRTT = rtt_sample_msec;
+
+ if (rtt_sample_msec > path_table->MaxRTT)
+ path_table->MaxRTT = rtt_sample_msec;
+ if (rtt_sample_msec < path_table->MinRTT)
+ path_table->MinRTT = rtt_sample_msec;
+
+ path_table->CountRTT++;
+ path_table->SumRTT += rtt_sample_msec;
+
+ rto = jiffies_to_msecs(inet_csk(sk)->icsk_rto);
+ if (rto > path_table->MaxRTO)
+ path_table->MaxRTO = rto;
+ if (rto < path_table->MinRTO)
+ path_table->MinRTO = rto;
+}
+
+void tcp_estats_update_timeout(struct sock *sk)
+{
+ if (inet_csk(sk)->icsk_backoff)
+ TCP_ESTATS_VAR_INC(tcp_sk(sk), stack_table, SubsequentTimeouts);
+ else
+ TCP_ESTATS_VAR_INC(tcp_sk(sk), perf_table, Timeouts);
+
+ if (inet_csk(sk)->icsk_ca_state == TCP_CA_Open)
+ TCP_ESTATS_VAR_INC(tcp_sk(sk), stack_table, AbruptTimeouts);
+}
+
+void tcp_estats_update_mss(struct tcp_sock *tp)
+{
+ struct tcp_estats *stats = tp->tcp_stats;
+ struct tcp_estats_stack_table *stack_table = stats->tables.stack_table;
+ int mss = tp->mss_cache;
+
+ if (stack_table == NULL)
+ return;
+
+ if (mss > stack_table->MaxMSS)
+ stack_table->MaxMSS = mss;
+ if (mss < stack_table->MinMSS)
+ stack_table->MinMSS = mss;
+}
+
+void tcp_estats_update_finish_segrecv(struct tcp_sock *tp)
+{
+ struct tcp_estats *stats = tp->tcp_stats;
+ struct tcp_estats_tables *tables = &stats->tables;
+ struct tcp_estats_perf_table *perf_table = tables->perf_table;
+ struct tcp_estats_stack_table *stack_table = tables->stack_table;
+ u32 mss = tp->mss_cache;
+ u32 cwnd;
+ u32 ssthresh;
+ u32 pipe_size;
+
+#ifdef CONFIG_TCP_ESTATS_STRICT_ELAPSEDTIME
+ stats->current_ts = ktime_get();
+#else
+ stats->current_ts = jiffies;
+#endif
+
+ if (stack_table != NULL) {
+ cwnd = tp->snd_cwnd * mss;
+ if (tp->snd_cwnd <= tp->snd_ssthresh) {
+ if (cwnd > stack_table->MaxSsCwnd)
+ stack_table->MaxSsCwnd = cwnd;
+ } else if (cwnd > stack_table->MaxCaCwnd) {
+ stack_table->MaxCaCwnd = cwnd;
+ }
+ }
+
+ if (perf_table != NULL) {
+ pipe_size = tcp_packets_in_flight(tp) * mss;
+ if (pipe_size > perf_table->MaxPipeSize)
+ perf_table->MaxPipeSize = pipe_size;
+ }
+
+ /* Discard initiail ssthresh set at infinity. */
+ if (tp->snd_ssthresh >= TCP_INFINITE_SSTHRESH) {
+ return;
+ }
+
+ if (stack_table != NULL) {
+ ssthresh = tp->snd_ssthresh * tp->mss_cache;
+ if (ssthresh > stack_table->MaxSsthresh)
+ stack_table->MaxSsthresh = ssthresh;
+ if (ssthresh < stack_table->MinSsthresh)
+ stack_table->MinSsthresh = ssthresh;
+ }
+}
+EXPORT_SYMBOL(tcp_estats_update_finish_segrecv);
+
+void tcp_estats_update_rwin_rcvd(struct tcp_sock *tp)
+{
+ struct tcp_estats *stats = tp->tcp_stats;
+ struct tcp_estats_perf_table *perf_table = stats->tables.perf_table;
+ u32 win = tp->snd_wnd;
+
+ if (perf_table == NULL)
+ return;
+
+ if (win > perf_table->MaxRwinRcvd)
+ perf_table->MaxRwinRcvd = win;
+ if (win == 0)
+ perf_table->ZeroRwinRcvd++;
+}
+
+void tcp_estats_update_rwin_sent(struct tcp_sock *tp)
+{
+ struct tcp_estats *stats = tp->tcp_stats;
+ struct tcp_estats_perf_table *perf_table = stats->tables.perf_table;
+ u32 win = tp->rcv_wnd;
+
+ if (perf_table == NULL)
+ return;
+
+ if (win > perf_table->MaxRwinSent)
+ perf_table->MaxRwinSent = win;
+ if (win == 0)
+ perf_table->ZeroRwinSent++;
+}
+
+void tcp_estats_update_sndlim(struct tcp_sock *tp,
+ enum tcp_estats_sndlim_states state)
+{
+ struct tcp_estats *stats = tp->tcp_stats;
+ struct tcp_estats_perf_table *perf_table = stats->tables.perf_table;
+ ktime_t now;
+
+ if (state <= TCP_ESTATS_SNDLIM_NONE ||
+ state >= TCP_ESTATS_SNDLIM_NSTATES) {
+ pr_err("tcp_estats_update_sndlim: BUG: state out of range %d\n",
+ state);
+ return;
+ }
+
+ if (perf_table == NULL)
+ return;
+
+ now = ktime_get();
+ perf_table->snd_lim_time[stats->limstate]
+ += ktime_to_us(ktime_sub(now, stats->limstate_ts));
+ stats->limstate_ts = now;
+ if (stats->limstate != state) {
+ stats->limstate = state;
+ perf_table->snd_lim_trans[state]++;
+ }
+}
+
+void tcp_estats_update_congestion(struct tcp_sock *tp)
+{
+ struct tcp_estats *stats = tp->tcp_stats;
+ struct tcp_estats_path_table *path_table = stats->tables.path_table;
+
+ TCP_ESTATS_VAR_INC(tp, perf_table, CongSignals);
+
+ if (path_table != NULL) {
+ path_table->PreCongSumCwnd += tp->snd_cwnd * tp->mss_cache;
+ path_table->PreCongSumRTT += path_table->SampleRTT;
+ }
+}
+
+void tcp_estats_update_post_congestion(struct tcp_sock *tp)
+{
+ struct tcp_estats *stats = tp->tcp_stats;
+ struct tcp_estats_path_table *path_table = stats->tables.path_table;
+
+ if (path_table != NULL) {
+ path_table->PostCongCountRTT++;
+ path_table->PostCongSumRTT += path_table->SampleRTT;
+ }
+}
+
+void tcp_estats_update_segsend(struct sock *sk, int pcount,
+ u32 seq, u32 end_seq, int flags)
+{
+ struct tcp_estats *stats = tcp_sk(sk)->tcp_stats;
+ struct tcp_estats_perf_table *perf_table = stats->tables.perf_table;
+ struct tcp_estats_app_table *app_table = stats->tables.app_table;
+
+ int data_len = end_seq - seq;
+
+#ifdef CONFIG_TCP_ESTATS_STRICT_ELAPSEDTIME
+ stats->current_ts = ktime_get();
+#else
+ stats->current_ts = jiffies;
+#endif
+
+ if (perf_table == NULL)
+ return;
+
+ /* We know we're sending a segment. */
+ perf_table->SegsOut += pcount;
+
+ /* A pure ACK contains no data; everything else is data. */
+ if (data_len > 0) {
+ perf_table->DataSegsOut += pcount;
+ perf_table->DataOctetsOut += data_len;
+ }
+
+ /* Check for retransmission. */
+ if (flags & TCPHDR_SYN) {
+ if (inet_csk(sk)->icsk_retransmits)
+ perf_table->SegsRetrans++;
+ } else if (app_table != NULL &&
+ before(seq, app_table->SndMax)) {
+ perf_table->SegsRetrans += pcount;
+ perf_table->OctetsRetrans += data_len;
+ }
+}
+
+void tcp_estats_update_segrecv(struct tcp_sock *tp, struct sk_buff *skb)
+{
+ struct tcp_estats_tables *tables = &tp->tcp_stats->tables;
+ struct tcp_estats_path_table *path_table = tables->path_table;
+ struct tcp_estats_perf_table *perf_table = tables->perf_table;
+ struct tcp_estats_stack_table *stack_table = tables->stack_table;
+ struct tcphdr *th = tcp_hdr(skb);
+ struct iphdr *iph = ip_hdr(skb);
+
+ if (perf_table != NULL)
+ perf_table->SegsIn++;
+
+ if (skb->len == th->doff * 4) {
+ if (stack_table != NULL &&
+ TCP_SKB_CB(skb)->ack_seq == tp->snd_una)
+ stack_table->DupAcksIn++;
+ } else {
+ if (perf_table != NULL) {
+ perf_table->DataSegsIn++;
+ perf_table->DataOctetsIn += skb->len - th->doff * 4;
+ }
+ }
+
+ if (path_table != NULL) {
+ path_table->IpTtl = iph->ttl;
+ path_table->IpTosIn = iph->tos;
+ }
+}
+EXPORT_SYMBOL(tcp_estats_update_segrecv);
+
+void tcp_estats_update_rcvd(struct tcp_sock *tp, u32 seq)
+{
+ /* After much debate, it was decided that "seq - rcv_nxt" is
+ indeed what we want, as opposed to what Krishnan suggested
+ to better match the RFC: "seq - tp->rcv_wup" */
+ TCP_ESTATS_VAR_ADD(tp, app_table, ThruOctetsReceived,
+ seq - tp->rcv_nxt);
+}
+
+void tcp_estats_update_writeq(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct tcp_estats_app_table *app_table =
+ tp->tcp_stats->tables.app_table;
+ int len;
+
+ if (app_table == NULL)
+ return;
+
+ len = tp->write_seq - app_table->SndMax;
+
+ if (len > app_table->MaxAppWQueue)
+ app_table->MaxAppWQueue = len;
+}
+
+static inline u32 ofo_qlen(struct tcp_sock *tp)
+{
+ if (!skb_peek(&tp->out_of_order_queue))
+ return 0;
+ else
+ return TCP_SKB_CB(tp->out_of_order_queue.prev)->end_seq -
+ TCP_SKB_CB(tp->out_of_order_queue.next)->seq;
+}
+
+void tcp_estats_update_recvq(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct tcp_estats_tables *tables = &tp->tcp_stats->tables;
+ struct tcp_estats_app_table *app_table = tables->app_table;
+ struct tcp_estats_stack_table *stack_table = tables->stack_table;
+
+ if (app_table != NULL) {
+ u32 len = tp->rcv_nxt - tp->copied_seq;
+ if (app_table->MaxAppRQueue < len)
+ app_table->MaxAppRQueue = len;
+ }
+
+ if (stack_table != NULL) {
+ u32 len = ofo_qlen(tp);
+ if (stack_table->MaxReasmQueue < len)
+ stack_table->MaxReasmQueue = len;
+ }
+}
+
+/*
+ * Manage connection ID table
+ */
+
+static int get_new_cid(struct tcp_estats *stats)
+{
+ int id_cid;
+
+again:
+ spin_lock_bh(&tcp_estats_idr_lock);
+ id_cid = idr_alloc(&tcp_estats_idr, stats, next_id, 0, GFP_KERNEL);
+ if (unlikely(id_cid == -ENOSPC)) {
+ spin_unlock_bh(&tcp_estats_idr_lock);
+ goto again;
+ }
+ if (unlikely(id_cid == -ENOMEM)) {
+ spin_unlock_bh(&tcp_estats_idr_lock);
+ return -ENOMEM;
+ }
+ next_id = (id_cid + 1) % ESTATS_MAX_CID;
+ stats->tcpe_cid = id_cid;
+ spin_unlock_bh(&tcp_estats_idr_lock);
+ return 0;
+}
+
+static void create_func(struct work_struct *work)
+{
+ /* stub for netlink notification of new connections */
+ ;
+}
+
+static void establish_func(struct work_struct *work)
+{
+ struct tcp_estats *stats = container_of(work, struct tcp_estats,
+ establish_notify);
+ int err = 0;
+
+ if ((stats->tcpe_cid) > 0) {
+ pr_err("TCP estats container established multiple times.\n");
+ return;
+ }
+
+ if ((stats->tcpe_cid) == 0) {
+ err = get_new_cid(stats);
+ if (err)
+ pr_devel("get_new_cid error %d\n", err);
+ }
+}
+
+static void destroy_func(struct work_struct *work)
+{
+ struct tcp_estats *stats = container_of(work, struct tcp_estats,
+ destroy_notify.work);
+
+ int id_cid = stats->tcpe_cid;
+
+ if (id_cid == 0)
+ pr_devel("TCP estats destroyed before being established.\n");
+
+ if (id_cid >= 0) {
+ if (id_cid) {
+ spin_lock_bh(&tcp_estats_idr_lock);
+ idr_remove(&tcp_estats_idr, id_cid);
+ spin_unlock_bh(&tcp_estats_idr_lock);
+ }
+ stats->tcpe_cid = -1;
+
+ tcp_estats_unuse(stats);
+ }
+}
+
+void __init tcp_estats_init()
+{
+ idr_init(&tcp_estats_idr);
+
+ create_notify_func = &create_func;
+ establish_notify_func = &establish_func;
+ destroy_notify_func = &destroy_func;
+
+ persist_delay = TCP_ESTATS_PERSIST_DELAY_SECS * HZ;
+
+ tcp_estats_wq = alloc_workqueue("tcp_estats", WQ_MEM_RECLAIM, 256);
+ if (tcp_estats_wq == NULL) {
+ pr_err("tcp_estats_init(): alloc_workqueue failed\n");
+ goto cleanup_fail;
+ }
+
+ tcp_estats_wq_enabled = 1;
+ return;
+
+cleanup_fail:
+ pr_err("TCP ESTATS: initialization failed.\n");
+}
--
1.9.3
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Powered by blists - more mailing lists