lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Date:	Wed, 11 Jul 2012 10:18:04 +0800
From:	Li Yu <raise.sail@...il.com>
To:	Linux Netdev List <netdev@...r.kernel.org>
Subject: [RFC][PATCH 4/4] skbtrace: four TCP/IP tracepoints tcp/icsk_connection,tcp_sendlim,tcp_congestion

From: Li Yu <bingtian.ly@...bao.com>

This implements four skbtrace traces for TCP.

(1) tcp/icsk_connection is for trace basic state
    migration of TCP protocol, e.g. SYN_RECV ->
    ESTABLISHED.
(2) tcp_sendlim is for trace TCP sending limitation.
    e.g. congestion window is limited to send segments.

(3) tcp_congestion is for trace TCP congestion events,
    e.g. Loss, FRTO and etc.

Thanks.

Sign-off-by: Li Yu <bingtian.ly@...bao.com>
---
 include/linux/skbtrace.h             |    3
 include/linux/skbtrace_api.h         |    1
 include/net/skbtrace_api_ipv4.h      |  124 ++++++++++++
 include/trace/events/skbtrace.h      |    1
 include/trace/events/skbtrace_ipv4.h |   49 ++++
 net/core/net-traces.c                |    4
 net/ipv4/Kconfig                     |    8
 net/ipv4/Makefile                    |    1
 net/ipv4/inet_connection_sock.c      |    2
 net/ipv4/inet_timewait_sock.c        |    3
 net/ipv4/skbtrace-ipv4.c             |  345
+++++++++++++++++++++++++++++++++++
 net/ipv4/tcp.c                       |    5
 net/ipv4/tcp_input.c                 |   12 +
 net/ipv4/tcp_ipv4.c                  |    4
 net/ipv4/tcp_minisocks.c             |    4
 net/ipv4/tcp_output.c                |   61 ++++--
 16 files changed, 610 insertions(+), 17 deletions(-)

diff --git a/include/linux/skbtrace.h b/include/linux/skbtrace.h
index 34b9144..b35d7b3 100644
--- a/include/linux/skbtrace.h
+++ b/include/linux/skbtrace.h
@@ -67,6 +67,9 @@ extern atomic64_t skbtrace_event_seq;
 struct skbtrace_context {
 	union {
 		struct skbtrace_block blk;
+		struct skbtrace_tcp_cong_blk tcp_cong;
+		struct skbtrace_tcp_conn_blk tcp_conn;
+		struct skbtrace_tcp_sendlim_blk tcp_sendlim;
 	};
 };

diff --git a/include/linux/skbtrace_api.h b/include/linux/skbtrace_api.h
index 7489856..281a868 100644
--- a/include/linux/skbtrace_api.h
+++ b/include/linux/skbtrace_api.h
@@ -68,5 +68,6 @@ struct skbtrace_block {
 } __packed;

 #include <net/skbtrace_api_common.h>
+#include <net/skbtrace_api_ipv4.h>

 #endif
diff --git a/include/net/skbtrace_api_ipv4.h
b/include/net/skbtrace_api_ipv4.h
new file mode 100644
index 0000000..a3e6462
--- /dev/null
+++ b/include/net/skbtrace_api_ipv4.h
@@ -0,0 +1,124 @@
+/*
+ *  skbtrace - sk_buff trace utilty
+ *
+ *	User/Kernel Interface
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
02111-1307, USA.
+ *
+ * 2012 Li Yu <bingtian.ly@...bao.com>
+ *
+ */
+#ifndef _NET_SKBTRACE_API_IPV4_H
+#define _NET_SKBTRACE_API_IPV4_H
+
+#include <linux/types.h>
+
+#ifdef __KERNEL__
+#include <linux/in.h>
+#include <linux/in6.h>
+#endif
+
+/********************* TCP section *********************/
+
+/* skbtrace_block->action */
+enum {
+	skbtrace_action_tcp_min		= 101,
+	skbtrace_action_tcp_congestion	= 101,
+	skbtrace_action_tcp_connection	= 102,
+	skbtrace_action_tcp_sendlimit	= 103,
+	skbtrace_action_tcp_max		= 199,
+};
+
+/* TCP congestion event (101) */
+
+/* flags */
+enum {
+	skbtrace_tcp_cong_cwr		= 4,
+	skbtrace_tcp_cong_loss		= 5,
+	skbtrace_tcp_cong_fastrtx	= 6,
+	skbtrace_tcp_cong_frto		= 7,
+	skbtrace_tcp_cong_frto_loss	= 8,
+	skbtrace_tcp_cong_leave		= 9,
+};
+
+struct skbtrace_tcp_cong_blk {
+	struct skbtrace_block blk;
+	__u32	rcv_rtt;
+	__u32	rto;
+	__u32	cwnd;
+	__u32	sndnxt;
+	__u32	snduna;
+} __packed;
+
+/* TCP basic connection events (101) */
+struct skbtrace_tcp_conn_blk {
+	struct skbtrace_block blk;
+	union {
+		struct {
+			struct sockaddr local;
+			struct sockaddr peer;
+		};
+		struct {
+			struct sockaddr_in local;
+			struct sockaddr_in peer;
+		} inet;
+		struct {
+			struct sockaddr_in6 local;
+			struct sockaddr_in6 peer;
+		} inet6;
+	} addr;
+} __packed;
+
+/* TCP send limit event (102) */
+enum {
+	skbtrace_tcp_sndlim_cwnd	= 4,
+	skbtrace_tcp_sndlim_swnd	= 5,
+	skbtrace_tcp_sndlim_nagle	= 6,
+	skbtrace_tcp_sndlim_tso		= 7,
+	skbtrace_tcp_sndlim_frag	= 8,	/* most likely ENOMEM errors */
+	skbtrace_tcp_sndlim_pushone	= 9,
+	skbtrace_tcp_sndlim_other	= 10,
+	skbtrace_tcp_sndlim_ok		= 11,
+};
+
+
+/* val member:
+ *    skbtrace_tcp_sndlim_other: the return value of tcp_transmit_skb()
+ *    skbtrace_tcp_sndlim_ok: total sent pkts
+ *    other cases: send limit occurs under MTU probe if 1, otherwise,
it is 0
+ */
+struct skbtrace_tcp_sendlim_blk {
+	struct skbtrace_block blk;
+	__u32 val;
+	__u32 count;
+	struct timespec begin;
+	__u32	snd_ssthresh;
+	__u32	snd_cwnd;
+	__u32	snd_cwnd_cnt;
+	__u32	snd_wnd;
+} __packed;
+
+/********************* icsk section *********************/
+
+/* skbtrace_block->action */
+enum {
+	skbtrace_action_icsk_min	= 201,
+	skbtrace_action_icsk_connection	= 201,
+	skbtrace_action_icsk_max	= 299,
+};
+
+/* Use skbtrace_tcp_conn_blk */
+
+#endif
diff --git a/include/trace/events/skbtrace.h
b/include/trace/events/skbtrace.h
index bf8c2cb..91567bf 100644
--- a/include/trace/events/skbtrace.h
+++ b/include/trace/events/skbtrace.h
@@ -27,5 +27,6 @@
 #include <linux/tracepoint.h>

 #include <trace/events/skbtrace_common.h>
+#include <trace/events/skbtrace_ipv4.h>

 #endif
diff --git a/include/trace/events/skbtrace_ipv4.h
b/include/trace/events/skbtrace_ipv4.h
new file mode 100644
index 0000000..73a9fb0
--- /dev/null
+++ b/include/trace/events/skbtrace_ipv4.h
@@ -0,0 +1,49 @@
+ /*
+ *  skbtrace - sk_buff trace utilty
+ *
+ *	The IPv4 related skbtrace events
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
02111-1307, USA.
+ *
+ * Thanks for Web10G project here, some sources reference to it.
+ *
+ * 2012 Li Yu <bingtian.ly@...bao.com>
+ *
+ */
+
+#if !defined(_TRACE_EVENTS_SKBTRACE_IPV4_H)
+#define _TRACE_EVENTS_SKBTRACE_IPV4_H
+
+#include <linux/tracepoint.h>
+
+struct sock;
+
+DECLARE_TRACE(icsk_connection,
+	TP_PROTO(struct sock *sk, __u32 state),
+	TP_ARGS(sk, state));
+
+DECLARE_TRACE(tcp_congestion,
+	TP_PROTO(struct sock *sk, int reason, int prior_state),
+	TP_ARGS(sk, reason, prior_state));
+
+DECLARE_TRACE(tcp_connection,
+	TP_PROTO(void *sk, __u32 state),
+	TP_ARGS(sk, state));
+
+DECLARE_TRACE(tcp_sendlimit,
+	TP_PROTO(struct sock *sk, int reason, int val),
+	TP_ARGS(sk, reason, val));
+
+#endif
diff --git a/net/core/net-traces.c b/net/core/net-traces.c
index d86a58b..95ad083 100644
--- a/net/core/net-traces.c
+++ b/net/core/net-traces.c
@@ -45,5 +45,9 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(napi_poll);
 	EXPORT_TRACEPOINT_SYMBOL_GPL(name);

 NEW_SKBTRACE_TP(skb_rps_info);
+NEW_SKBTRACE_TP(tcp_congestion);
+NEW_SKBTRACE_TP(tcp_connection);
+NEW_SKBTRACE_TP(icsk_connection);
+NEW_SKBTRACE_TP(tcp_sendlimit);

 #endif
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 20f1cb5..feb5e28 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -415,6 +415,14 @@ config INET_UDP_DIAG
 	  Support for UDP socket monitoring interface used by the ss tool.
 	  If unsure, say Y.

+config SKBTRACE_IPV4
+	tristate "TCP/IPv4 protocol suite support for skbtrace"
+	depends on SKBTRACE
+	default m
+	---help---
+	  Support for IPv4 part of skbtrace. which only contains TCP/IPv4
+	  specific events.
+
 menuconfig TCP_CONG_ADVANCED
 	bool "TCP: advanced congestion control"
 	---help---
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index ff75d3b..4b03aef 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -50,6 +50,7 @@ obj-$(CONFIG_TCP_CONG_YEAH) += tcp_yeah.o
 obj-$(CONFIG_TCP_CONG_ILLINOIS) += tcp_illinois.o
 obj-$(CONFIG_CGROUP_MEM_RES_CTLR_KMEM) += tcp_memcontrol.o
 obj-$(CONFIG_NETLABEL) += cipso_ipv4.o
+obj-${CONFIG_SKBTRACE_IPV4} += skbtrace-ipv4.o

 obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \
 		      xfrm4_output.o
diff --git a/net/ipv4/inet_connection_sock.c
b/net/ipv4/inet_connection_sock.c
index 034ddbe..a69becb 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -15,6 +15,7 @@

 #include <linux/module.h>
 #include <linux/jhash.h>
+#include <trace/events/skbtrace_ipv4.h>

 #include <net/inet_connection_sock.h>
 #include <net/inet_hashtables.h>
@@ -702,6 +703,7 @@ int inet_csk_listen_start(struct sock *sk, const int
nr_table_entries)
 		sk_dst_reset(sk);
 		sk->sk_prot->hash(sk);

+		trace_icsk_connection(sk, TCP_LISTEN);
 		return 0;
 	}

diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c
index 2784db3..9363a6b 100644
--- a/net/ipv4/inet_timewait_sock.c
+++ b/net/ipv4/inet_timewait_sock.c
@@ -12,6 +12,8 @@
 #include <linux/kmemcheck.h>
 #include <linux/slab.h>
 #include <linux/module.h>
+#include <linux/skbtrace.h>
+#include <trace/events/skbtrace_ipv4.h>
 #include <net/inet_hashtables.h>
 #include <net/inet_timewait_sock.h>
 #include <net/ip.h>
@@ -205,6 +207,7 @@ struct inet_timewait_sock *inet_twsk_alloc(const
struct sock *sk, const int stat
 		atomic_set(&tw->tw_refcnt, 0);
 		inet_twsk_dead_node_init(tw);
 		__module_get(tw->tw_prot->owner);
+		trace_tcp_connection(tw, state + TCP_MAX_STATES);
 	}

 	return tw;
diff --git a/net/ipv4/skbtrace-ipv4.c b/net/ipv4/skbtrace-ipv4.c
new file mode 100644
index 0000000..ed486be
--- /dev/null
+++ b/net/ipv4/skbtrace-ipv4.c
@@ -0,0 +1,345 @@
+/*
+ *  skbtrace - sk_buff trace for TCP/IPv4 protocol suite support
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
02111-1307, USA.
+ *
+ * 2012 Li Yu <bingtian.ly@...bao.com>
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/relay.h>
+#include <linux/debugfs.h>
+#include <linux/slab.h>
+#include <linux/ctype.h>
+#include <linux/jhash.h>
+#include <linux/inet.h>
+
+#include <linux/skbtrace.h>
+#include <linux/tcp.h>
+#include <net/inet_common.h>
+#include <net/inet_connection_sock.h>
+#include <net/tcp.h>
+
+static void skbtrace_tcp_congestion(struct skbtrace_tracepoint *t,
+			struct sock *sk, int reason, int prior_state)
+SKBTRACE_SOCK_EVENT_BEGIN
+	struct skbtrace_context *ctx;
+	struct skbtrace_tcp_cong_blk blk, *b;
+	struct tcp_sock *tp;
+
+	if (skbtrace_tcp_cong_leave == reason &&
+			inet_csk(sk)->icsk_ca_state == TCP_CA_Open)
+		return;
+
+	local_bh_disable();
+	ctx = skbtrace_context_get(sk);
+	if (ctx) {
+		if (skbtrace_action_tcp_congestion != ctx->blk.action)
+			skbtrace_probe(&ctx->blk);
+		b = &ctx->tcp_cong;
+	} else
+		b = &blk;
+
+	tp = tcp_sk(sk);
+	INIT_SKBTRACE_BLOCK(&b->blk, tp,
+			skbtrace_action_tcp_congestion,
+			1 << reason,
+			sizeof(*b));
+	b->cwnd = tp->snd_cwnd * tp->mss_cache;
+	b->rcv_rtt = tp->rcv_rtt_est.rtt;
+	b->rto = inet_csk(sk)->icsk_rto;
+	b->snduna = tp->snd_una;
+	b->sndnxt = tp->snd_nxt;
+	skbtrace_probe(&b->blk);
+	local_bh_enable();
+SKBTRACE_SOCK_EVENT_END
+
+static void skbtrace_tcp_connection(struct skbtrace_tracepoint *t,
+							void *ptr, u32 state)
+{
+	struct sock *sk = ptr;
+	struct inet_timewait_sock *tw = inet_twsk(ptr);
+
+	switch (state) {
+	case TCP_TIME_WAIT + TCP_MAX_STATES:
+	case TCP_FIN_WAIT2 + TCP_MAX_STATES:
+		{
+			struct skbtrace_tcp_conn_blk blk;
+
+			state -= TCP_MAX_STATES;
+			INIT_SKBTRACE_BLOCK(&blk.blk, tw,
+				skbtrace_action_tcp_connection,
+				1 << (state + skbtrace_flags_reserved_max),
+				sizeof(blk));
+			blk.addr.inet.local.sin_family = AF_INET;
+			blk.addr.inet.local.sin_port = tw->tw_sport;
+			blk.addr.inet.local.sin_addr.s_addr = tw->tw_rcv_saddr;
+			blk.addr.inet.peer.sin_family = AF_INET;
+			blk.addr.inet.peer.sin_port = tw->tw_dport;
+			blk.addr.inet.peer.sin_addr.s_addr = tw->tw_daddr;
+			skbtrace_probe(&blk.blk);
+			break;
+		}
+	case TCP_ESTABLISHED:
+	case TCP_FIN_WAIT1:
+	case TCP_CLOSE:
+	case TCP_CLOSE_WAIT:
+	case TCP_LAST_ACK:
+	case TCP_SYN_SENT:
+	case TCP_SYN_RECV:
+	case TCP_CLOSING:
+		{
+			struct skbtrace_context *ctx;
+			struct skbtrace_tcp_conn_blk blk, *b;
+
+			local_bh_disable();
+			b = &blk;
+			ctx = skbtrace_context_get(sk);
+			if (ctx) {
+				if (skbtrace_action_tcp_connection
+							!= ctx->blk.action)
+					skbtrace_probe(&ctx->blk);
+				b = &ctx->tcp_conn;
+			}
+			INIT_SKBTRACE_BLOCK(&b->blk, ptr,
+				skbtrace_action_tcp_connection,
+				1 << (state + skbtrace_flags_reserved_max),
+				sizeof(blk));
+			__inet_sock_getname(sk, &b->addr.local, NULL, 0);
+			if (TCP_LISTEN != state)
+				__inet_sock_getname(sk, &b->addr.peer, NULL, 1);
+			skbtrace_probe(&b->blk);
+			local_bh_enable();
+			break;
+		}
+	}
+}
+
+static void skbtrace_icsk_connection(struct skbtrace_tracepoint *t,
+						struct sock *sk, u32 state)
+SKBTRACE_SOCK_EVENT_BEGIN
+	struct skbtrace_context *ctx;
+	struct skbtrace_tcp_conn_blk blk, *b;
+
+	if (TCP_LISTEN != state)
+		return;
+
+	local_bh_disable();
+	ctx = skbtrace_context_get(sk);
+	if (ctx) {
+		if (skbtrace_action_icsk_connection != ctx->blk.action)
+			skbtrace_probe(&ctx->blk);
+		b = &ctx->tcp_conn;
+	} else
+		b = &blk;
+	INIT_SKBTRACE_BLOCK(&b->blk, sk,
+				skbtrace_action_icsk_connection,
+				1 << (state + skbtrace_flags_reserved_max),
+				sizeof(blk));
+	__inet_sock_getname(sk, &b->addr.local, NULL, 0);
+	skbtrace_probe(&b->blk);
+	local_bh_enable();
+SKBTRACE_SOCK_EVENT_END
+
+static const char * const skbtrace_tcp_sendlimit_options[] = {
+	"cwnd",
+	"swnd",
+	"nagle",
+	"tso",
+	"frag",
+	"pushone",
+	"other",
+	"ok",
+};
+
+static const int skbtrace_tcp_sendlimit_masks[] = {
+	skbtrace_tcp_sndlim_cwnd,
+	skbtrace_tcp_sndlim_swnd,
+	skbtrace_tcp_sndlim_nagle,
+	skbtrace_tcp_sndlim_tso,
+	skbtrace_tcp_sndlim_frag,
+	skbtrace_tcp_sndlim_pushone,
+	skbtrace_tcp_sndlim_other,
+	skbtrace_tcp_sndlim_ok,
+};
+
+static int skbtrace_tcp_sendlimit_setopt(struct skbtrace_tracepoint *t,
+						char *name, char *options)
+{
+	unsigned long mask = 0UL;
+	char *cur;
+	int ret = 0;
+
+	if (options) {
+		if (strncmp(options, "skip=", sizeof("skip=") - 1)) {
+			options = NULL;
+			ret = -EINVAL;
+		} else
+			options += sizeof("skip=") - 1;
+	}
+
+	if (!options || '\x0' == *options)
+		goto quit;
+
+	mask = 0UL;
+	cur = strsep(&options, ":");
+	while (cur) {
+		int i, nr_options;
+
+		nr_options = sizeof(skbtrace_tcp_sendlimit_masks)/sizeof(int);
+		for (i = 0; i < nr_options; i++) {
+			if (!strcmp(cur, skbtrace_tcp_sendlimit_options[i])) {
+				mask |= (1 << skbtrace_tcp_sendlimit_masks[i]);
+				break;
+			}
+		}
+		if (i >= nr_options) {
+			mask = 0UL;
+			ret = -EINVAL;
+		}
+		cur = strsep(&options, ":");
+	}
+
+quit:
+	t->private = (void *)(mask);
+	return ret;
+}
+
+static char *skbtrace_tcp_sendlimit_desc(struct skbtrace_tracepoint *t)
+{
+	char *desc;
+	unsigned long mask = (unsigned long)t->private;
+	int i, nr_options, copied;
+
+	desc = kmalloc(strlen(t->name) + 128, GFP_KERNEL);
+	if (!desc)
+		return NULL;
+
+	copied = sprintf(desc, "%s enabled:%d skip=", t->name, t->enabled);
+	nr_options = sizeof(skbtrace_tcp_sendlimit_masks)/sizeof(int);
+	for (i = 0; i < nr_options; i++) {
+		int this_n;
+		const char *this_p;
+
+		this_n = skbtrace_tcp_sendlimit_masks[i];
+		this_p = skbtrace_tcp_sendlimit_options[i];
+		if (t->enabled && (mask & (1 << this_n)))
+			copied += sprintf(desc + copied, "%s,", this_p);
+		else if (!t->enabled)
+			copied += sprintf(desc + copied, "%s,", this_p);
+	}
+
+	sprintf(desc + copied, "\n");
+	return desc;
+}
+
+static inline void tcp_sendlimit_block_setup(struct
skbtrace_tcp_sendlim_blk *b,
+					struct sock *sk, int reason, int val)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	INIT_SKBTRACE_BLOCK(&b->blk, tp,
+			skbtrace_action_tcp_sendlimit,
+			1 << reason,
+			sizeof(*b));
+
+	b->val = val;
+	b->count = 1;
+	b->begin = current_kernel_time();
+
+	b->snd_ssthresh = tp->snd_ssthresh;
+	b->snd_cwnd = tp->snd_cwnd;
+	b->snd_cwnd_cnt = tp->snd_cwnd_cnt;
+	b->snd_wnd = tp->snd_wnd;
+}
+
+static void skbtrace_tcp_sendlimit(struct skbtrace_tracepoint *t,
+		struct sock *sk, int reason, int val)
+SKBTRACE_SOCK_EVENT_BEGIN
+	struct skbtrace_context *ctx;
+	unsigned long mask = (unsigned long)t->private;
+
+	if (mask & (1<<reason))
+		return;
+
+	if (skbtrace_tcp_sndlim_ok == reason && !val)
+		return;
+
+	local_bh_disable();
+	ctx = skbtrace_context_get(sk);
+	if (unlikely(!ctx)) { /* no saved context, just fire up */
+		struct skbtrace_tcp_sendlim_blk blk;
+
+		tcp_sendlimit_block_setup(&blk, sk, reason, val);
+		skbtrace_probe(&blk.blk);
+		local_bh_enable();
+		return;
+	}
+
+	if (ctx->blk.action == skbtrace_action_tcp_sendlimit &&
+			(ctx->blk.flags & (1 << reason)) &&
+			ctx->tcp_sendlim.val == val &&
+			current_kernel_time().tv_sec == ctx->blk.ts.tv_sec) {
+		/* same event happens continuously */
+		++ctx->tcp_sendlim.count;
+		local_bh_enable();
+		return;
+	}
+
+	/* fire up last event or the same but delayed too much event */
+	skbtrace_probe(&ctx->blk);
+
+	/* initialize new context */
+	tcp_sendlimit_block_setup(&ctx->tcp_sendlim, sk, reason, val);
+	local_bh_enable();
+SKBTRACE_SOCK_EVENT_END
+
+static struct skbtrace_tracepoint af_inet4[] = {
+	{
+		.name = "tcp_congestion",
+		.probe = skbtrace_tcp_congestion,
+	},
+	{
+		.name = "tcp_connection",
+		.probe = skbtrace_tcp_connection,
+	},
+	{
+		.name = "icsk_connection",
+		.probe = skbtrace_icsk_connection,
+	},
+	{
+		.name = "tcp_sendlimit",
+		.probe = skbtrace_tcp_sendlimit,
+		.setup_options = skbtrace_tcp_sendlimit_setopt,
+		.desc = skbtrace_tcp_sendlimit_desc,
+	},
+	EMPTY_SKBTRACE_TP
+};
+
+static int skbtrace_ipv4_init(void)
+{
+	return skbtrace_register_tracepoints(AF_INET, af_inet4);
+}
+
+static void skbtrace_ipv4_cleanup(void)
+{
+	skbtrace_unregister_tracepoints(AF_INET);
+}
+
+module_init(skbtrace_ipv4_init);
+module_exit(skbtrace_ipv4_cleanup);
+MODULE_ALIAS("skbtrace-af-" __stringify(AF_INET));
+MODULE_LICENSE("GPL");
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 3ba605f..d85c8d7 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -279,6 +279,9 @@
 #include <asm/uaccess.h>
 #include <asm/ioctls.h>

+#include <linux/skbtrace.h>
+#include <trace/events/skbtrace_ipv4.h>
+
 int sysctl_tcp_fin_timeout __read_mostly = TCP_FIN_TIMEOUT;

 struct percpu_counter tcp_orphan_count;
@@ -1925,6 +1928,8 @@ void tcp_set_state(struct sock *sk, int state)
 			TCP_DEC_STATS(sock_net(sk), TCP_MIB_CURRESTAB);
 	}

+	trace_tcp_connection(sk, state);
+
 	/* Change state AFTER socket is unhashed to avoid closed
 	 * socket sitting in hash tables.
 	 */
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index ca0d0e7..8f8b5f5 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -74,6 +74,8 @@
 #include <linux/ipsec.h>
 #include <asm/unaligned.h>
 #include <net/netdma.h>
+#include <linux/skbtrace.h>
+#include <trace/events/skbtrace_ipv4.h>

 int sysctl_tcp_timestamps __read_mostly = 1;
 int sysctl_tcp_window_scaling __read_mostly = 1;
@@ -861,6 +863,7 @@ void tcp_enter_cwr(struct sock *sk, const int
set_ssthresh)

 		tcp_set_ca_state(sk, TCP_CA_CWR);
 	}
+	trace_tcp_congestion(sk, skbtrace_tcp_cong_cwr, 0);
 }

 /*
@@ -2151,6 +2154,8 @@ void tcp_enter_frto(struct sock *sk)
 	tcp_set_ca_state(sk, TCP_CA_Disorder);
 	tp->high_seq = tp->snd_nxt;
 	tp->frto_counter = 1;
+
+	trace_tcp_congestion(sk, skbtrace_tcp_cong_frto, 0);
 }

 /* Enter Loss state after F-RTO was applied. Dupack arrived after RTO,
@@ -2218,6 +2223,8 @@ static void tcp_enter_frto_loss(struct sock *sk,
int allowed_segments, int flag)
 	TCP_ECN_queue_cwr(tp);

 	tcp_clear_all_retrans_hints(tp);
+
+	trace_tcp_congestion(sk, skbtrace_tcp_cong_frto_loss, 0);
 }

 static void tcp_clear_retrans_partial(struct tcp_sock *tp)
@@ -2247,6 +2254,8 @@ void tcp_enter_loss(struct sock *sk, int how)
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct sk_buff *skb;

+	trace_tcp_congestion(sk, skbtrace_tcp_cong_loss, 0);
+
 	/* Reduce ssthresh if it has not yet been made inside this window. */
 	if (icsk->icsk_ca_state <= TCP_CA_Disorder || tp->snd_una ==
tp->high_seq ||
 	    (icsk->icsk_ca_state == TCP_CA_Loss && !icsk->icsk_retransmits)) {
@@ -3217,6 +3226,7 @@ static void tcp_fastretrans_alert(struct sock *sk,
int pkts_acked,
 		/* Otherwise enter Recovery state */
 		tcp_enter_recovery(sk, (flag & FLAG_ECE));
 		fast_rexmit = 1;
+		trace_tcp_congestion(sk, skbtrace_tcp_cong_fastrtx, 0);
 	}

 	if (do_lost || (tcp_is_fack(tp) && tcp_head_timedout(sk)))
@@ -3770,6 +3780,7 @@ static int tcp_ack(struct sock *sk, const struct
sk_buff *skb, int flag)
 	u32 prior_fackets;
 	int prior_packets;
 	int prior_sacked = tp->sacked_out;
+	int prior_state = icsk->icsk_ca_state;
 	int pkts_acked = 0;
 	int newly_acked_sacked = 0;
 	bool frto_cwnd = false;
@@ -3864,6 +3875,7 @@ static int tcp_ack(struct sock *sk, const struct
sk_buff *skb, int flag)
 		is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP));
 		tcp_fastretrans_alert(sk, pkts_acked, newly_acked_sacked,
 				      is_dupack, flag);
+		trace_tcp_congestion(sk, skbtrace_tcp_cong_leave, prior_state);
 	} else {
 		if ((flag & FLAG_DATA_ACKED) && !frto_cwnd)
 			tcp_cong_avoid(sk, ack, prior_in_flight);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 64568fa..505e4fd 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -85,6 +85,9 @@
 #include <linux/crypto.h>
 #include <linux/scatterlist.h>

+#include <linux/skbtrace.h>
+#include <trace/events/skbtrace_ipv4.h>
+
 int sysctl_tcp_tw_reuse __read_mostly;
 int sysctl_tcp_low_latency __read_mostly;
 EXPORT_SYMBOL(sysctl_tcp_low_latency);
@@ -1528,6 +1531,7 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk,
struct sk_buff *skb,
 	if (__inet_inherit_port(sk, newsk) < 0)
 		goto put_and_exit;
 	__inet_hash_nolisten(newsk, NULL);
+	trace_tcp_connection(newsk, TCP_SYN_RECV);

 	return newsk;

diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 72b7c63..0a8b4be 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -23,10 +23,13 @@
 #include <linux/slab.h>
 #include <linux/sysctl.h>
 #include <linux/workqueue.h>
+#include <linux/skbtrace.h>
 #include <net/tcp.h>
 #include <net/inet_common.h>
 #include <net/xfrm.h>

+#include <trace/events/skbtrace_ipv4.h>
+
 int sysctl_tcp_syncookies __read_mostly = 1;
 EXPORT_SYMBOL(sysctl_tcp_syncookies);

@@ -189,6 +192,7 @@ kill_with_rst:

 		/* FIN arrived, enter true time-wait state. */
 		tw->tw_substate	  = TCP_TIME_WAIT;
+		trace_tcp_connection(tw, TCP_TIME_WAIT + TCP_MAX_STATES);
 		tcptw->tw_rcv_nxt = TCP_SKB_CB(skb)->end_seq;
 		if (tmp_opt.saw_tstamp) {
 			tcptw->tw_ts_recent_stamp = get_seconds();
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index c465d3e..a7c0488 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -42,6 +42,9 @@
 #include <linux/gfp.h>
 #include <linux/module.h>

+#include <linux/skbtrace.h>
+#include <trace/events/skbtrace_ipv4.h>
+
 /* People can turn this off for buggy TCP's found in printers etc. */
 int sysctl_tcp_retrans_collapse __read_mostly = 1;

@@ -1660,15 +1663,18 @@ static int tcp_mtu_probe(struct sock *sk)

 	if (tp->snd_wnd < size_needed)
 		return -1;
-	if (after(tp->snd_nxt + size_needed, tcp_wnd_end(tp)))
+	if (after(tp->snd_nxt + size_needed, tcp_wnd_end(tp))) {
+		trace_tcp_sendlimit(sk, skbtrace_tcp_sndlim_swnd, 1);
 		return 0;
-
+	}
 	/* Do we need to wait to drain cwnd? With none in flight, don't stall */
 	if (tcp_packets_in_flight(tp) + 2 > tp->snd_cwnd) {
 		if (!tcp_packets_in_flight(tp))
 			return -1;
-		else
+		else {
+			trace_tcp_sendlimit(sk, skbtrace_tcp_sndlim_cwnd, 1);
 			return 0;
+		}
 	}

 	/* We're allowed to probe.  Build it now. */
@@ -1763,7 +1769,7 @@ static bool tcp_write_xmit(struct sock *sk,
unsigned int mss_now, int nonagle,
 	struct sk_buff *skb;
 	unsigned int tso_segs, sent_pkts;
 	int cwnd_quota;
-	int result;
+	int retval, result, sndlim;

 	sent_pkts = 0;

@@ -1777,6 +1783,8 @@ static bool tcp_write_xmit(struct sock *sk,
unsigned int mss_now, int nonagle,
 		}
 	}

+	sndlim = skbtrace_tcp_sndlim_ok;
+	result = 0;
 	while ((skb = tcp_send_head(sk))) {
 		unsigned int limit;

@@ -1784,20 +1792,27 @@ static bool tcp_write_xmit(struct sock *sk,
unsigned int mss_now, int nonagle,
 		BUG_ON(!tso_segs);

 		cwnd_quota = tcp_cwnd_test(tp, skb);
-		if (!cwnd_quota)
+		if (!cwnd_quota) {
+			sndlim = skbtrace_tcp_sndlim_cwnd;
 			break;
+		}

-		if (unlikely(!tcp_snd_wnd_test(tp, skb, mss_now)))
+		if (unlikely(!tcp_snd_wnd_test(tp, skb, mss_now))) {
+			sndlim = skbtrace_tcp_sndlim_swnd;
 			break;
-
+		}
 		if (tso_segs == 1) {
 			if (unlikely(!tcp_nagle_test(tp, skb, mss_now,
-						     (tcp_skb_is_last(sk, skb) ?
-						      nonagle : TCP_NAGLE_PUSH))))
+					     (tcp_skb_is_last(sk, skb) ?
+					      nonagle : TCP_NAGLE_PUSH)))) {
+				sndlim = skbtrace_tcp_sndlim_nagle;
 				break;
+			}
 		} else {
-			if (!push_one && tcp_tso_should_defer(sk, skb))
+			if (!push_one && tcp_tso_should_defer(sk, skb)) {
+				sndlim = skbtrace_tcp_sndlim_tso;
 				break;
+			}
 		}

 		limit = mss_now;
@@ -1806,14 +1821,18 @@ static bool tcp_write_xmit(struct sock *sk,
unsigned int mss_now, int nonagle,
 						    cwnd_quota);

 		if (skb->len > limit &&
-		    unlikely(tso_fragment(sk, skb, limit, mss_now, gfp)))
+		    unlikely(tso_fragment(sk, skb, limit, mss_now, gfp))) {
+			sndlim = skbtrace_tcp_sndlim_frag;
 			break;
+		}

 		TCP_SKB_CB(skb)->when = tcp_time_stamp;

-		if (unlikely(tcp_transmit_skb(sk, skb, 1, gfp)))
+		result = tcp_transmit_skb(sk, skb, 1, gfp);
+		if (unlikely(result)) {
+			sndlim = skbtrace_tcp_sndlim_other;
 			break;
-
+		}
 		/* Advance the send_head.  This one is sent out.
 		 * This call will increment packets_out.
 		 */
@@ -1822,17 +1841,25 @@ static bool tcp_write_xmit(struct sock *sk,
unsigned int mss_now, int nonagle,
 		tcp_minshall_update(tp, mss_now, skb);
 		sent_pkts += tcp_skb_pcount(skb);

-		if (push_one)
+		if (push_one) {
+			sndlim = skbtrace_tcp_sndlim_pushone;
 			break;
+		}
 	}
 	if (inet_csk(sk)->icsk_ca_state == TCP_CA_Recovery)
 		tp->prr_out += sent_pkts;

 	if (likely(sent_pkts)) {
+		trace_tcp_sendlimit(sk, skbtrace_tcp_sndlim_ok, sent_pkts);
 		tcp_cwnd_validate(sk);
-		return false;
-	}
-	return !tp->packets_out && tcp_send_head(sk);
+		retval = false;
+	} else
+		retval = !tp->packets_out && tcp_send_head(sk);
+
+	if (skbtrace_tcp_sndlim_ok != sndlim)
+		trace_tcp_sendlimit(sk, sndlim, result);
+
+	return retval;
 }

 /* Push out any pending frames which were held back due to
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ