lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-ID: <5080F031.5040804@gmail.com>
Date:	Fri, 19 Oct 2012 14:16:17 +0800
From:	Li Yu <raise.sail@...il.com>
To:	Linux Netdev List <netdev@...r.kernel.org>
Subject: [PATCH 1/3] skbtrace v2: core feature and common events

From: Li Yu <bingtian.ly@...bao.com>

This patch contains:

1. The glue code of tracepoints subsystem and relay file system.
2. API for particular networking trace points.
3. The skb_rps_info trace point.

Thanks

Sign-off-by: Li Yu <bingtian.ly@...bao.com>

 include/linux/skbtrace.h               |  478 ++++++++++++
 include/linux/skbtrace_api.h           |   73 +
 include/linux/skbuff.h                 |    7
 include/net/skbtrace_api_common.h      |   84 ++
 include/net/sock.h                     |   14
 include/trace/events/skbtrace.h        |   32
 include/trace/events/skbtrace_common.h |   41 +
 kernel/trace/Kconfig                   |    8
 net/core/Makefile                      |    2
 net/core/dev.c                         |    3
 net/core/net-traces.c                  |   24
 net/core/skbtrace-core.c               | 1226
+++++++++++++++++++++++++++++++++
 net/core/skbtrace-events-common.c      |   68 +
 net/core/skbuff.c                      |    5
 net/core/sock.c                        |    9
 15 files changed, 2073 insertions(+), 1 deletion(-)

============================

diff --git a/include/linux/skbtrace.h b/include/linux/skbtrace.h
new file mode 100644
index 0000000..71fbff0
--- /dev/null
+++ b/include/linux/skbtrace.h
@@ -0,0 +1,478 @@
+/*
+ *  skbtrace - sk_buff trace utilty
+ *
+ *	API for kernel
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
02111-1307, USA.
+ *
+ * 2012 Li Yu <bingtian.ly@...bao.com>
+ *
+ */
+
+#ifndef _LINUX_SKBTRACE_H
+#define _LINUX_SKBTRACE_H
+
+#include <linux/jump_label.h>
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/net.h>
+#include <linux/skbtrace_api.h>
+#include <asm/atomic.h>
+
+#include <net/sock.h>
+#include <net/inet_timewait_sock.h>
+
+#if defined(CONFIG_SKBTRACE) || defined(CONFIG_SKBTRACE_MODULE)
+#define HAVE_SKBTRACE 1
+#else
+#define HAVE_SKBTRACE 0
+#endif
+
+#if HAVE_SKBTRACE
+
+/* The size parameters of secondary_buffer->slots */
+#define SECONDARY_BUFFER_ORDER	0
+#define SECONDARY_BUFFER_SIZE	(PAGE_SIZE<<SECONDARY_BUFFER_ORDER)
+#define SECONDARY_BUFFER_UNIT	(128)
+#define SECONDARY_BUFFER_COUNTS
(SECONDARY_BUFFER_SIZE/SECONDARY_BUFFER_UNIT)
+
+struct secondary_buffer {
+	atomic_t refcnt;
+	struct hlist_node node;
+	int action;	/* the action of primary event */
+	spinlock_t lock;
+	unsigned long session;
+	int offset;	/* next writeable slot */
+	int count;	/* count of current cached events in 'slots' */
+	char *slots;	/* the cache of secondary events */
+};
+
+
+#define SECONDARY_TABLE_SHIFT	6
+#define SECONDARY_TABLE_SIZE	(1<<SECONDARY_TABLE_SHIFT)
+#define SECONDARY_TABLE_MASK	(SECONDARY_TABLE_SIZE - 1)
+
+struct secondary_table {
+	spinlock_t lock;
+	struct hlist_head table[SECONDARY_TABLE_SIZE];
+};
+
+struct skbtrace_tracepoint {
+	const char *trace_name;
+	int action;
+	int nr_secondary;
+	size_t block_size;
+	void *probe;
+	int (*setup_options)(struct skbtrace_tracepoint *tp,
+						char *options);
+	void (*enable)(struct skbtrace_tracepoint *tp);
+	void (*disable)(struct skbtrace_tracepoint *tp);
+	char *(*desc)(struct skbtrace_tracepoint *tp);
+	void *private;
+
+	/* Below is for internals, which is not a part of kernel API */
+	unsigned int enabled : 1;
+	struct skbtrace_tracepoint *primary;
+	/* The secondary events of sk_buff based event are */
+	/* cached here. The secondary events of socket based */
+	/* event are cached in hash table skbtrace_context->sec_table */
+	struct secondary_buffer sec_buffer;
+};
+
+extern atomic64_t skbtrace_event_seq;
+extern int sysctl_skbtrace_filter_default;
+
+#define INIT_SKBTRACE_BLOCK(blk, p, act, fl, blk_size) \
+	do {\
+		(blk)->magic = 0xDEADBEEF;\
+		(blk)->len = (blk_size);\
+		(blk)->action = (act);\
+		(blk)->flags = (fl);\
+		(blk)->seq = atomic64_add_return(1, &skbtrace_event_seq);\
+		(blk)->ts = current_kernel_time();\
+		(blk)->ptr = (p);\
+	} while (0)
+
+#define EMPTY_SKBTRACE_TP	{.trace_name = NULL, }
+
+struct inet_timewait_sock;
+struct skbtrace_ops {
+	int (*tw_getname)(struct inet_timewait_sock *tw,
+			struct sockaddr *uaddr, int peer);
+	int (*tw_filter_skb)(struct inet_timewait_sock *tw,
+			struct sk_buff *skb);
+	int (*getname)(struct sock *sk, struct sockaddr *uaddr,
+		 int *uaddr_len, int peer);
+	int (*filter_skb)(struct sock *sk, struct sk_buff *skb);
+};
+
+struct skbtrace_context {
+	unsigned long session;
+	struct skbtrace_ops *ops;
+	unsigned int active_conn_hit : 1;
+	struct secondary_table sec_table;
+};
+
+extern unsigned long skbtrace_session;
+
+extern int skbtrace_register_proto(int af,
+				struct skbtrace_tracepoint *tp_list,
+				struct skbtrace_ops *ops);
+extern void skbtrace_unregister_proto(int af);
+extern struct skbtrace_ops* skbtrace_ops_get(int af);
+
+extern void __skbtrace_probe(struct skbtrace_tracepoint *tp,
+				struct skbtrace_context *ctx,
+				struct skbtrace_block *blk);
+extern int skbtrace_events_common_init(void);
+
+extern struct static_key skbtrace_filters_enabled;
+extern struct sk_filter *skbtrace_skb_filter;
+extern struct sk_filter *skbtrace_sock_filter;
+
+extern struct sk_buff* skbtrace_get_sock_filter_skb(struct sock *sk);
+static inline void skbtrace_put_sock_filter_skb(struct sk_buff *skb)
+{
+	skb->data = skb->head;
+	skb->len = 0;
+	skb_reset_tail_pointer(skb);
+	skb_reset_transport_header(skb);
+	skb_reset_network_header(skb);
+	local_bh_enable();
+}
+extern struct sk_buff* skbtrace_get_twsk_filter_skb(
+					struct inet_timewait_sock *tw);
+#define skbtrace_put_twsk_filter_skb skbtrace_put_sock_filter_skb
+
+static inline void skbtrace_probe(struct skbtrace_tracepoint *t,
+				struct skbtrace_context *ctx,
+				struct skbtrace_block *blk)
+{
+	if (skbtrace_action_invalid == blk->action)
+		return;
+	__skbtrace_probe(t, ctx, blk);
+}
+
+static inline int skbtrace_bypass_skb(struct sk_buff *skb)
+{
+	if (static_key_false(&skbtrace_filters_enabled)) {
+		if (skb->skbtrace_filtered)
+			return skb->hit_skbtrace;
+		else if (skbtrace_skb_filter) {
+			unsigned int pkt_len;
+
+			pkt_len = SK_RUN_FILTER(skbtrace_skb_filter, skb);
+			skb->hit_skbtrace = !pkt_len;
+			skb->skbtrace_filtered = 1;
+			return skb->hit_skbtrace;
+		}
+	}
+	return 0;
+}
+
+static inline void secondary_buffer_get(struct secondary_buffer *buf)
+{
+	atomic_inc(&buf->refcnt);
+}
+
+static inline void secondary_buffer_put(struct secondary_buffer *buf)
+{
+	if (buf && atomic_dec_and_test(&buf->refcnt)) {
+		free_pages((unsigned long)buf->slots, SECONDARY_BUFFER_ORDER);
+		buf->slots = NULL;
+	}
+}
+
+static inline void secondary_buffer_reset(struct secondary_buffer *buf)
+{
+	buf->offset = 0;
+	buf->count = 0;
+}
+
+static inline int secondary_buffer_init(struct secondary_buffer *buf,
+					struct skbtrace_tracepoint *tp)
+{
+	buf->slots = (char *)__get_free_pages(GFP_ATOMIC,
+						SECONDARY_BUFFER_ORDER);
+	if (!buf->slots)
+		return -ENOMEM;
+
+	INIT_HLIST_NODE(&buf->node);
+	spin_lock_init(&buf->lock);
+	buf->action = tp->action;
+	buf->session = skbtrace_session;
+	atomic_set(&buf->refcnt, 0);
+	secondary_buffer_reset(buf);
+	secondary_buffer_get(buf);
+	return 0;
+}
+
+static inline struct secondary_buffer* secondary_buffer_new(
+					struct skbtrace_tracepoint *tp)
+{
+	struct secondary_buffer *buf;
+
+	buf = kmalloc(sizeof(*buf), GFP_ATOMIC);
+	if (buf && secondary_buffer_init(buf, tp)) {
+		kfree(buf);
+		buf = NULL;
+	}
+	return buf;
+}
+
+static inline void secondary_buffer_destroy(struct secondary_buffer *buf)
+{
+	if (buf) {
+		secondary_buffer_put(buf);
+		kfree(buf);
+	}
+}
+
+static inline struct secondary_buffer* secondary_table_lookup(
+				struct secondary_table *table,
+				struct skbtrace_tracepoint *tp)
+{
+	unsigned int key;
+	struct secondary_buffer *buffer;
+	struct hlist_node *pos;
+
+	key = (47 * tp->action) & SECONDARY_TABLE_MASK;
+	spin_lock_bh(&table->lock);
+	hlist_for_each_entry(buffer, pos, &table->table[key], node) {
+		if (buffer->session != skbtrace_session)
+			continue;
+		if (buffer->action == tp->action)
+			goto unlock;
+	}
+	buffer = NULL;
+unlock:
+	spin_unlock_bh(&table->lock);
+
+	return buffer;
+}
+
+static inline struct secondary_buffer* secondary_table_lookup_or_create(
+				struct secondary_table *table,
+				struct skbtrace_tracepoint *tp)
+{
+	unsigned int key;
+	struct secondary_buffer *buffer;
+	struct hlist_node *pos;
+
+	key = (47 * tp->action) & SECONDARY_TABLE_MASK;
+	spin_lock_bh(&table->lock);
+	hlist_for_each_entry(buffer, pos, &table->table[key], node) {
+		if (buffer->session != skbtrace_session)
+			continue;
+		if (buffer->action == tp->action)
+			goto unlock;
+	}
+	buffer = secondary_buffer_new(tp);
+	if (buffer)
+		hlist_add_head(&buffer->node, &table->table[key]);
+unlock:
+	spin_unlock_bh(&table->lock);
+
+	return buffer;
+}
+
+static inline void secondary_table_clean(struct secondary_table *table)
+{
+	unsigned int key;
+
+	spin_lock_bh(&table->lock);
+	for (key = 0; key < SECONDARY_TABLE_SIZE; key++) {
+		while (!hlist_empty(&table->table[key])) {
+			struct secondary_buffer *buffer;
+
+			buffer = container_of(table->table[key].first,
+						struct secondary_buffer, node);
+			hlist_del(table->table[key].first);
+			secondary_buffer_destroy(buffer);
+		}
+	}
+	spin_unlock_bh(&table->lock);
+}
+
+static inline void secondary_table_init(struct secondary_table *table)
+{
+	unsigned int key;
+
+	spin_lock_init(&table->lock);
+	for (key = 0; key < SECONDARY_TABLE_SIZE; key++)
+		INIT_HLIST_HEAD(&table->table[key]);
+}
+
+extern struct skbtrace_context *skbtrace_context_get(struct sock *sk);
+extern void skbtrace_context_setup(struct skbtrace_context *ctx,
+					struct skbtrace_ops *ops);
+
+static inline void skbtrace_context_destroy(struct skbtrace_context **ctx)
+{
+	if (!*ctx)
+		return;
+	secondary_table_clean(&(*ctx)->sec_table);
+	kfree(*ctx);
+	*ctx = NULL;
+}
+
+static inline void sock_skbtrace_reset(struct sock *sk)
+{
+	sk->sk_skbtrace = NULL;
+}
+
+static inline void* secondary_buffer_get_block(struct secondary_buffer
*buf,
+					struct skbtrace_tracepoint *primary)
+{
+	void *ret;
+
+	if (!buf->slots && secondary_buffer_init(buf, primary))
+		return NULL;
+
+	spin_lock_bh(&buf->lock);
+	ret = &buf->slots[buf->offset * SECONDARY_BUFFER_UNIT];
+	if (buf->count < SECONDARY_BUFFER_COUNTS)
+		buf->count++;
+	if (++buf->offset >= SECONDARY_BUFFER_COUNTS)
+		buf->offset = 0;
+	spin_unlock_bh(&buf->lock);
+	return ret;
+}
+
+static inline void* skbtrace_block_get(struct skbtrace_tracepoint *tp,
+					struct skbtrace_context *ctx,
+					void *fast)
+{
+	struct skbtrace_tracepoint *pri;
+
+	if (!tp || !tp->primary)
+		return fast;
+
+	pri = tp->primary;
+	if (ctx) {
+		struct secondary_buffer *buf;
+		struct secondary_table *table;
+
+		table = &ctx->sec_table;
+		buf = secondary_table_lookup_or_create(table, pri);
+		if (!buf)
+			return fast;
+		return secondary_buffer_get_block(buf, pri) ? : fast;
+	}
+	return secondary_buffer_get_block(&pri->sec_buffer, pri) ? : fast;
+}
+
+static inline void* skbtrace_block_sk_get(struct skbtrace_tracepoint *tp,
+					struct sock *sk,
+					void *fast)
+{
+	return skbtrace_block_get(tp, skbtrace_context_get(sk), fast);
+}
+
+#define SKBTRACE_SKB_EVENT_BEGIN \
+{\
+	if (skbtrace_bypass_skb(skb)) {\
+		return;	\
+	} else {
+
+#define SKBTRACE_SKB_EVENT_END \
+	} \
+}
+
+extern u32 skbtrace_sock_filter_id;
+static inline int skbtrace_bypass_sock(struct sock *sk)
+{
+	if (static_key_false(&skbtrace_filters_enabled)) {
+		if (likely(sk->sk_skbtrace_filtered &&
+				(skbtrace_sock_filter_id == sk->sk_skbtrace_fid))) {
+			return sk->sk_hit_skbtrace;
+		}
+		if (skbtrace_sock_filter) {
+			unsigned int pkt_len;
+			struct sk_buff *skb;
+
+			skb = skbtrace_get_sock_filter_skb(sk);
+			if (skb) {
+				pkt_len = SK_RUN_FILTER(skbtrace_sock_filter, skb);
+				sk->sk_hit_skbtrace = !pkt_len;
+				sk->sk_skbtrace_filtered = 1;
+				skbtrace_put_sock_filter_skb(skb);
+				sk->sk_skbtrace_fid = skbtrace_sock_filter_id;
+				return sk->sk_hit_skbtrace;
+			}
+			return sysctl_skbtrace_filter_default;
+		}
+	}
+	return 0;
+}
+
+static inline int skbtrace_bypass_twsk(struct inet_timewait_sock *tw)
+{
+	if (static_key_false(&skbtrace_filters_enabled)) {
+		if (likely(tw->tw_skbtrace_filtered &&
+				(skbtrace_sock_filter_id == tw->tw_skbtrace_fid))) {
+			return tw->tw_hit_skbtrace;
+		}
+		if (skbtrace_sock_filter) {
+			unsigned int pkt_len;
+			struct sk_buff *skb;
+
+			skb = skbtrace_get_twsk_filter_skb(tw);
+			if (skb) {
+				pkt_len = SK_RUN_FILTER(skbtrace_sock_filter, skb);
+				tw->tw_hit_skbtrace = !pkt_len;
+				tw->tw_skbtrace_filtered = 1;
+				skbtrace_put_twsk_filter_skb(skb);
+				tw->tw_skbtrace_fid = skbtrace_sock_filter_id;
+				return tw->tw_hit_skbtrace;
+			}
+			return sysctl_skbtrace_filter_default;
+		}
+	}
+	return 0;
+}
+
+#define SKBTRACE_SOCK_EVENT_BEGIN \
+{\
+	if (skbtrace_bypass_sock(sk)) {\
+		return;	\
+	} else {
+
+#define SKBTRACE_SOCK_EVENT_END \
+	} \
+}
+
+extern int inet_filter_skb(struct sock *sk, struct sk_buff *skb);
+extern int inet_tw_getname(struct inet_timewait_sock *tw,
+				struct sockaddr *uaddr, int peer);
+extern int inet_tw_filter_skb(struct inet_timewait_sock *tw,
+				struct sk_buff *skb);
+extern int tcp_tw_filter_skb(struct inet_timewait_sock *tw,
+				struct sk_buff *skb);
+extern int tcp_filter_skb(struct sock *sk, struct sk_buff *skb);
+
+#else /* HAVE_SKBTRACE */
+
+static inline void sock_skbtrace_reset(struct sock *sk)
+{
+}
+
+static inline void skbtrace_context_destroy(struct skbtrace_context **ctx)
+{
+}
+
+#endif /* HAVE_SKBTRACE */
+
+#endif /* _LINUX_SKBTRACE_H */
diff --git a/include/linux/skbtrace_api.h b/include/linux/skbtrace_api.h
new file mode 100644
index 0000000..2d14ff6
--- /dev/null
+++ b/include/linux/skbtrace_api.h
@@ -0,0 +1,73 @@
+/*
+ *  skbtrace - sk_buff trace utilty
+ *
+ *	User/Kernel Interface
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
02111-1307, USA.
+ *
+ * 2012 Li Yu <bingtian.ly@...bao.com>
+ *
+ */
+#ifndef _LINUX_SKBTRACE_API_H
+#define _LINUX_SKBTRACE_API_H
+
+#include <linux/types.h>
+
+#ifdef __KERNEL__
+#include <linux/time.h>
+#else
+#include <time.h>
+#define __packed	__attribute__ ((__packed__))
+#endif
+
+#define TRACE_SPEC_MAX_LEN	256
+
+#define SKBTRACE_DEF_SUBBUF_SIZE	(1<<12)
+#define SKBTRACE_DEF_SUBBUF_NR		(1<<11)
+
+#define SKBTRACE_MIN_SUBBUF_SIZE	SKBTRACE_DEF_SUBBUF_SIZE
+#define SKBTRACE_MIN_SUBBUF_NR		SKBTRACE_DEF_SUBBUF_NR
+
+#define SKBTRACE_MAX_SUBBUF_SIZE	(1<<16)
+#define SKBTRACE_MAX_SUBBUF_NR		(1<<20)
+
+#define SC	0	/* for tracepoints in process context */
+#define SI	1	/* for tracepoints in softirq context */
+#define HW	2	/* for tracepoints in hardirq context */
+#define NR_CHANNELS	3
+
+/* struct skbtrace_block - be used in kernel/user interaction	*/
+/* @len:	whole data structure size in bytes		*/
+/* @action:	action of this skbtrace_block			*/
+/* @flags:	the flags depend on above action field		*/
+/* @ts:		the timestamp of this event.			*/
+/* @ptr:	the major source kernel data structure		*/
+/*		of this event, for gerneral, a sk_buff or sock	*/
+/* PLEASE:							*/
+/*	Keep 64 bits alignment 					*/
+struct skbtrace_block {
+	__u64 magic;
+	__u16 len;
+	__u16 action;
+	__u32 flags;
+	struct timespec ts;
+	__u64 seq;
+	void *ptr;
+} __packed;
+
+#include <net/skbtrace_api_common.h>
+#include <net/skbtrace_api_ipv4.h>
+
+#endif
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 7632c87..27a0fe0 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -351,6 +351,8 @@ typedef unsigned char *sk_buff_data_t;
  *	@peeked: this packet has been seen already, so stats have been
  *		done for it, don't do them again
  *	@nf_trace: netfilter packet trace flag
+ *	@hit_skbtrace: is this should be skipped by skbtrace filter?
+ *	@skbtrace_filtered: is this already processed by skbtrace filter?
  *	@protocol: Packet protocol from driver
  *	@destructor: Destruct function
  *	@nfct: Associated connection, if any
@@ -469,7 +471,10 @@ struct sk_buff {
 	__u8			wifi_acked:1;
 	__u8			no_fcs:1;
 	__u8			head_frag:1;
-	/* 8/10 bit hole (depending on ndisc_nodetype presence) */
+#if defined(CONFIG_SKBTRACE) || defined(CONFIG_SKBTRACE_MODULE)
+	__u8			hit_skbtrace:1;
+	__u8			skbtrace_filtered:1;
+#endif
 	kmemcheck_bitfield_end(flags2);

 #ifdef CONFIG_NET_DMA
diff --git a/include/net/skbtrace_api_common.h
b/include/net/skbtrace_api_common.h
new file mode 100644
index 0000000..87892d6
--- /dev/null
+++ b/include/net/skbtrace_api_common.h
@@ -0,0 +1,84 @@
+/*
+ *  skbtrace - sk_buff trace utilty
+ *
+ *	User/Kernel Interface
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
02111-1307, USA.
+ *
+ * 2012 Li Yu <bingtian.ly@...bao.com>
+ *
+ */
+#ifndef _NET_SKBTRACE_API_COMMON_H
+#define _NET_SKBTRACE_API_COMMON_H
+
+#include <linux/types.h>
+
+/********************* Common section *********************/
+
+/* skbtrace_block->action */
+enum {
+	skbtrace_action_invalid		= 0,
+	skbtrace_action_common_min	= 1,
+	skbtrace_action_skb_rps_info	= 1,
+	skbtrace_action_sk_timer	= 2,
+	skbtrace_action_common_max	= 99,
+};
+
+/* common skbtrace_block->flags */
+/* miss_secondary - none secondary events or no enough memory to cache
them */
+enum {
+	skbtrace_flags_reserved_min = 28,
+	skbtrace_flags_miss_secondary = 28,
+	skbtrace_flags_reserved_max = 31,
+};
+
+/* it is copied from <net/flow_keys.h>, except pad fields and packed */
+struct skbtrace_flow_keys {
+	__u32 src;
+	__u32 dst;
+	union {
+		__u32 ports;
+		__u16 port16[2];
+	};
+	__u32 ip_proto;
+} __packed;
+
+struct skbtrace_skb_rps_info_blk {
+	struct skbtrace_block blk;
+	__u16 rx_queue;
+	__u16 pad;
+	__u32 rx_hash;
+	__u32 cpu;
+	__u32 ifindex;
+	struct skbtrace_flow_keys keys;
+} __packed;
+
+
+/* socket timers */
+/* flags */
+enum {
+	skbtrace_sk_timer_setup	= 0,
+	skbtrace_sk_timer_reset	= 1,
+	skbtrace_sk_timer_stop	= 2,
+	skbtrace_sk_timer_last	= 3,
+};
+
+struct skbtrace_sk_timer_blk {
+	struct skbtrace_block blk;
+	__s32	proto;
+	__s32	timeout;
+} __packed;
+
+#endif
diff --git a/include/net/sock.h b/include/net/sock.h
index adb7da2..7a1d861 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -190,6 +190,8 @@ struct sock_common {
 };

 struct cg_proto;
+struct skbtrace_context;
+
 /**
   *	struct sock - network layer representation of sockets
   *	@__sk_common: shared layout with inet_timewait_sock
@@ -332,7 +334,12 @@ struct sock {
 				sk_userlocks : 4,
 				sk_protocol  : 8,
 				sk_type      : 16;
+#if defined(CONFIG_SKBTRACE) || defined(CONFIG_SKBTRACE_MODULE)
+	unsigned int		sk_hit_skbtrace : 1,
+				sk_skbtrace_filtered : 1;
+#endif
 	kmemcheck_bitfield_end(flags);
+	unsigned int		sk_skbtrace_fid;
 	int			sk_wmem_queued;
 	gfp_t			sk_allocation;
 	netdev_features_t	sk_route_caps;
@@ -373,6 +380,9 @@ struct sock {
 	__u32			sk_mark;
 	u32			sk_classid;
 	struct cg_proto		*sk_cgrp;
+#if defined(CONFIG_SKBTRACE) || defined(CONFIG_SKBTRACE_MODULE)
+	struct skbtrace_context *sk_skbtrace;
+#endif
 	void			(*sk_state_change)(struct sock *sk);
 	void			(*sk_data_ready)(struct sock *sk, int bytes);
 	void			(*sk_write_space)(struct sock *sk);
@@ -842,6 +852,10 @@ struct module;
  * transport -> network interface is defined by struct inet_proto
  */
 struct proto {
+#if defined(CONFIG_SKBTRACE) || defined(CONFIG_SKBTRACE_MODULE)
+	int			(*filter_skb)(struct sock *sk,
+					struct sk_buff *skb);
+#endif
 	void			(*close)(struct sock *sk,
 					long timeout);
 	int			(*connect)(struct sock *sk,
diff --git a/include/trace/events/skbtrace.h
b/include/trace/events/skbtrace.h
new file mode 100644
index 0000000..91567bf
--- /dev/null
+++ b/include/trace/events/skbtrace.h
@@ -0,0 +1,32 @@
+/*
+ *  skbtrace - sk_buff trace utilty
+ *
+ *	Events
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
02111-1307, USA.
+ *
+ * 2012 Li Yu <bingtian.ly@...bao.com>
+ *
+ */
+
+#if !defined(_TRACE_EVENTS_SKBTRACE_H)
+#define _TRACE_EVENTS_SKBTRACE_H
+
+#include <linux/tracepoint.h>
+
+#include <trace/events/skbtrace_common.h>
+#include <trace/events/skbtrace_ipv4.h>
+
+#endif
diff --git a/include/trace/events/skbtrace_common.h
b/include/trace/events/skbtrace_common.h
new file mode 100644
index 0000000..4352564
--- /dev/null
+++ b/include/trace/events/skbtrace_common.h
@@ -0,0 +1,41 @@
+/*
+ *  skbtrace - sk_buff trace utilty
+ *
+ *	Comon events
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
02111-1307, USA.
+ *
+ * 2012 Li Yu <bingtian.ly@...bao.com>
+ *
+ */
+
+#if !defined(_TRACE_EVENTS_SKBTRACE_COMMON_H)
+#define _TRACE_EVENTS_SKBTRACE_COMMON_H
+
+#include <linux/tracepoint.h>
+
+struct sk_buff;
+struct net_device;
+struct timer_list;
+
+DECLARE_TRACE(skb_rps_info,
+	TP_PROTO(struct sk_buff *skb, struct net_device *dev, int cpu),
+	TP_ARGS(skb, dev, cpu));
+
+DECLARE_TRACE(sk_timer,
+	TP_PROTO(void *sk, struct timer_list *timer, int action),
+	TP_ARGS(sk, timer, action));
+
+#endif
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 8c4c070..cc49b26 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -367,6 +367,14 @@ config BLK_DEV_IO_TRACE

 	  If unsure, say N.

+config SKBTRACE
+	tristate "skbtrace : flexible networking tracing"
+	help
+	  A blktrace like utility for networking subsystem, you can enable
this feature
+	  as a kernel module.
+
+	  If unsure, say N.
+
 config KPROBE_EVENT
 	depends on KPROBES
 	depends on HAVE_REGS_AND_STACK_ACCESS_API
diff --git a/net/core/Makefile b/net/core/Makefile
index 674641b..6a80a85 100644
--- a/net/core/Makefile
+++ b/net/core/Makefile
@@ -18,6 +18,8 @@ obj-$(CONFIG_NETPOLL) += netpoll.o
 obj-$(CONFIG_NET_DMA) += user_dma.o
 obj-$(CONFIG_FIB_RULES) += fib_rules.o
 obj-$(CONFIG_TRACEPOINTS) += net-traces.o
+obj-${CONFIG_SKBTRACE} += skbtrace.o
+skbtrace-objs := skbtrace-core.o skbtrace-events-common.o
 obj-$(CONFIG_NET_DROP_MONITOR) += drop_monitor.o
 obj-$(CONFIG_NETWORK_PHY_TIMESTAMPING) += timestamping.o
 obj-$(CONFIG_NETPRIO_CGROUP) += netprio_cgroup.o
diff --git a/net/core/dev.c b/net/core/dev.c
index 89e33a5..b363716 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -129,6 +129,8 @@
 #include <trace/events/napi.h>
 #include <trace/events/net.h>
 #include <trace/events/skb.h>
+#include <trace/events/skbtrace_common.h>
+#include <linux/skbtrace.h>
 #include <linux/pci.h>
 #include <linux/inetdevice.h>
 #include <linux/cpu_rmap.h>
@@ -2813,6 +2815,7 @@ static int get_rps_cpu(struct net_device *dev,
struct sk_buff *skb,
 	}

 done:
+	trace_skb_rps_info(skb, dev, cpu);
 	return cpu;
 }

diff --git a/net/core/net-traces.c b/net/core/net-traces.c
index ba3c012..41e1766 100644
--- a/net/core/net-traces.c
+++ b/net/core/net-traces.c
@@ -21,6 +21,7 @@
 #include <linux/netlink.h>
 #include <linux/net_dropmon.h>
 #include <linux/slab.h>
+#include <linux/skbtrace.h>

 #include <asm/unaligned.h>
 #include <asm/bitops.h>
@@ -31,7 +32,30 @@
 #include <trace/events/napi.h>
 #include <trace/events/sock.h>
 #include <trace/events/udp.h>
+#include <trace/events/skbtrace.h>

 EXPORT_TRACEPOINT_SYMBOL_GPL(kfree_skb);

 EXPORT_TRACEPOINT_SYMBOL_GPL(napi_poll);
+
+#if HAVE_SKBTRACE
+
+#define NEW_SKBTRACE_TP(name) \
+	DEFINE_TRACE(name); \
+	EXPORT_TRACEPOINT_SYMBOL_GPL(name);
+
+NEW_SKBTRACE_TP(skb_rps_info);
+NEW_SKBTRACE_TP(sk_timer);
+
+NEW_SKBTRACE_TP(tcp_congestion);
+NEW_SKBTRACE_TP(tcp_connection);
+NEW_SKBTRACE_TP(icsk_connection);
+NEW_SKBTRACE_TP(tcp_sendlimit);
+NEW_SKBTRACE_TP(tcp_active_conn);
+NEW_SKBTRACE_TP(tcp_rttm);
+NEW_SKBTRACE_TP(tcp_ca_state);
+
+unsigned long skbtrace_session;
+EXPORT_SYMBOL(skbtrace_session);
+
+#endif
diff --git a/net/core/skbtrace-core.c b/net/core/skbtrace-core.c
new file mode 100644
index 0000000..2c2ac3e
--- /dev/null
+++ b/net/core/skbtrace-core.c
@@ -0,0 +1,1226 @@
+/*
+ *  skbtrace - sk_buff trace utilty
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
02111-1307, USA.
+ *
+ * 2012 Li Yu <bingtian.ly@...bao.com>
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/relay.h>
+#include <linux/debugfs.h>
+#include <linux/slab.h>
+#include <linux/ctype.h>
+#include <linux/jhash.h>
+
+#include <linux/net.h>
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/filter.h>
+#include <linux/skbtrace.h>
+#include <net/sock.h>
+
+#define SKBTRACE_VERSION	"1"
+#define SKBTRACE_DIR		"skbtrace"
+
+static unsigned long skbtrace_dropped[NR_CHANNELS][NR_CPUS];
+/* +1 for quick indexing trick in __skbtrace_probe() */
+static struct rchan *skbtrace_channels[NR_CHANNELS + 1];
+
+int sysctl_skbtrace_filter_default = 0;
+EXPORT_SYMBOL_GPL(sysctl_skbtrace_filter_default);
+static struct sk_buff **sock_filter_skb;
+static struct sock_fprog skb_filter_fprog;
+static struct sock_fprog sock_filter_fprog;
+struct sk_filter *skbtrace_skb_filter;
+EXPORT_SYMBOL_GPL(skbtrace_skb_filter);
+
+u32 skbtrace_sock_filter_id;
+EXPORT_SYMBOL_GPL(skbtrace_sock_filter_id);
+struct sk_filter *skbtrace_sock_filter;
+EXPORT_SYMBOL_GPL(skbtrace_sock_filter);
+
+static struct dentry	*skbtrace_dentry;
+static struct dentry	*enabled_control;
+static struct dentry	*dropped_control;
+static struct dentry	*version_control;
+static struct dentry	*subbuf_nr_control;
+static struct dentry	*subbuf_size_control;
+static struct dentry	*filters_control;
+static struct dentry	*sock_filters_control;
+
+static const struct file_operations	enabled_fops;
+static const struct file_operations	dropped_fops;
+static const struct file_operations	version_fops;
+static const struct file_operations	subbuf_nr_fops;
+static const struct file_operations	subbuf_size_fops;
+static const struct file_operations	filters_fops;
+static const struct file_operations	sock_filters_fops;
+
+static int nr_skbtrace_enabled_tp;
+static int subbuf_nr = SKBTRACE_DEF_SUBBUF_NR;
+static int subbuf_size = SKBTRACE_DEF_SUBBUF_SIZE;
+
+static bool should_load_proto;
+
+struct static_key skbtrace_filters_enabled = STATIC_KEY_INIT_FALSE;
+EXPORT_SYMBOL_GPL(skbtrace_filters_enabled);
+
+atomic64_t skbtrace_event_seq = ATOMIC64_INIT(0);
+EXPORT_SYMBOL_GPL(skbtrace_event_seq);
+
+/* protect agaist af_tp_list and skbtrace_channels */
+static struct mutex skbtrace_lock;
+static struct skbtrace_tracepoint *af_tp_list[AF_MAX];
+struct skbtrace_ops* skbtrace_ops[AF_MAX];
+
+static int create_controls(void);
+static void remove_controls(void);
+static int  create_channels(void);
+static void flush_channels(void);
+static void destroy_channels(void);
+static ssize_t sk_filter_read(struct sock_fprog *fprog, char __user
*buffer,
+							    size_t count);
+static ssize_t sk_filter_write(struct sock_fprog *sk_fprog,
+				struct sk_filter **sk_filter,
+				const char __user *buffer, size_t count);
+static void reset_filter(struct sock_fprog *fprog, struct sk_filter
**filter);
+static void skbtrace_filters_clean(void);
+
+struct skbtrace_ops* skbtrace_ops_get(int af)
+{
+	return skbtrace_ops[af];
+}
+EXPORT_SYMBOL_GPL(skbtrace_ops_get);
+
+static void skbtrace_proto_load(void)
+{
+	int af;
+
+	if (!should_load_proto)
+		return;
+
+	should_load_proto = false;
+
+	for (af = AF_UNSPEC; af < AF_MAX; af++) {
+		/* load proto-specific events */
+		if (!af_tp_list[af])
+			request_module("skbtrace-af-%d", af);
+	}
+}
+
+void __skbtrace_block_probe(struct skbtrace_block *blk)
+{
+	unsigned int chan_id;
+	struct rchan *rchan;
+
+	chan_id = (!!in_irq()) << 1;
+	chan_id |= !!in_softirq();	/* make sparse happy */
+	rchan = skbtrace_channels[chan_id];
+
+	if (unlikely(chan_id >= HW))
+		relay_write(rchan, blk, blk->len);
+	else {
+		local_bh_disable();
+		__relay_write(rchan, blk, blk->len);
+		local_bh_enable();
+	}
+	blk->action = skbtrace_action_invalid;
+}
+
+void __skbtrace_do_probe(struct skbtrace_tracepoint *tp,
+				struct skbtrace_context *ctx,
+				struct skbtrace_block *blk)
+{
+	int i;
+	char *sec_blk;
+	struct secondary_buffer *buf;
+
+	if (ctx)
+		buf = secondary_table_lookup(&ctx->sec_table, tp);
+	else
+		buf = &tp->sec_buffer;
+
+	if (!buf) {
+		if (tp->nr_secondary)
+			blk->flags |= 1<<skbtrace_flags_miss_secondary;
+		goto quit;
+	}
+
+	spin_lock_bh(&buf->lock);
+	for (i = 0; i < buf->count; i++) {
+		if (--buf->offset < 0)
+			buf->offset = SECONDARY_BUFFER_COUNTS - 1;
+		sec_blk = &buf->slots[buf->offset * SECONDARY_BUFFER_UNIT];
+		__skbtrace_block_probe((struct skbtrace_block*)sec_blk);
+	}
+	secondary_buffer_reset(buf);
+	spin_unlock_bh(&buf->lock);
+
+quit:
+	__skbtrace_block_probe(blk);
+}
+
+void __skbtrace_probe(struct skbtrace_tracepoint *tp,
+				struct skbtrace_context *ctx,
+				struct skbtrace_block *blk)
+{
+	if (!tp)
+		return;
+	if (!tp->primary)
+		__skbtrace_do_probe(tp, ctx, blk);
+}
+EXPORT_SYMBOL_GPL(__skbtrace_probe);
+
+static void __skbtrace_setup_tracepoints(struct skbtrace_tracepoint
*tp_list)
+{
+	struct skbtrace_tracepoint *tp;
+
+	tp = tp_list;
+	while (tp && tp->trace_name) {
+		secondary_buffer_init(&tp->sec_buffer, tp);
+		tp->primary = NULL;
+		tp->enabled = 0;
+		tp++;
+	}
+}
+
+static int __skbtrace_register_tracepoints(int af,
+                                struct skbtrace_tracepoint *tp_list)
+{
+	int ret = 0;
+
+	if (af_tp_list[af])
+		ret = -EEXIST;
+
+	if (tp_list) {
+		__skbtrace_setup_tracepoints(tp_list);
+		if (tp_list[0].trace_name)
+			af_tp_list[af] = tp_list;
+		else
+			ret = -EINVAL;
+	} else
+		af_tp_list[af] = NULL;
+
+	return ret;
+}
+
+static void __skbtrace_unregister_tracepoints(int af)
+{
+	struct skbtrace_tracepoint *tp;
+
+	tp = af_tp_list[af];
+	while (tp && tp->trace_name) {
+		if (tp->enabled) {
+			tp->enabled = 0;
+			--nr_skbtrace_enabled_tp;
+			tracepoint_probe_unregister(tp->trace_name,
+							tp->probe, tp);
+			secondary_buffer_put(&tp->sec_buffer);
+		}
+		tp++;
+	}
+	af_tp_list[af] = NULL;
+}
+
+static inline int __skbtrace_register_ops(int af, struct skbtrace_ops *ops)
+{
+	if (skbtrace_ops[af])
+		return -EEXIST;
+	skbtrace_ops[af] = ops;
+	return 0;
+}
+
+static inline void __skbtrace_unregister_ops(int af)
+{
+	skbtrace_ops[af] = NULL;
+}
+
+int skbtrace_register_proto(int af,
+			struct skbtrace_tracepoint *tp_list,
+			struct skbtrace_ops *ops)
+{
+	int ret;
+
+	if (af < 0 || af >= AF_MAX)
+		return -EINVAL;
+
+	mutex_lock(&skbtrace_lock);
+	ret = __skbtrace_register_tracepoints(af, tp_list);
+	if (!ret) {
+		ret = __skbtrace_register_ops(af, ops);
+		if (ret)
+			__skbtrace_unregister_tracepoints(af);
+	}
+	mutex_unlock(&skbtrace_lock);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(skbtrace_register_proto);
+
+void skbtrace_unregister_proto(int af)
+{
+	if (af < 0 || af >= AF_MAX)
+		return;
+
+	mutex_lock(&skbtrace_lock);
+	__skbtrace_unregister_tracepoints(af);
+	__skbtrace_unregister_ops(af);
+	mutex_unlock(&skbtrace_lock);
+
+	flush_channels();
+	should_load_proto = true;
+}
+EXPORT_SYMBOL_GPL(skbtrace_unregister_proto);
+
+void skbtrace_context_setup(struct skbtrace_context *ctx,
+				struct skbtrace_ops *ops)
+{
+	ctx->ops = ops;
+	ctx->session = skbtrace_session;
+	secondary_table_init(&ctx->sec_table);
+}
+EXPORT_SYMBOL(skbtrace_context_setup);
+
+struct skbtrace_context *skbtrace_context_get(struct sock *sk)
+{
+	struct skbtrace_ops *ops;
+	struct skbtrace_context *ctx;
+
+	ops = skbtrace_ops_get(sk->sk_family);
+	if (!ops)
+		return NULL;
+	local_bh_disable();
+
+	if (sk->sk_skbtrace &&
+			(skbtrace_session != sk->sk_skbtrace->session))
+		skbtrace_context_destroy(&sk->sk_skbtrace);
+
+	if (!sk->sk_skbtrace) {
+		ctx = kzalloc(sizeof(struct skbtrace_context), GFP_ATOMIC);
+		if (likely(ctx)) {
+			skbtrace_context_setup(ctx, ops);
+			sk->sk_skbtrace = ctx;
+		}
+	}
+
+	local_bh_enable();
+	return sk->sk_skbtrace;
+}
+EXPORT_SYMBOL(skbtrace_context_get);
+
+static int subbuf_start_handler(struct rchan_buf *buf,
+				void *subbuf,
+				void *prev_subbuf,
+				size_t prev_padding)
+{
+	if (relay_buf_full(buf)) {
+		long trace, cpu;
+
+		trace = (long)buf->chan->private_data;
+		cpu = buf->cpu;
+		skbtrace_dropped[trace][cpu]++;
+		return 0;
+	}
+	return 1;
+}
+
+static struct dentry *create_buf_file_handler(const char *filename,
+					      struct dentry *parent,
+					      umode_t mode,
+					      struct rchan_buf *buf,
+					      int *is_global)
+{
+	return debugfs_create_file(filename, mode, parent, buf,
+				       &relay_file_operations);
+}
+
+static int remove_buf_file_handler(struct dentry *dentry)
+{
+	debugfs_remove(dentry);
+	return 0;
+}
+
+static struct rchan_callbacks relayfs_callbacks = {
+	.subbuf_start = subbuf_start_handler,
+	.create_buf_file = create_buf_file_handler,
+	.remove_buf_file = remove_buf_file_handler,
+};
+
+/* caller must hold skbtrace_lock */
+static int create_channels(void)
+{
+	unsigned long i, created;
+	const char *skbtrace_names[NR_CHANNELS] = {    "trace.syscall.cpu",
+							"trace.softirq.cpu",
+							"trace.hardirq.cpu" };
+	created = 0;
+	for (i = 0; i < NR_CHANNELS; i++) {
+		if (skbtrace_channels[i])
+			continue;
+		skbtrace_channels[i] = relay_open(skbtrace_names[i],
+			skbtrace_dentry, subbuf_size, subbuf_nr,
+				&relayfs_callbacks, (void *)i);
+		if (!skbtrace_channels[i]) {
+			destroy_channels();
+			return -ENOMEM;
+		}
+		created = 1;
+	}
+	skbtrace_channels[HW + 1] = skbtrace_channels[HW];
+
+	if (created)
+		__module_get(THIS_MODULE);
+	return 0;
+}
+
+static void flush_channels(void)
+{
+	int i;
+	for (i = 0; i < NR_CHANNELS; i++) {
+		if (skbtrace_channels[i])
+			relay_flush(skbtrace_channels[i]);
+	}
+}
+
+/* caller must hold skbtrace_lock */
+static void destroy_channels(void)
+{
+	int i, removed;
+
+	removed = 0;
+	for (i = 0; i < NR_CHANNELS; i++) {
+		if (skbtrace_channels[i]) {
+			relay_flush(skbtrace_channels[i]);
+			relay_close(skbtrace_channels[i]);
+			skbtrace_channels[i] = NULL;
+			removed = 1;
+		}
+	}
+	skbtrace_channels[HW + 1] = NULL;
+
+	if (removed)
+		module_put(THIS_MODULE);
+}
+
+static void remove_controls(void)
+{
+#define REMOVE_DEBUGFS_FILE(name) \
+	do {\
+		if (name##_control) \
+			debugfs_remove(name##_control); \
+	} while(0);
+
+	REMOVE_DEBUGFS_FILE(enabled)
+	REMOVE_DEBUGFS_FILE(dropped)
+	REMOVE_DEBUGFS_FILE(version)
+	REMOVE_DEBUGFS_FILE(subbuf_nr)
+	REMOVE_DEBUGFS_FILE(subbuf_size)
+	REMOVE_DEBUGFS_FILE(filters)
+	REMOVE_DEBUGFS_FILE(sock_filters)
+}
+
+static int create_controls(void)
+{
+#define CREATE_DEBUGFS_FILE(name)\
+	do {\
+		name##_control = debugfs_create_file(#name, 0,\
+				skbtrace_dentry, NULL, &name##_fops);\
+		if (name##_control)\
+			break;\
+		pr_err("skbtrace: couldn't create relayfs file '" #name "'\n");\
+		goto fail;\
+	} while (0);
+
+	CREATE_DEBUGFS_FILE(enabled)
+	CREATE_DEBUGFS_FILE(dropped)
+	CREATE_DEBUGFS_FILE(version)
+	CREATE_DEBUGFS_FILE(subbuf_nr)
+	CREATE_DEBUGFS_FILE(subbuf_size)
+	CREATE_DEBUGFS_FILE(filters)
+	CREATE_DEBUGFS_FILE(sock_filters)
+
+#undef CREATE_DEBUGFS_FILE
+	return 0;
+fail:
+	remove_controls();
+	return -1;
+}
+
+static char *skbtrace_tracepoint_default_desc(struct
skbtrace_tracepoint *t)
+{
+	char *desc;
+	int n;
+
+	n = strlen(t->trace_name) + 64;
+	desc = kmalloc(n, GFP_KERNEL);
+	if (!desc)
+		return NULL;
+
+	snprintf(desc, n, "%s enabled:%d\n", t->trace_name, !!t->enabled);
+	return desc;
+}
+
+static char *skbtrace_tracepoint_desc(struct skbtrace_tracepoint *tp)
+{
+	if (tp->desc)
+		return tp->desc(tp);
+	return skbtrace_tracepoint_default_desc(tp);
+}
+
+static ssize_t enabled_read(struct file *filp, char __user *buffer,
+			    size_t count, loff_t *ppos)
+{
+	size_t ret, offset, len;
+	struct skbtrace_tracepoint *tp;
+	int af;
+	char *desc = NULL;
+
+	skbtrace_proto_load();
+
+	ret = offset = 0;
+	mutex_lock(&skbtrace_lock);
+	for (af = AF_UNSPEC; af < AF_MAX; af++) {
+		tp = af_tp_list[af];
+		while (tp && tp->trace_name) {
+			kfree(desc);
+			desc = skbtrace_tracepoint_desc(tp);
+			if (!desc)
+				return -ENOMEM;
+			len = strlen(desc);
+			offset += len;
+			if (offset <= *ppos) {
+				++tp;
+				continue;
+			}
+			if (count < len) {
+				ret = -EINVAL;
+				goto unlock;
+			}
+			if (copy_to_user(buffer, desc, len)) {
+				ret = -EFAULT;
+				goto unlock;
+			}
+			*ppos += len;
+			ret = len;
+			goto unlock;
+		}
+	}
+unlock:
+	kfree(desc);
+	mutex_unlock(&skbtrace_lock);
+
+	return ret;
+}
+
+static struct skbtrace_tracepoint *skbtrace_lookup_tp(char *name)
+{
+	int af;
+	struct skbtrace_tracepoint *tp;
+
+	for (af = AF_UNSPEC; af < AF_MAX; af++) {
+		tp = af_tp_list[af];
+		while (tp && tp->trace_name) {
+			if (!strcmp(name, tp->trace_name))
+				return tp;
+			++tp;
+		}
+	}
+
+	return NULL;
+}
+
+struct skbtrace_options_context {
+	char *name;
+	char *options;
+	struct skbtrace_tracepoint *primary;
+};
+
+struct option_handler {
+	char *key;
+	int (*handler)(struct skbtrace_options_context *ctx, char *val);
+};
+
+static int handle_primary_option(struct skbtrace_options_context *ctx,
char *val)
+{
+	ctx->primary = skbtrace_lookup_tp(val);
+	if (!ctx->primary)
+		return -EINVAL;
+	return 0;
+}
+
+static struct option_handler common_handlers[] = {
+	{
+		.key = "primary=",
+		.handler = handle_primary_option,
+	},
+	{
+		.key = NULL,
+	},
+};
+
+static int handle_options(char *event_spec, struct option_handler
*handlers,
+					struct skbtrace_options_context *ctx)
+{
+	char *option;
+
+	memset(ctx, 0, sizeof(*ctx));
+	ctx->options = strchr(event_spec, ',');
+	if (!ctx->options)
+		return 0;
+	*(ctx->options) = '\x0';
+	option = ++(ctx->options);
+
+	while (option && *option) {
+		char *end;
+		struct option_handler *h;
+
+		end = strchr(option, ',');
+		if (end)
+			*end = '\x0';
+		h = &handlers[0];
+		while (h->key) {
+			if (strstr(option, h->key) == option) {
+				int ret;
+				char *val;
+
+				val = option + strlen(h->key);
+				ret = h->handler(ctx, val);
+				if (!ret)
+					break;
+				else
+					return -EINVAL;
+			}
+			h++;
+		}
+		if (!h->key) {
+			if (end) {
+				*end = ',';
+				option = end + 1;
+			} else
+				break;
+		} else {
+			if (end) {
+				memmove(option, end + 1, strlen(end + 1) + 1);
+			} else
+				*option = '\x0';
+		}
+	}
+
+	return 0;
+}
+
+static int __enable_tp(struct skbtrace_tracepoint *tp,
+				struct skbtrace_options_context *ctx)
+{
+	int ret = 0;
+
+	if (tp->enabled)
+		return -EBUSY;
+
+	if (tp->enable)
+		tp->enable(tp);
+	ret = tracepoint_probe_register(tp->trace_name, tp->probe, tp);
+	if (!ret) {
+		tp->primary = ctx->primary;
+		if (tp->primary)
+			tp->primary->nr_secondary++;
+		tp->enabled = 1;
+	} else {
+		if (tp->disable)
+			tp->disable(tp);
+	}
+
+	return ret;
+}
+
+static int __disable_tp(struct skbtrace_tracepoint *tp)
+{
+	int ret;
+
+	if (!tp->enabled)
+		return -EINVAL;
+
+	ret = tracepoint_probe_unregister(tp->trace_name, tp->probe, tp);
+	if (ret)
+		return ret;
+
+	if (tp->disable)
+		tp->disable(tp);
+	if (tp->primary) {
+		secondary_buffer_put(&tp->primary->sec_buffer);
+		tp->primary->nr_secondary--;
+	}
+	tp->enabled = 0;
+	return 0;
+}
+
+static int skbtrace_enable_tp(char *event_spec)
+{
+	struct skbtrace_options_context ctx;
+	int ret;
+	struct skbtrace_tracepoint *tp;
+
+	ret = handle_options(event_spec, common_handlers, &ctx);
+	if (ret)
+		return ret;
+	ctx.name = event_spec;
+
+	mutex_lock(&skbtrace_lock);
+	if (!nr_skbtrace_enabled_tp) {
+		ret = create_channels();
+		if (ret)
+			goto unlock;
+	}
+
+	tp = skbtrace_lookup_tp(ctx.name);
+	if (!tp || tp->enabled) {
+		ret = -EINVAL;
+		goto unlock;
+	}
+
+	if (ctx.options && tp->setup_options) {
+		ret = tp->setup_options(tp, ctx.options);
+		if (ret)
+			goto unlock;
+	}
+
+	ret = __enable_tp(tp, &ctx);
+
+	if (ret && !nr_skbtrace_enabled_tp)
+		destroy_channels();
+	else if (!ret)
+		++nr_skbtrace_enabled_tp;
+
+unlock:
+	mutex_unlock(&skbtrace_lock);
+	return ret;
+}
+
+static int skbtrace_disable_all_tp(void)
+{
+	int ret, af;
+	struct skbtrace_tracepoint *tp;
+
+	/*
+	 * '-*' has two meanings:
+	 *
+	 *   (0) first time, it disables all tracepoints, and flush channels.
+	 *   (1) second time, it removes all channels.
+	 */
+
+	if (!nr_skbtrace_enabled_tp) {
+		skbtrace_filters_clean();
+		++skbtrace_session;
+		destroy_channels();
+		return 0;
+	}
+
+	ret = -EINVAL;
+	mutex_lock(&skbtrace_lock);
+	for (af = AF_UNSPEC; af < AF_MAX; af++) {
+		tp = af_tp_list[af];
+		while (tp && tp->trace_name) {
+			ret = __disable_tp(tp);
+			if (!ret)
+				--nr_skbtrace_enabled_tp;
+			++tp;
+		}
+	}
+	mutex_unlock(&skbtrace_lock);
+	flush_channels();
+
+	return ret;
+}
+
+/* The user given buffer should contains such like string:
+ *	(0) To enable a skbtrace event:	"TRACE_NAME,opt1=val1,opt2=val2,..."
+ *	(1) To disable all skbtrace events:"-*"
+ */
+static ssize_t enabled_write(struct file *filp, const char __user *buffer,
+			     size_t count, loff_t *ppos)
+{
+	char kbuf[TRACE_SPEC_MAX_LEN+1];
+	int ret;
+
+	skbtrace_proto_load();
+
+	if (count >= TRACE_SPEC_MAX_LEN)
+		return -EINVAL;
+	if (copy_from_user(kbuf, buffer, count))
+		return -EFAULT;
+	kbuf[count] = '\x0';
+
+	if (strcmp("-*", kbuf))
+		ret = skbtrace_enable_tp(&kbuf[0]);
+	else
+		ret = skbtrace_disable_all_tp();
+
+	return ret ?: count;
+}
+
+static int kmod_open(struct inode *inodep, struct file *filp)
+{
+	__module_get(THIS_MODULE);
+	return 0;
+}
+
+static int kmod_release(struct inode *inodep, struct file *filp)
+{
+	module_put(THIS_MODULE);
+	return 0;
+}
+
+static const struct file_operations enabled_fops = {
+	.owner =	THIS_MODULE,
+	.open =		kmod_open,
+	.release =	kmod_release,
+	.read =		enabled_read,
+	.write =	enabled_write,
+};
+
+static ssize_t dropped_read(struct file *filp, char __user *buffer,
+			    size_t count, loff_t *ppos)
+{
+
+	char buf[256];
+	unsigned long skbtrace_total_dropped[NR_CHANNELS] = {0, 0, 0};
+	int cpu;
+
+	for_each_possible_cpu(cpu) {
+		skbtrace_total_dropped[HW] += skbtrace_dropped[HW][cpu];
+		skbtrace_total_dropped[SI] += skbtrace_dropped[SI][cpu];
+		skbtrace_total_dropped[SC] += skbtrace_dropped[SC][cpu];
+	}
+
+	snprintf(buf, sizeof(buf), "%lu %lu %lu\n",
+		skbtrace_total_dropped[HW],
+		skbtrace_total_dropped[SI],
+		skbtrace_total_dropped[SC]
+		);
+
+	return simple_read_from_buffer(buffer, count, ppos,
+				       buf, strlen(buf));
+}
+
+static ssize_t dropped_write(struct file *filp, const char __user *buffer,
+			    size_t count, loff_t *ppos)
+{
+	memset(skbtrace_dropped, 0, sizeof(skbtrace_dropped));
+	return count;
+}
+
+static const struct file_operations dropped_fops = {
+	.owner =	THIS_MODULE,
+	.open =		kmod_open,
+	.release =	kmod_release,
+	.read =		dropped_read,
+	.write =	dropped_write,
+};
+
+static ssize_t version_read(struct file *filp, char __user *buffer,
+			    size_t count, loff_t *ppos)
+{
+	return simple_read_from_buffer(buffer, count, ppos,
+				       SKBTRACE_VERSION "\n",
+					strlen(SKBTRACE_VERSION "\n"));
+}
+
+static const struct file_operations version_fops = {
+	.owner =	THIS_MODULE,
+	.open =		kmod_open,
+	.release =	kmod_release,
+	.read =		version_read,
+};
+
+static ssize_t subbuf_x_read(struct file *filp, char __user *buffer,
+			    size_t count, loff_t *ppos, int which)
+{
+	char buf[24];
+
+	sprintf(buf, "%d\n", which);
+	return simple_read_from_buffer(buffer, count, ppos,
+				       buf, strlen(buf));
+}
+
+static ssize_t subbuf_x_write(struct file *filp, const char __user *buffer,
+			    size_t count, loff_t *ppos,
+			    int *which, int min_val, int max_val)
+{
+	char buf[24];
+	int v;
+
+	if (nr_skbtrace_enabled_tp)
+		return -EBUSY;
+
+	if (!buffer || count > sizeof(buf) - 1)
+		return -EINVAL;
+	memset(buf, 0, sizeof(buf));
+	if (copy_from_user(buf, buffer, count))
+		return -EFAULT;
+	if (sscanf(buf, "%d", &v) != 1)
+		return -EINVAL;
+	if (v < min_val || v > max_val)
+		return -EINVAL;
+
+	*which = v;
+	return count;
+}
+
+static ssize_t subbuf_nr_read(struct file *filp, char __user *buffer,
+			    size_t count, loff_t *ppos)
+{
+	return subbuf_x_read(filp, buffer, count, ppos, subbuf_nr);
+}
+
+static ssize_t subbuf_nr_write(struct file *filp, const char __user
*buffer,
+			    size_t count, loff_t *ppos)
+{
+	return subbuf_x_write(filp, buffer, count, ppos, &subbuf_nr,
+			SKBTRACE_MIN_SUBBUF_NR, SKBTRACE_MAX_SUBBUF_NR);
+}
+
+static const struct file_operations subbuf_nr_fops = {
+	.owner =	THIS_MODULE,
+	.open =		kmod_open,
+	.release =	kmod_release,
+	.read =		subbuf_nr_read,
+	.write =	subbuf_nr_write,
+};
+
+static ssize_t subbuf_size_read(struct file *filp, char __user *buffer,
+			    size_t count, loff_t *ppos)
+{
+	return subbuf_x_read(filp, buffer, count, ppos, subbuf_size);
+}
+
+static ssize_t subbuf_size_write(struct file *filp, const char __user
*buffer,
+			    size_t count, loff_t *ppos)
+{
+	return subbuf_x_write(filp, buffer, count, ppos, &subbuf_size,
+			SKBTRACE_MIN_SUBBUF_SIZE, SKBTRACE_MAX_SUBBUF_SIZE);
+}
+
+static const struct file_operations subbuf_size_fops = {
+	.owner =	THIS_MODULE,
+	.open =		kmod_open,
+	.release =	kmod_release,
+	.read =		subbuf_size_read,
+	.write =	subbuf_size_write,
+};
+
+struct sk_buff* skbtrace_get_twsk_filter_skb(struct inet_timewait_sock *tw)
+{
+	unsigned int cpu;
+	struct sk_buff **p_skb;
+	int ret;
+	struct skbtrace_ops *ops;
+
+	local_bh_disable();
+
+	ops = skbtrace_ops_get(tw->tw_family);
+	if (!ops || !ops->filter_skb) {
+		local_bh_enable();
+		return NULL;
+	}
+
+	cpu = smp_processor_id();
+	p_skb = per_cpu_ptr(sock_filter_skb, cpu);
+	if (unlikely(!*p_skb)) {
+		*p_skb = alloc_skb(1500, GFP_ATOMIC);
+		if (!*p_skb) {
+			local_bh_enable();
+			return NULL;
+		}
+	}
+
+	ret = ops->tw_filter_skb(tw, *p_skb);
+	if (ret < 0) {
+		skbtrace_put_twsk_filter_skb(*p_skb);
+		return NULL;
+	}
+
+	return *p_skb;
+}
+EXPORT_SYMBOL_GPL(skbtrace_get_twsk_filter_skb);
+
+struct sk_buff* skbtrace_get_sock_filter_skb(struct sock *sk)
+{
+	unsigned int cpu;
+	struct sk_buff **p_skb;
+	int ret;
+	struct skbtrace_ops *ops;
+
+	local_bh_disable();
+
+	ops = skbtrace_ops_get(sk->sk_family);
+	if (!ops || !ops->filter_skb) {
+		local_bh_enable();
+		return NULL;
+	}
+
+	cpu = smp_processor_id();
+	p_skb = per_cpu_ptr(sock_filter_skb, cpu);
+	if (unlikely(!*p_skb)) {
+		*p_skb = alloc_skb(1500, GFP_ATOMIC);
+		if (!*p_skb) {
+			local_bh_enable();
+			return NULL;
+		}
+	}
+
+	ret = ops->filter_skb(sk, *p_skb);
+	if (ret < 0) {
+		skbtrace_put_sock_filter_skb(*p_skb);
+		return NULL;
+	}
+
+	return *p_skb;
+}
+EXPORT_SYMBOL_GPL(skbtrace_get_sock_filter_skb);
+
+static ssize_t sk_filter_read(struct sock_fprog *fprog, char __user
*buffer,
+							    size_t count)
+{
+	int sz_filter;
+	struct sock_filter __user *user_filter;
+
+	if (!fprog || !fprog->filter)
+		return -EINVAL;
+	sz_filter = fprog->len * sizeof(struct sock_filter);
+	if (count < sizeof(struct sock_fprog) + sz_filter)
+		return -EINVAL;
+
+	if (copy_to_user(buffer, &fprog->len, sizeof(short)))
+		return -EFAULT;
+
+	if (copy_from_user(&user_filter,
+			buffer + sizeof(short), sizeof(user_filter)))
+		return -EFAULT;
+	if (copy_to_user(user_filter, fprog->filter, sz_filter))
+		return -EFAULT;
+
+	return sizeof(struct sock_fprog) + sz_filter;
+}
+
+static ssize_t sk_filter_write(struct sock_fprog *sk_fprog,
+				struct sk_filter **sk_filter,
+				const char __user *buffer, size_t count)
+{
+	int sz_filter, ret;
+	struct sock_filter __user *user_filter;
+
+	if (count < sizeof(struct sock_fprog) || sk_fprog->filter)
+		return -EINVAL;
+	if (copy_from_user(sk_fprog, buffer, sizeof(struct sock_fprog)))
+		return -EFAULT;
+	sz_filter = sk_fprog->len * sizeof(struct sock_filter);
+	user_filter = sk_fprog->filter;
+
+	sk_fprog->filter = kzalloc(sz_filter, GFP_KERNEL);
+	if (!sk_fprog->filter)
+		ret = -ENOMEM;
+
+	ret = -EFAULT;
+	if (!copy_from_user(sk_fprog->filter, user_filter, sz_filter)) {
+		ret = sk_unattached_filter_create(sk_filter, sk_fprog);
+		if (ret) {
+			reset_filter(sk_fprog, sk_filter);
+			return ret;
+		}
+	}
+	static_key_slow_inc(&skbtrace_filters_enabled);
+	return sizeof(struct sock_fprog) + sz_filter;
+}
+
+static ssize_t filters_read(struct file *filp, char __user *buffer,
+			size_t count, loff_t *ppos, struct sock_fprog *fprog)
+{
+	return sk_filter_read(fprog, buffer, count);
+}
+
+static ssize_t skb_filters_read(struct file *filp, char __user *buffer,
+						size_t count, loff_t *ppos)
+{
+	return filters_read(filp, buffer, count, ppos, &skb_filter_fprog);
+}
+
+static ssize_t sock_filters_read(struct file *filp, char __user *buffer,
+						size_t count, loff_t *ppos)
+{
+	return filters_read(filp, buffer, count, ppos, &sock_filter_fprog);
+}
+
+static ssize_t filters_write(struct file *filp, const char __user *buffer,
+						size_t count, loff_t *ppos,
+			struct sock_fprog *fprog, struct sk_filter **filter)
+
+{
+	skbtrace_proto_load();
+
+	if (nr_skbtrace_enabled_tp)
+		return -EBUSY;
+	reset_filter(fprog, filter);
+	return sk_filter_write(fprog, filter, buffer, count);
+}
+
+static ssize_t skb_filters_write(struct file *filp, const char __user
*buffer,
+						size_t count, loff_t *ppos)
+{
+	return filters_write(filp, buffer, count, ppos,
+			&skb_filter_fprog, &skbtrace_skb_filter);
+}
+
+static ssize_t sock_filters_write(struct file *filp, const char __user
*buffer,
+						size_t count, loff_t *ppos)
+{
+	if (unlikely(!++skbtrace_sock_filter_id))
+		skbtrace_sock_filter_id = 1;
+	return filters_write(filp, buffer, count, ppos,
+				&sock_filter_fprog, &skbtrace_sock_filter);
+}
+
+static const struct file_operations filters_fops = {
+	.owner =	THIS_MODULE,
+	.open =		kmod_open,
+	.release =	kmod_release,
+	.read =		skb_filters_read,
+	.write =	skb_filters_write,
+};
+
+static const struct file_operations sock_filters_fops = {
+	.owner =	THIS_MODULE,
+	.open =		kmod_open,
+	.release =	kmod_release,
+	.read =		sock_filters_read,
+	.write =	sock_filters_write,
+};
+
+static void reset_filter(struct sock_fprog *fprog, struct sk_filter
**filter)
+{
+	if (fprog->filter)
+		kfree(fprog->filter);
+	memset(fprog, 0, sizeof(struct sock_fprog));
+
+	if (*filter) {
+		static_key_slow_dec(&skbtrace_filters_enabled);
+		sk_unattached_filter_destroy(*filter);
+		*filter = NULL;
+	}
+}
+
+static void skbtrace_filters_clean(void)
+{
+	reset_filter(&sock_filter_fprog, &skbtrace_sock_filter);
+	reset_filter(&skb_filter_fprog, &skbtrace_skb_filter);
+}
+
+static void clean_skbtrace_filters(void)
+{
+	unsigned int cpu;
+
+	if (skb_filter_fprog.filter)
+		kfree(skb_filter_fprog.filter);
+	if (skbtrace_skb_filter) {
+		static_key_slow_dec(&skbtrace_filters_enabled);
+		sk_unattached_filter_destroy(skbtrace_skb_filter);
+	}
+
+	if (sock_filter_fprog.filter)
+		kfree(sock_filter_fprog.filter);
+	if (skbtrace_sock_filter) {
+		static_key_slow_dec(&skbtrace_filters_enabled);
+		sk_unattached_filter_destroy(skbtrace_sock_filter);
+	}
+
+	for_each_possible_cpu(cpu) {
+		struct sk_buff **p_skb;
+
+		p_skb = per_cpu_ptr(sock_filter_skb, cpu);
+		if (*p_skb)
+			kfree_skb(*p_skb);
+	}
+	free_percpu(sock_filter_skb);
+}
+
+static int setup_skbtrace_filters(void)
+{
+	unsigned int cpu, err;
+
+	skbtrace_sock_filter_id = random32();
+
+	skbtrace_filters_clean();
+
+	sock_filter_skb = alloc_percpu(struct sk_buff*);
+	err = 0;
+	for_each_possible_cpu(cpu) {
+		struct sk_buff **p_skb;
+
+		p_skb = per_cpu_ptr(sock_filter_skb, cpu);
+		if (cpu_online(cpu)) {
+			*p_skb = alloc_skb(1500, GFP_KERNEL);
+			if (!*p_skb)
+				err = 1;
+		} else
+			*p_skb = NULL;
+	}
+
+	if (err) {
+		clean_skbtrace_filters();
+		return -ENOMEM;
+	}
+	return 0;
+}
+
+static int skbtrace_init(void)
+{
+	mutex_init(&skbtrace_lock);
+	if (!skbtrace_session)
+		skbtrace_session = random32();
+
+	if (setup_skbtrace_filters() < 0)
+		return -ENOMEM;
+
+	if (skbtrace_events_common_init())
+		return -ENODEV;
+
+	skbtrace_dentry = debugfs_create_dir(SKBTRACE_DIR, NULL);
+	if (!skbtrace_dentry)
+		return -ENOMEM;
+
+	if (create_controls()) {
+		debugfs_remove(skbtrace_dentry);
+		return -ENOMEM;
+	}
+
+	should_load_proto = true;
+	return 0;
+}
+
+static void skbtrace_exit(void)
+{
+	skbtrace_disable_all_tp(); /* disable all enabled tracepoints */
+	skbtrace_disable_all_tp(); /* remove channels in debugfs at 2nd time */
+	if (unlikely(nr_skbtrace_enabled_tp))
+		pr_err("skbtrace: failed to clean tracepoints.\n");
+	remove_controls();
+	debugfs_remove(skbtrace_dentry);
+	clean_skbtrace_filters();
+}
+
+module_init(skbtrace_init);
+module_exit(skbtrace_exit);
+MODULE_LICENSE("GPL");
diff --git a/net/core/skbtrace-events-common.c
b/net/core/skbtrace-events-common.c
new file mode 100644
index 0000000..30a3730
--- /dev/null
+++ b/net/core/skbtrace-events-common.c
@@ -0,0 +1,68 @@
+/*
+ *  skbtrace - sk_buff trace utilty
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
02111-1307, USA.
+ *
+ * 2012 Li Yu <bingtian.ly@...bao.com>
+ *
+ */
+
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/socket.h>
+#include <linux/skbtrace_api.h>
+#include <linux/skbtrace.h>
+#include <net/flow_keys.h>
+
+static void skbtrace_skb_rps_info(struct skbtrace_tracepoint *t,
+		struct sk_buff *skb, struct net_device *dev, int cpu)
+SKBTRACE_SKB_EVENT_BEGIN
+	struct skbtrace_skb_rps_info_blk blk, *b;
+	struct flow_keys keys;
+
+	b = skbtrace_block_get(t, NULL, &blk);
+	INIT_SKBTRACE_BLOCK(&b->blk, skb,
+			skbtrace_action_skb_rps_info,
+			0,
+			sizeof(blk));
+	b->rx_hash = skb->rxhash;
+	if (skb_rx_queue_recorded(skb))
+		b->rx_queue = skb_get_rx_queue(skb);
+	else
+		b->rx_queue = 0;
+	skb_flow_dissect(skb, &keys);
+	b->keys.src = keys.src;
+	b->keys.dst = keys.dst;
+	b->keys.ports = keys.ports;
+	b->keys.ip_proto = keys.ip_proto;
+	b->cpu = cpu;
+	b->ifindex = dev->ifindex;
+	skbtrace_probe(t, NULL, &b->blk);
+SKBTRACE_SKB_EVENT_END
+
+static struct skbtrace_tracepoint common[] = {
+	{
+		.trace_name = "skb_rps_info",
+		.action = skbtrace_action_skb_rps_info,
+		.block_size = sizeof(struct skbtrace_skb_rps_info_blk),
+		.probe = skbtrace_skb_rps_info,
+	},
+	EMPTY_SKBTRACE_TP
+};
+
+int skbtrace_events_common_init(void)
+{
+	return skbtrace_register_proto(AF_UNSPEC, common, NULL);
+}
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index e33ebae..15954ae 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -70,6 +70,7 @@
 #include <asm/uaccess.h>
 #include <trace/events/skb.h>
 #include <linux/highmem.h>
+#include <linux/skbtrace.h>

 struct kmem_cache *skbuff_head_cache __read_mostly;
 static struct kmem_cache *skbuff_fclone_cache __read_mostly;
@@ -700,6 +701,10 @@ static void __copy_skb_header(struct sk_buff *new,
const struct sk_buff *old)
 	new->ooo_okay		= old->ooo_okay;
 	new->l4_rxhash		= old->l4_rxhash;
 	new->no_fcs		= old->no_fcs;
+#if HAVE_SKBTRACE
+	new->hit_skbtrace	= old->hit_skbtrace;
+	new->skbtrace_filtered	= old->skbtrace_filtered;
+#endif
 #ifdef CONFIG_XFRM
 	new->sp			= secpath_get(old->sp);
 #endif
diff --git a/net/core/sock.c b/net/core/sock.c
index a6000fb..b818961 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -132,8 +132,10 @@
 #include <net/netprio_cgroup.h>

 #include <linux/filter.h>
+#include <linux/skbtrace.h>

 #include <trace/events/sock.h>
+#include <trace/events/skbtrace_common.h>

 #ifdef CONFIG_INET
 #include <net/tcp.h>
@@ -1272,6 +1274,7 @@ struct sock *sk_alloc(struct net *net, int family,
gfp_t priority,

 		sock_update_classid(sk);
 		sock_update_netprioidx(sk, current);
+		sock_skbtrace_reset(sk);
 	}

 	return sk;
@@ -1292,6 +1295,8 @@ static void __sk_free(struct sock *sk)
 		RCU_INIT_POINTER(sk->sk_filter, NULL);
 	}

+	skbtrace_context_destroy(&sk->sk_skbtrace);
+
 	sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);

 	if (atomic_read(&sk->sk_omem_alloc))
@@ -1440,6 +1445,8 @@ struct sock *sk_clone_lock(const struct sock *sk,
const gfp_t priority)

 		if (newsk->sk_flags & SK_FLAGS_TIMESTAMP)
 			net_enable_timestamp();
+
+		sock_skbtrace_reset(newsk);
 	}
 out:
 	return newsk;
@@ -2124,6 +2131,7 @@ void sk_reset_timer(struct sock *sk, struct
timer_list* timer,
 {
 	if (!mod_timer(timer, expires))
 		sock_hold(sk);
+	trace_sk_timer(sk, timer, skbtrace_sk_timer_reset);
 }
 EXPORT_SYMBOL(sk_reset_timer);

@@ -2131,6 +2139,7 @@ void sk_stop_timer(struct sock *sk, struct
timer_list* timer)
 {
 	if (timer_pending(timer) && del_timer(timer))
 		__sock_put(sk);
+	trace_sk_timer(sk, timer, skbtrace_sk_timer_stop);
 }
 EXPORT_SYMBOL(sk_stop_timer);

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ