netdev - [RFC 01/01]af_packet: Enhance network capture visibility

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-ID: <BANLkTimYVUkUWA2XPix2nUL-=rnQKghZQA@mail.gmail.com>
Date:	Wed, 25 May 2011 19:03:08 -0400
From:	chetan loke <loke.chetan@...il.com>
To:	netdev@...r.kernel.org, loke.chetan@...il.com
Subject: [RFC 01/01]af_packet: Enhance network capture visibility

This patch is not complete and is intended to:
a) demonstrate the improvments
b) gather suggestions


Signed-off-by: Chetan Loke <lokec@....neu.edu>

-----------------------
 include/linux/if_packet.h |   27 ++
 net/packet/af_packet.c    |  637 ++++++++++++++++++++++++++++++++++++++++++---
 2 files changed, 632 insertions(+), 32 deletions(-)

-----------------------

diff --git a/include/linux/if_packet.h b/include/linux/if_packet.h
index 72bfa5a..1452f47 100644
--- a/include/linux/if_packet.h
+++ b/include/linux/if_packet.h
@@ -55,6 +55,17 @@ struct tpacket_stats {
 	unsigned int	tp_drops;
 };

+struct tpacket_stats_v3 {
+	unsigned int	tp_packets;
+	unsigned int	tp_drops;
+	unsigned int	tp_plug_q_cnt;
+};
+
+union tpacket_stats_u {
+	struct tpacket_stats stats1;
+	struct tpacket_stats_v3 stats3;
+};
+
 struct tpacket_auxdata {
 	__u32		tp_status;
 	__u32		tp_len;
@@ -102,11 +113,27 @@ struct tpacket2_hdr {
 	__u16		tp_vlan_tci;
 };

+
+struct tpacket3_hdr {
+	__u32		tp_status;
+	__u32		tp_len;
+	__u32		tp_snaplen;
+	__u16		tp_mac;
+	__u16		tp_net;
+	__u32		tp_sec;
+	__u32		tp_nsec;
+	__u16		tp_vlan_tci;
+	long		tp_next_offset;
+};
+
 #define TPACKET2_HDRLEN		(TPACKET_ALIGN(sizeof(struct tpacket2_hdr))
+ sizeof(struct sockaddr_ll))

+#define TPACKET3_HDRLEN		(TPACKET_ALIGN(sizeof(struct tpacket3_hdr))
+ sizeof(struct sockaddr_ll))
+
 enum tpacket_versions {
 	TPACKET_V1,
 	TPACKET_V2,
+	TPACKET_V3
 };

 /*
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 91cb1d7..8e0bc51 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -164,6 +164,57 @@ struct packet_mreq_max {
 static int packet_set_ring(struct sock *sk, struct tpacket_req *req,
 		int closing, int tx_ring);

+
+#define V3_ALIGNMENT	(4)
+#define ALIGN_4(x)		(((x)+V3_ALIGNMENT-1)&~(V3_ALIGNMENT-1))
+
+
+struct bd_ts{
+	unsigned int ts_sec;
+	union {
+		unsigned int u1_i1[1];
+		struct {
+			unsigned int ts_usec;
+		}ts_s1;
+		struct {
+			unsigned int ts_nsec;
+		}ts_s2;
+	} ts_u1;
+}__attribute__ ((__packed__));
+
+struct  block_desc{
+	uint32_t		block_status;
+	uint32_t		num_pkts;
+	struct bd_ts	ts_first_pkt;
+	struct bd_ts	ts_last_pkt;
+	long			offset_to_first_pkt;
+	uint32_t		seq_num;
+} __attribute__ ((__packed__));
+
+struct kbdq_core{
+	struct pgv		*pkbdq;
+	unsigned int	hdrlen;
+	unsigned char	reset_pending_on_curr_blk;
+	unsigned char   delete_blk_timer;
+	unsigned short	kactive_blk_num;
+	unsigned short	hole_bytes_size;
+	char			*pkblk_start;
+	char			*pkblk_end;
+	int				kblk_size;
+	unsigned int	knum_blocks;
+	unsigned int	knxt_seq_num;
+	char			*prev;
+	char			*nxt_offset;
+	/* last_kactive_blk_num:
+	 * trick to see if user-space has caught up
+	 * in order to avoid refreshing timer when every single pkt arrives.
+	 */
+	unsigned short	last_kactive_blk_num;
+#define DEFAULT_PRB_RETIRE_TMO	(4)
+	unsigned short  retire_blk_tmo;
+	struct timer_list retire_blk_timer;
+};
+
 #define PGV_FROM_VMALLOC 1
 struct pgv {
 	char *buffer;
@@ -179,11 +230,16 @@ struct packet_ring_buffer {
 	unsigned int		pg_vec_order;
 	unsigned int		pg_vec_pages;
 	unsigned int		pg_vec_len;
-
+	struct kbdq_core			prb_bdqc;
 	atomic_t		pending;
 };

 struct packet_sock;
+
+static void prb_open_block(struct kbdq_core *pkc1,struct block_desc *pbd1);
+static void prb_retire_rx_blk_timer_expired(unsigned long data);
+static void _prb_refresh_rx_retire_blk_timer(struct kbdq_core *pkc);
+static void prb_init_blk_timer(struct packet_sock *po,struct
kbdq_core *pkc,void (*func) (unsigned long));
 static int tpacket_snd(struct packet_sock *po, struct msghdr *msg);

 static void packet_flush_mclist(struct sock *sk);
@@ -192,6 +248,7 @@ struct packet_sock {
 	/* struct sock has to be the first member of packet_sock */
 	struct sock		sk;
 	struct tpacket_stats	stats;
+	union  tpacket_stats_u	stats_u;
 	struct packet_ring_buffer	rx_ring;
 	struct packet_ring_buffer	tx_ring;
 	int			copy_thresh;
@@ -223,7 +280,14 @@ struct packet_skb_cb {

 #define PACKET_SKB_CB(__skb)	((struct packet_skb_cb *)((__skb)->cb))

-static inline __pure struct page *pgv_to_page(void *addr)
+#define GET_PBDQC_FROM_RB(x)				((struct kbdq_core *)(&(x)->prb_bdqc))
+#define GET_CURR_PBLOCK_DESC_FROM_CORE(x)	((struct block_desc
*)((x)->pkbdq[(x)->kactive_blk_num].buffer))
+#define GET_PBLOCK_DESC(x,bid)				((struct block_desc
*)((x)->pkbdq[(bid)].buffer))
+
+#define INCREMENT_PRB_BLK_NUM(x) \
+	(((x)->kactive_blk_num < ((x)->knum_blocks-1)) ? ((x)->kactive_blk_num+1) : 0)
+
+static inline struct page *pgv_to_page(void *addr)
 {
 	if (is_vmalloc_addr(addr))
 		return vmalloc_to_page(addr);
@@ -248,8 +312,12 @@ static void __packet_set_status(struct
packet_sock *po, void *frame, int status)
 		h.h2->tp_status = status;
 		flush_dcache_page(pgv_to_page(&h.h2->tp_status));
 		break;
+	case TPACKET_V3:
+		pr_err("<%s> TPACKET version not supported.Who is calling?.Dumping
stack.\n",__func__);
+		dump_stack();
+		break;
 	default:
-		pr_err("TPACKET version not supported\n");
+		pr_err("<%s> TPACKET version not supported\n",__func__);
 		BUG();
 	}

@@ -274,6 +342,10 @@ static int __packet_get_status(struct packet_sock
*po, void *frame)
 	case TPACKET_V2:
 		flush_dcache_page(pgv_to_page(&h.h2->tp_status));
 		return h.h2->tp_status;
+	case TPACKET_V3:
+		pr_err("<%s> TPACKET version:%d not supported.Dumping
stack.\n",__func__,po->tp_version);
+		dump_stack();
+		return 0;
 	default:
 		pr_err("TPACKET version not supported\n");
 		BUG();
@@ -309,9 +381,234 @@ static inline void *packet_current_frame(struct
packet_sock *po,
 		struct packet_ring_buffer *rb,
 		int status)
 {
-	return packet_lookup_frame(po, rb, rb->head, status);
+	switch (po->tp_version) {
+		case TPACKET_V1:
+		case TPACKET_V2:
+			return packet_lookup_frame(po, rb, rb->head, status);
+		case TPACKET_V3:
+			pr_err("<%s> TPACKET version:%d not supported.Dumping
stack.\n",__func__,po->tp_version);
+			dump_stack();
+			return 0;
+		default:
+			pr_err("<%s> TPACKET version not supported\n",__func__);
+			BUG();
+			return 0;
+	}
+}
+
+static void prb_flush_block(struct block_desc *pbd1)
+{
+	flush_dcache_page(pgv_to_page(pbd1));
+}
+
+/* Side effect:
+ * 1)flush the block-header
+ * 2)Increment active_blk_num
+ */
+static void prb_close_block(struct kbdq_core *pkc1,struct block_desc *pbd1)
+{
+	
+	//long size = pkc1->pkblk_end - pkc1->nxt_offset;
+	pbd1->block_status = TP_STATUS_USER;
+
+	/* Get the ts of the last pkt */
+	if (pbd1->num_pkts) {
+		struct tpacket3_hdr *ph = (struct tpacket3_hdr *)pkc1->prev;
+		pbd1->ts_last_pkt.ts_sec		= ph->tp_sec;
+		pbd1->ts_last_pkt.ts_s2.ts_nsec	= ph->tp_nsec;
+	} else {
+		/* Ok, we tmo'd - so get the current time */
+		struct timespec ts;
+		getnstimeofday(&ts);
+		pbd1->ts_last_pkt.ts_sec		= ts.tp_sec;
+		pbd1->ts_last_pkt.ts_s2.ts_nsec	= ts.tp_nsec;
+	}
+
+	prb_flush_block(pbd1);
+	pkc1->kactive_blk_num = INCREMENT_PRB_BLK_NUM(pkc1);
+}
+
+static inline void prb_unplug_queue(struct kbdq_core *pkc) {
+	pkc->reset_pending_on_curr_blk=0;
+}
+
+/* Side effect of opening a block:
+ * 1) prb_queue is unplugged.
+ * 2) retire_blk_timer is refreshed.
+ */
+static void prb_open_block(struct kbdq_core *pkc1,struct block_desc *pbd1)
+{
+	struct timespec ts;
+
+	pbd1->block_status	= TP_STATUS_KERNEL;
+	getnstimeofday(&ts);
+	pbd1->num_pkts		= 0;
+	pbd1->ts_first_pkt.ts_sec				= ts.tv_sec;
+	pbd1->ts_first_pkt.ts_u1.ts_s2.ts_nsec	= ts.tv_nsec;
+	pkc1->pkblk_start	= (char *)pbd1;
+	pbd1->seq_num		= pkc1->knxt_seq_num++;
+	pkc1->nxt_offset	= (char *)(pkc1->pkblk_start + sizeof(struct block_desc));
+	
+	pbd1->offset_to_first_pkt    = (long)sizeof(struct block_desc);
+
+	pkc1->prev			= pkc1->nxt_offset;
+	pkc1->pkblk_end		= pkc1->pkblk_start + pkc1->kblk_size;
+
+	prb_unplug_queue(pkc1);
+	_prb_refresh_rx_retire_blk_timer(pkc1);
+}
+
+static inline void prb_plug_queue(struct kbdq_core *pkc,struct
packet_sock *po) {
+	pkc->reset_pending_on_curr_blk=1;
+	po->stats_u.stats3.tp_plug_q_cnt++;
+}
+
+static void *prb_try_next_block(struct kbdq_core *pkc,struct packet_sock *po)
+{
+	struct block_desc *pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
+
+	/* close current block */
+	if (likely(TP_STATUS_KERNEL == pbd->block_status)) {
+		prb_close_block(pkc,pbd);
+	} else {
+		printk("<%s> ERROR - pbd[%d]:%p\n",__func__,pkc->kactive_blk_num,pbd);
+		BUG();
+	}
+
+	/* Get the next block num */
+	pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
+	
+	smp_mb();
+	
+	/* If the curr_block is currently in_use then plug the queue */
+	if (TP_STATUS_USER == pbd->block_status) {
+		    prb_plug_queue(pkc,po);
+			return NULL;
+	}
+	/* open next block */
+	prb_open_block(pkc,pbd);
+	return (void *)pkc->nxt_offset;
+}
+
+#define TOTAL_PKT_LEN_INCL_ALIGN(length) (ALIGN_4((length)))
+
+static void prb_fill_curr_block(char *curr,struct kbdq_core
*pkc,struct block_desc *pbd,unsigned int len)
+{
+	struct tpacket3_hdr *ppd;
+	struct tpacket3_hdr *prev;
+
+	ppd  = (struct tpacket3_hdr *)curr;
+	prev = (struct tpacket3_hdr *)pkc->prev;
+	/* lets do pd_s1 for for V4 header */
+	//ppd->pd_u1.pd_s1.nxt_offset = 0;
+	//((struct tpacket3_hdr *)pkc->prev)->pd_u1.pd_s1.next_offset =
(char *)ppd - pkc->prev;
+	ppd->tp_next_offset = 0;
+	if (pkc->prev > (char *)ppd) {
+		printk("<%s> curr:0x%p len:%d pkc->prev:%p \n",__func__,curr,len,pkc->prev);
+		BUG();
+	}
+	prev->tp_next_offset = (long)ppd - (long)pkc->prev;
+	pkc->prev = curr;
+	pkc->nxt_offset += TOTAL_PKT_LEN_INCL_ALIGN(len);
+	pbd->num_pkts += 1;
+}
+
+static inline int prb_curr_blk_in_use(struct kbdq_core *pkc,struct
block_desc *pbd) {
+
+	return (TP_STATUS_USER == pbd->block_status);
+}
+
+static inline int prb_queue_plugged(struct kbdq_core *pkc) {
+	return pkc->reset_pending_on_curr_blk;
+}
+
+/* Assumes caller has the sk->rx_queue.lock */
+static void *__packet_lookup_frame_in_block(struct packet_ring_buffer *rb,
+		int status,unsigned int len,struct packet_sock *po)
+{
+	struct kbdq_core *pkc  = GET_PBDQC_FROM_RB(rb);
+	struct block_desc *pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
+	char *curr, *end;
+	
+	if (prb_queue_plugged(pkc)) {
+		if (prb_curr_blk_in_use(pkc,pbd)) {
+			return NULL;
+		} else {
+			/* open-block unplugs the queue. Unplugging is a side effect */
+			prb_open_block(pkc,pbd);
+		}
+	}
+
+	smp_mb();
+
+	curr = pkc->nxt_offset;
+	end  = (char *) ( (char *)pbd + pkc->kblk_size);
+	
+	/* first try the current block */
+	if (curr+TOTAL_PKT_LEN_INCL_ALIGN(len) < end) {
+		prb_fill_curr_block(curr,pkc,pbd,len);
+		return (void *)curr;
+	}
+	
+	/* Then try the next block. */
+	if ((curr = (char *)prb_try_next_block(pkc,po))) {
+		pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
+		prb_fill_curr_block(curr,pkc,pbd,len);
+		return (void *)curr;
+	}
+
+	/* no free blocks are available - user_space hasn't caught up yet */
+	return NULL;
+}
+
+static inline void *packet_current_rx_frame(struct packet_sock *po,
+		struct packet_ring_buffer *rb,
+		int status, unsigned int len)
+{
+	char *curr=NULL;
+	switch (po->tp_version) {
+		case TPACKET_V1:
+		case TPACKET_V2:
+			curr = packet_lookup_frame(po, rb, rb->head, status);
+			return curr;
+		case TPACKET_V3:
+			return __packet_lookup_frame_in_block(rb, status,len,po);
+		default:
+			pr_err("<%s> TPACKET version:%d not supported\n",__func__,po->tp_version);
+			BUG();
+			return 0;
+	}
+}
+
+static inline void *prb_lookup_block(struct packet_sock *po,
+		struct packet_ring_buffer *rb,unsigned int previous,
+		int status)
+{
+	struct kbdq_core *pkc  = GET_PBDQC_FROM_RB(rb);
+	struct block_desc *pbd = GET_PBLOCK_DESC(pkc,previous);
+
+	if (status != pbd->block_status)
+		return NULL;
+	return pbd;
+}
+
+static inline int prb_previous_blk_num(struct packet_ring_buffer *rb)
+{
+	unsigned int prev = rb->prb_bdqc.kactive_blk_num ?
(rb->prb_bdqc.kactive_blk_num-1) : (rb->prb_bdqc.knum_blocks-1);
+	return prev;
+}
+
+/* Assumes caller has held the rx_queue.lock */
+static inline void* __prb_previous_block(struct packet_sock *po,
+		struct packet_ring_buffer *rb,
+		int status)
+{
+
+	unsigned int previous = prb_previous_blk_num(rb);
+	return prb_lookup_block(po,rb,previous,status);
 }

+
 static inline void *packet_previous_frame(struct packet_sock *po,
 		struct packet_ring_buffer *rb,
 		int status)
@@ -320,11 +617,38 @@ static inline void *packet_previous_frame(struct
packet_sock *po,
 	return packet_lookup_frame(po, rb, previous, status);
 }

+static inline void *packet_previous_rx_frame(struct packet_sock *po,
+		struct packet_ring_buffer *rb,
+		int status)
+{
+	if (po->tp_version <= TPACKET_V2)
+		return packet_previous_frame(po,rb,status);
+	
+	return __prb_previous_block(po,rb,status);
+}
+
 static inline void packet_increment_head(struct packet_ring_buffer *buff)
 {
 	buff->head = buff->head != buff->frame_max ? buff->head+1 : 0;
 }

+static inline void packet_increment_rx_head(struct packet_sock
*po,struct packet_ring_buffer *rb)
+{
+	switch (po->tp_version) {
+		case TPACKET_V1:
+		case TPACKET_V2:
+			return packet_increment_head(rb);
+		case TPACKET_V3:
+			pr_err("<%s> TPACKET version:%d not supported.Dumping
stack.\n",__func__,po->tp_version);
+			dump_stack();
+			return;
+		default:
+			pr_err("<%s> TPACKET version not supported\n",__func__);
+			BUG();
+			return;
+	}
+}
+
 static inline struct packet_sock *pkt_sk(struct sock *sk)
 {
 	return (struct packet_sock *)sk;
@@ -663,6 +987,7 @@ static int tpacket_rcv(struct sk_buff *skb, struct
net_device *dev,
 	union {
 		struct tpacket_hdr *h1;
 		struct tpacket2_hdr *h2;
+		struct tpacket3_hdr *h3;
 		void *raw;
 	} h;
 	u8 *skb_head = skb->data;
@@ -715,29 +1040,31 @@ static int tpacket_rcv(struct sk_buff *skb,
struct net_device *dev,
 		macoff = netoff - maclen;
 	}

-	if (macoff + snaplen > po->rx_ring.frame_size) {
-		if (po->copy_thresh &&
-		    atomic_read(&sk->sk_rmem_alloc) + skb->truesize <
-		    (unsigned)sk->sk_rcvbuf) {
-			if (skb_shared(skb)) {
-				copy_skb = skb_clone(skb, GFP_ATOMIC);
-			} else {
-				copy_skb = skb_get(skb);
-				skb_head = skb->data;
+	if (po->tp_version <= TPACKET_V2) {
+		if (macoff + snaplen > po->rx_ring.frame_size) {
+			if (po->copy_thresh &&
+				atomic_read(&sk->sk_rmem_alloc) + skb->truesize <
+				(unsigned)sk->sk_rcvbuf) {
+				if (skb_shared(skb)) {
+					copy_skb = skb_clone(skb, GFP_ATOMIC);
+				} else {
+					copy_skb = skb_get(skb);
+					skb_head = skb->data;
+				}
+				if (copy_skb)
+					skb_set_owner_r(copy_skb, sk);
 			}
-			if (copy_skb)
-				skb_set_owner_r(copy_skb, sk);
+			snaplen = po->rx_ring.frame_size - macoff;
+			if ((int)snaplen < 0)
+				snaplen = 0;
 		}
-		snaplen = po->rx_ring.frame_size - macoff;
-		if ((int)snaplen < 0)
-			snaplen = 0;
 	}
-
 	spin_lock(&sk->sk_receive_queue.lock);
-	h.raw = packet_current_frame(po, &po->rx_ring, TP_STATUS_KERNEL);
+	h.raw = packet_current_rx_frame(po, &po->rx_ring,
TP_STATUS_KERNEL,(macoff+snaplen));
 	if (!h.raw)
 		goto ring_is_full;
-	packet_increment_head(&po->rx_ring);
+	if (TPACKET_V3 != po->tp_version)
+		packet_increment_rx_head(po,&po->rx_ring);
 	po->stats.tp_packets++;
 	if (copy_skb) {
 		status |= TP_STATUS_COPY;
@@ -789,6 +1116,21 @@ static int tpacket_rcv(struct sk_buff *skb,
struct net_device *dev,
 		h.h2->tp_vlan_tci = vlan_tx_tag_get(skb);
 		hdrlen = sizeof(*h.h2);
 		break;
+	case TPACKET_V3:
+		/* tp_nxt_offset is already populated above. So DONT clear those
fields here */
+		h.h3->tp_len = skb->len;
+		h.h3->tp_snaplen = snaplen;
+		h.h3->tp_mac = macoff;
+		h.h3->tp_net = netoff;
+		if (skb->tstamp.tv64)
+			ts = ktime_to_timespec(skb->tstamp);
+		else
+			getnstimeofday(&ts);
+		h.h3->tp_sec  = ts.tv_sec;
+		h.h3->tp_nsec = ts.tv_nsec;
+		h.h3->tp_vlan_tci = vlan_tx_tag_get(skb);
+		hdrlen = sizeof(*h.h3);
+		break;	
 	default:
 		BUG();
 	}
@@ -804,7 +1146,8 @@ static int tpacket_rcv(struct sk_buff *skb,
struct net_device *dev,
 	else
 		sll->sll_ifindex = dev->ifindex;

-	__packet_set_status(po, h.raw, status);
+	if (po->tp_version <= TPACKET_V2)
+		__packet_set_status(po, h.raw, status);
 	smp_mb();
 #if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
 	{
@@ -815,7 +1158,6 @@ static int tpacket_rcv(struct sk_buff *skb,
struct net_device *dev,
 			flush_dcache_page(pgv_to_page(start));
 	}
 #endif
-
 	sk->sk_data_ready(sk, 0);

 drop_n_restore:
@@ -1984,6 +2326,7 @@ packet_setsockopt(struct socket *sock, int
level, int optname, char __user *optv
 		switch (val) {
 		case TPACKET_V1:
 		case TPACKET_V2:
+		case TPACKET_V3:
 			po->tp_version = val;
 			return 0;
 		default:
@@ -2082,6 +2425,7 @@ static int packet_getsockopt(struct socket
*sock, int level, int optname,
 	struct packet_sock *po = pkt_sk(sk);
 	void *data;
 	struct tpacket_stats st;
+	union tpacket_stats_u st_u;

 	if (level != SOL_PACKET)
 		return -ENOPROTOOPT;
@@ -2094,15 +2438,25 @@ static int packet_getsockopt(struct socket
*sock, int level, int optname,

 	switch (optname) {
 	case PACKET_STATISTICS:
-		if (len > sizeof(struct tpacket_stats))
-			len = sizeof(struct tpacket_stats);
+		if (po->tp_version == TPACKET_V3) {
+			len = sizeof(struct tpacket_stats_v3);
+		} else {
+			if (len > sizeof(struct tpacket_stats))
+				len = sizeof(struct tpacket_stats);
+		}
 		spin_lock_bh(&sk->sk_receive_queue.lock);
-		st = po->stats;
+		if (po->tp_version == TPACKET_V3) {
+			memcpy(&st_u.stats3,&po->stats,sizeof(struct tpacket_stats));
+			st_u.stats3.tp_plug_q_cnt  = po->stats_u.stats3.tp_plug_q_cnt;
+			st_u.stats3.tp_packets += po->stats.tp_drops;
+			data = &st_u.stats3;
+		} else {
+			st = po->stats;
+			st.tp_packets += st.tp_drops;
+			data = &st;
+		}
 		memset(&po->stats, 0, sizeof(st));
 		spin_unlock_bh(&sk->sk_receive_queue.lock);
-		st.tp_packets += st.tp_drops;
-
-		data = &st;
 		break;
 	case PACKET_AUXDATA:
 		if (len > sizeof(int))
@@ -2143,6 +2497,9 @@ static int packet_getsockopt(struct socket
*sock, int level, int optname,
 		case TPACKET_V2:
 			val = sizeof(struct tpacket2_hdr);
 			break;
+		case TPACKET_V3:
+			val = sizeof(struct tpacket3_hdr);
+			break;
 		default:
 			return -EINVAL;
 		}
@@ -2293,7 +2650,7 @@ static unsigned int packet_poll(struct file
*file, struct socket *sock,

 	spin_lock_bh(&sk->sk_receive_queue.lock);
 	if (po->rx_ring.pg_vec) {
-		if (!packet_previous_frame(po, &po->rx_ring, TP_STATUS_KERNEL))
+		if (!packet_previous_rx_frame(po, &po->rx_ring, TP_STATUS_KERNEL))
 			mask |= POLLIN | POLLRDNORM;
 	}
 	spin_unlock_bh(&sk->sk_receive_queue.lock);
@@ -2396,7 +2753,6 @@ static struct pgv *alloc_pg_vec(struct
tpacket_req *req, int order)
 	pg_vec = kcalloc(block_nr, sizeof(struct pgv), GFP_KERNEL);
 	if (unlikely(!pg_vec))
 		goto out;
-
 	for (i = 0; i < block_nr; i++) {
 		pg_vec[i].buffer = alloc_one_pg_vec_page(order);
 		if (unlikely(!pg_vec[i].buffer))
@@ -2412,6 +2768,197 @@ out_free_pgvec:
 	goto out;
 }

+
+static void prb_del_retire_blk_timer(struct kbdq_core *pkc)
+{
+	del_timer_sync(&pkc->retire_blk_timer);
+}
+
+static void prb_shutdown_retire_blk_timer(struct packet_sock *po, int
tx_ring,struct sk_buff_head *rb_queue)
+{
+	struct kbdq_core *pkc;
+
+	pkc	= tx_ring ? &po->tx_ring.prb_bdqc : &po->rx_ring.prb_bdqc;
+	
+	spin_lock(&rb_queue->lock);
+	pkc->delete_blk_timer=1;
+	spin_unlock(&rb_queue->lock);
+
+	prb_del_retire_blk_timer(pkc);
+}
+
+/*  Increment the blk_num and then invoke this func to refresh the timer.
+ *  We do it in this order so that if a timer is about
+ *  to fire then it will fail the blk_num check.
+ *  Assumes sk_buff_head lock is held.
+ */
+static void _prb_refresh_rx_retire_blk_timer(struct kbdq_core *pkc)
+{
+	pkc->last_kactive_blk_num = pkc->kactive_blk_num;
+	mod_timer(&pkc->retire_blk_timer,jiffies+msecs_to_jiffies(pkc->retire_blk_tmo));
+}
+
+/* close current block and open next block or plug the queue */
+static inline void prb_retire_curr_block(struct kbdq_core *pkc,struct
packet_sock *po)
+{
+	prb_try_next_block(pkc,po);
+}
+
+/*
+ * Timer logic:
+ * 1) We refresh the timer only when we open a block.
+ *    By doing this we don't waste cycles refreshing the timer
+ *    on packet-by-packet basis.
+ * With a 1MB block-size, on a 1Gbps line, it will take
+ * ~8 ms to fill a block.
+ * So, if the user sets the 'tmo' to 10ms then the timer will never
fire(which is what we want)!
+ * However, the user could choose to close a block early and that's fine.
+ *
+ * But when the timer does fire, we check whether or not to refresh it.
+ * Since the tmo granularity is in msecs, it is not too expensive
+ * to refresh the timer every '8' msecs.
+ * Either the user can set the 'tmo' or we can derive it based on
+ * a) line-speed and b) block-size
+ */
+static void prb_retire_rx_blk_timer_expired(unsigned long data)
+{
+	struct packet_sock *po = (struct packet_sock *)data;
+	struct kbdq_core *pkc = &po->rx_ring.prb_bdqc;
+	unsigned short tmo;
+	unsigned int plugged;
+	struct block_desc *pbd;
+
+	spin_lock(&po->sk.sk_receive_queue.lock);
+
+	plugged = prb_queue_plugged(pkc);
+	pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
+
+	/* We read the tmo so that user-space can change it anytime they want.
+	 * But, the changes will get into affect only when:
+	 * i) Either when the timer expires(this code path) or
+	 * ii)When a new block is opened.
+	 */
+	tmo = pkc->retire_blk_tmo;
+	if (pkc->last_kactive_blk_num == pkc->kactive_blk_num &&
+		!plugged) {
+		if (TP_STATUS_KERNEL == pbd->block_status) {
+			prb_retire_curr_block(pkc,po);
+		}
+	}
+	pkc->last_kactive_blk_num = pkc->kactive_blk_num;
+	
+	if (pkc->delete_blk_timer)
+		goto out;
+
+	if (plugged) {
+		/* Case 1. queue was plugged because user-space was lagging behind */
+		if (prb_curr_blk_in_use(pkc,pbd)) {
+			/* Ok, user-space is still behind. But we still want to refresh the timer */
+			/* if-check added for code readability */
+		} else {
+			/* Case 2. queue was plugged, user-space caught up and now the
link went idle && the timer fired.
+			 * We don't have a block to close and we cannot close the current
block because
+			 * the timer wasn't really meant for this block. So we just open
this block and restart the timer.
+			 * open-block unplugs the queue, restarts timer.
Unplugging/refreshing-timer is a side effect.
+			 */
+			prb_open_block(pkc,pbd);
+			goto out;
+		}
+	}
+
+	mod_timer(&pkc->retire_blk_timer,jiffies+msecs_to_jiffies(tmo));
+
+out:
+	spin_unlock(&po->sk.sk_receive_queue.lock);
+}
+
+static void prb_init_blk_timer(struct packet_sock *po,struct
kbdq_core *pkc,void (*func) (unsigned long))
+{
+
+	init_timer(&pkc->retire_blk_timer);
+	pkc->retire_blk_timer.data		= (long)po;
+	pkc->retire_blk_timer.function	= func;
+	pkc->retire_blk_timer.expires	= jiffies;
+}
+
+static void prb_setup_retire_blk_timer(struct packet_sock *po,int tx_ring)
+{
+	struct kbdq_core *pkc;
+
+	if (tx_ring)
+		BUG();
+
+	pkc	 = tx_ring ? &po->tx_ring.prb_bdqc : &po->rx_ring.prb_bdqc;
+	prb_init_blk_timer(po,pkc,prb_retire_rx_blk_timer_expired);
+}
+
+static int prb_calc_retire_blk_tmo(struct packet_sock *po, int
blk_size_in_bytes)
+{
+	struct net_device *dev;
+	unsigned int mbits=0,msec=0,div=0,tmo=0;
+
+	dev = dev_get_by_index(sock_net(&po->sk), po->ifindex);
+	if (unlikely(dev == NULL)) {
+		return DEFAULT_PRB_RETIRE_TMO;
+	}
+
+    if (dev->ethtool_ops && dev->ethtool_ops->get_settings) {
+		struct ethtool_cmd ecmd = { .cmd = ETHTOOL_GSET, };
+
+        if (!dev->ethtool_ops->get_settings(dev, &ecmd)) {
+			switch(ecmd.speed) {
+				case SPEED_10000:
+					msec = 1;
+					div=10000/1000;
+					break;
+                case SPEED_1000:
+                    msec = 1;
+					div = 1000/1000;
+					break;
+                /* If the link speed is so low you don't really need
to care about perf anyways */
+				case SPEED_100:
+				case SPEED_10:
+				default:
+					return DEFAULT_PRB_RETIRE_TMO;
+            }
+        }
+    }
+
+	mbits = (blk_size_in_bytes * 8) / (1024 * 1024);
+
+	if (div)
+		mbits /= div;
+
+	tmo = mbits * msec;
+
+	if (div)
+		return (tmo+1);
+	return tmo;
+}
+
+static void init_prb_bdqc(struct packet_sock *po,struct
packet_ring_buffer *rb,struct pgv *pg_vec,struct tpacket_req *req,int
tx_ring)
+{
+
+	struct kbdq_core *p1 = &rb->prb_bdqc;
+	struct block_desc *pbd;
+
+	memset(p1,0x0,sizeof(*p1));
+	p1->pkbdq			= pg_vec;
+	pbd					= (struct block_desc *)pg_vec[0].buffer;
+	p1->pkblk_start		= (char *)pg_vec[0].buffer;
+	
+	p1->kblk_size		= req->tp_block_size;
+	p1->knum_blocks		= req->tp_block_nr;
+	p1->hdrlen			= po->tp_hdrlen;
+	
+	p1->last_kactive_blk_num = 0;
+	po->stats_u.stats3.tp_plug_q_cnt = 0;
+	p1->retire_blk_tmo = prb_calc_retire_blk_tmo(po,req->tp_block_size);
+
+	prb_setup_retire_blk_timer(po,tx_ring);
+	prb_open_block(p1,pbd);
+}
+
 static int packet_set_ring(struct sock *sk, struct tpacket_req *req,
 		int closing, int tx_ring)
 {
@@ -2421,7 +2968,14 @@ static int packet_set_ring(struct sock *sk,
struct tpacket_req *req,
 	struct packet_ring_buffer *rb;
 	struct sk_buff_head *rb_queue;
 	__be16 num;
-	int err;
+	int err=-EINVAL;
+
+	/* Opening a Tx-ring is NOT supported post TPACKET_V2 */
+	if (!closing && tx_ring && (po->tp_version > TPACKET_V2)) {
+		pr_err("<%s> Tx-ring is not supported on version:%d.Dumping
stack.\n",__func__,po->tp_version);
+		dump_stack();
+		goto out;
+	}

 	rb = tx_ring ? &po->tx_ring : &po->rx_ring;
 	rb_queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue;
@@ -2447,6 +3001,9 @@ static int packet_set_ring(struct sock *sk,
struct tpacket_req *req,
 		case TPACKET_V2:
 			po->tp_hdrlen = TPACKET2_HDRLEN;
 			break;
+		case TPACKET_V3:
+			po->tp_hdrlen = TPACKET3_HDRLEN;
+			break;
 		}

 		err = -EINVAL;
@@ -2472,6 +3029,15 @@ static int packet_set_ring(struct sock *sk,
struct tpacket_req *req,
 		pg_vec = alloc_pg_vec(req, order);
 		if (unlikely(!pg_vec))
 			goto out;
+		switch (po->tp_version) {
+			case TPACKET_V3:
+				/* Transmit path is not supported. We checked it above but just
being paranoid */
+				if (!tx_ring)
+					init_prb_bdqc(po,rb,pg_vec,req,tx_ring);
+				break;
+			default:
+				break;
+		}
 	}
 	/* Done */
 	else {
@@ -2529,10 +3095,17 @@ static int packet_set_ring(struct sock *sk,
struct tpacket_req *req,
 	}
 	spin_unlock(&po->bind_lock);

+	if (closing && (po->tp_version > TPACKET_V2)) {
+		/* Because we don't support block-based V3 on tx-ring */
+		if (!tx_ring)	
+			prb_shutdown_retire_blk_timer(po,tx_ring,rb_queue);
+	}
+
 	release_sock(sk);

 	if (pg_vec)
 		free_pg_vec(pg_vec, order, req->tp_block_nr);
+	
 out:
 	return err;
 }
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html