netdev - Re: [PATCH] af_packet: add interframe drop cmsg (v2)

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20090924004232.GA17913@localhost.localdomain>
Date:	Wed, 23 Sep 2009 20:42:32 -0400
From:	Neil Horman <nhorman@...driver.com>
To:	netdev@...r.kernel.org
Cc:	davem@...emloft.net, eric.dumazet@...il.com
Subject: Re: [PATCH] af_packet: add interframe drop cmsg (v2)

   
Ok, Version 2 of this patch, Taking Erics change notes into account.  Again,
tested by me, using the same test harness, and it works fine.
Neil
 

    Add Ancilliary data to better represent loss information
    
    I've had a few requests recently to provide more detail regarding frame loss
    during an AF_PACKET packet capture session.  Specifically the requestors want to
    see where in a packet sequence frames were lost, i.e. they want to see that 40
    frames were lost between frames 302 and 303 in a packet capture file.  In order
    to do this we need:
    
    1) The kernel to export this data to user space
    2) The applications to make use of it
    
    This patch addresses item (1).  It does this by doing the following:
    
    A) attaching ancilliary data to any skb enqueued to a socket recieve queue for
    which frames were lost between it and the previously enqueued frame.  Note I use
    a ring buffer with a correlator value (the skb pointer) to do this.  This was
    done because the skb->cb array is exhausted already for AF_PACKET
    
    B) For any frame dequeued that has ancilliary data in the ring buffer (as
    determined by the correlator value), we add a cmsg structure to the msghdr that
    gets copied to user space, this cmsg structure is of cmsg_level AF_PACKET, and
    cmsg_type PACKET_GAPDATA.  It contains a u32 value which counts the number of
    frames lost between the reception of the frame being currently recevied and the
    frame most recently preceding it.  Note this creates a situation in which if we
    have packet loss followed immediately by a socket close operation we might miss
    some gap information.  This situation is covered by the use of the
    PACKET_AUXINFO socket option, which provides total loss stats (from which the
    final gap can be computed).
    
    I've tested this patch myself, and it works well.
    
    Signed-off-by: Neil Horman <nhorman@...driver.com>

diff --git a/include/linux/if_packet.h b/include/linux/if_packet.h
index dea7d6b..e5d200f 100644
--- a/include/linux/if_packet.h
+++ b/include/linux/if_packet.h
@@ -48,11 +48,13 @@ struct sockaddr_ll
 #define PACKET_RESERVE			12
 #define PACKET_TX_RING			13
 #define PACKET_LOSS			14
+#define PACKET_GAPDATA			15
 
 struct tpacket_stats
 {
 	unsigned int	tp_packets;
 	unsigned int	tp_drops;
+	unsigned int    tp_gap;
 };
 
 struct tpacket_auxdata
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index d3d52c6..ee624fe 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -179,6 +179,11 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg);
 
 static void packet_flush_mclist(struct sock *sk);
 
+struct packet_gap_record {
+		struct sk_buff *skb;
+		__u32 gap;
+};
+
 struct packet_sock {
 	/* struct sock has to be the first member of packet_sock */
 	struct sock		sk;
@@ -197,6 +202,11 @@ struct packet_sock {
 	int			ifindex;	/* bound device		*/
 	__be16			num;
 	struct packet_mclist	*mclist;
+	struct packet_gap_record *gaps;
+	unsigned int gap_head;
+	unsigned int gap_tail;
+	unsigned int gap_list_size;
+
 #ifdef CONFIG_PACKET_MMAP
 	atomic_t		mapped;
 	enum tpacket_versions	tp_version;
@@ -524,6 +534,56 @@ static inline unsigned int run_filter(struct sk_buff *skb, struct sock *sk,
 }
 
 /*
+ * If we've lost frames since the last time we queued one to the
+ * sk_receive_queue, we need to record it here.
+ * This must be called under the protection of the socket lock
+ * to prevent racing with other softirqs and user space
+ */
+static void record_packet_gap(struct sk_buff *skb, struct packet_sock *po)
+{
+	/*
+	 * do nothing if there is no gap
+	 */
+	if (!po->stats.tp_gap)
+		return;
+
+	/*
+	 * If there is, check the gap list tail to make sure we
+	 * have an open entry
+	 */
+	if (po->gaps[po->gap_tail].skb != NULL) {
+		pr_debug("packet socket gap list is full!\n");
+		return;
+	}
+
+	/*
+	 * We have a free entry, record it
+	 */
+	po->gaps[po->gap_tail].skb = skb;
+	po->gaps[po->gap_tail].gap = po->stats.tp_gap;
+	if (++po->gap_tail == po->gap_list_size)
+		po->gap_tail = 0;
+	po->stats.tp_gap = 0;
+	return;
+
+}
+
+static __u32 check_packet_gap(struct sk_buff *skb, struct packet_sock *po)
+{
+	__u32 gap = 0;
+
+	if (po->gaps[po->gap_head].skb != skb)
+		return 0;
+
+	gap = po->gaps[po->gap_head].gap;
+	po->gaps[po->gap_head].skb = NULL;
+	if (++po->gap_head == po->gap_list_size)
+		po->gap_head = 0;
+	return gap;
+}
+
+
+/*
    This function makes lazy skb cloning in hope that most of packets
    are discarded by BPF.
 
@@ -626,6 +686,7 @@ static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
 
 	spin_lock(&sk->sk_receive_queue.lock);
 	po->stats.tp_packets++;
+	record_packet_gap(skb, po);
 	__skb_queue_tail(&sk->sk_receive_queue, skb);
 	spin_unlock(&sk->sk_receive_queue.lock);
 	sk->sk_data_ready(sk, skb->len);
@@ -634,6 +695,7 @@ static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
 drop_n_acct:
 	spin_lock(&sk->sk_receive_queue.lock);
 	po->stats.tp_drops++;
+	po->stats.tp_gap++;
 	spin_unlock(&sk->sk_receive_queue.lock);
 
 drop_n_restore:
@@ -811,6 +873,7 @@ drop:
 
 ring_is_full:
 	po->stats.tp_drops++;
+	po->stats.tp_gap++;
 	spin_unlock(&sk->sk_receive_queue.lock);
 
 	sk->sk_data_ready(sk, 0);
@@ -1223,6 +1286,8 @@ static int packet_release(struct socket *sock)
 	skb_queue_purge(&sk->sk_receive_queue);
 	sk_refcnt_debug_release(sk);
 
+	kfree(po->gaps);
+
 	sock_put(sk);
 	return 0;
 }
@@ -1350,6 +1415,7 @@ static int packet_create(struct net *net, struct socket *sock, int protocol)
 	struct packet_sock *po;
 	__be16 proto = (__force __be16)protocol; /* weird, but documented */
 	int err;
+	unsigned int num_records = PAGE_SIZE/sizeof(struct packet_gap_record);
 
 	if (!capable(CAP_NET_RAW))
 		return -EPERM;
@@ -1374,6 +1440,14 @@ static int packet_create(struct net *net, struct socket *sock, int protocol)
 	sk->sk_family = PF_PACKET;
 	po->num = proto;
 
+	err = -ENOMEM;
+	po->gaps = kzalloc(sizeof(struct packet_gap_record)*num_records,
+				GFP_KERNEL);
+	if (!po->gaps)
+		goto out_free;
+	po->gap_tail = po->gap_head = 0;
+	po->gap_list_size = num_records;
+
 	sk->sk_destruct = packet_sock_destruct;
 	sk_refcnt_debug_inc(sk);
 
@@ -1402,6 +1476,9 @@ static int packet_create(struct net *net, struct socket *sock, int protocol)
 	sock_prot_inuse_add(net, &packet_proto, 1);
 	write_unlock_bh(&net->packet.sklist_lock);
 	return 0;
+
+out_free:
+	sk_free(sk);
 out:
 	return err;
 }
@@ -1418,6 +1495,7 @@ static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,
 	struct sk_buff *skb;
 	int copied, err;
 	struct sockaddr_ll *sll;
+	__u32 gap;
 
 	err = -EINVAL;
 	if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT))
@@ -1496,6 +1574,12 @@ static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,
 		put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux);
 	}
 
+	lock_sock(sk);
+	gap = check_packet_gap(skb, pkt_sk(sk));
+	release_sock(sk);
+	if (gap)
+		put_cmsg(msg, SOL_PACKET, PACKET_GAPDATA, sizeof(u32), &gap);
+
 	/*
 	 *	Free or return the buffer as appropriate. Again this
 	 *	hides all the races and re-entrancy issues from us.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html