lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20171031124145.9667-6-bjorn.topel@gmail.com>
Date:   Tue, 31 Oct 2017 13:41:36 +0100
From:   Björn Töpel <bjorn.topel@...il.com>
To:     bjorn.topel@...il.com, magnus.karlsson@...el.com,
        alexander.h.duyck@...el.com, alexander.duyck@...il.com,
        john.fastabend@...il.com, ast@...com, brouer@...hat.com,
        michael.lundkvist@...csson.com, ravineet.singh@...csson.com,
        daniel@...earbox.net, netdev@...r.kernel.org
Cc:     jesse.brandeburg@...el.com, anjali.singhai@...el.com,
        rami.rosen@...el.com, jeffrey.b.shaw@...el.com,
        ferruh.yigit@...el.com, qi.z.zhang@...el.com
Subject: [RFC PATCH 05/14] packet: enable Tx support for AF_PACKET V4

From: Magnus Karlsson <magnus.karlsson@...el.com>

In this commit AF_PACKET V4 egress support is added.

Signed-off-by: Magnus Karlsson <magnus.karlsson@...el.com>
---
 include/linux/tpacket4.h | 192 +++++++++++++++++++++++++++++++++++++++++++++++
 net/packet/af_packet.c   | 169 ++++++++++++++++++++++++++++++++++++++---
 2 files changed, 350 insertions(+), 11 deletions(-)

diff --git a/include/linux/tpacket4.h b/include/linux/tpacket4.h
index 1d4c13d472e5..ac6c721294e8 100644
--- a/include/linux/tpacket4.h
+++ b/include/linux/tpacket4.h
@@ -18,6 +18,8 @@
 #define TP4_UMEM_MIN_FRAME_SIZE 2048
 #define TP4_KERNEL_HEADROOM 256 /* Headrom for XDP */
 
+#define TP4A_FRAME_COMPLETED TP4_DESC_KERNEL
+
 enum tp4_validation {
 	TP4_VALIDATION_NONE,	/* No validation is performed */
 	TP4_VALIDATION_IDX,	/* Only address to packet buffer is validated */
@@ -402,6 +404,60 @@ static inline int tp4q_enqueue_from_array(struct tp4_packet_array *a,
 }
 
 /**
+ * tp4q_enqueue_completed_from_array - Enqueue only completed entries
+ *				       from packet array
+ *
+ * @a: Pointer to the packet array to enqueue from
+ * @dcnt: Max number of entries to enqueue
+ *
+ * Returns the number of entries successfully enqueued or a negative errno
+ * at failure.
+ **/
+static inline int tp4q_enqueue_completed_from_array(struct tp4_packet_array *a,
+						    u32 dcnt)
+{
+	struct tp4_queue *q = a->tp4q;
+	unsigned int used_idx = q->used_idx;
+	struct tpacket4_desc *d = a->items;
+	int i, j;
+
+	if (q->num_free < dcnt)
+		return -ENOSPC;
+
+	for (i = 0; i < dcnt; i++) {
+		unsigned int didx = (a->start + i) & a->mask;
+
+		if (d[didx].flags & TP4A_FRAME_COMPLETED) {
+			unsigned int idx = (used_idx++) & q->ring_mask;
+
+			q->ring[idx].idx = d[didx].idx;
+			q->ring[idx].len = d[didx].len;
+			q->ring[idx].offset = d[didx].offset;
+			q->ring[idx].error = d[didx].error;
+		} else {
+			break;
+		}
+	}
+
+	if (i == 0)
+		return 0;
+
+	/* Order flags and data */
+	smp_wmb();
+
+	for (j = i - 1; j >= 0; j--) {
+		unsigned int idx = (q->used_idx + j) & q->ring_mask;
+		unsigned int didx = (a->start + j) & a->mask;
+
+		q->ring[idx].flags = d[didx].flags & ~TP4_DESC_KERNEL;
+	}
+	q->num_free -= i;
+	q->used_idx += i;
+
+	return i;
+}
+
+/**
  * tp4q_dequeue_to_array - Dequeue entries from tp4 queue to packet array
  *
  * @a: Pointer to the packet array to dequeue from
@@ -581,6 +637,15 @@ static inline struct tpacket4_desc *tp4q_get_desc(struct tp4_frame_set *p)
  **/
 
 /**
+ * tp4f_reset - Start to traverse the frames in the set from the beginning
+ * @p: pointer to frame set
+ **/
+static inline void tp4f_reset(struct tp4_frame_set *p)
+{
+	p->curr = p->start;
+}
+
+/**
  * tp4f_next_frame - Go to next frame in frame set
  * @p: pointer to frame set
  *
@@ -597,6 +662,38 @@ static inline bool tp4f_next_frame(struct tp4_frame_set *p)
 }
 
 /**
+ * tp4f_get_frame_id - Get packet buffer id of frame
+ * @p: pointer to frame set
+ *
+ * Returns the id of the packet buffer of the current frame
+ **/
+static inline u64 tp4f_get_frame_id(struct tp4_frame_set *p)
+{
+	return p->pkt_arr->items[p->curr & p->pkt_arr->mask].idx;
+}
+
+/**
+ * tp4f_get_frame_len - Get length of data in current frame
+ * @p: pointer to frame set
+ *
+ * Returns the length of data in the packet buffer of the current frame
+ **/
+static inline u32 tp4f_get_frame_len(struct tp4_frame_set *p)
+{
+	return p->pkt_arr->items[p->curr & p->pkt_arr->mask].len;
+}
+
+/**
+ * tp4f_set_error - Set an error on the current frame
+ * @p: pointer to frame set
+ * @errno: the errno to be assigned
+ **/
+static inline void tp4f_set_error(struct tp4_frame_set *p, int errno)
+{
+	p->pkt_arr->items[p->curr & p->pkt_arr->mask].error = errno;
+}
+
+/**
  * tp4f_get_data - Gets a pointer to the frame the frame set is on
  * @p: pointer to the frame set
  *
@@ -627,6 +724,48 @@ static inline void tp4f_set_frame(struct tp4_frame_set *p, u32 len, u16 offset,
 		d->flags |= TP4_PKT_CONT;
 }
 
+/*************** PACKET OPERATIONS *******************************/
+/* A packet consists of one or more frames. Both frames and packets
+ * are represented by a tp4_frame_set. The only difference is that
+ * packet functions look at the EOP flag.
+ **/
+
+/**
+ * tp4f_get_packet_len - Length of packet
+ * @p: pointer to packet
+ *
+ * Returns the length of the packet in bytes.
+ * Resets curr pointer of packet.
+ **/
+static inline u32 tp4f_get_packet_len(struct tp4_frame_set *p)
+{
+	u32 len = 0;
+
+	tp4f_reset(p);
+
+	do {
+		len += tp4f_get_frame_len(p);
+	} while (tp4f_next_frame(p));
+
+	return len;
+}
+
+/**
+ * tp4f_packet_completed - Mark packet as completed
+ * @p: pointer to packet
+ *
+ * Resets curr pointer of packet.
+ **/
+static inline void tp4f_packet_completed(struct tp4_frame_set *p)
+{
+	tp4f_reset(p);
+
+	do {
+		p->pkt_arr->items[p->curr & p->pkt_arr->mask].flags |=
+			TP4A_FRAME_COMPLETED;
+	} while (tp4f_next_frame(p));
+}
+
 /**************** PACKET_ARRAY FUNCTIONS ********************************/
 
 static inline struct tp4_packet_array *__tp4a_new(
@@ -815,6 +954,59 @@ static inline unsigned int tp4a_max_data_size(struct tp4_packet_array *a)
 }
 
 /**
+ * tp4a_next_packet - Get next packet in array and advance curr pointer
+ * @a: pointer to packet array
+ * @p: supplied pointer to packet structure that is filled in by function
+ *
+ * Returns true if there is a packet, false otherwise. Packet returned in *p.
+ **/
+static inline bool tp4a_next_packet(struct tp4_packet_array *a,
+				    struct tp4_frame_set *p)
+{
+	u32 avail = a->end - a->curr;
+
+	if (avail == 0)
+		return false; /* empty */
+
+	p->pkt_arr = a;
+	p->start = a->curr;
+	p->curr = a->curr;
+	p->end = a->curr;
+
+	/* XXX Sanity check for too-many-frames packets? */
+	while (a->items[p->end++ & a->mask].flags & TP4_PKT_CONT) {
+		avail--;
+		if (avail == 0)
+			return false;
+	}
+
+	a->curr += (p->end - p->start);
+	return true;
+}
+
+/**
+ * tp4a_flush_completed - Flushes only frames marked as completed
+ * @a: pointer to packet array
+ *
+ * Returns 0 for success and -1 for failure
+ **/
+static inline int tp4a_flush_completed(struct tp4_packet_array *a)
+{
+	u32 avail = a->curr - a->start;
+	int ret;
+
+	if (avail == 0)
+		return 0; /* nothing to flush */
+
+	ret = tp4q_enqueue_completed_from_array(a, avail);
+	if (ret < 0)
+		return -1;
+
+	a->start += ret;
+	return 0;
+}
+
+/**
  * tp4a_populate - Populate an array with packets from associated tp4q
  * @a: pointer to packet array
  **/
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 830d97ff4358..444eb4834362 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -2462,6 +2462,28 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
 	goto drop_n_restore;
 }
 
+static void packet_v4_destruct_skb(struct sk_buff *skb)
+{
+	struct packet_sock *po = pkt_sk(skb->sk);
+
+	if (likely(po->tx_ring.pg_vec)) {
+		u64 idx = (u64)skb_shinfo(skb)->destructor_arg;
+		struct tp4_frame_set p = {.start = idx,
+					  .curr = idx,
+					  .end = idx + 1,
+					  .pkt_arr = po->tx_ring.tp4a};
+
+		spin_lock(&po->sk.sk_write_queue.lock);
+		tp4f_packet_completed(&p);
+		WARN_ON_ONCE(tp4a_flush_completed(po->tx_ring.tp4a));
+		spin_unlock(&po->sk.sk_write_queue.lock);
+
+		packet_dec_pending(&po->tx_ring);
+	}
+
+	sock_wfree(skb);
+}
+
 static void tpacket_destruct_skb(struct sk_buff *skb)
 {
 	struct packet_sock *po = pkt_sk(skb->sk);
@@ -2519,24 +2541,24 @@ static int packet_snd_vnet_parse(struct msghdr *msg, size_t *len,
 }
 
 static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
-		void *frame, struct net_device *dev, void *data, int tp_len,
+		void *dtor_arg, struct net_device *dev, void *data, int tp_len,
 		__be16 proto, unsigned char *addr, int hlen, int copylen,
 		const struct sockcm_cookie *sockc)
 {
-	union tpacket_uhdr ph;
 	int to_write, offset, len, nr_frags, len_max;
 	struct socket *sock = po->sk.sk_socket;
 	struct page *page;
 	int err;
 
-	ph.raw = frame;
-
 	skb->protocol = proto;
 	skb->dev = dev;
 	skb->priority = po->sk.sk_priority;
 	skb->mark = po->sk.sk_mark;
-	sock_tx_timestamp(&po->sk, sockc->tsflags, &skb_shinfo(skb)->tx_flags);
-	skb_shinfo(skb)->destructor_arg = ph.raw;
+	if (sockc) {
+		sock_tx_timestamp(&po->sk, sockc->tsflags,
+				  &skb_shinfo(skb)->tx_flags);
+	}
+	skb_shinfo(skb)->destructor_arg = dtor_arg;
 
 	skb_reserve(skb, hlen);
 	skb_reset_network_header(skb);
@@ -2840,6 +2862,126 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
 	return err;
 }
 
+static int packet_v4_snd(struct packet_sock *po, struct msghdr *msg)
+{
+	DECLARE_SOCKADDR(struct sockaddr_ll *, saddr, msg->msg_name);
+	bool need_wait = !(msg->msg_flags & MSG_DONTWAIT);
+	struct packet_ring_buffer *rb = &po->tx_ring;
+	int err = 0, dlen, size_max, hlen, tlen;
+	struct tp4_frame_set p;
+	struct net_device *dev;
+	struct sk_buff *skb;
+	unsigned char *addr;
+	bool has_packet;
+	__be16 proto;
+	void *data;
+
+	mutex_lock(&po->pg_vec_lock);
+
+	if (likely(!saddr)) {
+		dev = packet_cached_dev_get(po);
+		proto = po->num;
+		addr = NULL;
+	} else {
+		pr_warn("packet v4 not implemented!\n");
+		return -EINVAL;
+	}
+
+	err = -ENXIO;
+	if (unlikely(!dev))
+		goto out;
+	err = -ENETDOWN;
+	if (unlikely(!(dev->flags & IFF_UP)))
+		goto out_put;
+
+	size_max = tp4a_max_data_size(rb->tp4a);
+
+	if (size_max > dev->mtu + dev->hard_header_len + VLAN_HLEN)
+		size_max = dev->mtu + dev->hard_header_len + VLAN_HLEN;
+
+	spin_lock_bh(&po->sk.sk_write_queue.lock);
+	tp4a_populate(rb->tp4a);
+	spin_unlock_bh(&po->sk.sk_write_queue.lock);
+
+	do {
+		spin_lock_bh(&po->sk.sk_write_queue.lock);
+		has_packet = tp4a_next_packet(rb->tp4a, &p);
+		spin_unlock_bh(&po->sk.sk_write_queue.lock);
+
+		if (!has_packet) {
+			if (need_wait && need_resched()) {
+				schedule();
+				continue;
+			}
+			break;
+		}
+
+		dlen = tp4f_get_packet_len(&p);
+		data = tp4f_get_data(&p);
+		hlen = LL_RESERVED_SPACE(dev);
+		tlen = dev->needed_tailroom;
+		skb = sock_alloc_send_skb(&po->sk,
+					  hlen + tlen +
+					  sizeof(struct sockaddr_ll),
+					  !need_wait, &err);
+
+		if (unlikely(!skb)) {
+			err = -EAGAIN;
+			goto out_err;
+		}
+
+		dlen = tpacket_fill_skb(po, skb,
+					(void *)(long)tp4f_get_frame_id(&p),
+					dev,
+					data, dlen, proto, addr, hlen,
+					dev->hard_header_len, NULL);
+		if (likely(dlen >= 0) &&
+		    dlen > dev->mtu + dev->hard_header_len &&
+		    !packet_extra_vlan_len_allowed(dev, skb)) {
+			dlen = -EMSGSIZE;
+		}
+
+		if (unlikely(dlen < 0)) {
+			err = dlen;
+			goto out_err;
+		}
+
+		skb->destructor = packet_v4_destruct_skb;
+		packet_inc_pending(&po->tx_ring);
+
+		err = po->xmit(skb);
+		/* Ignore NET_XMIT_CN as packet might have been sent */
+		if (err == NET_XMIT_DROP || err == NETDEV_TX_BUSY) {
+			err = -EAGAIN;
+			packet_dec_pending(&po->tx_ring);
+			skb = NULL;
+			goto out_err;
+		}
+	} while (!err ||
+		/* Note: packet_read_pending() might be slow if we have
+		 * to call it as it's per_cpu variable, but in fast-path
+		 * we already short-circuit the loop with the first
+		 * condition, and luckily don't have to go that path
+		 * anyway.
+		 */
+		 (need_wait && packet_read_pending(&po->tx_ring)));
+
+	goto out_put;
+
+out_err:
+	spin_lock_bh(&po->sk.sk_write_queue.lock);
+	tp4f_set_error(&p, -err);
+	tp4f_packet_completed(&p);
+	WARN_ON_ONCE(tp4a_flush_completed(rb->tp4a));
+	spin_unlock_bh(&po->sk.sk_write_queue.lock);
+	kfree_skb(skb);
+out_put:
+	dev_put(dev);
+out:
+	mutex_unlock(&po->pg_vec_lock);
+	return 0;
+}
+
 static struct sk_buff *packet_alloc_skb(struct sock *sk, size_t prepad,
 				        size_t reserve, size_t len,
 				        size_t linear, int noblock,
@@ -3015,10 +3157,10 @@ static int packet_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
 	struct packet_sock *po = pkt_sk(sk);
 
 	if (po->tx_ring.pg_vec) {
-		if (po->tp_version == TPACKET_V4)
-			return -EINVAL;
+		if (po->tp_version != TPACKET_V4)
+			return tpacket_snd(po, msg);
 
-		return tpacket_snd(po, msg);
+		return packet_v4_snd(po, msg);
 	}
 
 	return packet_snd(sock, msg, len);
@@ -4329,9 +4471,14 @@ static unsigned int packet_poll(struct file *file, struct socket *sock,
 		po->pressure = 0;
 	spin_unlock_bh(&sk->sk_receive_queue.lock);
 	spin_lock_bh(&sk->sk_write_queue.lock);
-	if (po->tx_ring.pg_vec && po->tp_version != TPACKET_V4) {
-		if (packet_current_frame(po, &po->tx_ring, TP_STATUS_AVAILABLE))
+	if (po->tx_ring.pg_vec) {
+		if (po->tp_version == TPACKET_V4) {
+			if (tp4q_nb_avail(&po->tx_ring.tp4q, 1))
+				mask |= POLLOUT | POLLWRNORM;
+		} else if (packet_current_frame(po, &po->tx_ring,
+					 TP_STATUS_AVAILABLE)) {
 			mask |= POLLOUT | POLLWRNORM;
+		}
 	}
 	spin_unlock_bh(&sk->sk_write_queue.lock);
 	return mask;
-- 
2.11.0

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ