lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20171031124145.9667-5-bjorn.topel@gmail.com>
Date:   Tue, 31 Oct 2017 13:41:35 +0100
From:   Björn Töpel <bjorn.topel@...il.com>
To:     bjorn.topel@...il.com, magnus.karlsson@...el.com,
        alexander.h.duyck@...el.com, alexander.duyck@...il.com,
        john.fastabend@...il.com, ast@...com, brouer@...hat.com,
        michael.lundkvist@...csson.com, ravineet.singh@...csson.com,
        daniel@...earbox.net, netdev@...r.kernel.org
Cc:     jesse.brandeburg@...el.com, anjali.singhai@...el.com,
        rami.rosen@...el.com, jeffrey.b.shaw@...el.com,
        ferruh.yigit@...el.com, qi.z.zhang@...el.com
Subject: [RFC PATCH 04/14] packet: enable Rx for AF_PACKET V4

From: Magnus Karlsson <magnus.karlsson@...el.com>

In this commit, ingress support is implemented.

Signed-off-by: Magnus Karlsson <magnus.karlsson@...el.com>
---
 include/linux/tpacket4.h | 361 +++++++++++++++++++++++++++++++++++++++++++++++
 net/packet/af_packet.c   |  83 +++++++----
 2 files changed, 419 insertions(+), 25 deletions(-)

diff --git a/include/linux/tpacket4.h b/include/linux/tpacket4.h
index 44ba38034133..1d4c13d472e5 100644
--- a/include/linux/tpacket4.h
+++ b/include/linux/tpacket4.h
@@ -191,6 +191,172 @@ static inline struct tp4_umem *tp4q_umem_new(unsigned long addr, size_t size,
 }
 
 /**
+ * tp4q_set_error - Sets an errno on the descriptor
+ *
+ * @desc: Pointer to the descriptor to be manipulated
+ * @errno: The errno number to write to the descriptor
+ **/
+static inline void tp4q_set_error(struct tpacket4_desc *desc,
+				  int errno)
+{
+	desc->error = errno;
+}
+
+/**
+ * tp4q_set_offset - Sets the data offset for the descriptor
+ *
+ * @desc: Pointer to the descriptor to be manipulated
+ * @offset: The data offset to write to the descriptor
+ **/
+static inline void tp4q_set_offset(struct tpacket4_desc *desc,
+				   u16 offset)
+{
+	desc->offset = offset;
+}
+
+/**
+ * tp4q_is_free - Is there a free entry on the queue?
+ *
+ * @q: Pointer to the tp4 queue to examine
+ *
+ * Returns true if there is a free entry, otherwise false
+ **/
+static inline int tp4q_is_free(struct tp4_queue *q)
+{
+	unsigned int idx = q->used_idx & q->ring_mask;
+	unsigned int prev_idx;
+
+	if (!idx)
+		prev_idx = q->ring_mask;
+	else
+		prev_idx = idx - 1;
+
+	/* previous frame is already consumed by userspace
+	 * meaning ring is free
+	 */
+	if (q->ring[prev_idx].flags & TP4_DESC_KERNEL)
+		return 1;
+
+	/* there is some data that userspace can read immediately */
+	return 0;
+}
+
+/**
+ * tp4q_get_data_headroom - How much data headroom does the queue have
+ *
+ * @q: Pointer to the tp4 queue to examine
+ *
+ * Returns the amount of data headroom that has been configured for the
+ * queue
+ **/
+static inline unsigned int tp4q_get_data_headroom(struct tp4_queue *q)
+{
+	return q->umem->data_headroom + TP4_KERNEL_HEADROOM;
+}
+
+/**
+ * tp4q_is_valid_entry - Is the entry valid?
+ *
+ * @q: Pointer to the tp4 queue the descriptor resides in
+ * @desc: Pointer to the descriptor to examine
+ * @validation: The type of validation to perform
+ *
+ * Returns true if the entry is a valid, otherwise false
+ **/
+static inline bool tp4q_is_valid_entry(struct tp4_queue *q,
+				       struct tpacket4_desc *d,
+				       enum tp4_validation validation)
+{
+	if (validation == TP4_VALIDATION_NONE)
+		return true;
+
+	if (unlikely(d->idx >= q->umem->nframes)) {
+		tp4q_set_error(d, EBADF);
+		return false;
+	}
+	if (validation == TP4_VALIDATION_IDX) {
+		tp4q_set_offset(d, tp4q_get_data_headroom(q));
+		return true;
+	}
+
+	/* TP4_VALIDATION_DESC */
+	if (unlikely(d->len > q->umem->frame_size ||
+		     d->len == 0 ||
+		     d->offset > q->umem->frame_size ||
+		     d->offset + d->len > q->umem->frame_size)) {
+		tp4q_set_error(d, EBADF);
+		return false;
+	}
+
+	return true;
+}
+
+/**
+ * tp4q_nb_avail - Returns the number of available entries
+ *
+ * @q: Pointer to the tp4 queue to examine
+ * @dcnt: Max number of entries to check
+ *
+ * Returns the the number of entries available in the queue up to dcnt
+ **/
+static inline int tp4q_nb_avail(struct tp4_queue *q, int dcnt)
+{
+	unsigned int idx, last_avail_idx = q->last_avail_idx;
+	int i, entries = 0;
+
+	for (i = 0; i < dcnt; i++) {
+		idx = (last_avail_idx++) & q->ring_mask;
+		if (!(q->ring[idx].flags & TP4_DESC_KERNEL))
+			break;
+		entries++;
+	}
+
+	return entries;
+}
+
+/**
+ * tp4q_enqueue - Enqueue entries to a tp4 queue
+ *
+ * @q: Pointer to the tp4 queue the descriptor resides in
+ * @d: Pointer to the descriptor to examine
+ * @dcnt: Max number of entries to dequeue
+ *
+ * Returns 0 for success or an errno at failure
+ **/
+static inline int tp4q_enqueue(struct tp4_queue *q,
+			       const struct tpacket4_desc *d, int dcnt)
+{
+	unsigned int used_idx = q->used_idx;
+	int i;
+
+	if (q->num_free < dcnt)
+		return -ENOSPC;
+
+	q->num_free -= dcnt;
+
+	for (i = 0; i < dcnt; i++) {
+		unsigned int idx = (used_idx++) & q->ring_mask;
+
+		q->ring[idx].idx = d[i].idx;
+		q->ring[idx].len = d[i].len;
+		q->ring[idx].offset = d[i].offset;
+		q->ring[idx].error = d[i].error;
+	}
+
+	/* Order flags and data */
+	smp_wmb();
+
+	for (i = dcnt - 1; i >= 0; i--) {
+		unsigned int idx = (q->used_idx + i) & q->ring_mask;
+
+		q->ring[idx].flags = d[i].flags & ~TP4_DESC_KERNEL;
+	}
+	q->used_idx += dcnt;
+
+	return 0;
+}
+
+/**
  * tp4q_enqueue_from_array - Enqueue entries from packet array to tp4 queue
  *
  * @a: Pointer to the packet array to enqueue from
@@ -236,6 +402,45 @@ static inline int tp4q_enqueue_from_array(struct tp4_packet_array *a,
 }
 
 /**
+ * tp4q_dequeue_to_array - Dequeue entries from tp4 queue to packet array
+ *
+ * @a: Pointer to the packet array to dequeue from
+ * @dcnt: Max number of entries to dequeue
+ *
+ * Returns the number of entries dequeued. Non valid entries will be
+ * discarded.
+ **/
+static inline int tp4q_dequeue_to_array(struct tp4_packet_array *a, u32 dcnt)
+{
+	struct tpacket4_desc *d = a->items;
+	int i, entries, valid_entries = 0;
+	struct tp4_queue *q = a->tp4q;
+	u32 start = a->end;
+
+	entries = tp4q_nb_avail(q, dcnt);
+	q->num_free += entries;
+
+	/* Order flags and data */
+	smp_rmb();
+
+	for (i = 0; i < entries; i++) {
+		unsigned int d_idx = start & a->mask;
+		unsigned int idx;
+
+		idx = (q->last_avail_idx++) & q->ring_mask;
+		d[d_idx] = q->ring[idx];
+		if (!tp4q_is_valid_entry(q, &d[d_idx], a->validation)) {
+			WARN_ON_ONCE(tp4q_enqueue(a->tp4q, &d[d_idx], 1));
+			continue;
+		}
+
+		start++;
+		valid_entries++;
+	}
+	return valid_entries;
+}
+
+/**
  * tp4q_disable - Disable a tp4 queue
  *
  * @dev: Pointer to the netdevice the queue is connected to
@@ -309,6 +514,67 @@ static inline int tp4q_enable(struct device *dev,
 	return 0;
 }
 
+/**
+ * tp4q_get_page_offset - Get offset into page frame resides at
+ *
+ * @q: Pointer to the tp4 queue that this frame resides in
+ * @addr: Index of this frame in the packet buffer / umem
+ * @pg: Returns a pointer to the page of this frame
+ * @off: Returns the offset to the page of this frame
+ **/
+static inline void tp4q_get_page_offset(struct tp4_queue *q, u64 addr,
+				       u64 *pg, u64 *off)
+{
+	*pg = addr >> q->umem->nfpplog2;
+	*off = (addr - (*pg << q->umem->nfpplog2))
+	       << q->umem->frame_size_log2;
+}
+
+/**
+ * tp4q_max_data_size - Get the max packet size supported by a queue
+ *
+ * @q: Pointer to the tp4 queue to examine
+ *
+ * Returns the max packet size supported by the queue
+ **/
+static inline unsigned int tp4q_max_data_size(struct tp4_queue *q)
+{
+	return q->umem->frame_size - q->umem->data_headroom -
+		TP4_KERNEL_HEADROOM;
+}
+
+/**
+ * tp4q_get_data - Gets a pointer to the start of the packet
+ *
+ * @q: Pointer to the tp4 queue to examine
+ * @desc: Pointer to descriptor of the packet
+ *
+ * Returns a pointer to the start of the packet the descriptor is pointing
+ * to
+ **/
+static inline void *tp4q_get_data(struct tp4_queue *q,
+				  struct tpacket4_desc *desc)
+{
+	u64 pg, off;
+	u8 *pkt;
+
+	tp4q_get_page_offset(q, desc->idx, &pg, &off);
+	pkt = page_address(q->umem->pgs[pg]);
+	return (u8 *)(pkt + off) + desc->offset;
+}
+
+/**
+ * tp4q_get_desc - Get descriptor associated with frame
+ *
+ * @p: Pointer to the packet to examine
+ *
+ * Returns the descriptor of the current frame of packet p
+ **/
+static inline struct tpacket4_desc *tp4q_get_desc(struct tp4_frame_set *p)
+{
+	return &p->pkt_arr->items[p->curr & p->pkt_arr->mask];
+}
+
 /*************** FRAME OPERATIONS *******************************/
 /* A frame is always just one frame of size frame_size.
  * A frame set is one or more frames.
@@ -331,6 +597,18 @@ static inline bool tp4f_next_frame(struct tp4_frame_set *p)
 }
 
 /**
+ * tp4f_get_data - Gets a pointer to the frame the frame set is on
+ * @p: pointer to the frame set
+ *
+ * Returns a pointer to the data of the frame that the frame set is
+ * pointing to. Note that there might be configured headroom before this
+ **/
+static inline void *tp4f_get_data(struct tp4_frame_set *p)
+{
+	return tp4q_get_data(p->pkt_arr->tp4q, tp4q_get_desc(p));
+}
+
+/**
  * tp4f_set_frame - Sets the properties of a frame
  * @p: pointer to frame
  * @len: the length in bytes of the data in the frame
@@ -443,6 +721,29 @@ static inline bool tp4a_get_flushable_frame_set(struct tp4_packet_array *a,
 }
 
 /**
+ * tp4a_next_frame - Get next frame in array and advance curr pointer
+ * @a: pointer to packet array
+ * @p: supplied pointer to packet structure that is filled in by function
+ *
+ * Returns true if there is a frame, false otherwise. Frame returned in *p.
+ **/
+static inline bool tp4a_next_frame(struct tp4_packet_array *a,
+				   struct tp4_frame_set *p)
+{
+	u32 avail = a->end - a->curr;
+
+	if (avail == 0)
+		return false; /* empty */
+
+	p->pkt_arr = a;
+	p->start = a->curr;
+	p->curr = a->curr;
+	p->end = ++a->curr;
+
+	return true;
+}
+
+/**
  * tp4a_flush - Flush processed packets to associated tp4q
  * @a: pointer to packet array
  *
@@ -489,4 +790,64 @@ static inline void tp4a_free(struct tp4_packet_array *a)
 	kfree(a);
 }
 
+/**
+ * tp4a_get_data_headroom - Returns the data headroom configured for the array
+ * @a: pointer to packet array
+ *
+ * Returns the data headroom configured for the array
+ **/
+static inline unsigned int tp4a_get_data_headroom(struct tp4_packet_array *a)
+{
+	return tp4q_get_data_headroom(a->tp4q);
+}
+
+/**
+ * tp4a_max_data_size - Get the max packet size supported for the array
+ * @a: pointer to packet array
+ *
+ * Returns the maximum size of data that can be put in a frame when headroom
+ * has been accounted for.
+ **/
+static inline unsigned int tp4a_max_data_size(struct tp4_packet_array *a)
+{
+	return tp4q_max_data_size(a->tp4q);
+
+}
+
+/**
+ * tp4a_populate - Populate an array with packets from associated tp4q
+ * @a: pointer to packet array
+ **/
+static inline void tp4a_populate(struct tp4_packet_array *a)
+{
+	u32 cnt, free = a->mask + 1 - (a->end - a->start);
+
+	if (free == 0)
+		return; /* no space! */
+
+	cnt = tp4q_dequeue_to_array(a, free);
+	a->end += cnt;
+}
+
+/**
+ * tp4a_next_frame_populate - Get next frame and populate array if empty
+ * @a: pointer to packet array
+ * @p: supplied pointer to packet structure that is filled in by function
+ *
+ * Returns true if there is a frame, false otherwise. Frame returned in *p.
+ **/
+static inline bool tp4a_next_frame_populate(struct tp4_packet_array *a,
+					    struct tp4_frame_set *p)
+{
+	bool more_frames;
+
+	more_frames = tp4a_next_frame(a, p);
+	if (!more_frames) {
+		tp4a_populate(a);
+		more_frames = tp4a_next_frame(a, p);
+	}
+
+	return more_frames;
+}
+
 #endif /* _LINUX_TPACKET4_H */
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 190598eb3461..830d97ff4358 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -2192,7 +2192,7 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
 	int skb_len = skb->len;
 	unsigned int snaplen, res;
 	unsigned long status = TP_STATUS_USER;
-	unsigned short macoff, netoff, hdrlen;
+	unsigned short macoff = 0, netoff = 0, hdrlen;
 	struct sk_buff *copy_skb = NULL;
 	struct timespec ts;
 	__u32 ts_status;
@@ -2212,9 +2212,6 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
 	sk = pt->af_packet_priv;
 	po = pkt_sk(sk);
 
-	if (po->tp_version == TPACKET_V4)
-		goto drop;
-
 	if (!net_eq(dev_net(dev), sock_net(sk)))
 		goto drop;
 
@@ -2246,7 +2243,7 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
 	if (sk->sk_type == SOCK_DGRAM) {
 		macoff = netoff = TPACKET_ALIGN(po->tp_hdrlen) + 16 +
 				  po->tp_reserve;
-	} else {
+	} else if (po->tp_version != TPACKET_V4) {
 		unsigned int maclen = skb_network_offset(skb);
 		netoff = TPACKET_ALIGN(po->tp_hdrlen +
 				       (maclen < 16 ? 16 : maclen)) +
@@ -2276,6 +2273,12 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
 				do_vnet = false;
 			}
 		}
+	} else if (po->tp_version == TPACKET_V4) {
+		if (snaplen > tp4a_max_data_size(po->rx_ring.tp4a)) {
+			pr_err_once("%s: packet too big, %u, dropping.",
+				    __func__, snaplen);
+			goto drop_n_restore;
+		}
 	} else if (unlikely(macoff + snaplen >
 			    GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len)) {
 		u32 nval;
@@ -2291,8 +2294,22 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
 		}
 	}
 	spin_lock(&sk->sk_receive_queue.lock);
-	h.raw = packet_current_rx_frame(po, skb,
-					TP_STATUS_KERNEL, (macoff+snaplen));
+	if (po->tp_version != TPACKET_V4) {
+		h.raw = packet_current_rx_frame(po, skb,
+						TP_STATUS_KERNEL,
+						(macoff + snaplen));
+	} else {
+		struct tp4_frame_set p;
+
+		if (tp4a_next_frame_populate(po->rx_ring.tp4a, &p)) {
+			u16 offset = tp4a_get_data_headroom(po->rx_ring.tp4a);
+
+			tp4f_set_frame(&p, snaplen, offset, true);
+			h.raw = tp4f_get_data(&p);
+		} else {
+			h.raw = NULL;
+		}
+	}
 	if (!h.raw)
 		goto drop_n_account;
 	if (po->tp_version <= TPACKET_V2) {
@@ -2371,20 +2388,25 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
 		memset(h.h3->tp_padding, 0, sizeof(h.h3->tp_padding));
 		hdrlen = sizeof(*h.h3);
 		break;
+	case TPACKET_V4:
+		hdrlen = 0;
+		break;
 	default:
 		BUG();
 	}
 
-	sll = h.raw + TPACKET_ALIGN(hdrlen);
-	sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
-	sll->sll_family = AF_PACKET;
-	sll->sll_hatype = dev->type;
-	sll->sll_protocol = skb->protocol;
-	sll->sll_pkttype = skb->pkt_type;
-	if (unlikely(po->origdev))
-		sll->sll_ifindex = orig_dev->ifindex;
-	else
-		sll->sll_ifindex = dev->ifindex;
+	if (po->tp_version != TPACKET_V4) {
+		sll = h.raw + TPACKET_ALIGN(hdrlen);
+		sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
+		sll->sll_family = AF_PACKET;
+		sll->sll_hatype = dev->type;
+		sll->sll_protocol = skb->protocol;
+		sll->sll_pkttype = skb->pkt_type;
+		if (unlikely(po->origdev))
+			sll->sll_ifindex = orig_dev->ifindex;
+		else
+			sll->sll_ifindex = dev->ifindex;
+	}
 
 	smp_mb();
 
@@ -2401,11 +2423,21 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
 	smp_wmb();
 #endif
 
-	if (po->tp_version <= TPACKET_V2) {
+	switch (po->tp_version) {
+	case TPACKET_V1:
+	case TPACKET_V2:
 		__packet_set_status(po, h.raw, status);
 		sk->sk_data_ready(sk);
-	} else {
+		break;
+	case TPACKET_V3:
 		prb_clear_blk_fill_status(&po->rx_ring);
+		break;
+	case TPACKET_V4:
+		spin_lock(&sk->sk_receive_queue.lock);
+		WARN_ON_ONCE(tp4a_flush(po->rx_ring.tp4a));
+		spin_unlock(&sk->sk_receive_queue.lock);
+		sk->sk_data_ready(sk);
+		break;
 	}
 
 drop_n_restore:
@@ -4283,20 +4315,21 @@ static unsigned int packet_poll(struct file *file, struct socket *sock,
 	struct packet_sock *po = pkt_sk(sk);
 	unsigned int mask = datagram_poll(file, sock, wait);
 
-	if (po->tp_version == TPACKET_V4)
-		return mask;
-
 	spin_lock_bh(&sk->sk_receive_queue.lock);
 	if (po->rx_ring.pg_vec) {
-		if (!packet_previous_rx_frame(po, &po->rx_ring,
-			TP_STATUS_KERNEL))
+		if (po->tp_version == TPACKET_V4) {
+			if (!tp4q_is_free(&po->rx_ring.tp4q))
+				mask |= POLLIN | POLLRDNORM;
+		} else if (!packet_previous_rx_frame(po, &po->rx_ring,
+					TP_STATUS_KERNEL)) {
 			mask |= POLLIN | POLLRDNORM;
+		}
 	}
 	if (po->pressure && __packet_rcv_has_room(po, NULL) == ROOM_NORMAL)
 		po->pressure = 0;
 	spin_unlock_bh(&sk->sk_receive_queue.lock);
 	spin_lock_bh(&sk->sk_write_queue.lock);
-	if (po->tx_ring.pg_vec) {
+	if (po->tx_ring.pg_vec && po->tp_version != TPACKET_V4) {
 		if (packet_current_frame(po, &po->tx_ring, TP_STATUS_AVAILABLE))
 			mask |= POLLOUT | POLLWRNORM;
 	}
-- 
2.11.0

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ