lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20170207030240.31357-10-edumazet@google.com>
Date:   Mon,  6 Feb 2017 19:02:40 -0800
From:   Eric Dumazet <edumazet@...gle.com>
To:     "David S . Miller" <davem@...emloft.net>
Cc:     netdev <netdev@...r.kernel.org>,
        Tariq Toukan <tariqt@...lanox.com>,
        Martin KaFai Lau <kafai@...com>,
        Willem de Bruijn <willemb@...gle.com>,
        Jesper Dangaard Brouer <brouer@...hat.com>,
        Brenden Blanco <bblanco@...mgrid.com>,
        Alexei Starovoitov <ast@...nel.org>,
        Eric Dumazet <edumazet@...gle.com>,
        Eric Dumazet <eric.dumazet@...il.com>
Subject: [PATCH net-next 9/9] mlx4: add page recycling in receive path

Same technique than some Intel drivers, for arches where PAGE_SIZE = 4096

In most cases, pages are reused because they were consumed
before we could loop around the RX ring.

This brings back performance, and is even better,
a single TCP flow reaches 30Gbit on my hosts.

Signed-off-by: Eric Dumazet <edumazet@...gle.com>
---
 drivers/net/ethernet/mellanox/mlx4/en_rx.c   | 238 ++++++++-------------------
 drivers/net/ethernet/mellanox/mlx4/mlx4_en.h |   1 -
 2 files changed, 68 insertions(+), 171 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx4/en_rx.c b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
index be4f3491a4fcb6ee0e9fe4e71abfd2bc5373..6854a19087edbf0bc9bf29e20a82deaaf043 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
@@ -50,10 +50,9 @@
 
 #include "mlx4_en.h"
 
-static int mlx4_alloc_pages(struct mlx4_en_priv *priv,
-			    struct mlx4_en_rx_alloc *page_alloc,
-			    const struct mlx4_en_frag_info *frag_info,
-			    gfp_t gfp)
+static int mlx4_alloc_page(const struct mlx4_en_priv *priv,
+			   struct mlx4_en_rx_alloc *frag,
+			   gfp_t gfp)
 {
 	struct page *page;
 	dma_addr_t dma;
@@ -66,142 +65,40 @@ static int mlx4_alloc_pages(struct mlx4_en_priv *priv,
 		put_page(page);
 		return -ENOMEM;
 	}
-	page_alloc->page = page;
-	page_alloc->dma = dma;
-	page_alloc->page_offset = 0;
-	/* Not doing get_page() for each frag is a big win
-	 * on asymetric workloads. Note we can not use atomic_set().
-	 */
-	page_ref_add(page, PAGE_SIZE / frag_info->frag_stride - 1);
+	frag->page = page;
+	frag->dma = dma;
+	frag->page_offset = priv->rx_headroom;
 	return 0;
 }
 
-static int mlx4_en_alloc_frags(struct mlx4_en_priv *priv,
+static int mlx4_en_alloc_frags(const struct mlx4_en_priv *priv,
 			       struct mlx4_en_rx_desc *rx_desc,
 			       struct mlx4_en_rx_alloc *frags,
-			       struct mlx4_en_rx_alloc *ring_alloc,
 			       gfp_t gfp)
 {
-	struct mlx4_en_rx_alloc page_alloc[MLX4_EN_MAX_RX_FRAGS];
-	const struct mlx4_en_frag_info *frag_info;
-	struct page *page;
 	int i;
 
-	for (i = 0; i < priv->num_frags; i++) {
-		frag_info = &priv->frag_info[i];
-		page_alloc[i] = ring_alloc[i];
-		page_alloc[i].page_offset += frag_info->frag_stride;
-
-		if (page_alloc[i].page_offset + frag_info->frag_stride <=
-		    PAGE_SIZE)
-			continue;
-
-		if (unlikely(mlx4_alloc_pages(priv, &page_alloc[i],
-					      frag_info, gfp)))
-			goto out;
-	}
-
-	for (i = 0; i < priv->num_frags; i++) {
-		frags[i] = ring_alloc[i];
-		frags[i].page_offset += priv->rx_headroom;
-		rx_desc->data[i].addr = cpu_to_be64(frags[i].dma +
-						    frags[i].page_offset);
-		ring_alloc[i] = page_alloc[i];
-	}
-
-	return 0;
-
-out:
-	while (i--) {
-		if (page_alloc[i].page != ring_alloc[i].page) {
-			dma_unmap_page(priv->ddev, page_alloc[i].dma,
-				       PAGE_SIZE, priv->dma_dir);
-			page = page_alloc[i].page;
-			/* Revert changes done by mlx4_alloc_pages */
-			page_ref_sub(page, PAGE_SIZE /
-					   priv->frag_info[i].frag_stride - 1);
-			put_page(page);
-		}
-	}
-	return -ENOMEM;
-}
-
-static void mlx4_en_free_frag(struct mlx4_en_priv *priv,
-			      struct mlx4_en_rx_alloc *frags,
-			      int i)
-{
-	const struct mlx4_en_frag_info *frag_info = &priv->frag_info[i];
-	u32 next_frag_end = frags[i].page_offset + 2 * frag_info->frag_stride;
-
-
-	if (next_frag_end > PAGE_SIZE)
-		dma_unmap_page(priv->ddev, frags[i].dma, PAGE_SIZE,
-			       priv->dma_dir);
-
-	if (frags[i].page)
-		put_page(frags[i].page);
-}
-
-static int mlx4_en_init_allocator(struct mlx4_en_priv *priv,
-				  struct mlx4_en_rx_ring *ring)
-{
-	int i;
-	struct mlx4_en_rx_alloc *page_alloc;
-
-	for (i = 0; i < priv->num_frags; i++) {
-		const struct mlx4_en_frag_info *frag_info = &priv->frag_info[i];
-
-		if (mlx4_alloc_pages(priv, &ring->page_alloc[i],
-				     frag_info, GFP_KERNEL | __GFP_COLD))
-			goto out;
-
-		en_dbg(DRV, priv, "  frag %d allocator: - frags:%d\n",
-		       i, page_ref_count(ring->page_alloc[i].page));
+	for (i = 0; i < priv->num_frags; i++, frags++) {
+		if (!frags->page && mlx4_alloc_page(priv, frags, gfp))
+			return -ENOMEM;
+		rx_desc->data[i].addr = cpu_to_be64(frags->dma +
+						    frags->page_offset);
 	}
 	return 0;
-
-out:
-	while (i--) {
-		struct page *page;
-
-		page_alloc = &ring->page_alloc[i];
-		dma_unmap_page(priv->ddev, page_alloc->dma,
-			       PAGE_SIZE, priv->dma_dir);
-		page = page_alloc->page;
-		/* Revert changes done by mlx4_alloc_pages */
-		page_ref_sub(page, PAGE_SIZE /
-				   priv->frag_info[i].frag_stride - 1);
-		put_page(page);
-		page_alloc->page = NULL;
-	}
-	return -ENOMEM;
 }
 
-static void mlx4_en_destroy_allocator(struct mlx4_en_priv *priv,
-				      struct mlx4_en_rx_ring *ring)
+static void mlx4_en_free_frag(const struct mlx4_en_priv *priv,
+			      struct mlx4_en_rx_alloc *frag)
 {
-	struct mlx4_en_rx_alloc *page_alloc;
-	int i;
-
-	for (i = 0; i < priv->num_frags; i++) {
-		const struct mlx4_en_frag_info *frag_info = &priv->frag_info[i];
-
-		page_alloc = &ring->page_alloc[i];
-		en_dbg(DRV, priv, "Freeing allocator:%d count:%d\n",
-		       i, page_count(page_alloc->page));
-
-		dma_unmap_page(priv->ddev, page_alloc->dma,
+	if (frag->page) {
+		dma_unmap_page(priv->ddev, frag->dma,
 			       PAGE_SIZE, priv->dma_dir);
-		while (page_alloc->page_offset + frag_info->frag_stride <
-		       PAGE_SIZE) {
-			put_page(page_alloc->page);
-			page_alloc->page_offset += frag_info->frag_stride;
-		}
-		page_alloc->page = NULL;
+		put_page(frag->page);
+		frag->page = NULL;
 	}
 }
 
-static void mlx4_en_init_rx_desc(struct mlx4_en_priv *priv,
+static void mlx4_en_init_rx_desc(const struct mlx4_en_priv *priv,
 				 struct mlx4_en_rx_ring *ring, int index)
 {
 	struct mlx4_en_rx_desc *rx_desc = ring->buf + ring->stride * index;
@@ -226,7 +123,7 @@ static void mlx4_en_init_rx_desc(struct mlx4_en_priv *priv,
 	}
 }
 
-static int mlx4_en_prepare_rx_desc(struct mlx4_en_priv *priv,
+static int mlx4_en_prepare_rx_desc(const struct mlx4_en_priv *priv,
 				   struct mlx4_en_rx_ring *ring, int index,
 				   gfp_t gfp)
 {
@@ -235,19 +132,21 @@ static int mlx4_en_prepare_rx_desc(struct mlx4_en_priv *priv,
 					(index << priv->log_rx_info);
 
 	if (ring->page_cache.index > 0) {
-		ring->page_cache.index--;
-		frags[0].page = ring->page_cache.buf[ring->page_cache.index].page;
-		frags[0].dma  = ring->page_cache.buf[ring->page_cache.index].dma;
+		if (frags[0].page) {
+			ring->page_cache.index--;
+			frags[0].page = ring->page_cache.buf[ring->page_cache.index].page;
+			frags[0].dma  = ring->page_cache.buf[ring->page_cache.index].dma;
+		}
 		frags[0].page_offset = XDP_PACKET_HEADROOM;
 		rx_desc->data[0].addr = cpu_to_be64(frags[0].dma +
 						    frags[0].page_offset);
 		return 0;
 	}
 
-	return mlx4_en_alloc_frags(priv, rx_desc, frags, ring->page_alloc, gfp);
+	return mlx4_en_alloc_frags(priv, rx_desc, frags, gfp);
 }
 
-static inline bool mlx4_en_is_ring_empty(struct mlx4_en_rx_ring *ring)
+static bool mlx4_en_is_ring_empty(const struct mlx4_en_rx_ring *ring)
 {
 	return ring->prod == ring->cons;
 }
@@ -257,7 +156,8 @@ static inline void mlx4_en_update_rx_prod_db(struct mlx4_en_rx_ring *ring)
 	*ring->wqres.db.db = cpu_to_be32(ring->prod & 0xffff);
 }
 
-static void mlx4_en_free_rx_desc(struct mlx4_en_priv *priv,
+/* slow path */
+static void mlx4_en_free_rx_desc(const struct mlx4_en_priv *priv,
 				 struct mlx4_en_rx_ring *ring,
 				 int index)
 {
@@ -267,7 +167,7 @@ static void mlx4_en_free_rx_desc(struct mlx4_en_priv *priv,
 	frags = ring->rx_info + (index << priv->log_rx_info);
 	for (nr = 0; nr < priv->num_frags; nr++) {
 		en_dbg(DRV, priv, "Freeing fragment:%d\n", nr);
-		mlx4_en_free_frag(priv, frags, nr);
+		mlx4_en_free_frag(priv, frags + nr);
 	}
 }
 
@@ -380,9 +280,9 @@ int mlx4_en_create_rx_ring(struct mlx4_en_priv *priv,
 
 	tmp = size * roundup_pow_of_two(MLX4_EN_MAX_RX_FRAGS *
 					sizeof(struct mlx4_en_rx_alloc));
-	ring->rx_info = vmalloc_node(tmp, node);
+	ring->rx_info = vzalloc_node(tmp, node);
 	if (!ring->rx_info) {
-		ring->rx_info = vmalloc(tmp);
+		ring->rx_info = vzalloc(tmp);
 		if (!ring->rx_info) {
 			err = -ENOMEM;
 			goto err_ring;
@@ -452,16 +352,6 @@ int mlx4_en_activate_rx_rings(struct mlx4_en_priv *priv)
 		/* Initialize all descriptors */
 		for (i = 0; i < ring->size; i++)
 			mlx4_en_init_rx_desc(priv, ring, i);
-
-		/* Initialize page allocators */
-		err = mlx4_en_init_allocator(priv, ring);
-		if (err) {
-			en_err(priv, "Failed initializing ring allocator\n");
-			if (ring->stride <= TXBB_SIZE)
-				ring->buf -= TXBB_SIZE;
-			ring_ind--;
-			goto err_allocator;
-		}
 	}
 	err = mlx4_en_fill_rx_buffers(priv);
 	if (err)
@@ -481,11 +371,9 @@ int mlx4_en_activate_rx_rings(struct mlx4_en_priv *priv)
 		mlx4_en_free_rx_buf(priv, priv->rx_ring[ring_ind]);
 
 	ring_ind = priv->rx_ring_num - 1;
-err_allocator:
 	while (ring_ind >= 0) {
 		if (priv->rx_ring[ring_ind]->stride <= TXBB_SIZE)
 			priv->rx_ring[ring_ind]->buf -= TXBB_SIZE;
-		mlx4_en_destroy_allocator(priv, priv->rx_ring[ring_ind]);
 		ring_ind--;
 	}
 	return err;
@@ -562,50 +450,64 @@ void mlx4_en_deactivate_rx_ring(struct mlx4_en_priv *priv,
 	mlx4_en_free_rx_buf(priv, ring);
 	if (ring->stride <= TXBB_SIZE)
 		ring->buf -= TXBB_SIZE;
-	mlx4_en_destroy_allocator(priv, ring);
 }
 
 
-static int mlx4_en_complete_rx_desc(struct mlx4_en_priv *priv,
-				    struct mlx4_en_rx_desc *rx_desc,
+static noinline int mlx4_en_complete_rx_desc(struct mlx4_en_priv *priv,
 				    struct mlx4_en_rx_alloc *frags,
 				    struct sk_buff *skb,
 				    int length)
 {
-	struct skb_frag_struct *skb_frags_rx = skb_shinfo(skb)->frags;
 	struct mlx4_en_frag_info *frag_info = priv->frag_info;
 	int nr, frag_size;
+	struct page *page;
 	dma_addr_t dma;
+	bool release;
+	unsigned int truesize = 0;
 
 	/* Collect used fragments while replacing them in the HW descriptors */
 	for (nr = 0;;) {
 		frag_size = min_t(int, length, frag_info->frag_size);
 
-		if (unlikely(!frags[nr].page))
+		page = frags[nr].page;
+		if (unlikely(!page))
 			goto fail;
 
-		dma = be64_to_cpu(rx_desc->data[nr].addr);
-		dma_sync_single_for_cpu(priv->ddev, dma, frag_info->frag_size,
-					DMA_FROM_DEVICE);
+		dma = frags[nr].dma;
+		dma_sync_single_range_for_cpu(priv->ddev, dma, frags[nr].page_offset,
+					      frag_info->frag_size, priv->dma_dir);
 
-		__skb_fill_page_desc(skb, nr, frags[nr].page,
-				     frags[nr].page_offset,
+		__skb_fill_page_desc(skb, nr, page, frags[nr].page_offset,
 				     frag_size);
 
-		skb->truesize += frag_info->frag_stride;
-		frags[nr].page = NULL;
+		truesize += frag_info->frag_stride;
+		if (frag_info->frag_stride == PAGE_SIZE / 2) {
+			frags[nr].page_offset ^= PAGE_SIZE / 2;
+			release = page_count(page) != 1 || page_is_pfmemalloc(page);
+		} else {
+			frags[nr].page_offset += frag_info->frag_stride;
+			release = frags[nr].page_offset + frag_info->frag_size > PAGE_SIZE;
+		}
+		if (release) {
+			dma_unmap_page(priv->ddev, dma, PAGE_SIZE, priv->dma_dir);
+			frags[nr].page = NULL;
+		} else {
+			page_ref_inc(page);
+		}
+
 		nr++;
 		length -= frag_size;
 		if (!length)
 			break;
 		frag_info++;
 	}
+	skb->truesize += truesize;
 	return nr;
 
 fail:
 	while (nr > 0) {
 		nr--;
-		__skb_frag_unref(&skb_frags_rx[nr]);
+		__skb_frag_unref(skb_shinfo(skb)->frags + nr);
 	}
 	return 0;
 }
@@ -636,7 +538,8 @@ static struct sk_buff *mlx4_en_rx_skb(struct mlx4_en_priv *priv,
 	if (length <= SMALL_PACKET_SIZE) {
 		/* We are copying all relevant data to the skb - temporarily
 		 * sync buffers for the copy */
-		dma = be64_to_cpu(rx_desc->data[0].addr);
+
+		dma = frags[0].dma + frags[0].page_offset;
 		dma_sync_single_for_cpu(priv->ddev, dma, length,
 					DMA_FROM_DEVICE);
 		skb_copy_to_linear_data(skb, va, length);
@@ -645,7 +548,7 @@ static struct sk_buff *mlx4_en_rx_skb(struct mlx4_en_priv *priv,
 		unsigned int pull_len;
 
 		/* Move relevant fragments to skb */
-		used_frags = mlx4_en_complete_rx_desc(priv, rx_desc, frags,
+		used_frags = mlx4_en_complete_rx_desc(priv, frags,
 							skb, length);
 		if (unlikely(!used_frags)) {
 			kfree_skb(skb);
@@ -913,8 +816,10 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int bud
 			case XDP_TX:
 				if (likely(!mlx4_en_xmit_frame(ring, frags, dev,
 							length, cq->ring,
-							&doorbell_pending)))
-					goto consumed;
+							&doorbell_pending))) {
+					frags[0].page = NULL;
+					goto next;
+				}
 				trace_xdp_exception(dev, xdp_prog, act);
 				goto xdp_drop_no_cnt; /* Drop on xmit failure */
 			default:
@@ -924,8 +829,6 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int bud
 			case XDP_DROP:
 				ring->xdp_drop++;
 xdp_drop_no_cnt:
-				if (likely(mlx4_en_rx_recycle(ring, frags)))
-					goto consumed;
 				goto next;
 			}
 		}
@@ -971,9 +874,8 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int bud
 			if (!gro_skb)
 				goto next;
 
-			nr = mlx4_en_complete_rx_desc(priv,
-				rx_desc, frags, gro_skb,
-				length);
+			nr = mlx4_en_complete_rx_desc(priv, frags, gro_skb,
+						      length);
 			if (!nr)
 				goto next;
 
@@ -1081,10 +983,6 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int bud
 
 		napi_gro_receive(&cq->napi, skb);
 next:
-		for (nr = 0; nr < priv->num_frags; nr++)
-			mlx4_en_free_frag(priv, frags, nr);
-
-consumed:
 		++cq->mcq.cons_index;
 		index = (cq->mcq.cons_index) & ring->size_mask;
 		cqe = mlx4_en_get_cqe(cq->buf, index, priv->cqe_size) + factor;
diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
index 4016086b13539c8bd848242a3a1788eff245..4a6594325a10b238b8a4f01805493b5c6e8b 100644
--- a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
+++ b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
@@ -327,7 +327,6 @@ struct mlx4_en_rx_desc {
 
 struct mlx4_en_rx_ring {
 	struct mlx4_hwq_resources wqres;
-	struct mlx4_en_rx_alloc page_alloc[MLX4_EN_MAX_RX_FRAGS];
 	u32 size ;	/* number of Rx descs*/
 	u32 actual_size;
 	u32 size_mask;
-- 
2.11.0.483.g087da7b7c-goog

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ