lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Date:   Thu, 11 Feb 2021 16:18:29 -0800
From:   Ronak Doshi <doshir@...are.com>
To:     <netdev@...r.kernel.org>
CC:     Ronak Doshi <doshir@...are.com>, Todd Sabin <tsabin@...are.com>,
        "maintainer:VMWARE VMXNET3 ETHERNET DRIVER" <pv-drivers@...are.com>,
        "David S. Miller" <davem@...emloft.net>,
        Jakub Kicinski <kuba@...nel.org>,
        Jonathan Lemon <jonathan.lemon@...il.com>,
        Willem de Bruijn <willemb@...gle.com>,
        Randy Dunlap <rdunlap@...radead.org>,
        Kevin Hao <haokexin@...il.com>,
        Aleksandr Nogikh <nogikh@...gle.com>,
        Pablo Neira Ayuso <pablo@...filter.org>,
        Jakub Sitnicki <jakub@...udflare.com>,
        Marco Elver <elver@...gle.com>,
        Dexuan Cui <decui@...rosoft.com>,
        Paolo Abeni <pabeni@...hat.com>,
        Miaohe Lin <linmiaohe@...wei.com>,
        Guillaume Nault <gnault@...hat.com>,
        Dongseok Yi <dseok.yi@...sung.com>,
        Yadu Kishore <kyk.segfault@...il.com>,
        open list <linux-kernel@...r.kernel.org>
Subject: [PATCH net-next] avoid fragmenting page memory with netdev_alloc_cache

From: Todd Sabin <tsabin@...are.com>

Linux network stack uses an allocation page cache for skbs.  The
purpose is to reduce the number of page allocations that it needs to
make, and it works by allocating a group of pages, and then
sub-allocating skb memory from them.  When all skbs referencing the
shared pages are freed, then the block of pages is finally freed.

When these skbs are all freed close together in time, this works fine.
However, what can happen is that there are multiple nics (or multiple
rx-queues in a single nic), and the skbs are allocated to fill the rx
ring(s). If some nics or queues are far more active than others, the
entries in the less busy nic/queue may end up referencing a page
block, while all of the other packets that referenced that block of
pages are freed.

The result of this is that the memory used by an appliance for its rx
rings can slowly grow to be much greater than it was originally.

This patch fixes that by giving each vmxnet3 device a per-rx-queue page
cache.

Signed-off-by: Todd Sabin <tsabin@...are.com>
Signed-off-by: Ronak Doshi <doshir@...are.com>
---
 drivers/net/vmxnet3/vmxnet3_drv.c | 30 ++++++++++++++++++++++++------
 drivers/net/vmxnet3/vmxnet3_int.h |  2 ++
 include/linux/skbuff.h            |  2 ++
 net/core/skbuff.c                 | 21 +++++++++++++++------
 4 files changed, 43 insertions(+), 12 deletions(-)

diff --git a/drivers/net/vmxnet3/vmxnet3_drv.c b/drivers/net/vmxnet3/vmxnet3_drv.c
index 6e87f1fc4874..edcbc38c3ff6 100644
--- a/drivers/net/vmxnet3/vmxnet3_drv.c
+++ b/drivers/net/vmxnet3/vmxnet3_drv.c
@@ -574,9 +574,11 @@ vmxnet3_rq_alloc_rx_buf(struct vmxnet3_rx_queue *rq, u32 ring_idx,
 
 		if (rbi->buf_type == VMXNET3_RX_BUF_SKB) {
 			if (rbi->skb == NULL) {
-				rbi->skb = __netdev_alloc_skb_ip_align(adapter->netdev,
-								       rbi->len,
-								       GFP_KERNEL);
+				rbi->skb = ___netdev_alloc_skb(adapter->netdev,
+							       rbi->len + NET_IP_ALIGN, GFP_KERNEL,
+							       &adapter->frag_cache[rq->qid]);
+				if (NET_IP_ALIGN && rbi->skb)
+					skb_reserve(rbi->skb, NET_IP_ALIGN);
 				if (unlikely(rbi->skb == NULL)) {
 					rq->stats.rx_buf_alloc_failure++;
 					break;
@@ -1421,8 +1423,11 @@ vmxnet3_rq_rx_complete(struct vmxnet3_rx_queue *rq,
 			rxDataRingUsed =
 				VMXNET3_RX_DATA_RING(adapter, rcd->rqID);
 			len = rxDataRingUsed ? rcd->len : rbi->len;
-			new_skb = netdev_alloc_skb_ip_align(adapter->netdev,
-							    len);
+			new_skb = ___netdev_alloc_skb(adapter->netdev,
+						      rbi->len + NET_IP_ALIGN, GFP_ATOMIC,
+						      &adapter->frag_cache[rq->qid]);
+			if (NET_IP_ALIGN && new_skb)
+				skb_reserve(new_skb, NET_IP_ALIGN);
 			if (new_skb == NULL) {
 				/* Skb allocation failed, do not handover this
 				 * skb to stack. Reuse it. Drop the existing pkt
@@ -1483,6 +1488,7 @@ vmxnet3_rq_rx_complete(struct vmxnet3_rx_queue *rq,
 					     le32_to_cpu(rcd->rssHash),
 					     PKT_HASH_TYPE_L3);
 #endif
+			skb_record_rx_queue(ctx->skb, rq->qid);
 			skb_put(ctx->skb, rcd->len);
 
 			if (VMXNET3_VERSION_GE_2(adapter) &&
@@ -3652,7 +3658,7 @@ vmxnet3_remove_device(struct pci_dev *pdev)
 {
 	struct net_device *netdev = pci_get_drvdata(pdev);
 	struct vmxnet3_adapter *adapter = netdev_priv(netdev);
-	int size = 0;
+	int size = 0, i;
 	int num_rx_queues;
 
 #ifdef VMXNET3_RSS
@@ -3691,6 +3697,18 @@ vmxnet3_remove_device(struct pci_dev *pdev)
 			  adapter->shared, adapter->shared_pa);
 	dma_unmap_single(&adapter->pdev->dev, adapter->adapter_pa,
 			 sizeof(struct vmxnet3_adapter), PCI_DMA_TODEVICE);
+	for (i = 0; i < VMXNET3_DEVICE_MAX_RX_QUEUES; i++) {
+		struct page *page;
+		struct page_frag_cache *nc;
+
+		nc = &adapter->frag_cache[i];
+		if (unlikely(!nc->va)) {
+			/* nothing to do */
+			continue;
+		}
+		page = virt_to_page(nc->va);
+		__page_frag_cache_drain(page, nc->pagecnt_bias);
+	}
 	free_netdev(netdev);
 }
 
diff --git a/drivers/net/vmxnet3/vmxnet3_int.h b/drivers/net/vmxnet3/vmxnet3_int.h
index e910596b79cf..7e8767007203 100644
--- a/drivers/net/vmxnet3/vmxnet3_int.h
+++ b/drivers/net/vmxnet3/vmxnet3_int.h
@@ -42,6 +42,7 @@
 #include <linux/interrupt.h>
 #include <linux/workqueue.h>
 #include <linux/uaccess.h>
+#include <linux/mm.h>
 #include <asm/dma.h>
 #include <asm/page.h>
 
@@ -362,6 +363,7 @@ struct vmxnet3_adapter {
 	dma_addr_t			shared_pa;
 	dma_addr_t queue_desc_pa;
 	dma_addr_t coal_conf_pa;
+	struct page_frag_cache          frag_cache[VMXNET3_DEVICE_MAX_RX_QUEUES];
 
 	/* Wake-on-LAN */
 	u32     wol;
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 0a4e91a2f873..b57485016e04 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -2841,6 +2841,8 @@ static inline void *netdev_alloc_frag_align(unsigned int fragsz,
 
 struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int length,
 				   gfp_t gfp_mask);
+struct sk_buff *___netdev_alloc_skb(struct net_device *dev, unsigned int length,
+				    gfp_t gfp_mask, struct page_frag_cache *nc);
 
 /**
  *	netdev_alloc_skb - allocate an skbuff for rx on a specific device
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index d380c7b5a12d..ee0611345f6c 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -409,10 +409,11 @@ void *__netdev_alloc_frag_align(unsigned int fragsz, unsigned int align_mask)
 EXPORT_SYMBOL(__netdev_alloc_frag_align);
 
 /**
- *	__netdev_alloc_skb - allocate an skbuff for rx on a specific device
+ *	___netdev_alloc_skb - allocate an skbuff for rx on a specific device
  *	@dev: network device to receive on
  *	@len: length to allocate
  *	@gfp_mask: get_free_pages mask, passed to alloc_skb
+ *	@nc: page frag cache
  *
  *	Allocate a new &sk_buff and assign it a usage count of one. The
  *	buffer has NET_SKB_PAD headroom built in. Users should allocate
@@ -421,10 +422,9 @@ EXPORT_SYMBOL(__netdev_alloc_frag_align);
  *
  *	%NULL is returned if there is no free memory.
  */
-struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len,
-				   gfp_t gfp_mask)
+struct sk_buff *___netdev_alloc_skb(struct net_device *dev, unsigned int len,
+				    gfp_t gfp_mask, struct page_frag_cache *nc)
 {
-	struct page_frag_cache *nc;
 	struct sk_buff *skb;
 	bool pfmemalloc;
 	void *data;
@@ -450,12 +450,14 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len,
 		gfp_mask |= __GFP_MEMALLOC;
 
 	if (in_irq() || irqs_disabled()) {
-		nc = this_cpu_ptr(&netdev_alloc_cache);
+		if (!nc)
+			nc = this_cpu_ptr(&netdev_alloc_cache);
 		data = page_frag_alloc(nc, len, gfp_mask);
 		pfmemalloc = nc->pfmemalloc;
 	} else {
 		local_bh_disable();
-		nc = this_cpu_ptr(&napi_alloc_cache.page);
+		if (!nc)
+			nc = this_cpu_ptr(&napi_alloc_cache.page);
 		data = page_frag_alloc(nc, len, gfp_mask);
 		pfmemalloc = nc->pfmemalloc;
 		local_bh_enable();
@@ -481,6 +483,13 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len,
 skb_fail:
 	return skb;
 }
+EXPORT_SYMBOL(___netdev_alloc_skb);
+
+struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len,
+				   gfp_t gfp_mask)
+{
+	return ___netdev_alloc_skb(dev, len, gfp_mask, NULL);
+}
 EXPORT_SYMBOL(__netdev_alloc_skb);
 
 /**
-- 
2.11.0

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ