linux-kernel - [PATCH net-next] avoid fragmenting page memory with netdev_alloc

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-ID: <20210212001842.32714-1-doshir@vmware.com>
Date:   Thu, 11 Feb 2021 16:18:29 -0800
From:   Ronak Doshi <doshir@...are.com>
To:     <netdev@...r.kernel.org>
CC:     Ronak Doshi <doshir@...are.com>, Todd Sabin <tsabin@...are.com>,
        "maintainer:VMWARE VMXNET3 ETHERNET DRIVER" <pv-drivers@...are.com>,
        "David S. Miller" <davem@...emloft.net>,
        Jakub Kicinski <kuba@...nel.org>,
        Jonathan Lemon <jonathan.lemon@...il.com>,
        Willem de Bruijn <willemb@...gle.com>,
        Randy Dunlap <rdunlap@...radead.org>,
        Kevin Hao <haokexin@...il.com>,
        Aleksandr Nogikh <nogikh@...gle.com>,
        Pablo Neira Ayuso <pablo@...filter.org>,
        Jakub Sitnicki <jakub@...udflare.com>,
        Marco Elver <elver@...gle.com>,
        Dexuan Cui <decui@...rosoft.com>,
        Paolo Abeni <pabeni@...hat.com>,
        Miaohe Lin <linmiaohe@...wei.com>,
        Guillaume Nault <gnault@...hat.com>,
        Dongseok Yi <dseok.yi@...sung.com>,
        Yadu Kishore <kyk.segfault@...il.com>,
        open list <linux-kernel@...r.kernel.org>
Subject: [PATCH net-next] avoid fragmenting page memory with netdev_alloc_cache

From: Todd Sabin <tsabin@...are.com>

Linux network stack uses an allocation page cache for skbs.  The
purpose is to reduce the number of page allocations that it needs to
make, and it works by allocating a group of pages, and then
sub-allocating skb memory from them.  When all skbs referencing the
shared pages are freed, then the block of pages is finally freed.

When these skbs are all freed close together in time, this works fine.
However, what can happen is that there are multiple nics (or multiple
rx-queues in a single nic), and the skbs are allocated to fill the rx
ring(s). If some nics or queues are far more active than others, the
entries in the less busy nic/queue may end up referencing a page
block, while all of the other packets that referenced that block of
pages are freed.

The result of this is that the memory used by an appliance for its rx
rings can slowly grow to be much greater than it was originally.

This patch fixes that by giving each vmxnet3 device a per-rx-queue page
cache.

Signed-off-by: Todd Sabin <tsabin@...are.com>
Signed-off-by: Ronak Doshi <doshir@...are.com>
---
 drivers/net/vmxnet3/vmxnet3_drv.c | 30 ++++++++++++++++++++++++------
 drivers/net/vmxnet3/vmxnet3_int.h |  2 ++
 include/linux/skbuff.h            |  2 ++
 net/core/skbuff.c                 | 21 +++++++++++++++------
 4 files changed, 43 insertions(+), 12 deletions(-)

diff --git a/drivers/net/vmxnet3/vmxnet3_drv.c b/drivers/net/vmxnet3/vmxnet3_drv.c
index 6e87f1fc4874..edcbc38c3ff6 100644
--- a/drivers/net/vmxnet3/vmxnet3_drv.c
+++ b/drivers/net/vmxnet3/vmxnet3_drv.c
@@ -574,9 +574,11 @@ vmxnet3_rq_alloc_rx_buf(struct vmxnet3_rx_queue *rq, u32 ring_idx,
 
 		if (rbi->buf_type == VMXNET3_RX_BUF_SKB) {
 			if (rbi->skb == NULL) {
-				rbi->skb = __netdev_alloc_skb_ip_align(adapter->netdev,
-								       rbi->len,
-								       GFP_KERNEL);
+				rbi->skb = ___netdev_alloc_skb(adapter->netdev,
+							       rbi->len + NET_IP_ALIGN, GFP_KERNEL,
+							       &adapter->frag_cache[rq->qid]);
+				if (NET_IP_ALIGN && rbi->skb)
+					skb_reserve(rbi->skb, NET_IP_ALIGN);
 				if (unlikely(rbi->skb == NULL)) {
 					rq->stats.rx_buf_alloc_failure++;
 					break;
@@ -1421,8 +1423,11 @@ vmxnet3_rq_rx_complete(struct vmxnet3_rx_queue *rq,
 			rxDataRingUsed =
 				VMXNET3_RX_DATA_RING(adapter, rcd->rqID);
 			len = rxDataRingUsed ? rcd->len : rbi->len;
-			new_skb = netdev_alloc_skb_ip_align(adapter->netdev,
-							    len);
+			new_skb = ___netdev_alloc_skb(adapter->netdev,
+						      rbi->len + NET_IP_ALIGN, GFP_ATOMIC,
+						      &adapter->frag_cache[rq->qid]);
+			if (NET_IP_ALIGN && new_skb)
+				skb_reserve(new_skb, NET_IP_ALIGN);
 			if (new_skb == NULL) {
 				/* Skb allocation failed, do not handover this
 				 * skb to stack. Reuse it. Drop the existing pkt
@@ -1483,6 +1488,7 @@ vmxnet3_rq_rx_complete(struct vmxnet3_rx_queue *rq,
 					     le32_to_cpu(rcd->rssHash),
 					     PKT_HASH_TYPE_L3);
 #endif
+			skb_record_rx_queue(ctx->skb, rq->qid);
 			skb_put(ctx->skb, rcd->len);
 
 			if (VMXNET3_VERSION_GE_2(adapter) &&
@@ -3652,7 +3658,7 @@ vmxnet3_remove_device(struct pci_dev *pdev)
 {
 	struct net_device *netdev = pci_get_drvdata(pdev);
 	struct vmxnet3_adapter *adapter = netdev_priv(netdev);
-	int size = 0;
+	int size = 0, i;
 	int num_rx_queues;
 
 #ifdef VMXNET3_RSS
@@ -3691,6 +3697,18 @@ vmxnet3_remove_device(struct pci_dev *pdev)
 			  adapter->shared, adapter->shared_pa);
 	dma_unmap_single(&adapter->pdev->dev, adapter->adapter_pa,
 			 sizeof(struct vmxnet3_adapter), PCI_DMA_TODEVICE);
+	for (i = 0; i < VMXNET3_DEVICE_MAX_RX_QUEUES; i++) {
+		struct page *page;
+		struct page_frag_cache *nc;
+
+		nc = &adapter->frag_cache[i];
+		if (unlikely(!nc->va)) {
+			/* nothing to do */
+			continue;
+		}
+		page = virt_to_page(nc->va);
+		__page_frag_cache_drain(page, nc->pagecnt_bias);
+	}
 	free_netdev(netdev);
 }
 
diff --git a/drivers/net/vmxnet3/vmxnet3_int.h b/drivers/net/vmxnet3/vmxnet3_int.h
index e910596b79cf..7e8767007203 100644
--- a/drivers/net/vmxnet3/vmxnet3_int.h
+++ b/drivers/net/vmxnet3/vmxnet3_int.h
@@ -42,6 +42,7 @@
 #include <linux/interrupt.h>
 #include <linux/workqueue.h>
 #include <linux/uaccess.h>
+#include <linux/mm.h>
 #include <asm/dma.h>
 #include <asm/page.h>
 
@@ -362,6 +363,7 @@ struct vmxnet3_adapter {
 	dma_addr_t			shared_pa;
 	dma_addr_t queue_desc_pa;
 	dma_addr_t coal_conf_pa;
+	struct page_frag_cache          frag_cache[VMXNET3_DEVICE_MAX_RX_QUEUES];
 
 	/* Wake-on-LAN */
 	u32     wol;
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 0a4e91a2f873..b57485016e04 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -2841,6 +2841,8 @@ static inline void *netdev_alloc_frag_align(unsigned int fragsz,
 
 struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int length,
 				   gfp_t gfp_mask);
+struct sk_buff *___netdev_alloc_skb(struct net_device *dev, unsigned int length,
+				    gfp_t gfp_mask, struct page_frag_cache *nc);
 
 /**
  *	netdev_alloc_skb - allocate an skbuff for rx on a specific device
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index d380c7b5a12d..ee0611345f6c 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -409,10 +409,11 @@ void *__netdev_alloc_frag_align(unsigned int fragsz, unsigned int align_mask)
 EXPORT_SYMBOL(__netdev_alloc_frag_align);
 
 /**
- *	__netdev_alloc_skb - allocate an skbuff for rx on a specific device
+ *	___netdev_alloc_skb - allocate an skbuff for rx on a specific device
  *	@dev: network device to receive on
  *	@len: length to allocate
  *	@gfp_mask: get_free_pages mask, passed to alloc_skb
+ *	@nc: page frag cache
  *
  *	Allocate a new &sk_buff and assign it a usage count of one. The
  *	buffer has NET_SKB_PAD headroom built in. Users should allocate
@@ -421,10 +422,9 @@ EXPORT_SYMBOL(__netdev_alloc_frag_align);
  *
  *	%NULL is returned if there is no free memory.
  */
-struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len,
-				   gfp_t gfp_mask)
+struct sk_buff *___netdev_alloc_skb(struct net_device *dev, unsigned int len,
+				    gfp_t gfp_mask, struct page_frag_cache *nc)
 {
-	struct page_frag_cache *nc;
 	struct sk_buff *skb;
 	bool pfmemalloc;
 	void *data;
@@ -450,12 +450,14 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len,
 		gfp_mask |= __GFP_MEMALLOC;
 
 	if (in_irq() || irqs_disabled()) {
-		nc = this_cpu_ptr(&netdev_alloc_cache);
+		if (!nc)
+			nc = this_cpu_ptr(&netdev_alloc_cache);
 		data = page_frag_alloc(nc, len, gfp_mask);
 		pfmemalloc = nc->pfmemalloc;
 	} else {
 		local_bh_disable();
-		nc = this_cpu_ptr(&napi_alloc_cache.page);
+		if (!nc)
+			nc = this_cpu_ptr(&napi_alloc_cache.page);
 		data = page_frag_alloc(nc, len, gfp_mask);
 		pfmemalloc = nc->pfmemalloc;
 		local_bh_enable();
@@ -481,6 +483,13 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len,
 skb_fail:
 	return skb;
 }
+EXPORT_SYMBOL(___netdev_alloc_skb);
+
+struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len,
+				   gfp_t gfp_mask)
+{
+	return ___netdev_alloc_skb(dev, len, gfp_mask, NULL);
+}
 EXPORT_SYMBOL(__netdev_alloc_skb);
 
 /**
-- 
2.11.0