[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20180315195329.7787-3-willy@infradead.org>
Date: Thu, 15 Mar 2018 12:53:29 -0700
From: Matthew Wilcox <willy@...radead.org>
To: Alexander Duyck <alexander.h.duyck@...el.com>
Cc: linux-mm@...r.kernel.org, netdev@...r.kernel.org,
Matthew Wilcox <mawilcox@...rosoft.com>
Subject: [RFC 2/2] page_frag_cache: Store metadata in struct page
From: Matthew Wilcox <mawilcox@...rosoft.com>
Shrink page_frag_cache from 24 to 8 bytes (a single pointer to the
currently-in-use struct page) by using the page's refcount directly
(instead of maintaining a bias) and storing our current progress through
the page in the same bits currently used for page->index. We no longer
need to reflect the page pfmemalloc state if we're storing the page
directly.
On the downside, we now call page_address() on every allocation, and we
do an atomic_inc() rather than a non-atomic decrement, but we should
touch the same number of cachelines and there is far less code (and
the code is less complex).
Signed-off-by: Matthew Wilcox <mawilcox@...rosoft.com>
---
include/linux/mm_types.h | 17 +-----
mm/page_alloc.c | 135 ++++++++++++++++++++++++-----------------------
net/core/skbuff.c | 4 +-
3 files changed, 74 insertions(+), 82 deletions(-)
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 1c5dea402501..f922cb62bd91 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -90,6 +90,7 @@ struct page {
union {
pgoff_t index; /* Our offset within mapping. */
void *freelist; /* sl[aou]b first free object */
+ unsigned int offset; /* page_frag highwater mark */
/* page_deferred_list().prev -- second tail page */
};
@@ -219,22 +220,8 @@ struct page {
#endif
} _struct_page_alignment;
-#define PAGE_FRAG_CACHE_MAX_SIZE __ALIGN_MASK(32768, ~PAGE_MASK)
-#define PAGE_FRAG_CACHE_MAX_ORDER get_order(PAGE_FRAG_CACHE_MAX_SIZE)
-
struct page_frag_cache {
- void * va;
-#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
- __u16 offset;
- __u16 size;
-#else
- __u32 offset;
-#endif
- /* we maintain a pagecount bias, so that we dont dirty cache line
- * containing page->_refcount every time we allocate a fragment.
- */
- unsigned int pagecnt_bias;
- bool pfmemalloc;
+ struct page *page;
};
typedef unsigned long vm_flags_t;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 7a9c14214ed2..f8a176aab287 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4319,34 +4319,72 @@ void free_pages(unsigned long addr, unsigned int order)
EXPORT_SYMBOL(free_pages);
/*
- * Page Fragment:
- * An arbitrary-length arbitrary-offset area of memory which resides
- * within a 0 or higher order page. Multiple fragments within that page
- * are individually refcounted, in the page's reference counter.
+ * The page fragment allocator is simple, yet effective. It allocates
+ * pages from the page allocator, then hands out fragments of those
+ * pages to its callers. It makes no effort to track which parts of
+ * the page remain in use, always allocating fresh memory. The page
+ * reference count is used to keep track of whether any fragment is
+ * still in use; when all fragments in a page have been freed, the
+ * entire page is returned to the page allocator.
*
- * The page_frag functions below provide a simple allocation framework for
- * page fragments. This is used by the network stack and network device
- * drivers to provide a backing region of memory for use as either an
- * sk_buff->head, or to be used in the "frags" portion of skb_shared_info.
+ * The page fragment allocator performs no locking. The caller is
+ * expected to ensure that two callers cannot simultaneously allocate
+ * from the same page_frag_cache. Freeing is atomic and is permitted
+ * to happen simultaneously with other frees or an allocation.
+ *
+ * The allocator uses the struct page to store its state. The 'offset'
+ * field in struct page is used to track how far through the page the
+ * allocation has proceeded. The 'refcount' field is used to track
+ * how many fragments have been allocated from this page. All other
+ * fields in struct page may be used by the owner of the page_frag_cache.
+ * The refcount is incremented by one while the page is still actively being
+ * allocated from; this prevents it from being freed prematurely.
*/
-static struct page *__page_frag_cache_refill(struct page_frag_cache *nc,
- gfp_t gfp_mask)
+
+#define PAGE_FRAG_ALLOC_SIZE (64 * 1024)
+#define PAGE_FRAG_ORDER get_order(PAGE_FRAG_ALLOC_SIZE)
+
+static noinline
+struct page *__page_frag_cache_refill(struct page_frag_cache *nc,
+ unsigned int size, gfp_t gfp_mask)
{
+ struct page *old = nc->page;
struct page *page = NULL;
- gfp_t gfp = gfp_mask;
-
-#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
- gfp_mask |= __GFP_COMP | __GFP_NOWARN | __GFP_NORETRY |
- __GFP_NOMEMALLOC;
- page = alloc_pages_node(NUMA_NO_NODE, gfp_mask,
- PAGE_FRAG_CACHE_MAX_ORDER);
- nc->size = page ? PAGE_FRAG_CACHE_MAX_SIZE : PAGE_SIZE;
-#endif
- if (unlikely(!page))
- page = alloc_pages_node(NUMA_NO_NODE, gfp, 0);
- nc->va = page ? page_address(page) : NULL;
+ if (size > PAGE_FRAG_ALLOC_SIZE)
+ return NULL;
+
+ /*
+ * If all the previous allocations from this page have already been
+ * freed, reuse the page if it can satisfy this allocation.
+ */
+ if (old && page_ref_count(old) == 1) {
+ unsigned int offset = PAGE_SIZE << compound_order(old);
+
+ if (offset > size) {
+ old->offset = offset;
+ return old;
+ }
+ }
+
+ if (PAGE_FRAG_ORDER > 0) {
+ gfp_t gfp = gfp_mask | __GFP_COMP | __GFP_NOWARN |
+ __GFP_NORETRY | __GFP_NOMEMALLOC;
+
+ page = alloc_pages_node(NUMA_NO_NODE, gfp, PAGE_FRAG_ORDER);
+ if (unlikely(!page) && size > PAGE_SIZE)
+ return NULL;
+ }
+ if (unlikely(!page))
+ page = alloc_pages_node(NUMA_NO_NODE, gfp_mask, 0);
+ if (unlikely(!page))
+ return NULL;
+
+ if (old)
+ put_page(old);
+ nc->page = page;
+ page->offset = PAGE_SIZE << compound_order(page);
return page;
}
@@ -4366,56 +4404,23 @@ void __page_frag_cache_drain(struct page *page, unsigned int count)
EXPORT_SYMBOL(__page_frag_cache_drain);
void *page_frag_alloc(struct page_frag_cache *nc,
- unsigned int fragsz, gfp_t gfp_mask)
+ unsigned int size, gfp_t gfp_mask)
{
- unsigned int size = PAGE_SIZE;
- struct page *page;
- int offset;
+ struct page *page = nc->page;
+ unsigned int offset = page->offset;
- if (unlikely(!nc->va)) {
-refill:
- page = __page_frag_cache_refill(nc, gfp_mask);
+ if (unlikely(!page || offset < size)) {
+ page = __page_frag_cache_refill(nc, size, gfp_mask);
if (!page)
return NULL;
-
-#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
- /* if size can vary use size else just use PAGE_SIZE */
- size = nc->size;
-#endif
- /* Even if we own the page, we do not use atomic_set().
- * This would break get_page_unless_zero() users.
- */
- page_ref_add(page, size - 1);
-
- /* reset page count bias and offset to start of new frag */
- nc->pfmemalloc = page_is_pfmemalloc(page);
- nc->pagecnt_bias = size;
- nc->offset = size;
- }
-
- offset = nc->offset - fragsz;
- if (unlikely(offset < 0)) {
- page = virt_to_page(nc->va);
-
- if (!page_ref_sub_and_test(page, nc->pagecnt_bias))
- goto refill;
-
-#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
- /* if size can vary use size else just use PAGE_SIZE */
- size = nc->size;
-#endif
- /* OK, page count is 0, we can safely set it */
- set_page_count(page, size);
-
- /* reset page count bias and offset to start of new frag */
- nc->pagecnt_bias = size;
- offset = size - fragsz;
+ offset = page->offset;
}
- nc->pagecnt_bias--;
- nc->offset = offset;
+ page_ref_inc(page);
+ offset -= size;
+ page->offset = offset;
- return nc->va + offset;
+ return page_address(page) + offset;
}
EXPORT_SYMBOL(page_frag_alloc);
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 09bd89c90a71..59df4db31aed 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -412,7 +412,7 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len,
nc = this_cpu_ptr(&netdev_alloc_cache);
data = page_frag_alloc(nc, len, gfp_mask);
- pfmemalloc = nc->pfmemalloc;
+ pfmemalloc = page_is_pfmemalloc(nc->page);
local_irq_restore(flags);
@@ -486,7 +486,7 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
}
/* use OR instead of assignment to avoid clearing of bits in mask */
- if (nc->page.pfmemalloc)
+ if (page_is_pfmemalloc(nc->page.page))
skb->pfmemalloc = 1;
skb->head_frag = 1;
--
2.16.2
Powered by blists - more mailing lists