[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20161220132827.18788.8658.stgit@firesoul>
Date: Tue, 20 Dec 2016 14:28:28 +0100
From: Jesper Dangaard Brouer <brouer@...hat.com>
To: linux-mm@...ck.org, Alexander Duyck <alexander.duyck@...il.com>
Cc: willemdebruijn.kernel@...il.com, netdev@...r.kernel.org,
john.fastabend@...il.com, Saeed Mahameed <saeedm@...lanox.com>,
Jesper Dangaard Brouer <brouer@...hat.com>,
bjorn.topel@...el.com,
Alexei Starovoitov <alexei.starovoitov@...il.com>,
Tariq Toukan <tariqt@...lanox.com>
Subject: [RFC PATCH 4/4] page_pool: change refcnt model
This is the direction the patch is going, after Mel's comments.
Most significantly: Change that refcnt must reach zero (and not 1)
before the page gets into the recycle ring. Pages on the
pp_alloc_cache have refcnt==1 invariance, as this allow fast direct
recycling (allowed by XDP_DROP).
When mlx5 page recycle cache didn't work (at next-next at commit
f5f99309fa74) the benchmarks showed the gain was reduced to 14% by
this patch, or an added cost of approx 133 cycle (which were a higher
cycle cost than expected).
UPDATE: net-next at commit 52f40e9d65 this patch show no gain, perhaps
a small performance regression. The accuracy of the UDP measurements
are not good enough to conclude on, ksoftirq +1.4% and UDP side -0.89%.
The TC ingress drop test is more significant and show 4.3% slower.
Thus, this patch makes page_pool slower than the driver specific page
recycle cache. More optimizations are pending for the page_pool, thus
this can likely be regained.
The page_pool will still show benefit for use-case where the driver
page recycle cache doesn't work (>128 outstanding packets/pages).
Signed-off-by: Jesper Dangaard Brouer <brouer@...hat.com>
---
include/linux/mm.h | 5 --
include/linux/page_pool.h | 10 +++
mm/page_alloc.c | 16 ++---
mm/page_pool.c | 141 +++++++++++++++++++--------------------------
mm/swap.c | 3 +
5 files changed, 79 insertions(+), 96 deletions(-)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 11b4d8fb280b..7315c1790f7c 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -766,11 +766,6 @@ static inline void put_page(struct page *page)
{
page = compound_head(page);
- if (PagePool(page)) {
- page_pool_put_page(page);
- return;
- }
-
if (put_page_testzero(page))
__put_page(page);
diff --git a/include/linux/page_pool.h b/include/linux/page_pool.h
index 6f8f2ff6d758..40da1fac573d 100644
--- a/include/linux/page_pool.h
+++ b/include/linux/page_pool.h
@@ -112,6 +112,7 @@ struct page_pool {
* wise, because free's can happen on remote CPUs, with no
* association with allocation resource.
*
+ * XXX: Mel says drop comment
* For now use ptr_ring, as it separates consumer and
* producer, which is a common use-case. The ptr_ring is not
* though as the final data structure, expecting this to
@@ -145,6 +146,7 @@ void page_pool_destroy(struct page_pool *pool);
/* Never call this directly, use helpers below */
void __page_pool_put_page(struct page *page, bool allow_direct);
+/* XXX: Mel: needs descriptions*/
static inline void page_pool_put_page(struct page *page)
{
__page_pool_put_page(page, false);
@@ -155,4 +157,12 @@ static inline void page_pool_recycle_direct(struct page *page)
__page_pool_put_page(page, true);
}
+/*
+ * Called when refcnt reach zero. On failure page_pool state is
+ * cleared, and caller can return page to page allocator.
+ */
+bool page_pool_recycle(struct page *page);
+// XXX: compile out trick, let this return false compile time,
+// or let PagePool() check compile to false.
+
#endif /* _LINUX_PAGE_POOL_H */
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 655db05f0c1c..5a68bdbc9dc1 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1240,6 +1240,9 @@ static void __free_pages_ok(struct page *page, unsigned int order)
int migratetype;
unsigned long pfn = page_to_pfn(page);
+ if (PagePool(page) && page_pool_recycle(page))
+ return;
+
if (!free_pages_prepare(page, order, true))
return;
@@ -2448,6 +2451,9 @@ void free_hot_cold_page(struct page *page, bool cold)
unsigned long pfn = page_to_pfn(page);
int migratetype;
+ if (PagePool(page) && page_pool_recycle(page))
+ return;
+
if (!free_pcp_prepare(page))
return;
@@ -3873,11 +3879,6 @@ EXPORT_SYMBOL(get_zeroed_page);
void __free_pages(struct page *page, unsigned int order)
{
- if (PagePool(page)) {
- page_pool_put_page(page);
- return;
- }
-
if (put_page_testzero(page)) {
if (order == 0)
free_hot_cold_page(page, false);
@@ -4005,11 +4006,6 @@ void __free_page_frag(void *addr)
{
struct page *page = virt_to_head_page(addr);
- if (PagePool(page)) {
- page_pool_put_page(page);
- return;
- }
-
if (unlikely(put_page_testzero(page)))
__free_pages_ok(page, compound_order(page));
}
diff --git a/mm/page_pool.c b/mm/page_pool.c
index 74138d5fe86d..064034d89f8a 100644
--- a/mm/page_pool.c
+++ b/mm/page_pool.c
@@ -21,14 +21,15 @@
#include <linux/dma-mapping.h>
#include <linux/page-flags.h>
#include <linux/mm.h> /* for __put_page() */
+#include "internal.h" /* for set_page_refcounted() */
/*
* The struct page_pool (likely) cannot be embedded into another
* structure, because freeing this struct depend on outstanding pages,
* which can point back to the page_pool. Thus, don't export "init".
*/
-int page_pool_init(struct page_pool *pool,
- const struct page_pool_params *params)
+static int page_pool_init(struct page_pool *pool,
+ const struct page_pool_params *params)
{
int ring_qsize = 1024; /* Default */
int param_copy_sz;
@@ -108,40 +109,33 @@ EXPORT_SYMBOL(page_pool_create);
/* fast path */
static struct page *__page_pool_get_cached(struct page_pool *pool)
{
+ struct ptr_ring *r;
struct page *page;
- /* FIXME: use another test for safe-context, caller should
- * simply provide this guarantee
- */
- if (likely(in_serving_softirq())) { // FIXME add use of PP_FLAG_NAPI
- struct ptr_ring *r;
-
- if (likely(pool->alloc.count)) {
- /* Fast-path */
- page = pool->alloc.cache[--pool->alloc.count];
- return page;
- }
- /* Slower-path: Alloc array empty, time to refill */
- r = &pool->ring;
- /* Open-coded bulk ptr_ring consumer.
- *
- * Discussion: ATM the ring consumer lock is not
- * really needed due to the softirq/NAPI protection,
- * but later MM-layer need the ability to reclaim
- * pages on the ring. Thus, keeping the locks.
- */
- spin_lock(&r->consumer_lock);
- while ((page = __ptr_ring_consume(r))) {
- if (pool->alloc.count == PP_ALLOC_CACHE_REFILL)
- break;
- pool->alloc.cache[pool->alloc.count++] = page;
- }
- spin_unlock(&r->consumer_lock);
+ /* Caller guarantee safe context for accessing alloc.cache */
+ if (likely(pool->alloc.count)) {
+ /* Fast-path */
+ page = pool->alloc.cache[--pool->alloc.count];
return page;
}
- /* Slow-path: Get page from locked ring queue */
- page = ptr_ring_consume(&pool->ring);
+ /* Slower-path: Alloc array empty, time to refill */
+ r = &pool->ring;
+ /* Open-coded bulk ptr_ring consumer.
+ *
+ * Discussion: ATM ring *consumer* lock is not really needed
+ * due to caller protecton, but later MM-layer need the
+ * ability to reclaim pages from ring. Thus, keeping locks.
+ */
+ spin_lock(&r->consumer_lock);
+ while ((page = __ptr_ring_consume(r))) {
+ /* Pages on ring refcnt==0, on alloc.cache refcnt==1 */
+ set_page_refcounted(page);
+ if (pool->alloc.count == PP_ALLOC_CACHE_REFILL)
+ break;
+ pool->alloc.cache[pool->alloc.count++] = page;
+ }
+ spin_unlock(&r->consumer_lock);
return page;
}
@@ -290,15 +284,9 @@ static void __page_pool_clean_page(struct page *page)
/* Return a page to the page allocator, cleaning up our state */
static void __page_pool_return_page(struct page *page)
{
- struct page_pool *pool = page->pool;
-
+ VM_BUG_ON_PAGE(page_ref_count(page) != 0, page);
__page_pool_clean_page(page);
- /*
- * Given page pool state and flags were just cleared, the page
- * must be freed here. Thus, code invariant assumes
- * refcnt==1, as __free_pages() call put_page_testzero().
- */
- __free_pages(page, pool->p.order);
+ __put_page(page);
}
bool __page_pool_recycle_into_ring(struct page_pool *pool,
@@ -332,70 +320,61 @@ bool __page_pool_recycle_into_ring(struct page_pool *pool,
*
* Caller must provide appropiate safe context.
*/
-static bool __page_pool_recycle_direct(struct page *page,
+// noinline /* hack for perf-record test */
+static
+bool __page_pool_recycle_direct(struct page *page,
struct page_pool *pool)
{
- // BUG_ON(!in_serving_softirq());
+ VM_BUG_ON_PAGE(page_ref_count(page) != 1, page);
+ /* page refcnt==1 invarians on alloc.cache */
if (unlikely(pool->alloc.count == PP_ALLOC_CACHE_SIZE))
return false;
- /* Caller MUST have verified/know (page_ref_count(page) == 1) */
pool->alloc.cache[pool->alloc.count++] = page;
return true;
}
-void __page_pool_put_page(struct page *page, bool allow_direct)
+/*
+ * Called when refcnt reach zero. On failure page_pool state is
+ * cleared, and caller can return page to page allocator.
+ */
+bool page_pool_recycle(struct page *page)
{
struct page_pool *pool = page->pool;
- /* This is a fast-path optimization, that avoids an atomic
- * operation, in the case where a single object is (refcnt)
- * using the page.
- *
- * refcnt == 1 means page_pool owns page, and can recycle it.
- */
- if (likely(page_ref_count(page) == 1)) {
- /* Read barrier implicit paired with full MB of atomic ops */
- smp_rmb();
-
- if (allow_direct)
- if (__page_pool_recycle_direct(page, pool))
- return;
+ VM_BUG_ON_PAGE(page_ref_count(page) != 0, page);
- if (!__page_pool_recycle_into_ring(pool, page)) {
- /* Cache full, do real __free_pages() */
- __page_pool_return_page(page);
- }
- return;
- }
- /*
- * Many drivers splitting up the page into fragments, and some
- * want to keep doing this to save memory. The put_page_testzero()
- * function as a refcnt decrement, and should not return true.
- */
- if (unlikely(put_page_testzero(page))) {
- /*
- * Reaching refcnt zero should not be possible,
- * indicate code error. Don't crash but warn, handle
- * case by not-recycling, but return page to page
- * allocator.
- */
- WARN(1, "%s() violating page_pool invariance refcnt:%d\n",
- __func__, page_ref_count(page));
- /* Cleanup state before directly returning page */
+ /* Pages on recycle ring have refcnt==0 */
+ if (!__page_pool_recycle_into_ring(pool, page)) {
__page_pool_clean_page(page);
- __put_page(page);
+ return false;
}
+ return true;
+}
+EXPORT_SYMBOL(page_pool_recycle);
+
+void __page_pool_put_page(struct page *page, bool allow_direct)
+{
+ struct page_pool *pool = page->pool;
+
+ if (allow_direct && (page_ref_count(page) == 1))
+ if (__page_pool_recycle_direct(page, pool))
+ return;
+
+ if (put_page_testzero(page))
+ if (!page_pool_recycle(page))
+ __put_page(page);
+
}
EXPORT_SYMBOL(__page_pool_put_page);
-static void __destructor_put_page(void *ptr)
+void __destructor_return_page(void *ptr)
{
struct page *page = ptr;
/* Verify the refcnt invariant of cached pages */
- if (!(page_ref_count(page) == 1)) {
+ if (page_ref_count(page) != 0) {
pr_crit("%s() page_pool refcnt %d violation\n",
__func__, page_ref_count(page));
BUG();
@@ -407,7 +386,7 @@ static void __destructor_put_page(void *ptr)
void page_pool_destroy(struct page_pool *pool)
{
/* Empty recycle ring */
- ptr_ring_cleanup(&pool->ring, __destructor_put_page);
+ ptr_ring_cleanup(&pool->ring, __destructor_return_page);
/* FIXME-mem-leak: cleanup array/stack cache
* pool->alloc. Driver usually will destroy RX ring after
diff --git a/mm/swap.c b/mm/swap.c
index 4dcf852e1e6d..d71c896cb1a1 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -96,6 +96,9 @@ static void __put_compound_page(struct page *page)
void __put_page(struct page *page)
{
+ if (PagePool(page) && page_pool_recycle(page))
+ return;
+
if (unlikely(PageCompound(page)))
__put_compound_page(page);
else
Powered by blists - more mailing lists