[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <d9814d6628599b7b28ed29c71d6fb6631123fdef.camel@gmail.com>
Date: Wed, 14 Aug 2024 14:00:21 -0700
From: Alexander H Duyck <alexander.duyck@...il.com>
To: Yunsheng Lin <linyunsheng@...wei.com>, davem@...emloft.net,
kuba@...nel.org, pabeni@...hat.com
Cc: netdev@...r.kernel.org, linux-kernel@...r.kernel.org, Andrew Morton
<akpm@...ux-foundation.org>, linux-mm@...ck.org
Subject: Re: [PATCH net-next v13 11/14] mm: page_frag: introduce
prepare/probe/commit API
On Thu, 2024-08-08 at 20:37 +0800, Yunsheng Lin wrote:
> There are many use cases that need minimum memory in order
> for forward progress, but more performant if more memory is
> available or need to probe the cache info to use any memory
> available for frag caoleasing reason.
>
> Currently skb_page_frag_refill() API is used to solve the
> above use cases, but caller needs to know about the internal
> detail and access the data field of 'struct page_frag' to
> meet the requirement of the above use cases and its
> implementation is similar to the one in mm subsystem.
>
> To unify those two page_frag implementations, introduce a
> prepare API to ensure minimum memory is satisfied and return
> how much the actual memory is available to the caller and a
> probe API to report the current available memory to caller
> without doing cache refilling. The caller needs to either call
> the commit API to report how much memory it actually uses, or
> not do so if deciding to not use any memory.
>
> CC: Alexander Duyck <alexander.duyck@...il.com>
> Signed-off-by: Yunsheng Lin <linyunsheng@...wei.com>
> ---
> include/linux/page_frag_cache.h | 75 ++++++++++++++++
> mm/page_frag_cache.c | 152 ++++++++++++++++++++++++++++----
> 2 files changed, 212 insertions(+), 15 deletions(-)
>
> diff --git a/include/linux/page_frag_cache.h b/include/linux/page_frag_cache.h
> index 0abffdd10a1c..ba5d7f8a03cd 100644
> --- a/include/linux/page_frag_cache.h
> +++ b/include/linux/page_frag_cache.h
> @@ -7,6 +7,8 @@
> #include <linux/build_bug.h>
> #include <linux/log2.h>
> #include <linux/types.h>
> +#include <linux/mm.h>
> +#include <linux/mmdebug.h>
> #include <linux/mm_types_task.h>
>
> #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
> @@ -67,6 +69,9 @@ static inline unsigned int page_frag_cache_page_size(unsigned long encoded_va)
>
> void page_frag_cache_drain(struct page_frag_cache *nc);
> void __page_frag_cache_drain(struct page *page, unsigned int count);
> +struct page *page_frag_alloc_pg(struct page_frag_cache *nc,
> + unsigned int *offset, unsigned int fragsz,
> + gfp_t gfp);
> void *__page_frag_alloc_va_align(struct page_frag_cache *nc,
> unsigned int fragsz, gfp_t gfp_mask,
> unsigned int align_mask);
> @@ -79,12 +84,82 @@ static inline void *page_frag_alloc_va_align(struct page_frag_cache *nc,
> return __page_frag_alloc_va_align(nc, fragsz, gfp_mask, -align);
> }
>
> +static inline unsigned int page_frag_cache_page_offset(const struct page_frag_cache *nc)
> +{
> + return page_frag_cache_page_size(nc->encoded_va) - nc->remaining;
> +}
> +
> static inline void *page_frag_alloc_va(struct page_frag_cache *nc,
> unsigned int fragsz, gfp_t gfp_mask)
> {
> return __page_frag_alloc_va_align(nc, fragsz, gfp_mask, ~0u);
> }
>
> +void *page_frag_alloc_va_prepare(struct page_frag_cache *nc, unsigned int *fragsz,
> + gfp_t gfp);
> +
> +static inline void *page_frag_alloc_va_prepare_align(struct page_frag_cache *nc,
> + unsigned int *fragsz,
> + gfp_t gfp,
> + unsigned int align)
> +{
> + WARN_ON_ONCE(!is_power_of_2(align) || align > PAGE_SIZE);
> + nc->remaining = nc->remaining & -align;
> + return page_frag_alloc_va_prepare(nc, fragsz, gfp);
> +}
> +
> +struct page *page_frag_alloc_pg_prepare(struct page_frag_cache *nc,
> + unsigned int *offset,
> + unsigned int *fragsz, gfp_t gfp);
> +
> +struct page *page_frag_alloc_prepare(struct page_frag_cache *nc,
> + unsigned int *offset,
> + unsigned int *fragsz,
> + void **va, gfp_t gfp);
> +
> +static inline struct page *page_frag_alloc_probe(struct page_frag_cache *nc,
> + unsigned int *offset,
> + unsigned int *fragsz,
> + void **va)
> +{
> + unsigned long encoded_va = nc->encoded_va;
> + struct page *page;
> +
> + VM_BUG_ON(!*fragsz);
> + if (unlikely(nc->remaining < *fragsz))
> + return NULL;
> +
> + *va = encoded_page_address(encoded_va);
> + page = virt_to_page(*va);
> + *fragsz = nc->remaining;
> + *offset = page_frag_cache_page_size(encoded_va) - *fragsz;
> + *va += *offset;
> +
> + return page;
> +}
> +
I still think this should be populating a bio_vec instead of passing
multiple arguments by pointer. With that you would be able to get all
the fields without as many arguments having to be passed.
> +static inline void page_frag_alloc_commit(struct page_frag_cache *nc,
> + unsigned int fragsz)
> +{
> + VM_BUG_ON(fragsz > nc->remaining || !nc->pagecnt_bias);
> + nc->pagecnt_bias--;
> + nc->remaining -= fragsz;
> +}
> +
I would really like to see this accept a bio_vec as well. With that you
could verify the page and offset matches the expected value before
applying fragsz.
> +static inline void page_frag_alloc_commit_noref(struct page_frag_cache *nc,
> + unsigned int fragsz)
> +{
> + VM_BUG_ON(fragsz > nc->remaining);
> + nc->remaining -= fragsz;
> +}
> +
Same here.
> +static inline void page_frag_alloc_abort(struct page_frag_cache *nc,
> + unsigned int fragsz)
> +{
> + nc->pagecnt_bias++;
> + nc->remaining += fragsz;
> +}
> +
This doesn't add up. Why would you need abort if you have commit? Isn't
this more of a revert? I wouldn't think that would be valid as it is
possible you took some sort of action that might have resulted in this
memory already being shared. We shouldn't allow rewinding the offset
pointer without knowing that there are no other entities sharing the
page.
> void page_frag_free_va(void *addr);
>
> #endif
> diff --git a/mm/page_frag_cache.c b/mm/page_frag_cache.c
> index 27596b84b452..f8fad7d2cca8 100644
> --- a/mm/page_frag_cache.c
> +++ b/mm/page_frag_cache.c
> @@ -19,27 +19,27 @@
> #include <linux/page_frag_cache.h>
> #include "internal.h"
>
> -static bool __page_frag_cache_reuse(unsigned long encoded_va,
> - unsigned int pagecnt_bias)
> +static struct page *__page_frag_cache_reuse(unsigned long encoded_va,
> + unsigned int pagecnt_bias)
> {
> struct page *page;
>
> page = virt_to_page((void *)encoded_va);
> if (!page_ref_sub_and_test(page, pagecnt_bias))
> - return false;
> + return NULL;
>
> if (unlikely(encoded_page_pfmemalloc(encoded_va))) {
> free_unref_page(page, encoded_page_order(encoded_va));
> - return false;
> + return NULL;
> }
>
> /* OK, page count is 0, we can safely set it */
> set_page_count(page, PAGE_FRAG_CACHE_MAX_SIZE + 1);
> - return true;
> + return page;
> }
>
> -static bool __page_frag_cache_refill(struct page_frag_cache *nc,
> - gfp_t gfp_mask)
> +static struct page *__page_frag_cache_refill(struct page_frag_cache *nc,
> + gfp_t gfp_mask)
> {
> unsigned long order = PAGE_FRAG_CACHE_MAX_ORDER;
> struct page *page = NULL;
> @@ -55,7 +55,7 @@ static bool __page_frag_cache_refill(struct page_frag_cache *nc,
> page = __alloc_pages(gfp, 0, numa_mem_id(), NULL);
> if (unlikely(!page)) {
> memset(nc, 0, sizeof(*nc));
> - return false;
> + return NULL;
> }
>
> order = 0;
> @@ -69,29 +69,151 @@ static bool __page_frag_cache_refill(struct page_frag_cache *nc,
> */
> page_ref_add(page, PAGE_FRAG_CACHE_MAX_SIZE);
>
> - return true;
> + return page;
> }
>
> /* Reload cache by reusing the old cache if it is possible, or
> * refilling from the page allocator.
> */
> -static bool __page_frag_cache_reload(struct page_frag_cache *nc,
> - gfp_t gfp_mask)
> +static struct page *__page_frag_cache_reload(struct page_frag_cache *nc,
> + gfp_t gfp_mask)
> {
> + struct page *page;
> +
> if (likely(nc->encoded_va)) {
> - if (__page_frag_cache_reuse(nc->encoded_va, nc->pagecnt_bias))
> + page = __page_frag_cache_reuse(nc->encoded_va, nc->pagecnt_bias);
> + if (page)
> goto out;
> }
>
> - if (unlikely(!__page_frag_cache_refill(nc, gfp_mask)))
> - return false;
> + page = __page_frag_cache_refill(nc, gfp_mask);
> + if (unlikely(!page))
> + return NULL;
>
> out:
> /* reset page count bias and remaining to start of new frag */
> nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
> nc->remaining = page_frag_cache_page_size(nc->encoded_va);
> - return true;
> + return page;
> +}
> +
None of the functions above need to be returning page.
> +void *page_frag_alloc_va_prepare(struct page_frag_cache *nc,
> + unsigned int *fragsz, gfp_t gfp)
> +{
> + unsigned int remaining = nc->remaining;
> +
> + VM_BUG_ON(!*fragsz);
> + if (likely(remaining >= *fragsz)) {
> + unsigned long encoded_va = nc->encoded_va;
> +
> + *fragsz = remaining;
> +
> + return encoded_page_address(encoded_va) +
> + (page_frag_cache_page_size(encoded_va) - remaining);
> + }
> +
> + if (unlikely(*fragsz > PAGE_SIZE))
> + return NULL;
> +
> + /* When reload fails, nc->encoded_va and nc->remaining are both reset
> + * to zero, so there is no need to check the return value here.
> + */
> + __page_frag_cache_reload(nc, gfp);
> +
> + *fragsz = nc->remaining;
> + return encoded_page_address(nc->encoded_va);
> +}
> +EXPORT_SYMBOL(page_frag_alloc_va_prepare);
> +
> +struct page *page_frag_alloc_pg_prepare(struct page_frag_cache *nc,
> + unsigned int *offset,
> + unsigned int *fragsz, gfp_t gfp)
> +{
> + unsigned int remaining = nc->remaining;
> + struct page *page;
> +
> + VM_BUG_ON(!*fragsz);
> + if (likely(remaining >= *fragsz)) {
> + unsigned long encoded_va = nc->encoded_va;
> +
> + *offset = page_frag_cache_page_size(encoded_va) - remaining;
> + *fragsz = remaining;
> +
> + return virt_to_page((void *)encoded_va);
> + }
> +
> + if (unlikely(*fragsz > PAGE_SIZE))
> + return NULL;
> +
> + page = __page_frag_cache_reload(nc, gfp);
> + *offset = 0;
> + *fragsz = nc->remaining;
> + return page;
> +}
> +EXPORT_SYMBOL(page_frag_alloc_pg_prepare);
> +
> +struct page *page_frag_alloc_prepare(struct page_frag_cache *nc,
> + unsigned int *offset,
> + unsigned int *fragsz,
> + void **va, gfp_t gfp)
> +{
> + unsigned int remaining = nc->remaining;
> + struct page *page;
> +
> + VM_BUG_ON(!*fragsz);
> + if (likely(remaining >= *fragsz)) {
> + unsigned long encoded_va = nc->encoded_va;
> +
> + *offset = page_frag_cache_page_size(encoded_va) - remaining;
> + *va = encoded_page_address(encoded_va) + *offset;
> + *fragsz = remaining;
> +
> + return virt_to_page((void *)encoded_va);
> + }
> +
> + if (unlikely(*fragsz > PAGE_SIZE))
> + return NULL;
> +
> + page = __page_frag_cache_reload(nc, gfp);
> + *offset = 0;
> + *fragsz = nc->remaining;
> + *va = encoded_page_address(nc->encoded_va);
> +
> + return page;
> +}
> +EXPORT_SYMBOL(page_frag_alloc_prepare);
> +
> +struct page *page_frag_alloc_pg(struct page_frag_cache *nc,
> + unsigned int *offset, unsigned int fragsz,
> + gfp_t gfp)
> +{
> + unsigned int remaining = nc->remaining;
> + struct page *page;
> +
> + VM_BUG_ON(!fragsz);
> + if (likely(remaining >= fragsz)) {
> + unsigned long encoded_va = nc->encoded_va;
> +
> + *offset = page_frag_cache_page_size(encoded_va) -
> + remaining;
> +
> + return virt_to_page((void *)encoded_va);
> + }
> +
> + if (unlikely(fragsz > PAGE_SIZE))
> + return NULL;
> +
> + page = __page_frag_cache_reload(nc, gfp);
> + if (unlikely(!page))
> + return NULL;
> +
> + *offset = 0;
> + nc->remaining = remaining - fragsz;
> + nc->pagecnt_bias--;
> +
> + return page;
> }
> +EXPORT_SYMBOL(page_frag_alloc_pg);
Again, this isn't returning a page. It is essentially returning a
bio_vec without calling it as such. You might as well pass the bio_vec
pointer as an argument and just have it populate it directly.
It would be identical to the existing page_frag for all intents and
purposes. In addition you could use that as an intermediate value
between the page_frag_cache for your prepare/commit call setup as you
could limit the size/bv_len to being the only item that can be
adjusted, specifically reduced between the prepare and commit calls.
Powered by blists - more mailing lists