netdev - Re: [PATCH net-next v9 06/13] mm: page_frag: reuse existing space for 'size' and 'pfmemalloc'

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <12a8b9ddbcb2da8431f77c5ec952ccfb2a77b7ec.camel@gmail.com>
Date: Mon, 01 Jul 2024 17:08:28 -0700
From: Alexander H Duyck <alexander.duyck@...il.com>
To: Yunsheng Lin <linyunsheng@...wei.com>, davem@...emloft.net,
 kuba@...nel.org,  pabeni@...hat.com
Cc: netdev@...r.kernel.org, linux-kernel@...r.kernel.org, Andrew Morton
	 <akpm@...ux-foundation.org>, linux-mm@...ck.org
Subject: Re: [PATCH net-next v9 06/13] mm: page_frag: reuse existing space
 for 'size' and 'pfmemalloc'

On Tue, 2024-06-25 at 21:52 +0800, Yunsheng Lin wrote:
> Currently there is one 'struct page_frag' for every 'struct
> sock' and 'struct task_struct', we are about to replace the
> 'struct page_frag' with 'struct page_frag_cache' for them.
> Before begin the replacing, we need to ensure the size of
> 'struct page_frag_cache' is not bigger than the size of
> 'struct page_frag', as there may be tens of thousands of
> 'struct sock' and 'struct task_struct' instances in the
> system.
> 
> By or'ing the page order & pfmemalloc with lower bits of
> 'va' instead of using 'u16' or 'u32' for page size and 'u8'
> for pfmemalloc, we are able to avoid 3 or 5 bytes space waste.
> And page address & pfmemalloc & order is unchanged for the
> same page in the same 'page_frag_cache' instance, it makes
> sense to fit them together.
> 
> Also, it is better to replace 'offset' with 'remaining', which
> is the remaining size for the cache in a 'page_frag_cache'
> instance, we are able to do a single 'fragsz > remaining'
> checking for the case of cache not being enough, which should be
> the fast path if we ensure size is zoro when 'va' == NULL by
> memset'ing 'struct page_frag_cache' in page_frag_cache_init()
> and page_frag_cache_drain().
> 
> After this patch, the size of 'struct page_frag_cache' should be
> the same as the size of 'struct page_frag'.
> 
> CC: Alexander Duyck <alexander.duyck@...il.com>
> Signed-off-by: Yunsheng Lin <linyunsheng@...wei.com>
> ---
>  include/linux/page_frag_cache.h | 76 +++++++++++++++++++++++-----
>  mm/page_frag_cache.c            | 90 ++++++++++++++++++++-------------
>  2 files changed, 118 insertions(+), 48 deletions(-)
> 
> diff --git a/include/linux/page_frag_cache.h b/include/linux/page_frag_cache.h
> index 6ac3a25089d1..b33904d4494f 100644
> --- a/include/linux/page_frag_cache.h
> +++ b/include/linux/page_frag_cache.h
> @@ -8,29 +8,81 @@
>  #define PAGE_FRAG_CACHE_MAX_SIZE	__ALIGN_MASK(32768, ~PAGE_MASK)
>  #define PAGE_FRAG_CACHE_MAX_ORDER	get_order(PAGE_FRAG_CACHE_MAX_SIZE)
>  
> -struct page_frag_cache {
> -	void *va;
> +/*
> + * struct encoded_va - a nonexistent type marking this pointer
> + *
> + * An 'encoded_va' pointer is a pointer to a aligned virtual address, which is
> + * at least aligned to PAGE_SIZE, that means there are at least 12 lower bits
> + * space available for other purposes.
> + *
> + * Currently we use the lower 8 bits and bit 9 for the order and PFMEMALLOC
> + * flag of the page this 'va' is corresponding to.
> + *
> + * Use the supplied helper functions to endcode/decode the pointer and bits.
> + */
> +struct encoded_va;
> +

Why did you create a struct for this? The way you use it below it is
just a pointer. No point in defining a struct that doesn't exist
anywhere.

> +#define PAGE_FRAG_CACHE_ORDER_MASK		GENMASK(7, 0)
> +#define PAGE_FRAG_CACHE_PFMEMALLOC_BIT		BIT(8)
> +#define PAGE_FRAG_CACHE_PFMEMALLOC_SHIFT	8
> +
> +static inline struct encoded_va *encode_aligned_va(void *va,
> +						   unsigned int order,
> +						   bool pfmemalloc)
> +{
>  #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
> -	__u16 offset;
> -	__u16 size;
> +	return (struct encoded_va *)((unsigned long)va | order |
> +			pfmemalloc << PAGE_FRAG_CACHE_PFMEMALLOC_SHIFT);
>  #else
> -	__u32 offset;
> +	return (struct encoded_va *)((unsigned long)va |
> +			pfmemalloc << PAGE_FRAG_CACHE_PFMEMALLOC_SHIFT);
> +#endif
> +}
> +
> +static inline unsigned long encoded_page_order(struct encoded_va *encoded_va)
> +{
> +#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
> +	return PAGE_FRAG_CACHE_ORDER_MASK & (unsigned long)encoded_va;
> +#else
> +	return 0;
> +#endif
> +}
> +
> +static inline bool encoded_page_pfmemalloc(struct encoded_va *encoded_va)
> +{
> +	return PAGE_FRAG_CACHE_PFMEMALLOC_BIT & (unsigned long)encoded_va;
> +}
> +

My advice is that if you just make encoded_va an unsigned long this
just becomes some FIELD_GET and bit operations.

> +static inline void *encoded_page_address(struct encoded_va *encoded_va)
> +{
> +	return (void *)((unsigned long)encoded_va & PAGE_MASK);
> +}
> +
> +struct page_frag_cache {
> +	struct encoded_va *encoded_va;

This should be an unsigned long, not a pointer since you are storing
data other than just a pointer in here. The pointer is just one of the
things you extract out of it.

> +
> +#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE) && (BITS_PER_LONG <= 32)
> +	u16 pagecnt_bias;
> +	u16 remaining;
> +#else
> +	u32 pagecnt_bias;
> +	u32 remaining;
>  #endif
> -	/* we maintain a pagecount bias, so that we dont dirty cache line
> -	 * containing page->_refcount every time we allocate a fragment.
> -	 */
> -	unsigned int		pagecnt_bias;
> -	bool pfmemalloc;
>  };
>  
>  static inline void page_frag_cache_init(struct page_frag_cache *nc)
>  {
> -	nc->va = NULL;
> +	memset(nc, 0, sizeof(*nc));

Shouldn't need to memset 0 the whole thing. Just setting page and order
to 0 should be enough to indicate that there isn't anything there.

>  }
>  
>  static inline bool page_frag_cache_is_pfmemalloc(struct page_frag_cache *nc)
>  {
> -	return !!nc->pfmemalloc;
> +	return encoded_page_pfmemalloc(nc->encoded_va);
> +}
> +
> +static inline unsigned int page_frag_cache_page_size(struct encoded_va *encoded_va)
> +{
> +	return PAGE_SIZE << encoded_page_order(encoded_va);
>  }
>  
>  void page_frag_cache_drain(struct page_frag_cache *nc);
> diff --git a/mm/page_frag_cache.c b/mm/page_frag_cache.c
> index dd640af5607a..a3316dd50eff 100644
> --- a/mm/page_frag_cache.c
> +++ b/mm/page_frag_cache.c
> @@ -18,34 +18,61 @@
>  #include <linux/page_frag_cache.h>
>  #include "internal.h"
>  
> +static void *page_frag_cache_current_va(struct page_frag_cache *nc)
> +{
> +	struct encoded_va *encoded_va = nc->encoded_va;
> +
> +	return (void *)(((unsigned long)encoded_va & PAGE_MASK) |
> +		(page_frag_cache_page_size(encoded_va) - nc->remaining));
> +}
> +

Rather than an OR here I would rather see this just use addition.
Otherwise this logic becomes overly complicated.

>  static struct page *__page_frag_cache_refill(struct page_frag_cache *nc,
>  					     gfp_t gfp_mask)
>  {
>  	struct page *page = NULL;
>  	gfp_t gfp = gfp_mask;
> +	unsigned int order;
>  
>  #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
>  	gfp_mask = (gfp_mask & ~__GFP_DIRECT_RECLAIM) |  __GFP_COMP |
>  		   __GFP_NOWARN | __GFP_NORETRY | __GFP_NOMEMALLOC;
>  	page = alloc_pages_node(NUMA_NO_NODE, gfp_mask,
>  				PAGE_FRAG_CACHE_MAX_ORDER);
> -	nc->size = page ? PAGE_FRAG_CACHE_MAX_SIZE : PAGE_SIZE;
>  #endif
> -	if (unlikely(!page))
> +	if (unlikely(!page)) {
>  		page = alloc_pages_node(NUMA_NO_NODE, gfp, 0);
> +		if (unlikely(!page)) {
> +			memset(nc, 0, sizeof(*nc));
> +			return NULL;
> +		}
> +
> +		order = 0;
> +		nc->remaining = PAGE_SIZE;
> +	} else {
> +		order = PAGE_FRAG_CACHE_MAX_ORDER;
> +		nc->remaining = PAGE_FRAG_CACHE_MAX_SIZE;
> +	}
>  
> -	nc->va = page ? page_address(page) : NULL;
> +	/* Even if we own the page, we do not use atomic_set().
> +	 * This would break get_page_unless_zero() users.
> +	 */
> +	page_ref_add(page, PAGE_FRAG_CACHE_MAX_SIZE);
>  
> +	/* reset page count bias of new frag */
> +	nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;

I would rather keep the pagecnt_bias, page reference addition, and
resetting of remaining outside of this. The only fields we should be
setting are order, the virtual address, and pfmemalloc since those are
what is encoded in your unsigned long variable.

> +	nc->encoded_va = encode_aligned_va(page_address(page), order,
> +					   page_is_pfmemalloc(page));
>  	return page;
>  }
>  
>  void page_frag_cache_drain(struct page_frag_cache *nc)
>  {
> -	if (!nc->va)
> +	if (!nc->encoded_va)
>  		return;
>  
> -	__page_frag_cache_drain(virt_to_head_page(nc->va), nc->pagecnt_bias);
> -	nc->va = NULL;
> +	__page_frag_cache_drain(virt_to_head_page(nc->encoded_va),
> +				nc->pagecnt_bias);
> +	memset(nc, 0, sizeof(*nc));

Again, no need for memset when "nv->encoded_va = 0" will do.

>  }
>  EXPORT_SYMBOL(page_frag_cache_drain);
>  
> @@ -62,51 +89,41 @@ void *__page_frag_alloc_va_align(struct page_frag_cache *nc,
>  				 unsigned int fragsz, gfp_t gfp_mask,
>  				 unsigned int align_mask)
>  {
> -	unsigned int size = PAGE_SIZE;
> +	struct encoded_va *encoded_va = nc->encoded_va;
>  	struct page *page;
> -	int offset;
> +	int remaining;
> +	void *va;
>  
> -	if (unlikely(!nc->va)) {
> +	if (unlikely(!encoded_va)) {
>  refill:
> -		page = __page_frag_cache_refill(nc, gfp_mask);
> -		if (!page)
> +		if (unlikely(!__page_frag_cache_refill(nc, gfp_mask)))
>  			return NULL;
>  
> -		/* Even if we own the page, we do not use atomic_set().
> -		 * This would break get_page_unless_zero() users.
> -		 */
> -		page_ref_add(page, PAGE_FRAG_CACHE_MAX_SIZE);
> -
> -		/* reset page count bias and offset to start of new frag */
> -		nc->pfmemalloc = page_is_pfmemalloc(page);
> -		nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
> -		nc->offset = 0;
> +		encoded_va = nc->encoded_va;
>  	}
>  
> -#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
> -	/* if size can vary use size else just use PAGE_SIZE */
> -	size = nc->size;
> -#endif
> -
> -	offset = __ALIGN_KERNEL_MASK(nc->offset, ~align_mask);
> -	if (unlikely(offset + fragsz > size)) {
> -		page = virt_to_page(nc->va);
> -
> +	remaining = nc->remaining & align_mask;
> +	remaining -= fragsz;
> +	if (unlikely(remaining < 0)) {

Now this is just getting confusing. You essentially just added an
additional addition step and went back to the countdown approach I was
using before except for the fact that you are starting at 0 whereas I
was actually moving down through the page.

What I would suggest doing since "remaining" is a negative offset
anyway would be to look at just storing it as a signed negative number.
At least with that you can keep to your original approach and would
only have to change your check to be for "remaining + fragsz <= 0".
With that you can still do your math but it becomes an addition instead
of a subtraction.

> +		page = virt_to_page(encoded_va);
>  		if (!page_ref_sub_and_test(page, nc->pagecnt_bias))
>  			goto refill;
>  
> -		if (unlikely(nc->pfmemalloc)) {
> -			free_unref_page(page, compound_order(page));
> +		if (unlikely(encoded_page_pfmemalloc(encoded_va))) {
> +			VM_BUG_ON(compound_order(page) !=
> +				  encoded_page_order(encoded_va));
> +			free_unref_page(page, encoded_page_order(encoded_va));
>  			goto refill;
>  		}
>  
>  		/* OK, page count is 0, we can safely set it */
>  		set_page_count(page, PAGE_FRAG_CACHE_MAX_SIZE + 1);
>  
> -		/* reset page count bias and offset to start of new frag */
> +		/* reset page count bias and remaining of new frag */
>  		nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
> -		offset = 0;
> -		if (unlikely(fragsz > PAGE_SIZE)) {
> +		nc->remaining = remaining = page_frag_cache_page_size(encoded_va);
> +		remaining -= fragsz;
> +		if (unlikely(remaining < 0)) {
>  			/*
>  			 * The caller is trying to allocate a fragment
>  			 * with fragsz > PAGE_SIZE but the cache isn't big

I find it really amusing that you went to all the trouble of flipping
the logic just to flip it back to being a countdown setup. If you were
going to bother with all that then why not just make the remaining
negative instead? You could save yourself a ton of trouble that way and
all you would need to do is flip a few signs.

> @@ -120,10 +137,11 @@ void *__page_frag_alloc_va_align(struct page_frag_cache *nc,
>  		}
>  	}
>  
> +	va = page_frag_cache_current_va(nc);
>  	nc->pagecnt_bias--;
> -	nc->offset = offset + fragsz;
> +	nc->remaining = remaining;
>  
> -	return nc->va + offset;
> +	return va;
>  }
>  EXPORT_SYMBOL(__page_frag_alloc_va_align);
>  

Not sure I am huge fan of the way the order of operations has to get so
creative for this to work.  Not that I see a better way to do it, but
my concern is that this is going to add technical debt as I can easily
see somebody messing up the order of things at some point in the future
and generating a bad pointer.