linux-kernel - Re: [PATCH v2 8/9] mm/swap: introduce a helper for swapin without vmfault

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <875y039utw.fsf@yhuang6-desk2.ccr.corp.intel.com>
Date: Tue, 09 Jan 2024 09:08:59 +0800
From: "Huang, Ying" <ying.huang@...el.com>
To: Kairui Song <ryncsn@...il.com>
Cc: linux-mm@...ck.org,  Kairui Song <kasong@...cent.com>,  Andrew Morton
 <akpm@...ux-foundation.org>,  Chris Li <chrisl@...nel.org>,  Hugh Dickins
 <hughd@...gle.com>,  Johannes Weiner <hannes@...xchg.org>,  Matthew Wilcox
 <willy@...radead.org>,  Michal Hocko <mhocko@...e.com>,  Yosry Ahmed
 <yosryahmed@...gle.com>,  David Hildenbrand <david@...hat.com>,
  linux-kernel@...r.kernel.org
Subject: Re: [PATCH v2 8/9] mm/swap: introduce a helper for swapin without
 vmfault

Kairui Song <ryncsn@...il.com> writes:

> From: Kairui Song <kasong@...cent.com>
>
> There are two places where swapin is not caused by direct anon page fault:
> - shmem swapin, invoked indirectly through shmem mapping
> - swapoff
>
> They used to construct a pseudo vmfault struct for swapin function.
> Shmem has dropped the pseudo vmfault recently in commit ddc1a5cbc05d
> ("mempolicy: alloc_pages_mpol() for NUMA policy without vma"). Swapoff
> path is still using one.
>
> Introduce a helper for them both, this help save stack usage for swapoff
> path, and help apply a unified swapin cache and readahead policy check.
>
> Due to missing vmfault info, the caller have to pass in mempolicy
> explicitly, make it different from swapin_entry and name it
> swapin_entry_mpol.
>
> This commit convert swapoff to use this helper, follow-up commits will
> convert shmem to use it too.
>
> Signed-off-by: Kairui Song <kasong@...cent.com>
> ---
>  mm/swap.h       |  9 +++++++++
>  mm/swap_state.c | 40 ++++++++++++++++++++++++++++++++--------
>  mm/swapfile.c   | 15 ++++++---------
>  3 files changed, 47 insertions(+), 17 deletions(-)
>
> diff --git a/mm/swap.h b/mm/swap.h
> index 9180411afcfe..8f790a67b948 100644
> --- a/mm/swap.h
> +++ b/mm/swap.h
> @@ -73,6 +73,9 @@ struct folio *swap_cluster_readahead(swp_entry_t entry, gfp_t flag,
>  		struct mempolicy *mpol, pgoff_t ilx);
>  struct folio *swapin_entry(swp_entry_t entry, gfp_t flag,
>  			    struct vm_fault *vmf, enum swap_cache_result *result);
> +struct folio *swapin_entry_mpol(swp_entry_t entry, gfp_t gfp_mask,
> +				struct mempolicy *mpol, pgoff_t ilx,
> +				enum swap_cache_result *result);
>  
>  static inline unsigned int folio_swap_flags(struct folio *folio)
>  {
> @@ -109,6 +112,12 @@ static inline struct folio *swapin_entry(swp_entry_t swp, gfp_t gfp_mask,
>  	return NULL;
>  }
>  
> +static inline struct page *swapin_entry_mpol(swp_entry_t entry, gfp_t gfp_mask,
> +		struct mempolicy *mpol, pgoff_t ilx, enum swap_cache_result *result)
> +{
> +	return NULL;
> +}
> +
>  static inline int swap_writepage(struct page *p, struct writeback_control *wbc)
>  {
>  	return 0;
> diff --git a/mm/swap_state.c b/mm/swap_state.c
> index 21badd4f0fc7..3edf4b63158d 100644
> --- a/mm/swap_state.c
> +++ b/mm/swap_state.c
> @@ -880,14 +880,13 @@ static struct folio *swap_vma_readahead(swp_entry_t targ_entry, gfp_t gfp_mask,
>   * in.
>   */
>  static struct folio *swapin_direct(swp_entry_t entry, gfp_t gfp_mask,
> -				  struct vm_fault *vmf, void *shadow)
> +				   struct mempolicy *mpol, pgoff_t ilx,
> +				   void *shadow)
>  {
> -	struct vm_area_struct *vma = vmf->vma;
>  	struct folio *folio;
>  
> -	/* skip swapcache */
> -	folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0,
> -				vma, vmf->address, false);
> +	folio = (struct folio *)alloc_pages_mpol(gfp_mask, 0,
> +			mpol, ilx, numa_node_id());
>  	if (folio) {
>  		if (mem_cgroup_swapin_charge_folio(folio, NULL,
>  						   GFP_KERNEL, entry)) {
> @@ -943,18 +942,18 @@ struct folio *swapin_entry(swp_entry_t entry, gfp_t gfp_mask,
>  		goto done;
>  	}
>  
> +	mpol = get_vma_policy(vmf->vma, vmf->address, 0, &ilx);
>  	if (swap_use_no_readahead(swp_swap_info(entry), entry)) {
> -		folio = swapin_direct(entry, gfp_mask, vmf, shadow);
> +		folio = swapin_direct(entry, gfp_mask, mpol, ilx, shadow);
>  		cache_result = SWAP_CACHE_BYPASS;
>  	} else {
> -		mpol = get_vma_policy(vmf->vma, vmf->address, 0, &ilx);
>  		if (swap_use_vma_readahead())
>  			folio = swap_vma_readahead(entry, gfp_mask, mpol, ilx, vmf);
>  		else
>  			folio = swap_cluster_readahead(entry, gfp_mask, mpol, ilx);
> -		mpol_cond_put(mpol);
>  		cache_result = SWAP_CACHE_MISS;
>  	}
> +	mpol_cond_put(mpol);
>  done:
>  	if (result)
>  		*result = cache_result;
> @@ -962,6 +961,31 @@ struct folio *swapin_entry(swp_entry_t entry, gfp_t gfp_mask,
>  	return folio;
>  }
>  
> +struct folio *swapin_entry_mpol(swp_entry_t entry, gfp_t gfp_mask,
> +				struct mempolicy *mpol, pgoff_t ilx,
> +				enum swap_cache_result *result)
> +{
> +	enum swap_cache_result cache_result;
> +	void *shadow = NULL;
> +	struct folio *folio;
> +
> +	folio = swap_cache_get_folio(entry, NULL, 0, &shadow);
> +	if (folio) {
> +		cache_result = SWAP_CACHE_HIT;
> +	} else if (swap_use_no_readahead(swp_swap_info(entry), entry)) {
> +		folio = swapin_direct(entry, gfp_mask, mpol, ilx, shadow);
> +		cache_result = SWAP_CACHE_BYPASS;
> +	} else {
> +		folio = swap_cluster_readahead(entry, gfp_mask, mpol, ilx);
> +		cache_result = SWAP_CACHE_MISS;
> +	}
> +
> +	if (result)
> +		*result = cache_result;
> +
> +	return folio;
> +}
> +
>  #ifdef CONFIG_SYSFS
>  static ssize_t vma_ra_enabled_show(struct kobject *kobj,
>  				     struct kobj_attribute *attr, char *buf)
> diff --git a/mm/swapfile.c b/mm/swapfile.c
> index 5aa44de11edc..2f77bf143af8 100644
> --- a/mm/swapfile.c
> +++ b/mm/swapfile.c
> @@ -1840,18 +1840,13 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
>  	do {
>  		struct folio *folio;
>  		unsigned long offset;
> +		struct mempolicy *mpol;
>  		unsigned char swp_count;
>  		swp_entry_t entry;
> +		pgoff_t ilx;
>  		int ret;
>  		pte_t ptent;
>  
> -		struct vm_fault vmf = {
> -			.vma = vma,
> -			.address = addr,
> -			.real_address = addr,
> -			.pmd = pmd,
> -		};
> -
>  		if (!pte++) {
>  			pte = pte_offset_map(pmd, addr);
>  			if (!pte)
> @@ -1871,8 +1866,10 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
>  		pte_unmap(pte);
>  		pte = NULL;
>  
> -		folio = swapin_entry(entry, GFP_HIGHUSER_MOVABLE,
> -				     &vmf, NULL);
> +		mpol = get_vma_policy(vma, addr, 0, &ilx);
> +		folio = swapin_entry_mpol(entry, GFP_HIGHUSER_MOVABLE,
> +					  mpol, ilx, NULL);
> +		mpol_cond_put(mpol);
>  		if (!folio) {
>  			/*
>  			 * The entry could have been freed, and will not

IIUC, after the change, we will always use cluster readahead for
swapoff.  This may be OK.  But, at least we need some test results which
show that this will not cause any issue for this behavior change.  And
the behavior change should be described explicitly in patch description.

And I don't think it's a good abstraction to make swapin_entry_mpol()
always use cluster swapin, while swapin_entry() will try to use vma
swapin.  I think we can add "struct mempolicy *mpol" and "pgoff_t ilx"
to swapin_entry() as parameters, and use them if vmf == NULL.  If we
want to enforce cluster swapin in swapoff path, it will be better to add
some comments to describe why.

--
Best Regards,
Huang, Ying