linux-kernel - Re: [PATCH v2 9/9] mm/swap, shmem: use new swapin helper to skip readahead conditionally

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <871qar9sb2.fsf@yhuang6-desk2.ccr.corp.intel.com>
Date: Tue, 09 Jan 2024 10:03:29 +0800
From: "Huang, Ying" <ying.huang@...el.com>
To: Kairui Song <ryncsn@...il.com>
Cc: linux-mm@...ck.org,  Kairui Song <kasong@...cent.com>,  Andrew Morton
 <akpm@...ux-foundation.org>,  Chris Li <chrisl@...nel.org>,  Hugh Dickins
 <hughd@...gle.com>,  Johannes Weiner <hannes@...xchg.org>,  Matthew Wilcox
 <willy@...radead.org>,  Michal Hocko <mhocko@...e.com>,  Yosry Ahmed
 <yosryahmed@...gle.com>,  David Hildenbrand <david@...hat.com>,
  linux-kernel@...r.kernel.org
Subject: Re: [PATCH v2 9/9] mm/swap, shmem: use new swapin helper to skip
 readahead conditionally

Kairui Song <ryncsn@...il.com> writes:

> From: Kairui Song <kasong@...cent.com>
>
> Currently, shmem uses cluster readahead for all swap backends. Cluster
> readahead is not a good solution for ramdisk based device (ZRAM) at all.
>
> After switching to the new helper, most benchmarks showed a good result:
>
> - Single file sequence read:
>   perf stat --repeat 20 dd if=/tmpfs/test of=/dev/null bs=1M count=8192
>   (/tmpfs/test is a zero filled file, using brd as swap, 4G memcg limit)
>   Before: 22.248 +- 0.549
>   After:  22.021 +- 0.684 (-1.1%)
>
> - Random read stress test:
>   fio -name=tmpfs --numjobs=16 --directory=/tmpfs \
>   --size=256m --ioengine=mmap --rw=randread --random_distribution=random \
>   --time_based --ramp_time=1m --runtime=5m --group_reporting
>   (using brd as swap, 2G memcg limit)
>
>   Before: 1818MiB/s
>   After:  1888MiB/s (+3.85%)
>
> - Zipf biased random read stress test:
>   fio -name=tmpfs --numjobs=16 --directory=/tmpfs \
>   --size=256m --ioengine=mmap --rw=randread --random_distribution=zipf:1.2 \
>   --time_based --ramp_time=1m --runtime=5m --group_reporting
>   (using brd as swap, 2G memcg limit)
>
>   Before: 31.1GiB/s
>   After:  32.3GiB/s (+3.86%)
>
> So cluster readahead doesn't help much even for single sequence read,
> and for random stress test, the performance is better without it.
>
> Considering both memory and swap device will get more fragmented
> slowly, and commonly used ZRAM consumes much more CPU than plain
> ramdisk, false readahead could occur more frequently and waste
> more CPU. Direct SWAP is cheaper, so use the new helper and skip
> read ahead for SWP_SYNCHRONOUS_IO device.

It's good to take advantage of swap_direct (no readahead).  I also hopes
we can take advantage of VMA based swapin if shmem is accessed via mmap.
That appears possible.

--
Best Regards,
Huang, Ying

> Signed-off-by: Kairui Song <kasong@...cent.com>
> ---
>  mm/shmem.c      | 67 +++++++++++++++++++++++++------------------------
>  mm/swap.h       |  9 -------
>  mm/swap_state.c | 11 ++++++--
>  3 files changed, 43 insertions(+), 44 deletions(-)
>
> diff --git a/mm/shmem.c b/mm/shmem.c
> index 9da9f7a0e620..3c0729fe934d 100644
> --- a/mm/shmem.c
> +++ b/mm/shmem.c
> @@ -1564,20 +1564,6 @@ static inline struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo)
>  static struct mempolicy *shmem_get_pgoff_policy(struct shmem_inode_info *info,
>  			pgoff_t index, unsigned int order, pgoff_t *ilx);
>  
> -static struct folio *shmem_swapin_cluster(swp_entry_t swap, gfp_t gfp,
> -			struct shmem_inode_info *info, pgoff_t index)
> -{
> -	struct mempolicy *mpol;
> -	pgoff_t ilx;
> -	struct folio *folio;
> -
> -	mpol = shmem_get_pgoff_policy(info, index, 0, &ilx);
> -	folio = swap_cluster_readahead(swap, gfp, mpol, ilx);
> -	mpol_cond_put(mpol);
> -
> -	return folio;
> -}
> -
>  /*
>   * Make sure huge_gfp is always more limited than limit_gfp.
>   * Some of the flags set permissions, while others set limitations.
> @@ -1851,9 +1837,12 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
>  {
>  	struct address_space *mapping = inode->i_mapping;
>  	struct shmem_inode_info *info = SHMEM_I(inode);
> +	enum swap_cache_result cache_result;
>  	struct swap_info_struct *si;
>  	struct folio *folio = NULL;
> +	struct mempolicy *mpol;
>  	swp_entry_t swap;
> +	pgoff_t ilx;
>  	int error;
>  
>  	VM_BUG_ON(!*foliop || !xa_is_value(*foliop));
> @@ -1871,36 +1860,40 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
>  			return -EINVAL;
>  	}
>  
> -	/* Look it up and read it in.. */
> -	folio = swap_cache_get_folio(swap, NULL, 0, NULL);
> +	mpol = shmem_get_pgoff_policy(info, index, 0, &ilx);
> +	folio = swapin_entry_mpol(swap, gfp, mpol, ilx, &cache_result);
> +	mpol_cond_put(mpol);
> +
>  	if (!folio) {
> -		/* Or update major stats only when swapin succeeds?? */
> +		error = -ENOMEM;
> +		goto failed;
> +	}
> +	if (cache_result != SWAP_CACHE_HIT) {
>  		if (fault_type) {
>  			*fault_type |= VM_FAULT_MAJOR;
>  			count_vm_event(PGMAJFAULT);
>  			count_memcg_event_mm(fault_mm, PGMAJFAULT);
>  		}
> -		/* Here we actually start the io */
> -		folio = shmem_swapin_cluster(swap, gfp, info, index);
> -		if (!folio) {
> -			error = -ENOMEM;
> -			goto failed;
> -		}
>  	}
>  
>  	/* We have to do this with folio locked to prevent races */
>  	folio_lock(folio);
> -	if (!folio_test_swapcache(folio) ||
> -	    folio->swap.val != swap.val ||
> -	    !shmem_confirm_swap(mapping, index, swap)) {
> +	if (cache_result != SWAP_CACHE_BYPASS) {
> +		/* With cache bypass, folio is new allocated, sync, and not in cache */
> +		if (!folio_test_swapcache(folio) || folio->swap.val != swap.val) {
> +			error = -EEXIST;
> +			goto unlock;
> +		}
> +		if (!folio_test_uptodate(folio)) {
> +			error = -EIO;
> +			goto failed;
> +		}
> +		folio_wait_writeback(folio);
> +	}
> +	if (!shmem_confirm_swap(mapping, index, swap)) {
>  		error = -EEXIST;
>  		goto unlock;
>  	}
> -	if (!folio_test_uptodate(folio)) {
> -		error = -EIO;
> -		goto failed;
> -	}
> -	folio_wait_writeback(folio);
>  
>  	/*
>  	 * Some architectures may have to restore extra metadata to the
> @@ -1908,12 +1901,19 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
>  	 */
>  	arch_swap_restore(swap, folio);
>  
> -	if (shmem_should_replace_folio(folio, gfp)) {
> +	/* With cache bypass, folio is new allocated and always respect gfp flags */
> +	if (cache_result != SWAP_CACHE_BYPASS && shmem_should_replace_folio(folio, gfp)) {
>  		error = shmem_replace_folio(&folio, gfp, info, index);
>  		if (error)
>  			goto failed;
>  	}
>  
> +	/*
> +	 * The expected value checking below should be enough to ensure
> +	 * only one up-to-date swapin success. swap_free() is called after
> +	 * this, so the entry can't be reused. As long as the mapping still
> +	 * has the old entry value, it's never swapped in or modified.
> +	 */
>  	error = shmem_add_to_page_cache(folio, mapping, index,
>  					swp_to_radix_entry(swap), gfp);
>  	if (error)
> @@ -1924,7 +1924,8 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
>  	if (sgp == SGP_WRITE)
>  		folio_mark_accessed(folio);
>  
> -	delete_from_swap_cache(folio);
> +	if (cache_result != SWAP_CACHE_BYPASS)
> +		delete_from_swap_cache(folio);
>  	folio_mark_dirty(folio);
>  	swap_free(swap);
>  	put_swap_device(si);
> diff --git a/mm/swap.h b/mm/swap.h
> index 8f790a67b948..20f4048c971c 100644
> --- a/mm/swap.h
> +++ b/mm/swap.h
> @@ -57,9 +57,6 @@ void __delete_from_swap_cache(struct folio *folio,
>  void delete_from_swap_cache(struct folio *folio);
>  void clear_shadow_from_swap_cache(int type, unsigned long begin,
>  				  unsigned long end);
> -struct folio *swap_cache_get_folio(swp_entry_t entry,
> -		struct vm_area_struct *vma, unsigned long addr,
> -		void **shadowp);
>  struct folio *filemap_get_incore_folio(struct address_space *mapping,
>  		pgoff_t index);
>  
> @@ -123,12 +120,6 @@ static inline int swap_writepage(struct page *p, struct writeback_control *wbc)
>  	return 0;
>  }
>  
> -static inline struct folio *swap_cache_get_folio(swp_entry_t entry,
> -		struct vm_area_struct *vma, unsigned long addr)
> -{
> -	return NULL;
> -}
> -
>  static inline
>  struct folio *filemap_get_incore_folio(struct address_space *mapping,
>  		pgoff_t index)
> diff --git a/mm/swap_state.c b/mm/swap_state.c
> index 3edf4b63158d..10eec68475dd 100644
> --- a/mm/swap_state.c
> +++ b/mm/swap_state.c
> @@ -318,7 +318,14 @@ void free_pages_and_swap_cache(struct encoded_page **pages, int nr)
>  
>  static inline bool swap_use_no_readahead(struct swap_info_struct *si, swp_entry_t entry)
>  {
> -	return data_race(si->flags & SWP_SYNCHRONOUS_IO) && __swap_count(entry) == 1;
> +	int count;
> +
> +	if (!data_race(si->flags & SWP_SYNCHRONOUS_IO))
> +		return false;
> +
> +	count = __swap_count(entry);
> +
> +	return (count == 1 || count == SWAP_MAP_SHMEM);
>  }
>  
>  static inline bool swap_use_vma_readahead(void)
> @@ -334,7 +341,7 @@ static inline bool swap_use_vma_readahead(void)
>   *
>   * Caller must lock the swap device or hold a reference to keep it valid.
>   */
> -struct folio *swap_cache_get_folio(swp_entry_t entry,
> +static struct folio *swap_cache_get_folio(swp_entry_t entry,
>  		struct vm_area_struct *vma, unsigned long addr, void **shadowp)
>  {
>  	struct folio *folio;