lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <ZdkXpcAB6xWj2geh@google.com>
Date: Fri, 23 Feb 2024 14:09:41 -0800
From: Minchan Kim <minchan@...nel.org>
To: Barry Song <21cnbao@...il.com>
Cc: sj@...nel.org, akpm@...ux-foundation.org, damon@...ts.linux.dev,
	linux-mm@...ck.org, linux-kernel@...r.kernel.org, mhocko@...e.com,
	hannes@...xchg.org, Barry Song <v-songbaohua@...o.com>
Subject: Re: [PATCH RFC] mm: madvise: pageout: ignore references rather than
 clearing young

Hi Barry,

On Fri, Feb 23, 2024 at 05:15:50PM +1300, Barry Song wrote:
> From: Barry Song <v-songbaohua@...o.com>
> 
> While doing MADV_PAGEOUT, the current code will clear PTE young
> so that vmscan won't read young flags to allow the reclamation
> of madvised folios to go ahead.

Isn't it good to accelerate reclaiming? vmscan checks whether the
page was accessed recenlty by the young bit from pte and if it is,
it doesn't reclaim the page. Since we have cleared the young bit
in pte in madvise_pageout, vmscan is likely to reclaim the page
since it wouldn't see the ferencecd_ptes from folio_check_references.

Could you clarify if I miss something here?


> It seems we can do it by directly ignoring references, thus we
> can remove tlb flush in madvise and rmap overhead in vmscan.
> 
> Regarding the side effect, in the original code, if a parallel
> thread runs side by side to access the madvised memory with the
> thread doing madvise, folios will get a chance to be re-activated
> by vmscan. But with the patch, they will still be reclaimed. But
> this behaviour doing PAGEOUT and doing access at the same time is
> quite silly like DoS. So probably, we don't need to care.
> 
> A microbench as below has shown 6% decrement on the latency of
> MADV_PAGEOUT,
> 
>  #define PGSIZE 4096
>  main()
>  {
>  	int i;
>  #define SIZE 512*1024*1024
>  	volatile long *p = mmap(NULL, SIZE, PROT_READ | PROT_WRITE,
>  			MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
> 
>  	for (i = 0; i < SIZE/sizeof(long); i += PGSIZE / sizeof(long))
>  		p[i] =  0x11;
> 
>  	madvise(p, SIZE, MADV_PAGEOUT);
>  }
> 
> w/o patch                    w/ patch
> root@10:~# time ./a.out      root@10:~# time ./a.out
> real	0m49.634s            real   0m46.334s
> user	0m0.637s             user   0m0.648s
> sys	0m47.434s            sys    0m44.265s
> 
> Signed-off-by: Barry Song <v-songbaohua@...o.com>
> ---
>  mm/damon/paddr.c |  2 +-
>  mm/internal.h    |  2 +-
>  mm/madvise.c     |  8 ++++----
>  mm/vmscan.c      | 12 +++++++-----
>  4 files changed, 13 insertions(+), 11 deletions(-)
> 
> diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c
> index 081e2a325778..5e6dc312072c 100644
> --- a/mm/damon/paddr.c
> +++ b/mm/damon/paddr.c
> @@ -249,7 +249,7 @@ static unsigned long damon_pa_pageout(struct damon_region *r, struct damos *s)
>  put_folio:
>  		folio_put(folio);
>  	}
> -	applied = reclaim_pages(&folio_list);
> +	applied = reclaim_pages(&folio_list, false);
>  	cond_resched();
>  	return applied * PAGE_SIZE;
>  }
> diff --git a/mm/internal.h b/mm/internal.h
> index 93e229112045..36c11ea41f47 100644
> --- a/mm/internal.h
> +++ b/mm/internal.h
> @@ -868,7 +868,7 @@ extern unsigned long  __must_check vm_mmap_pgoff(struct file *, unsigned long,
>          unsigned long, unsigned long);
>  
>  extern void set_pageblock_order(void);
> -unsigned long reclaim_pages(struct list_head *folio_list);
> +unsigned long reclaim_pages(struct list_head *folio_list, bool ignore_references);
>  unsigned int reclaim_clean_pages_from_list(struct zone *zone,
>  					    struct list_head *folio_list);
>  /* The ALLOC_WMARK bits are used as an index to zone->watermark */
> diff --git a/mm/madvise.c b/mm/madvise.c
> index abde3edb04f0..44a498c94158 100644
> --- a/mm/madvise.c
> +++ b/mm/madvise.c
> @@ -386,7 +386,7 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
>  			return 0;
>  		}
>  
> -		if (pmd_young(orig_pmd)) {
> +		if (!pageout && pmd_young(orig_pmd)) {
>  			pmdp_invalidate(vma, addr, pmd);
>  			orig_pmd = pmd_mkold(orig_pmd);
>  
> @@ -410,7 +410,7 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
>  huge_unlock:
>  		spin_unlock(ptl);
>  		if (pageout)
> -			reclaim_pages(&folio_list);
> +			reclaim_pages(&folio_list, true);
>  		return 0;
>  	}
>  
> @@ -490,7 +490,7 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
>  
>  		VM_BUG_ON_FOLIO(folio_test_large(folio), folio);
>  
> -		if (pte_young(ptent)) {
> +		if (!pageout && pte_young(ptent)) {
>  			ptent = ptep_get_and_clear_full(mm, addr, pte,
>  							tlb->fullmm);
>  			ptent = pte_mkold(ptent);
> @@ -524,7 +524,7 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
>  		pte_unmap_unlock(start_pte, ptl);
>  	}
>  	if (pageout)
> -		reclaim_pages(&folio_list);
> +		reclaim_pages(&folio_list, true);
>  	cond_resched();
>  
>  	return 0;
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index 402c290fbf5a..ba2f37f46a73 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -2102,7 +2102,8 @@ static void shrink_active_list(unsigned long nr_to_scan,
>  }
>  
>  static unsigned int reclaim_folio_list(struct list_head *folio_list,
> -				      struct pglist_data *pgdat)
> +				      struct pglist_data *pgdat,
> +				      bool ignore_references)
>  {
>  	struct reclaim_stat dummy_stat;
>  	unsigned int nr_reclaimed;
> @@ -2115,7 +2116,7 @@ static unsigned int reclaim_folio_list(struct list_head *folio_list,
>  		.no_demotion = 1,
>  	};
>  
> -	nr_reclaimed = shrink_folio_list(folio_list, pgdat, &sc, &dummy_stat, false);
> +	nr_reclaimed = shrink_folio_list(folio_list, pgdat, &sc, &dummy_stat, ignore_references);
>  	while (!list_empty(folio_list)) {
>  		folio = lru_to_folio(folio_list);
>  		list_del(&folio->lru);
> @@ -2125,7 +2126,7 @@ static unsigned int reclaim_folio_list(struct list_head *folio_list,
>  	return nr_reclaimed;
>  }
>  
> -unsigned long reclaim_pages(struct list_head *folio_list)
> +unsigned long reclaim_pages(struct list_head *folio_list, bool ignore_references)
>  {
>  	int nid;
>  	unsigned int nr_reclaimed = 0;
> @@ -2147,11 +2148,12 @@ unsigned long reclaim_pages(struct list_head *folio_list)
>  			continue;
>  		}
>  
> -		nr_reclaimed += reclaim_folio_list(&node_folio_list, NODE_DATA(nid));
> +		nr_reclaimed += reclaim_folio_list(&node_folio_list, NODE_DATA(nid),
> +						   ignore_references);
>  		nid = folio_nid(lru_to_folio(folio_list));
>  	} while (!list_empty(folio_list));
>  
> -	nr_reclaimed += reclaim_folio_list(&node_folio_list, NODE_DATA(nid));
> +	nr_reclaimed += reclaim_folio_list(&node_folio_list, NODE_DATA(nid), ignore_references);
>  
>  	memalloc_noreclaim_restore(noreclaim_flag);
>  
> -- 
> 2.34.1
> 

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ