linux-kernel - Re: [PATCH v4 1/2] mm: store zero pages to be swapped out in a bitmap

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives

Hash Suite: Windows password security audit tool. GUI, reports in PDF.

[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]

Message-ID: <ZmoBf6RPJzC2RaqM@google.com>
Date: Wed, 12 Jun 2024 20:13:51 +0000
From: Yosry Ahmed <yosryahmed@...gle.com>
To: Usama Arif <usamaarif642@...il.com>
Cc: akpm@...ux-foundation.org, hannes@...xchg.org, shakeel.butt@...ux.dev, 
	david@...hat.com, ying.huang@...el.com, hughd@...gle.com, willy@...radead.org, 
	nphamcs@...il.com, chengming.zhou@...ux.dev, linux-mm@...ck.org, 
	linux-kernel@...r.kernel.org, kernel-team@...a.com
Subject: Re: [PATCH v4 1/2] mm: store zero pages to be swapped out in a bitmap

On Wed, Jun 12, 2024 at 01:43:35PM +0100, Usama Arif wrote:
[..]

Hi Usama,

A few more comments/questions, sorry for not looking closely earlier.

> diff --git a/mm/swapfile.c b/mm/swapfile.c
> index f1e559e216bd..48d8dca0b94b 100644
> --- a/mm/swapfile.c
> +++ b/mm/swapfile.c
> @@ -453,6 +453,8 @@ static unsigned int cluster_list_del_first(struct swap_cluster_list *list,
>  static void swap_cluster_schedule_discard(struct swap_info_struct *si,
>  		unsigned int idx)
>  {
> +	unsigned int i;
> +
>  	/*
>  	 * If scan_swap_map_slots() can't find a free cluster, it will check
>  	 * si->swap_map directly. To make sure the discarding cluster isn't
> @@ -461,6 +463,13 @@ static void swap_cluster_schedule_discard(struct swap_info_struct *si,
>  	 */
>  	memset(si->swap_map + idx * SWAPFILE_CLUSTER,
>  			SWAP_MAP_BAD, SWAPFILE_CLUSTER);
> +	/*
> +	 * zeromap can see updates from concurrent swap_writepage() and swap_read_folio()
> +	 * call on other slots, hence use atomic clear_bit for zeromap instead of the
> +	 * non-atomic bitmap_clear.
> +	 */

I don't think this is accurate. swap_read_folio() does not update the
zeromap. I think the need for an atomic operation here is because we may
be updating adjacent bits simulatenously, so we may cause lost updates
otherwise (i.e. corrupting adjacent bits).

> +	for (i = 0; i < SWAPFILE_CLUSTER; i++)
> +		clear_bit(idx * SWAPFILE_CLUSTER + i, si->zeromap);

Could you explain why we need to clear the zeromap here?

swap_cluster_schedule_discard() is called from:
- swap_free_cluster() -> free_cluster()

This is already covered below.

- swap_entry_free() -> dec_cluster_info_page() -> free_cluster()

Each entry in the cluster should have its zeromap bit cleared in
swap_entry_free() before the entire cluster is free and we call
free_cluster().

Am I missing something?

>  
>  	cluster_list_add_tail(&si->discard_clusters, si->cluster_info, idx);
>  
> @@ -482,7 +491,7 @@ static void __free_cluster(struct swap_info_struct *si, unsigned long idx)
>  static void swap_do_scheduled_discard(struct swap_info_struct *si)
>  {
>  	struct swap_cluster_info *info, *ci;
> -	unsigned int idx;
> +	unsigned int idx, i;
>  
>  	info = si->cluster_info;
>  
> @@ -498,6 +507,8 @@ static void swap_do_scheduled_discard(struct swap_info_struct *si)
>  		__free_cluster(si, idx);
>  		memset(si->swap_map + idx * SWAPFILE_CLUSTER,
>  				0, SWAPFILE_CLUSTER);
> +		for (i = 0; i < SWAPFILE_CLUSTER; i++)
> +			clear_bit(idx * SWAPFILE_CLUSTER + i, si->zeromap);

Same here. I didn't look into the specific code paths, but shouldn't the
cluster be unused (and hence its zeromap bits already cleared?).

>  		unlock_cluster(ci);
>  	}
>  }
> @@ -1059,9 +1070,12 @@ static void swap_free_cluster(struct swap_info_struct *si, unsigned long idx)
>  {
>  	unsigned long offset = idx * SWAPFILE_CLUSTER;
>  	struct swap_cluster_info *ci;
> +	unsigned int i;
>  
>  	ci = lock_cluster(si, offset);
>  	memset(si->swap_map + offset, 0, SWAPFILE_CLUSTER);
> +	for (i = 0; i < SWAPFILE_CLUSTER; i++)
> +		clear_bit(offset + i, si->zeromap);
>  	cluster_set_count_flag(ci, 0, 0);
>  	free_cluster(si, idx);
>  	unlock_cluster(ci);
> @@ -1336,6 +1350,7 @@ static void swap_entry_free(struct swap_info_struct *p, swp_entry_t entry)
>  	count = p->swap_map[offset];
>  	VM_BUG_ON(count != SWAP_HAS_CACHE);
>  	p->swap_map[offset] = 0;
> +	clear_bit(offset, p->zeromap);

I think instead of clearing the zeromap in swap_free_cluster() and here
separately, we can just do it in swap_range_free(). I suspect this may
be the only place we really need to clear the zero in the swapfile code.

>  	dec_cluster_info_page(p, p->cluster_info, offset);
>  	unlock_cluster(ci);
>  
> @@ -2597,6 +2612,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
>  	free_percpu(p->cluster_next_cpu);
>  	p->cluster_next_cpu = NULL;
>  	vfree(swap_map);
> +	bitmap_free(p->zeromap);
>  	kvfree(cluster_info);
>  	/* Destroy swap account information */
>  	swap_cgroup_swapoff(p->type);
> @@ -3123,6 +3139,12 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
>  		goto bad_swap_unlock_inode;
>  	}
>  
> +	p->zeromap = bitmap_zalloc(maxpages, GFP_KERNEL);
> +	if (!p->zeromap) {
> +		error = -ENOMEM;
> +		goto bad_swap_unlock_inode;
> +	}
> +
>  	if (p->bdev && bdev_stable_writes(p->bdev))
>  		p->flags |= SWP_STABLE_WRITES;
>  
> -- 
> 2.43.0
>