linux-kernel - Re: [PATCH 4/4] mm: swapoff: shmem_unuse() stop eviction without igrab()

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <84d74937-30ed-d0fe-c7cd-a813f61cbb96@yandex-team.ru>
Date:   Tue, 9 Apr 2019 10:50:45 +0300
From:   Konstantin Khlebnikov <khlebnikov@...dex-team.ru>
To:     Hugh Dickins <hughd@...gle.com>,
        Andrew Morton <akpm@...ux-foundation.org>
Cc:     "Alex Xu (Hello71)" <alex_y_xu@...oo.ca>,
        Vineeth Pillai <vpillai@...italocean.com>,
        Kelley Nielsen <kelleynnn@...il.com>,
        Rik van Riel <riel@...riel.com>,
        Huang Ying <ying.huang@...el.com>,
        linux-kernel@...r.kernel.org, linux-mm@...ck.org
Subject: Re: [PATCH 4/4] mm: swapoff: shmem_unuse() stop eviction without
 igrab()

On 08.04.2019 23:01, Hugh Dickins wrote:
> The igrab() in shmem_unuse() looks good, but we forgot that it gives no
> protection against concurrent unmounting: a point made by Konstantin
> Khlebnikov eight years ago, and then fixed in 2.6.39 by 778dd893ae78
> ("tmpfs: fix race between umount and swapoff"). The current 5.1-rc
> swapoff is liable to hit "VFS: Busy inodes after unmount of tmpfs.
> Self-destruct in 5 seconds.  Have a nice day..." followed by GPF.
> 
> Once again, give up on using igrab(); but don't go back to making such
> heavy-handed use of shmem_swaplist_mutex as last time: that would spoil
> the new design, and I expect could deadlock inside shmem_swapin_page().
> 
> Instead, shmem_unuse() just raise a "stop_eviction" count in the shmem-
> specific inode, and shmem_evict_inode() wait for that to go down to 0.
> Call it "stop_eviction" rather than "swapoff_busy" because it can be
> put to use for others later (huge tmpfs patches expect to use it).
> 
> That simplifies shmem_unuse(), protecting it from both unlink and unmount;
> and in practice lets it locate all the swap in its first try.  But do not
> rely on that: there's still a theoretical case, when shmem_writepage()
> might have been preempted after its get_swap_page(), before making the
> swap entry visible to swapoff.
> 
> Fixes: b56a2d8af914 ("mm: rid swapoff of quadratic complexity")
> Signed-off-by: Hugh Dickins <hughd@...gle.com>
> ---
> 
>   include/linux/shmem_fs.h |    1 +
>   mm/shmem.c               |   39 ++++++++++++++++++---------------------
>   mm/swapfile.c            |   11 +++++------
>   3 files changed, 24 insertions(+), 27 deletions(-)
> 
> --- 5.1-rc4/include/linux/shmem_fs.h	2019-03-17 16:18:15.181820820 -0700
> +++ linux/include/linux/shmem_fs.h	2019-04-07 19:18:43.248639711 -0700
> @@ -21,6 +21,7 @@ struct shmem_inode_info {
>   	struct list_head	swaplist;	/* chain of maybes on swap */
>   	struct shared_policy	policy;		/* NUMA memory alloc policy */
>   	struct simple_xattrs	xattrs;		/* list of xattrs */
> +	atomic_t		stop_eviction;	/* hold when working on inode */
>   	struct inode		vfs_inode;
>   };
>   
> --- 5.1-rc4/mm/shmem.c	2019-04-07 19:12:23.603858531 -0700
> +++ linux/mm/shmem.c	2019-04-07 19:18:43.248639711 -0700
> @@ -1081,9 +1081,15 @@ static void shmem_evict_inode(struct ino
>   			}
>   			spin_unlock(&sbinfo->shrinklist_lock);
>   		}
> -		if (!list_empty(&info->swaplist)) {
> +		while (!list_empty(&info->swaplist)) {
> +			/* Wait while shmem_unuse() is scanning this inode... */
> +			wait_var_event(&info->stop_eviction,
> +				       !atomic_read(&info->stop_eviction));
>   			mutex_lock(&shmem_swaplist_mutex);

>   			list_del_init(&info->swaplist);

Obviously, line above should be deleted.

> +			/* ...but beware of the race if we peeked too early */
> +			if (!atomic_read(&info->stop_eviction))
> +				list_del_init(&info->swaplist);
>   			mutex_unlock(&shmem_swaplist_mutex);
>   		}
>   	}
> @@ -1227,36 +1233,27 @@ int shmem_unuse(unsigned int type, bool
>   		unsigned long *fs_pages_to_unuse)
>   {
>   	struct shmem_inode_info *info, *next;
> -	struct inode *inode;
> -	struct inode *prev_inode = NULL;
>   	int error = 0;
>   
>   	if (list_empty(&shmem_swaplist))
>   		return 0;
>   
>   	mutex_lock(&shmem_swaplist_mutex);
> -
> -	/*
> -	 * The extra refcount on the inode is necessary to safely dereference
> -	 * p->next after re-acquiring the lock. New shmem inodes with swap
> -	 * get added to the end of the list and we will scan them all.
> -	 */
>   	list_for_each_entry_safe(info, next, &shmem_swaplist, swaplist) {
>   		if (!info->swapped) {
>   			list_del_init(&info->swaplist);
>   			continue;
>   		}
> -
> -		inode = igrab(&info->vfs_inode);
> -		if (!inode)
> -			continue;
> -
> +		/*
> +		 * Drop the swaplist mutex while searching the inode for swap;
> +		 * but before doing so, make sure shmem_evict_inode() will not
> +		 * remove placeholder inode from swaplist, nor let it be freed
> +		 * (igrab() would protect from unlink, but not from unmount).
> +		 */
> +		atomic_inc(&info->stop_eviction);
>   		mutex_unlock(&shmem_swaplist_mutex);
> -		if (prev_inode)
> -			iput(prev_inode);
> -		prev_inode = inode;
>   
> -		error = shmem_unuse_inode(inode, type, frontswap,
> +		error = shmem_unuse_inode(&info->vfs_inode, type, frontswap,
>   					  fs_pages_to_unuse);
>   		cond_resched();
>   
> @@ -1264,14 +1261,13 @@ int shmem_unuse(unsigned int type, bool
>   		next = list_next_entry(info, swaplist);
>   		if (!info->swapped)
>   			list_del_init(&info->swaplist);
> +		if (atomic_dec_and_test(&info->stop_eviction))
> +			wake_up_var(&info->stop_eviction);
>   		if (error)
>   			break;
>   	}
>   	mutex_unlock(&shmem_swaplist_mutex);
>   
> -	if (prev_inode)
> -		iput(prev_inode);
> -
>   	return error;
>   }
>   
> @@ -2238,6 +2234,7 @@ static struct inode *shmem_get_inode(str
>   		info = SHMEM_I(inode);
>   		memset(info, 0, (char *)inode - (char *)info);
>   		spin_lock_init(&info->lock);
> +		atomic_set(&info->stop_eviction, 0);
>   		info->seals = F_SEAL_SEAL;
>   		info->flags = flags & VM_NORESERVE;
>   		INIT_LIST_HEAD(&info->shrinklist);
> --- 5.1-rc4/mm/swapfile.c	2019-04-07 19:17:13.291957539 -0700
> +++ linux/mm/swapfile.c	2019-04-07 19:18:43.248639711 -0700
> @@ -2116,12 +2116,11 @@ retry:
>   	 * Under global memory pressure, swap entries can be reinserted back
>   	 * into process space after the mmlist loop above passes over them.
>   	 *
> -	 * Limit the number of retries? No: when shmem_unuse()'s igrab() fails,
> -	 * a shmem inode using swap is being evicted; and when mmget_not_zero()
> -	 * above fails, that mm is likely to be freeing swap from exit_mmap().
> -	 * Both proceed at their own independent pace: we could move them to
> -	 * separate lists, and wait for those lists to be emptied; but it's
> -	 * easier and more robust (though cpu-intensive) just to keep retrying.
> +	 * Limit the number of retries? No: when mmget_not_zero() above fails,
> +	 * that mm is likely to be freeing swap from exit_mmap(), which proceeds
> +	 * at its own independent pace; and even shmem_writepage() could have
> +	 * been preempted after get_swap_page(), temporarily hiding that swap.
> +	 * It's easy and robust (though cpu-intensive) just to keep retrying.
>   	 */
>   	if (si->inuse_pages) {
>   		if (!signal_pending(current))
>