lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite for Android: free password hash cracker in your pocket
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20230605152952.GA221380@cmpxchg.org>
Date:   Mon, 5 Jun 2023 11:29:52 -0400
From:   Johannes Weiner <hannes@...xchg.org>
To:     Domenico Cerasuolo <cerasuolodomenico@...il.com>
Cc:     vitaly.wool@...sulko.com, minchan@...nel.org,
        senozhatsky@...omium.org, yosryahmed@...gle.com,
        linux-mm@...ck.org, ddstreet@...e.org, sjenning@...hat.com,
        nphamcs@...il.com, akpm@...ux-foundation.org,
        linux-kernel@...r.kernel.org, kernel-team@...a.com
Subject: Re: [RFC PATCH 1/7] mm: zswap: add pool shrinking mechanism

Hi Domenico,

On Mon, Jun 05, 2023 at 10:54:13AM +0200, Domenico Cerasuolo wrote:
> @@ -364,6 +369,9 @@ static void zswap_free_entry(struct zswap_entry *entry)
>  	if (!entry->length)
>  		atomic_dec(&zswap_same_filled_pages);
>  	else {
> +		spin_lock(&entry->pool->lock);
> +		list_del_init(&entry->lru);
> +		spin_unlock(&entry->pool->lock);

This should be list_del(), as the entry is freed right after this and
the list isn't reused anymore.

The slab memory is recycled, but the allocation site (commented on
below) doesn't need the list initialized.

However, I think it also needs to check !zpool_evictable(). If
alloc/store doesn't do the list_add(), this would be a double delete.

> @@ -584,14 +592,65 @@ static struct zswap_pool *zswap_pool_find_get(char *type, char *compressor)
>  	return NULL;
>  }
>  
> +static int zswap_shrink(struct zswap_pool *pool)
> +{
> +	struct zswap_entry *lru_entry, *tree_entry = NULL;
> +	struct zswap_header *zhdr;
> +	struct zswap_tree *tree;
> +	swp_entry_t swpentry;
> +	int ret;
> +
> +	/* get a reclaimable entry from LRU */
> +	spin_lock(&pool->lock);
> +	if (list_empty(&pool->lru)) {
> +		spin_unlock(&pool->lock);
> +		return -EINVAL;
> +	}
> +	lru_entry = list_last_entry(&pool->lru, struct zswap_entry, lru);
> +	list_del_init(&lru_entry->lru);
> +	zhdr = zpool_map_handle(pool->zpool, lru_entry->handle, ZPOOL_MM_RO);
> +	tree = zswap_trees[swp_type(zhdr->swpentry)];
> +	zpool_unmap_handle(pool->zpool, lru_entry->handle);
> +	swpentry = zhdr->swpentry;
> +	spin_unlock(&pool->lock);

Once the pool lock is dropped, the lru_entry might get freed. But the
swpentry is copied to the stack, and lru_entry isn't deref'd again
until the entry is verified to still be alive in the tree.

This could use a comment.

> +	/* hold a reference from tree so it won't be freed during writeback */
> +	spin_lock(&tree->lock);
> +	tree_entry = zswap_entry_find_get(&tree->rbroot, swp_offset(swpentry));
> +	if (tree_entry != lru_entry) {
> +		if (tree_entry)
> +			zswap_entry_put(tree, tree_entry);
> +		spin_unlock(&tree->lock);
> +		return -EAGAIN;
> +	}
> +	spin_unlock(&tree->lock);

It's pretty outrageous how much simpler this is compared to the
<backend>_reclaim_page() functions! The backends have to jump through
a lot of hoops to serialize against freeing, whereas zswap can simply
hold a reference. This is clearly a much better design.

> +	ret = zswap_writeback_entry(pool->zpool, lru_entry->handle);
> +
> +	spin_lock(&tree->lock);
> +	if (ret) {
> +		spin_lock(&pool->lock);
> +		list_move(&lru_entry->lru, &pool->lru);
> +		spin_unlock(&pool->lock);
> +	}
> +	zswap_entry_put(tree, tree_entry);
> +	spin_unlock(&tree->lock);
> +
> +	return ret ? -EAGAIN : 0;
> +}
> +
>  static void shrink_worker(struct work_struct *w)
>  {
>  	struct zswap_pool *pool = container_of(w, typeof(*pool),
>  						shrink_work);
>  	int ret, failures = 0;
>  
> +	/* zpool_evictable will be removed once all 3 backends have migrated*/

Missing space between text and */

>  	do {
> -		ret = zpool_shrink(pool->zpool, 1, NULL);
> +		if (zpool_evictable(pool->zpool))
> +			ret = zpool_shrink(pool->zpool, 1, NULL);
> +		else
> +			ret = zswap_shrink(pool);
>  		if (ret) {
>  			zswap_reject_reclaim_fail++;
>  			if (ret != -EAGAIN)
> @@ -655,6 +714,8 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
>  	 */
>  	kref_init(&pool->kref);
>  	INIT_LIST_HEAD(&pool->list);
> +	INIT_LIST_HEAD(&pool->lru);
> +	spin_lock_init(&pool->lock);
>  	INIT_WORK(&pool->shrink_work, shrink_worker);
>  
>  	zswap_pool_debug("created", pool);
> @@ -1270,7 +1331,7 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
>  	}
>  
>  	/* store */
> -	hlen = zpool_evictable(entry->pool->zpool) ? sizeof(zhdr) : 0;
> +	hlen = sizeof(zhdr);
>  	gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM;
>  	if (zpool_malloc_support_movable(entry->pool->zpool))
>  		gfp |= __GFP_HIGHMEM | __GFP_MOVABLE;
> @@ -1313,6 +1374,13 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
>  			zswap_entry_put(tree, dupentry);
>  		}
>  	} while (ret == -EEXIST);
> +	INIT_LIST_HEAD(&entry->lru);

The list_add() below initializes the entry, so this shouldn't be
needed.

> +	/* zpool_evictable will be removed once all 3 backends have migrated*/
> +	if (entry->length && !zpool_evictable(entry->pool->zpool)) {
> +		spin_lock(&entry->pool->lock);
> +		list_add(&entry->lru, &entry->pool->lru);
> +		spin_unlock(&entry->pool->lock);
> +	}
>  	spin_unlock(&tree->lock);
>  
>  	/* update stats */

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ