[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <bc2c6314-7c94-8096-19f0-0c11882479e4@huawei.com>
Date: Wed, 23 Nov 2022 16:25:00 +0800
From: Chen Wandun <chenwandun@...wei.com>
To: Alexey Romanov <avromanov@...rdevices.ru>, <minchan@...nel.org>,
<senozhatsky@...omium.org>, <ngupta@...are.org>,
<akpm@...ux-foundation.org>
CC: <linux-mm@...ck.org>, <linux-kernel@...r.kernel.org>,
<kernel@...rdevices.ru>, <ddrokosov@...rdevices.ru>
Subject: Re: [RFC PATCH v1 1/4] zram: introduce merge identical pages
mechanism
在 2022/11/22 3:00, Alexey Romanov 写道:
> zram maps each page (struct page) to a zsmalloc object that
> contains a compressed buffer of that page's data. This fact
> generates data redundancy: for example, two identical pages
> will be store in compressed form in zsmalloc allocator twice.
>
> This patch adds a mechanism to scan zram_table_entry array
> and frees all identical objects in zsmalloc allocator, leaving
> only one. All zram_table_entry elements which reference this
> freed objects now refer to the same, not freed, object.
>
> To implement this mechanism, we sequentially scan the
> zram_table_entry array, counting the hash from the contents of
> the compressed pages (zsmalloc objects) and enter the index of
> the object into the hash table (hlist_head). If the hash matches,
> we remove the identical zsmalloc (zs_free) objects and update
> the link rbtree.
>
> Also, we can't just call zs_free() function during zram_free_page().
> Before calling this function we have to make sure that no one else
> refers to this zsmalloc object.
>
> To implement the mechanism for merging identical compressed
> pages (zsmalloc objects), a rbtree is needed.
>
> The tree node key is a reference to the zsmalloc object
> (handle), and the value is the number of references to
> this object (atomic counter). This is necessary for data
> consistency so that we do not zs_free the object referenced
> by any element of the zram_table_entry array.
>
> Signed-off-by: Alexey Romanov <avromanov@...rdevices.ru>
> ---
> drivers/block/zram/zram_drv.c | 278 +++++++++++++++++++++++++++++++++-
> drivers/block/zram/zram_drv.h | 6 +
> 2 files changed, 283 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
> index e290d6d97047..716c2f72805e 100644
> --- a/drivers/block/zram/zram_drv.c
> +++ b/drivers/block/zram/zram_drv.c
> @@ -33,12 +33,15 @@
> #include <linux/debugfs.h>
> #include <linux/cpuhotplug.h>
> #include <linux/part_stat.h>
> +#include <linux/hashtable.h>
> +#include <linux/xxhash.h>
>
> #include "zram_drv.h"
>
> static DEFINE_IDR(zram_index_idr);
> /* idr index must be protected */
> static DEFINE_MUTEX(zram_index_mutex);
> +static DEFINE_MUTEX(zram_rbtree_mutex);
>
> static int zram_major;
> static const char *default_compressor = CONFIG_ZRAM_DEF_COMP;
> @@ -57,6 +60,16 @@ static void zram_free_page(struct zram *zram, size_t index);
> static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec,
> u32 index, int offset, struct bio *bio);
>
> +struct zram_rbtree_node {
> + struct rb_node node;
> + unsigned long key;
> + unsigned long cnt;
> +};
> +
> +struct zram_hash_node {
> + unsigned long index;
> + struct hlist_node next;
> +};
>
> static int zram_slot_trylock(struct zram *zram, u32 index)
> {
> @@ -1283,6 +1296,247 @@ static DEVICE_ATTR_RO(bd_stat);
> #endif
> static DEVICE_ATTR_RO(debug_stat);
>
> +static bool zram_rbtree_insert(struct rb_root *root, struct zram_rbtree_node *data)
> +{
> + struct rb_node **new = &(root->rb_node), *parent = NULL;
> + struct zram_rbtree_node *this;
> +
> + while (*new) {
> + this = rb_entry(*new, struct zram_rbtree_node, node);
> + parent = *new;
> + if (data->key < this->key)
> + new = &((*new)->rb_left);
> + else if (data->key > this->key)
> + new = &((*new)->rb_right);
> + else
> + return false;
> + }
> +
> + rb_link_node(&data->node, parent, new);
> + rb_insert_color(&data->node, root);
> + return true;
> +}
> +
> +static struct zram_rbtree_node *zram_rbtree_search(struct rb_root *root,
> + unsigned long key)
> +{
> + struct rb_node *node = root->rb_node;
> + struct zram_rbtree_node *data;
> +
> + while (node) {
> + data = rb_entry(node, struct zram_rbtree_node, node);
> + if (key < data->key)
> + node = node->rb_left;
> + else if (key > data->key)
> + node = node->rb_right;
> + else
> + return data;
> + }
> +
> + return NULL;
> +}
> +
> +static unsigned long zram_calc_hash(void *src, size_t len)
> +{
> + return xxhash(src, len, 0);
> +}
> +
> +static int zram_cmp_obj_and_merge(struct zram *zram, struct hlist_head *htable,
> + size_t htable_size, size_t index)
> +{
> + struct zram_rbtree_node *rb_node;
> + struct zram_hash_node *node;
> + unsigned long handle, cur_handle;
> + size_t obj_size;
> + char *src, *buf;
> + unsigned long hash;
> + int ret = 0;
> +
> + handle = zram_get_handle(zram, index);
> + if (!handle)
> + return ret;
> +
> + obj_size = zram_get_obj_size(zram, index);
> + buf = kmalloc(obj_size, GFP_KERNEL);
> + if (!buf) {
> + pr_err("Failed to allocate zs_map_object buffer\n");
> + return -ENOMEM;
> + }
> +
> + src = zs_map_object(zram->mem_pool, handle, ZS_MM_RO);
> + memcpy(buf, src, obj_size);
> + zs_unmap_object(zram->mem_pool, handle);
> + hash = zram_calc_hash(buf, obj_size);
> +
> + mutex_lock(&zram_rbtree_mutex);
> + hlist_for_each_entry(node, &htable[hash % htable_size], next) {
> + int cmp;
> +
> + zram_slot_lock(zram, node->index);
> +
> + /*
> + * Page may change as the hash table is being formed,
> + * so the checks below are necessary.
> + */
> + cur_handle = zram_get_handle(zram, node->index);
> + if (handle == cur_handle ||
> + obj_size != zram_get_obj_size(zram, node->index)) {
> + zram_slot_unlock(zram, node->index);
> + continue;
> + }
> +
> + src = zs_map_object(zram->mem_pool, cur_handle, ZS_MM_RO);
> + cmp = memcmp(buf, src, obj_size);
> + zs_unmap_object(zram->mem_pool, cur_handle);
> +
> + if (!cmp) {
> + rb_node = zram_rbtree_search(&zram->sph_rbtree, handle);
> +
> + /*
> + * This check is necessary in order not to zs_free an object
> + * that someone already refers to. This situation is possible
> + * when with repeated calls to zram_do_scan(). For example:
> + *
> + * [slot0] [slot1] [slot2] [slot3] [slot4]
> + * [obj0] [obj1] [obj2] [obj3] [obj4]
> + *
> + * Let's imagine that obj2 and obj3 are equal, and we called
> + * zram_do_scan() function:
> + *
> + * [slot0] [slot1] [slot2] [slot3] [slot4]
> + * [obj0] [obj1] [obj2] [obj2] [obj4]
> + *
> + * Now, slot2 and slot3 refers to obj2 zsmalloc object.
> + * Time passed, now slot0 refres to obj0_n, which is equal
> + * to obj2:
> + *
> + * [slot0] [slot1] [slot2] [slot3] [slot4]
> + * [obj0_n] [obj1] [obj2] [obj2] [obj4]
> + *
> + * Now we call zram_do_scan() function again. We get to slot2,
> + * and we understand that obj2 and obj0_n hashes are the same. We
> + * try to zs_free(obj2), but slot3 also already refers to it.
> + *
> + * This is not correct!
> + */
In this case, it seem like can't merge all same object, is that right?
how about let slot2 point to obj0_n and decrease the rb_node ref of
slot2/slot3 in the first loop,
let slot3 point to obj0_n and decrease the rb_node ref in the next
loop, then the obj2 can be free.
> + if (unlikely(rb_node))
> + if (rb_node->cnt > 1) {
> + zram_slot_unlock(zram, node->index);
> + continue;
> + }
> +
> + zram_set_handle(zram, index, cur_handle);
> + zs_free(zram->mem_pool, handle);
> +
> + rb_node = zram_rbtree_search(&zram->sph_rbtree, cur_handle);
> +
> + if (!rb_node) {
> + rb_node = kzalloc(sizeof(struct zram_rbtree_node),
> + GFP_KERNEL);
> + if (!rb_node) {
> + pr_err("Failed to allocate rb_node\n");
> + ret = -ENOMEM;
> + zram_slot_unlock(zram, node->index);
> + mutex_unlock(&zram_rbtree_mutex);
> + goto merged_or_err;
> + }
> +
> + rb_node->key = cur_handle;
> + /* Two slots refers to an zsmalloc object with cur_handle key */
> + rb_node->cnt = 2;
> + zram_rbtree_insert(&zram->sph_rbtree, rb_node);
> + } else {
> + rb_node->cnt++;
> + }
> +
> + atomic64_sub(obj_size, &zram->stats.compr_data_size);
> + zram_set_flag(zram, index, ZRAM_MERGED);
> + zram_set_flag(zram, node->index, ZRAM_MERGED);
> +
> + zram_slot_unlock(zram, node->index);
> + mutex_unlock(&zram_rbtree_mutex);
> + goto merged_or_err;
> + }
> +
> + zram_slot_unlock(zram, node->index);
> + }
> +
> + mutex_unlock(&zram_rbtree_mutex);
> +
> + node = kmalloc(sizeof(struct zram_hash_node), GFP_KERNEL);
> + if (!node) {
> + ret = -ENOMEM;
> + goto merged_or_err;
> + }
> +
> + node->index = index;
> + hlist_add_head(&node->next, &htable[hash % htable_size]);
> +
> +merged_or_err:
> + kfree(buf);
> + return ret;
> +}
> +
> +static void zram_free_htable_entries(struct hlist_head *htable,
> + size_t htable_size)
> +{
> + struct hlist_node *n;
> + struct zram_hash_node *node;
> +
> + hlist_for_each_entry_safe(node, n, htable, next) {
> + hlist_del(&node->next);
> + kfree(node);
> + }
> +}
> +
> +static int zram_do_scan(struct zram *zram)
> +{
> + size_t num_pages = zram->disksize >> PAGE_SHIFT;
> + size_t htable_size = num_pages;
> + size_t index;
> + struct hlist_head *htable;
> + int i, ret = 0;
> +
> + htable = vzalloc(htable_size * sizeof(struct hlist_head));
> + if (!htable) {
> + pr_err("Failed to allocate hash table\n");
> + return -ENOMEM;
> + }
> +
> + for (i = 0; i < htable_size; i++)
> + INIT_HLIST_HEAD(&htable[i]);
> +
> + for (index = 0; index < num_pages; index++) {
> + zram_slot_lock(zram, index);
> +
> + if (!zram_allocated(zram, index)) {
> + zram_slot_unlock(zram, index);
> + continue;
> + }
> +
> + if (zram_test_flag(zram, index, ZRAM_UNDER_WB) ||
> + zram_test_flag(zram, index, ZRAM_WB) ||
> + zram_test_flag(zram, index, ZRAM_SAME)) {
> + zram_slot_unlock(zram, index);
> + continue;
> + }
> +
> + /* Ignore pages that have been recompressed */
> + if (zram_get_priority(zram, index) != 0)
> + continue;
> +
> + ret = zram_cmp_obj_and_merge(zram, htable, htable_size, index);
> + zram_slot_unlock(zram, index);
> + if (ret != 0)
> + goto out;
> + }
> +
> +out:
> + zram_free_htable_entries(htable, htable_size);
> + vfree(htable);
> + return ret;
> +}
> +
> static void zram_meta_free(struct zram *zram, u64 disksize)
> {
> size_t num_pages = disksize >> PAGE_SHIFT;
> @@ -1324,6 +1578,7 @@ static bool zram_meta_alloc(struct zram *zram, u64 disksize)
> static void zram_free_page(struct zram *zram, size_t index)
> {
> unsigned long handle;
> + struct zram_rbtree_node *node;
>
> #ifdef CONFIG_ZRAM_MEMORY_TRACKING
> zram->table[index].ac_time = 0;
> @@ -1361,7 +1616,26 @@ static void zram_free_page(struct zram *zram, size_t index)
> if (!handle)
> return;
>
> - zs_free(zram->mem_pool, handle);
> + if (zram_test_flag(zram, index, ZRAM_MERGED)) {
> + zram_clear_flag(zram, index, ZRAM_MERGED);
> + mutex_lock(&zram_rbtree_mutex);
> +
> + node = zram_rbtree_search(&zram->sph_rbtree, handle);
> + BUG_ON(!node);
> +
> + node->cnt--;
> + if (node->cnt == 0) {
> + rb_erase(&node->node, &zram->sph_rbtree);
> + mutex_unlock(&zram_rbtree_mutex);
> +
> + zs_free(zram->mem_pool, handle);
> + kfree(node);
> + } else {
> + mutex_unlock(&zram_rbtree_mutex);
> + }
> + } else {
> + zs_free(zram->mem_pool, handle);
> + }
>
> atomic64_sub(zram_get_obj_size(zram, index),
> &zram->stats.compr_data_size);
> @@ -2421,6 +2695,8 @@ static int zram_add(void)
>
> comp_algorithm_set(zram, ZRAM_PRIMARY_COMP, default_compressor);
>
> + zram->sph_rbtree = RB_ROOT;
> +
> zram_debugfs_register(zram);
> pr_info("Added device: %s\n", zram->disk->disk_name);
> return device_id;
> diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h
> index c5254626f051..4a7151c94523 100644
> --- a/drivers/block/zram/zram_drv.h
> +++ b/drivers/block/zram/zram_drv.h
> @@ -56,6 +56,7 @@ enum zram_pageflags {
>
> ZRAM_COMP_PRIORITY_BIT1, /* First bit of comp priority index */
> ZRAM_COMP_PRIORITY_BIT2, /* Second bit of comp priority index */
> + ZRAM_MERGED, /* page was merged */
>
> __NR_ZRAM_PAGEFLAGS,
> };
> @@ -140,5 +141,10 @@ struct zram {
> #ifdef CONFIG_ZRAM_MEMORY_TRACKING
> struct dentry *debugfs_dir;
> #endif
> + /*
> + * This is same pages handle's rb tree, where the key is a handle
> + * to same pages and the value is a link counter
> + */
> + struct rb_root sph_rbtree;
> };
> #endif
Powered by blists - more mailing lists