[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-ID: <tencent_78FC2C4FE16BA1EBAF0897DB60FCD675ED05@qq.com>
Date: Tue, 4 Nov 2025 17:27:17 +0800
From: Yuwen Chen <ywen.chen@...mail.com>
To: minchan@...nel.org,
senozhatsky@...omium.org,
axboe@...nel.dk
Cc: linux-kernel@...r.kernel.org,
linux-block@...r.kernel.org,
licayy@...look.com,
Yuwen Chen <ywen.chen@...mail.com>
Subject: [PATCH] zram: Implement multi-page write-back
For block devices, sequential write performance is significantly
better than random write. Currently, zram's write-back function
only supports single-page operations, which fails to leverage
the sequential write advantage and leads to suboptimal performance.
This patch implements multi-page batch write-back for zram to
leverage sequential write performance of block devices.
After applying this patch, a large number of pages being merged
into batch write operations can be observed via the following test
code, which effectively improves write-back performance.
mount -t debugfs none /sys/kernel/debug/
echo "block:block_bio_frontmerge" >> /sys/kernel/debug/tracing/set_event
echo "block:block_bio_backmerge" >> /sys/kernel/debug/tracing/set_event
cat /sys/kernel/debug/tracing/trace_pipe &
echo "page_indexes=1-10000" > /sys/block/zram0/writeback
Signed-off-by: Yuwen Chen <ywen.chen@...mail.com>
Reviewed-by: Fengyu Lian <licayy@...look.com>
---
drivers/block/zram/zram_drv.c | 214 +++++++++++++++++++++++++---------
1 file changed, 161 insertions(+), 53 deletions(-)
diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index a43074657531..85ef07455eda 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -734,21 +734,125 @@ static void read_from_bdev_async(struct zram *zram, struct page *page,
submit_bio(bio);
}
-static int zram_writeback_slots(struct zram *zram, struct zram_pp_ctl *ctl)
-{
- unsigned long blk_idx = 0;
- struct page *page = NULL;
+enum {
+ ZRAM_WB_WORK_ALLOCATED = 0,
+ ZRAM_WB_WORK_COMPLETED,
+};
+struct zram_writeback_work {
+ struct completion *done;
+ unsigned long blk_idx;
+ struct page *page;
struct zram_pp_slot *pps;
struct bio_vec bio_vec;
struct bio bio;
- int ret = 0, err;
- u32 index;
+ unsigned long flags;
+};
- page = alloc_page(GFP_KERNEL);
- if (!page)
+static int zram_writeback_complete(struct zram *zram, struct zram_writeback_work *work)
+{
+ u32 index = 0;
+ int err;
+
+ if (!test_and_clear_bit(ZRAM_WB_WORK_COMPLETED, &work->flags))
+ return 0;
+
+ err = blk_status_to_errno(work->bio.bi_status);
+ if (err)
+ return err;
+
+ index = work->pps->index;
+ atomic64_inc(&zram->stats.bd_writes);
+ zram_slot_lock(zram, index);
+ /*
+ * Same as above, we release slot lock during writeback so
+ * slot can change under us: slot_free() or slot_free() and
+ * reallocation (zram_write_page()). In both cases slot loses
+ * ZRAM_PP_SLOT flag. No concurrent post-processing can set
+ * ZRAM_PP_SLOT on such slots until current post-processing
+ * finishes.
+ */
+ if (!zram_test_flag(zram, index, ZRAM_PP_SLOT))
+ goto next;
+
+ zram_free_page(zram, index);
+ zram_set_flag(zram, index, ZRAM_WB);
+ zram_set_handle(zram, index, work->blk_idx);
+ work->blk_idx = 0;
+ atomic64_inc(&zram->stats.pages_stored);
+ spin_lock(&zram->wb_limit_lock);
+ if (zram->wb_limit_enable && zram->bd_wb_limit > 0)
+ zram->bd_wb_limit -= 1UL << (PAGE_SHIFT - 12);
+ spin_unlock(&zram->wb_limit_lock);
+
+next:
+ zram_slot_unlock(zram, index);
+ release_pp_slot(zram, work->pps);
+ work->pps = NULL;
+ return 0;
+}
+
+static void zram_writeback_endio(struct bio *bio)
+{
+ struct zram_writeback_work *work = bio->bi_private;
+
+ set_bit(ZRAM_WB_WORK_COMPLETED, &work->flags);
+ clear_bit(ZRAM_WB_WORK_ALLOCATED, &work->flags);
+ complete(work->done);
+}
+
+static struct zram_writeback_work *zram_writeback_next_work(struct zram_writeback_work **pool,
+ int size, int *off)
+{
+ struct zram_writeback_work *work = NULL;
+ int i = 0;
+
+ for (i = *off; i < size + *off; i++) {
+ work = pool[i % size];
+ if (!work->page)
+ continue;
+
+ if (!test_and_set_bit(ZRAM_WB_WORK_ALLOCATED, &work->flags)) {
+ *off = (i + 1) % size;
+ return work;
+ }
+ }
+ return NULL;
+}
+
+#define ZRAM_WRITEBACK_BIO_SIZE (32)
+static int zram_writeback_slots(struct zram *zram, struct zram_pp_ctl *ctl)
+{
+ int ret = 0, err, i = 0, off = 0;
+ int work_pool_size = 0;
+ struct zram_writeback_work work_prealloc[2] = {0};
+ struct zram_writeback_work *work_pool[ZRAM_WRITEBACK_BIO_SIZE] = {NULL};
+ struct zram_writeback_work *work = NULL;
+ DECLARE_COMPLETION_ONSTACK(done);
+ u32 index = 0;
+ struct blk_plug plug;
+
+ for (i = 0; i < ARRAY_SIZE(work_pool); i++) {
+ if (i < ARRAY_SIZE(work_prealloc)) {
+ work_pool[i] = &work_prealloc[i];
+ } else {
+ work_pool[i] = kzalloc(sizeof(*work), GFP_KERNEL);
+ if (!work_pool[i])
+ break;
+ }
+ work_pool[i]->done = &done;
+ work_pool[i]->flags = 0;
+ work_pool[i]->page = alloc_page(GFP_KERNEL);
+ if (!work_pool[i]->page)
+ break;
+ work = work_pool[i];
+ }
+ if (!work)
return -ENOMEM;
+ work_pool_size = i;
+ set_bit(ZRAM_WB_WORK_ALLOCATED, &work->flags);
- while ((pps = select_pp_slot(ctl))) {
+ blk_start_plug(&plug);
+ while ((work->pps = select_pp_slot(ctl))) {
spin_lock(&zram->wb_limit_lock);
if (zram->wb_limit_enable && !zram->bd_wb_limit) {
spin_unlock(&zram->wb_limit_lock);
@@ -757,15 +861,15 @@ static int zram_writeback_slots(struct zram *zram, struct zram_pp_ctl *ctl)
}
spin_unlock(&zram->wb_limit_lock);
- if (!blk_idx) {
- blk_idx = alloc_block_bdev(zram);
- if (!blk_idx) {
+ if (!work->blk_idx) {
+ work->blk_idx = alloc_block_bdev(zram);
+ if (!work->blk_idx) {
ret = -ENOSPC;
break;
}
}
- index = pps->index;
+ index = work->pps->index;
zram_slot_lock(zram, index);
/*
* scan_slots() sets ZRAM_PP_SLOT and relases slot lock, so
@@ -775,22 +879,32 @@ static int zram_writeback_slots(struct zram *zram, struct zram_pp_ctl *ctl)
*/
if (!zram_test_flag(zram, index, ZRAM_PP_SLOT))
goto next;
- if (zram_read_from_zspool(zram, page, index))
+ if (zram_read_from_zspool(zram, work->page, index))
goto next;
zram_slot_unlock(zram, index);
- bio_init(&bio, zram->bdev, &bio_vec, 1,
+ bio_init(&work->bio, zram->bdev, &work->bio_vec, 1,
REQ_OP_WRITE | REQ_SYNC);
- bio.bi_iter.bi_sector = blk_idx * (PAGE_SIZE >> 9);
- __bio_add_page(&bio, page, PAGE_SIZE, 0);
-
- /*
- * XXX: A single page IO would be inefficient for write
- * but it would be not bad as starter.
- */
- err = submit_bio_wait(&bio);
+ work->bio.bi_iter.bi_sector = work->blk_idx * (PAGE_SIZE >> 9);
+ work->bio.bi_end_io = zram_writeback_endio;
+ work->bio.bi_private = work;
+ __bio_add_page(&work->bio, work->page, PAGE_SIZE, 0);
+
+ list_del_init(&work->pps->entry);
+ submit_bio(&work->bio);
+
+ do {
+ work = zram_writeback_next_work(work_pool, work_pool_size, &off);
+ if (!work) {
+ blk_finish_plug(&plug);
+ wait_for_completion_io(&done);
+ blk_start_plug(&plug);
+ }
+ } while (!work);
+ err = zram_writeback_complete(zram, work);
if (err) {
- release_pp_slot(zram, pps);
+ release_pp_slot(zram, work->pps);
+ work->pps = NULL;
/*
* BIO errors are not fatal, we continue and simply
* attempt to writeback the remaining objects (pages).
@@ -800,43 +914,37 @@ static int zram_writeback_slots(struct zram *zram, struct zram_pp_ctl *ctl)
* the most recent BIO error.
*/
ret = err;
- continue;
}
+ cond_resched();
+ continue;
- atomic64_inc(&zram->stats.bd_writes);
- zram_slot_lock(zram, index);
- /*
- * Same as above, we release slot lock during writeback so
- * slot can change under us: slot_free() or slot_free() and
- * reallocation (zram_write_page()). In both cases slot loses
- * ZRAM_PP_SLOT flag. No concurrent post-processing can set
- * ZRAM_PP_SLOT on such slots until current post-processing
- * finishes.
- */
- if (!zram_test_flag(zram, index, ZRAM_PP_SLOT))
- goto next;
-
- zram_free_page(zram, index);
- zram_set_flag(zram, index, ZRAM_WB);
- zram_set_handle(zram, index, blk_idx);
- blk_idx = 0;
- atomic64_inc(&zram->stats.pages_stored);
- spin_lock(&zram->wb_limit_lock);
- if (zram->wb_limit_enable && zram->bd_wb_limit > 0)
- zram->bd_wb_limit -= 1UL << (PAGE_SHIFT - 12);
- spin_unlock(&zram->wb_limit_lock);
next:
zram_slot_unlock(zram, index);
- release_pp_slot(zram, pps);
-
+ release_pp_slot(zram, work->pps);
+ work->pps = NULL;
cond_resched();
}
+ blk_finish_plug(&plug);
- if (blk_idx)
- free_block_bdev(zram, blk_idx);
- if (page)
- __free_page(page);
+ if (work)
+ clear_bit(ZRAM_WB_WORK_ALLOCATED, &work->flags);
+ for (i = 0; i < work_pool_size; i++) {
+ while (test_bit(ZRAM_WB_WORK_ALLOCATED, &work_pool[i]->flags))
+ wait_for_completion_io(&done);
+ err = zram_writeback_complete(zram, work_pool[i]);
+ if (err) {
+ release_pp_slot(zram, work_pool[i]->pps);
+ work->pps = NULL;
+ ret = err;
+ }
+ if (work_pool[i]->blk_idx)
+ free_block_bdev(zram, work_pool[i]->blk_idx);
+ if (work_pool[i]->page)
+ __free_page(work_pool[i]->page);
+ if (i >= ARRAY_SIZE(work_prealloc))
+ kfree(work_pool[i]);
+ }
return ret;
}
--
2.34.1
Powered by blists - more mailing lists