[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-Id: <20250428073445.24108-1-zhoujifeng@kylinos.com.cn>
Date: Mon, 28 Apr 2025 15:34:45 +0800
From: Zhou Jifeng <zhoujifeng@...inos.com.cn>
To: colyli@...nel.org
Cc: dengwangbo@...inos.com.cn,
kent.overstreet@...ux.dev,
linux-bcache@...r.kernel.org,
linux-kernel@...r.kernel.org,
xiahua@...inos.com.cn,
zhoujifeng@...inos.com.cn
Subject: [PATCH v2] bcache: add the deferred_flush IO processing path in the writeback mode
In some scenarios with high requirements for both data reliability and
write performance, the various cache modes of the current bcache cannot
fully match the requirements. deferred_flush aims to increase the
reliability of writeback write-back. And reduce the sending of PREFLUSH
requests to the backing device to enhance data security and dsync write
performance in wrieback mode.
When cache_mode is switched to the non-writeback mode, defered_flush
will be automatically turned off. When other modes are switched to
writeback+defered_flush, a PREFLUSH request will be sent to the backing
device. Make sure that the previously submitted data is not lost.
deferred_flush supports three selectable modes:
none: do nothing (default )
normal: sequential I/O bypasses the cache disk
force: sequential I/O cannot bypass the cache disk
Signed-off-by: Zhou Jifeng <zhoujifeng@...inos.com.cn>
---
v1->v2: Version v2 mainly addresses the issue of low efficiency in
writing back dirty data in version v1. When writing back dirty data,
it no longer uses the FUA method but instead writes back no more than
500 dirty bkeys and then uniformly sends a PREFLUSH instruction once.
I will supplement more test data later.
drivers/md/bcache/bcache.h | 20 +++++++
drivers/md/bcache/bcache_ondisk.h | 5 ++
drivers/md/bcache/request.c | 32 +++++++++-
drivers/md/bcache/sysfs.c | 54 +++++++++++++++++
drivers/md/bcache/writeback.c | 98 +++++++++++++++++++++++++++----
drivers/md/bcache/writeback.h | 4 ++
6 files changed, 199 insertions(+), 14 deletions(-)
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index 785b0d9008fa..75110fbe6656 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -247,6 +247,14 @@ struct keybuf {
DECLARE_ARRAY_ALLOCATOR(struct keybuf_key, freelist, KEYBUF_NR);
};
+struct keybuf_preflush {
+ spinlock_t lock;
+#define KEYBUF_NR 500
+ struct keybuf_key data[KEYBUF_NR];
+ unsigned int nr_keys;
+};
+
+
struct bcache_device {
struct closure cl;
@@ -346,6 +354,12 @@ struct cached_dev {
struct keybuf writeback_keys;
+ /*
+ * Before performing preflush to the backing device, temporarily
+ * store the bkey waiting to clean up the dirty mark
+ */
+ struct keybuf_preflush preflush_keys;
+
struct task_struct *status_update_thread;
/*
* Order the write-half of writeback operations strongly in dispatch
@@ -405,6 +419,12 @@ struct cached_dev {
*/
#define BCH_WBRATE_UPDATE_MAX_SKIPS 15
unsigned int rate_update_retry;
+
+ /*
+ * In the deferred flush mode, 0 indicates that there is no
+ * need to send flush to the backing device.
+ */
+ atomic_t need_flush;
};
enum alloc_reserve {
diff --git a/drivers/md/bcache/bcache_ondisk.h b/drivers/md/bcache/bcache_ondisk.h
index 6620a7f8fffc..822dcdc0caaf 100644
--- a/drivers/md/bcache/bcache_ondisk.h
+++ b/drivers/md/bcache/bcache_ondisk.h
@@ -294,6 +294,11 @@ BITMASK(BDEV_CACHE_MODE, struct cache_sb, flags, 0, 4);
#define CACHE_MODE_WRITEBACK 1U
#define CACHE_MODE_WRITEAROUND 2U
#define CACHE_MODE_NONE 3U
+BITMASK(BDEV_DEFERRED_FLUSH, struct cache_sb, flags, 4, 3);
+#define DEFERRED_FLUSH_NONE 0U
+#define DEFERRED_FLUSH_NORMAL 1U
+#define DEFERRED_FLUSH_FORCE 2U
+
BITMASK(BDEV_STATE, struct cache_sb, flags, 61, 2);
#define BDEV_STATE_NONE 0U
#define BDEV_STATE_CLEAN 1U
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index af345dc6fde1..8dc17d9c5f75 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -1026,16 +1026,28 @@ static void cached_dev_write(struct cached_dev *dc, struct search *s)
bio->bi_end_io = backing_request_endio;
closure_bio_submit(s->iop.c, bio, cl);
+ if (BDEV_DEFERRED_FLUSH(&dc->sb))
+ atomic_set(&dc->need_flush, 1);
+
} else if (s->iop.writeback) {
bch_writeback_add(dc);
s->iop.bio = bio;
if (bio->bi_opf & REQ_PREFLUSH) {
+ struct bio *flush;
+
+ /*
+ * When DEFERRED_FLUSH is enabled, if need_flush is 0,
+ * there is no need to send a flush to the backing device.
+ */
+ if (BDEV_DEFERRED_FLUSH(&dc->sb) &&
+ (!atomic_cmpxchg(&dc->need_flush, 1, 0)))
+ goto insert_data;
+
/*
* Also need to send a flush to the backing
* device.
*/
- struct bio *flush;
flush = bio_alloc_bioset(bio->bi_bdev, 0,
REQ_OP_WRITE | REQ_PREFLUSH,
@@ -1050,6 +1062,9 @@ static void cached_dev_write(struct cached_dev *dc, struct search *s)
closure_bio_submit(s->iop.c, flush, cl);
}
} else {
+ if (BDEV_DEFERRED_FLUSH(&dc->sb))
+ atomic_set(&dc->need_flush, 1);
+
s->iop.bio = bio_alloc_clone(bio->bi_bdev, bio, GFP_NOIO,
&dc->disk.bio_split);
/* I/O request sent to backing device */
@@ -1066,14 +1081,27 @@ static CLOSURE_CALLBACK(cached_dev_nodata)
{
closure_type(s, struct search, cl);
struct bio *bio = &s->bio.bio;
+ struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
- if (s->iop.flush_journal)
+ if (s->iop.flush_journal) {
bch_journal_meta(s->iop.c, cl);
+ /*
+ * When deferred flush is enabled, it is necessary to determine
+ * whether the flush request can be sent to the backing device.
+ */
+ if (BDEV_DEFERRED_FLUSH(&dc->sb) &&
+ (!atomic_cmpxchg(&dc->need_flush, 1, 0))) {
+ s->iop.status = BLK_STS_OK;
+ goto end;
+ }
+ }
+
/* If it's a flush, we send the flush to the backing device too */
bio->bi_end_io = backing_request_endio;
closure_bio_submit(s->iop.c, bio, cl);
+end:
continue_at(cl, cached_dev_bio_complete, NULL);
}
diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
index e8f696cb58c0..3f343fba2f96 100644
--- a/drivers/md/bcache/sysfs.c
+++ b/drivers/md/bcache/sysfs.c
@@ -28,6 +28,25 @@ static const char * const bch_cache_modes[] = {
NULL
};
+/*
+ * Deferred flush: In writeback mode, reduce unnecessary PREFLUSH
+ * passed to the backend disk to speed up the performance of dsync
+ * requests.Enhance data writeback security through FUA when dirty
+ * data is written back
+ *
+ * Default is 0 ("none")
+ * none: Do nothing
+ * normal: Sequential I/O bypasses the cache disk
+ * force: Sequential I/O cannot bypass the cache disk
+ */
+static const char * const bch_deferred_flush[] = {
+ "none",
+ "normal",
+ "force",
+ NULL
+};
+
+
static const char * const bch_reada_cache_policies[] = {
"all",
"meta-only",
@@ -151,6 +170,7 @@ rw_attribute(copy_gc_enabled);
rw_attribute(idle_max_writeback_rate);
rw_attribute(gc_after_writeback);
rw_attribute(size);
+rw_attribute(deferred_flush);
static ssize_t bch_snprint_string_list(char *buf,
size_t size,
@@ -283,6 +303,11 @@ SHOW(__bch_cached_dev)
return strlen(buf);
}
+ if (attr == &sysfs_deferred_flush)
+ return bch_snprint_string_list(buf, PAGE_SIZE,
+ bch_deferred_flush,
+ BDEV_DEFERRED_FLUSH(&dc->sb));
+
#undef var
return 0;
}
@@ -295,6 +320,7 @@ STORE(__cached_dev)
ssize_t v;
struct cache_set *c;
struct kobj_uevent_env *env;
+ struct bio flush;
/* no user space access if system is rebooting */
if (bcache_is_reboot)
@@ -383,6 +409,12 @@ STORE(__cached_dev)
SET_BDEV_CACHE_MODE(&dc->sb, v);
bch_write_bdev_super(dc, NULL);
}
+
+ /* It's not the writeback mode that can't enable deferred_flush */
+ if (BDEV_DEFERRED_FLUSH(&dc->sb) && ((unsigned int) v != CACHE_MODE_WRITEBACK)) {
+ SET_BDEV_DEFERRED_FLUSH(&dc->sb, 0);
+ bch_write_bdev_super(dc, NULL);
+ }
}
if (attr == &sysfs_readahead_cache_policy) {
@@ -451,6 +483,27 @@ STORE(__cached_dev)
if (attr == &sysfs_stop)
bcache_device_stop(&dc->disk);
+ if (attr == &sysfs_deferred_flush) {
+ v = __sysfs_match_string(bch_deferred_flush, -1, buf);
+ if (v < 0)
+ return v;
+
+ if ((unsigned int) v != BDEV_DEFERRED_FLUSH(&dc->sb)) {
+ if (v && (BDEV_CACHE_MODE(&dc->sb) != CACHE_MODE_WRITEBACK)) {
+ pr_err("It's not the writeback mode that can't enable deferred_flush.\n");
+ return -EINVAL;
+ }
+
+ SET_BDEV_DEFERRED_FLUSH(&dc->sb, v);
+ bch_write_bdev_super(dc, NULL);
+ if (v) {
+ bio_init(&flush, dc->bdev, NULL, 0, REQ_OP_WRITE | REQ_PREFLUSH);
+ /* I/O request sent to backing device */
+ submit_bio_wait(&flush);
+ }
+ }
+ }
+
return size;
}
@@ -541,6 +594,7 @@ static struct attribute *bch_cached_dev_attrs[] = {
#endif
&sysfs_backing_dev_name,
&sysfs_backing_dev_uuid,
+ &sysfs_deferred_flush,
NULL
};
ATTRIBUTE_GROUPS(bch_cached_dev);
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
index 453efbbdc8ee..ce31d1535d90 100644
--- a/drivers/md/bcache/writeback.c
+++ b/drivers/md/bcache/writeback.c
@@ -362,23 +362,31 @@ static CLOSURE_CALLBACK(write_dirty_finish)
unsigned int i;
struct keylist keys;
- bch_keylist_init(&keys);
+ if (!BDEV_DEFERRED_FLUSH(&dc->sb)) {
+ bch_keylist_init(&keys);
- bkey_copy(keys.top, &w->key);
- SET_KEY_DIRTY(keys.top, false);
- bch_keylist_push(&keys);
+ bkey_copy(keys.top, &w->key);
+ SET_KEY_DIRTY(keys.top, false);
+ bch_keylist_push(&keys);
- for (i = 0; i < KEY_PTRS(&w->key); i++)
- atomic_inc(&PTR_BUCKET(dc->disk.c, &w->key, i)->pin);
+ for (i = 0; i < KEY_PTRS(&w->key); i++)
+ atomic_inc(&PTR_BUCKET(dc->disk.c, &w->key, i)->pin);
- ret = bch_btree_insert(dc->disk.c, &keys, NULL, &w->key);
+ ret = bch_btree_insert(dc->disk.c, &keys, NULL, &w->key);
- if (ret)
- trace_bcache_writeback_collision(&w->key);
+ if (ret)
+ trace_bcache_writeback_collision(&w->key);
- atomic_long_inc(ret
- ? &dc->disk.c->writeback_keys_failed
- : &dc->disk.c->writeback_keys_done);
+ atomic_long_inc(ret
+ ? &dc->disk.c->writeback_keys_failed
+ : &dc->disk.c->writeback_keys_done);
+ } else {
+ /* After flushing the backing device, update the btree */
+ spin_lock(&dc->preflush_keys.lock);
+ dc->preflush_keys.data[dc->preflush_keys.nr_keys] = *w;
+ dc->preflush_keys.nr_keys++;
+ spin_unlock(&dc->preflush_keys.lock);
+ }
}
bch_keybuf_del(&dc->writeback_keys, w);
@@ -435,6 +443,7 @@ static CLOSURE_CALLBACK(write_dirty)
if (KEY_DIRTY(&w->key)) {
dirty_init(w);
io->bio.bi_opf = REQ_OP_WRITE;
+
io->bio.bi_iter.bi_sector = KEY_START(&w->key);
bio_set_dev(&io->bio, io->dc->bdev);
io->bio.bi_end_io = dirty_endio;
@@ -471,6 +480,66 @@ static CLOSURE_CALLBACK(read_dirty_submit)
continue_at(cl, write_dirty, io->dc->writeback_write_wq);
}
+static void flush_backing_device(struct cached_dev *dc)
+{
+ int ret;
+ unsigned int i;
+ struct bio flush;
+ struct keybuf_key *p;
+
+ if (dc->preflush_keys.nr_keys == 0)
+ return;
+
+ bio_init(&flush, dc->bdev, NULL, 0, REQ_OP_WRITE | REQ_PREFLUSH);
+ ret = submit_bio_wait(&flush);
+ if (ret) {
+ pr_warn("flush backing device error, ret=%d!\n", ret);
+ dc->preflush_keys.nr_keys = 0;
+ /*
+ * Due to the flush failure, the dirty marked bkey will wait for
+ * the next rescheduling to be written back
+ */
+ return;
+ }
+
+ /*
+ * The dirty data was successfully written back and confirmed to be written
+ * to the disk. The status of the bkey in the btree was updated.
+ */
+ for (i = 0; i < dc->preflush_keys.nr_keys; i++) {
+ int ret;
+ unsigned int j;
+ struct keylist keys;
+
+ bch_keylist_init(&keys);
+
+ p = &dc->preflush_keys.data[i];
+ bkey_copy(keys.top, &p->key);
+ SET_KEY_DIRTY(keys.top, false);
+ bch_keylist_push(&keys);
+
+ for (j = 0; j < KEY_PTRS(&p->key); j++)
+ atomic_inc(&PTR_BUCKET(dc->disk.c, &p->key, j)->pin);
+
+ ret = bch_btree_insert(dc->disk.c, &keys, NULL, &p->key);
+
+ if (ret)
+ trace_bcache_writeback_collision(&p->key);
+
+ atomic_long_inc(ret
+ ? &dc->disk.c->writeback_keys_failed
+ : &dc->disk.c->writeback_keys_done);
+
+ /* For those bkeys that failed to be inserted, you can
+ * ignore them and they will be processed again in the
+ * next write-back scan.
+ */
+ }
+
+ dc->preflush_keys.nr_keys = 0;
+
+}
+
static void read_dirty(struct cached_dev *dc)
{
unsigned int delay = 0;
@@ -819,6 +888,8 @@ static int bch_writeback_thread(void *arg)
read_dirty(dc);
+ flush_backing_device(dc);
+
if (searched_full_index) {
unsigned int delay = dc->writeback_delay * HZ;
@@ -1072,6 +1143,9 @@ void bch_cached_dev_writeback_init(struct cached_dev *dc)
/* For dc->writeback_lock contention in update_writeback_rate() */
dc->rate_update_retry = 0;
+ spin_lock_init(&dc->preflush_keys.lock);
+ dc->preflush_keys.nr_keys = 0;
+
WARN_ON(test_and_clear_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags));
INIT_DELAYED_WORK(&dc->writeback_rate_update, update_writeback_rate);
}
diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h
index 31df716951f6..0c92a607a875 100644
--- a/drivers/md/bcache/writeback.h
+++ b/drivers/md/bcache/writeback.h
@@ -117,6 +117,10 @@ static inline bool should_writeback(struct cached_dev *dc, struct bio *bio,
bio_sectors(bio)))
return true;
+ /* Prevent IO from bypassing the cache disk */
+ if (BDEV_DEFERRED_FLUSH(&dc->sb) == DEFERRED_FLUSH_FORCE)
+ return true;
+
if (would_skip)
return false;
--
2.18.1
Powered by blists - more mailing lists