[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <CANubcdX7eNbH_bo4-f94DUbdiEbt04Vxy1MPyhm+CZyXB01FuQ@mail.gmail.com>
Date: Tue, 20 Jan 2026 10:48:42 +0800
From: Stephen Zhang <starzhangzsd@...il.com>
To: Coly Li <colyli@...as.com>, Kent Overstreet <kent.overstreet@...ux.dev>, axboe@...nel.dk,
Sasha Levin <sashal@...nel.org>, Christoph Hellwig <hch@...radead.org>
Cc: linux-bcache@...r.kernel.org,
Linux Kernel Mailing List <linux-kernel@...r.kernel.org>, zhangshida <zhangshida@...inos.cn>
Subject: Re: [PATCH v2] bcache: use bio cloning for detached device requests
Stephen Zhang <starzhangzsd@...il.com> 于2026年1月20日周二 10:39写道:
>
> ---------- Forwarded message ---------
> 发件人: zhangshida <starzhangzsd@...il.com>
> Date: 2026年1月20日周二 10:35
> Subject: [PATCH v2] bcache: use bio cloning for detached device requests
> To: <colyli@...as.com>, <kent.overstreet@...ux.dev>,
> <axboe@...nel.dk>, <sashal@...nel.org>, <hch@...radead.org>
> Cc: <linux-bcache@...r.kernel.org>, <linux-kernel@...r.kernel.org>,
> <zhangshida@...inos.cn>, <starzhangzsd@...il.com>, Christoph Hellwig
> <hch@....de>
>
>
> From: Shida Zhang <zhangshida@...inos.cn>
>
> Previously, bcache hijacked the bi_end_io and bi_private fields of
> the incoming bio when the backing device was in a detached state.
> This is fragile and breaks if the bio is needed to be processed by
> other layers.
>
> This patch transitions to using a cloned bio embedded within a private
> structure. This ensures the original bio's metadata remains untouched.
>
> Fixes: 53280e398471 ("bcache: fix improper use of bi_end_io")
> Co-developed-by: Christoph Hellwig <hch@....de>
> Signed-off-by: Christoph Hellwig <hch@....de>
> Signed-off-by: Shida Zhang <zhangshida@...inos.cn>
> ---
>
> Changelog:
> v1:
> https://lore.kernel.org/all/20260115074811.230807-1-zhangshida@kylinos.cn/
>
> drivers/md/bcache/bcache.h | 9 +++++
> drivers/md/bcache/request.c | 79 ++++++++++++++++---------------------
> drivers/md/bcache/super.c | 12 +++++-
> 3 files changed, 54 insertions(+), 46 deletions(-)
>
> diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
> index 8ccacba8554..54ff4e0238a 100644
> --- a/drivers/md/bcache/bcache.h
> +++ b/drivers/md/bcache/bcache.h
> @@ -273,6 +273,8 @@ struct bcache_device {
>
> struct bio_set bio_split;
>
> + struct bio_set bio_detach;
> +
> unsigned int data_csum:1;
>
> int (*cache_miss)(struct btree *b, struct search *s,
> @@ -753,6 +755,13 @@ struct bbio {
> struct bio bio;
> };
>
> +struct detached_dev_io_private {
> + struct bcache_device *d;
> + unsigned long start_time;
> + struct bio *orig_bio;
> + struct bio bio;
> +};
> +
> #define BTREE_PRIO USHRT_MAX
> #define INITIAL_PRIO 32768U
>
> diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
> index 82fdea7dea7..e0b12cb622b 100644
> --- a/drivers/md/bcache/request.c
> +++ b/drivers/md/bcache/request.c
> @@ -1077,68 +1077,58 @@ static CLOSURE_CALLBACK(cached_dev_nodata)
> continue_at(cl, cached_dev_bio_complete, NULL);
> }
>
> -struct detached_dev_io_private {
> - struct bcache_device *d;
> - unsigned long start_time;
> - bio_end_io_t *bi_end_io;
> - void *bi_private;
> - struct block_device *orig_bdev;
> -};
> -
> static void detached_dev_end_io(struct bio *bio)
> {
> - struct detached_dev_io_private *ddip;
> -
> - ddip = bio->bi_private;
> - bio->bi_end_io = ddip->bi_end_io;
> - bio->bi_private = ddip->bi_private;
> + struct detached_dev_io_private *ddip =
> + container_of(bio, struct detached_dev_io_private, bio);
> + struct bio *orig_bio = ddip->orig_bio;
>
> /* Count on the bcache device */
> - bio_end_io_acct_remapped(bio, ddip->start_time, ddip->orig_bdev);
> + bio_end_io_acct(orig_bio, ddip->start_time);
>
> if (bio->bi_status) {
> - struct cached_dev *dc = container_of(ddip->d,
> - struct cached_dev, disk);
> + struct cached_dev *dc = bio->bi_private;
> +
> /* should count I/O error for backing device here */
> bch_count_backing_io_errors(dc, bio);
> + orig_bio->bi_status = bio->bi_status;
> }
>
> - kfree(ddip);
> - bio_endio(bio);
> + bio_put(bio);
> + bio_endio(orig_bio);
> }
>
> -static void detached_dev_do_request(struct bcache_device *d, struct bio *bio,
> - struct block_device *orig_bdev, unsigned long start_time)
> +static void detached_dev_do_request(struct bcache_device *d,
> + struct bio *orig_bio, unsigned long start_time)
> {
> struct detached_dev_io_private *ddip;
> struct cached_dev *dc = container_of(d, struct cached_dev, disk);
> + struct bio *clone_bio;
>
> - /*
> - * no need to call closure_get(&dc->disk.cl),
> - * because upper layer had already opened bcache device,
> - * which would call closure_get(&dc->disk.cl)
> - */
> - ddip = kzalloc(sizeof(struct detached_dev_io_private), GFP_NOIO);
> - if (!ddip) {
> - bio->bi_status = BLK_STS_RESOURCE;
> - bio_endio(bio);
> + if (bio_op(orig_bio) == REQ_OP_DISCARD &&
> + !bdev_max_discard_sectors(dc->bdev)) {
> + bio_endio(orig_bio);
> return;
> }
>
> - ddip->d = d;
> + clone_bio = bio_alloc_clone(dc->bdev, orig_bio, GFP_NOIO,
> + &d->bio_detach);
> + if (!clone_bio) {
> + orig_bio->bi_status = BLK_STS_RESOURCE;
> + bio_endio(orig_bio);
> + return;
> + }
> +
> + ddip = container_of(clone_bio, struct detached_dev_io_private, bio);
> /* Count on the bcache device */
> - ddip->orig_bdev = orig_bdev;
> + ddip->d = d;
> ddip->start_time = start_time;
> - ddip->bi_end_io = bio->bi_end_io;
> - ddip->bi_private = bio->bi_private;
> - bio->bi_end_io = detached_dev_end_io;
> - bio->bi_private = ddip;
> -
> - if ((bio_op(bio) == REQ_OP_DISCARD) &&
> - !bdev_max_discard_sectors(dc->bdev))
> - detached_dev_end_io(bio);
> - else
> - submit_bio_noacct(bio);
> + ddip->orig_bio = orig_bio;
> +
> + clone_bio->bi_end_io = detached_dev_end_io;
> + clone_bio->bi_private = dc;
> +
> + submit_bio_noacct(clone_bio);
> }
>
> static void quit_max_writeback_rate(struct cache_set *c,
> @@ -1214,10 +1204,10 @@ void cached_dev_submit_bio(struct bio *bio)
>
> start_time = bio_start_io_acct(bio);
>
> - bio_set_dev(bio, dc->bdev);
> bio->bi_iter.bi_sector += dc->sb.data_offset;
>
> if (cached_dev_get(dc)) {
> + bio_set_dev(bio, dc->bdev);
> s = search_alloc(bio, d, orig_bdev, start_time);
> trace_bcache_request_start(s->d, bio);
>
> @@ -1237,9 +1227,10 @@ void cached_dev_submit_bio(struct bio *bio)
> else
> cached_dev_read(dc, s);
> }
> - } else
> + } else {
> /* I/O request sent to backing device */
> - detached_dev_do_request(d, bio, orig_bdev, start_time);
> + detached_dev_do_request(d, bio, start_time);
> + }
> }
>
> static int cached_dev_ioctl(struct bcache_device *d, blk_mode_t mode,
> diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
> index c17d4517af2..d4b798668c8 100644
> --- a/drivers/md/bcache/super.c
> +++ b/drivers/md/bcache/super.c
> @@ -887,6 +887,7 @@ static void bcache_device_free(struct bcache_device *d)
> }
>
> bioset_exit(&d->bio_split);
> + bioset_exit(&d->bio_detach);
> kvfree(d->full_dirty_stripes);
> kvfree(d->stripe_sectors_dirty);
>
> @@ -949,6 +950,11 @@ static int bcache_device_init(struct
> bcache_device *d, unsigned int block_size,
> BIOSET_NEED_BVECS|BIOSET_NEED_RESCUER))
> goto out_ida_remove;
>
> + if (bioset_init(&d->bio_detach, 4,
> + offsetof(struct detached_dev_io_private, bio),
> + BIOSET_NEED_BVECS|BIOSET_NEED_RESCUER))
> + goto out_bioset_split_exit;
> +
> if (lim.logical_block_size > PAGE_SIZE && cached_bdev) {
> /*
> * This should only happen with BCACHE_SB_VERSION_BDEV.
> @@ -964,7 +970,7 @@ static int bcache_device_init(struct bcache_device
> *d, unsigned int block_size,
>
> d->disk = blk_alloc_disk(&lim, NUMA_NO_NODE);
> if (IS_ERR(d->disk))
> - goto out_bioset_exit;
> + goto out_bioset_detach_exit;
>
> set_capacity(d->disk, sectors);
> snprintf(d->disk->disk_name, DISK_NAME_LEN, "bcache%i", idx);
> @@ -976,7 +982,9 @@ static int bcache_device_init(struct bcache_device
> *d, unsigned int block_size,
> d->disk->private_data = d;
> return 0;
>
> -out_bioset_exit:
> +out_bioset_detach_exit:
> + bioset_exit(&d->bio_detach);
> +out_bioset_split_exit:
> bioset_exit(&d->bio_split);
> out_ida_remove:
> ida_free(&bcache_device_idx, idx);
> --
> 2.34.1
I’ve tested this patch with the script below, and the results look good.
I couldn't find a standard test suite for the project, but I’d be
happy to integrate
these tests into it if needed. Just let me know.
Thanks,
Shida
-----
#!/bin/bash
# cycle_test.sh - Automation for bcache detached bio-cloning patch
# --- CONFIGURATION ---
BACKING_DEV="/dev/sdb1"
CACHE_DEV="/dev/nvme0n1p1"
BCACHE_DEV="/dev/bcache0"
ITERATIONS=3
FIO_RUNTIME=60
set -e
log() { echo -e "\n[$(date +%T)] $1"; }
# --- CLEANUP HANDLER ---
cleanup() {
set +e
log "CLEANING UP RESOURCES..."
if pgrep fio > /dev/null; then
sudo pkill -9 fio
fi
# 1. Stop the logical bcache device
if [ -b "$BCACHE_DEV" ]; then
BDEV_NAME=$(basename "$BCACHE_DEV")
echo 1 | sudo tee /sys/block/$BDEV_NAME/bcache/stop >
/dev/null 2>&1 || true
fi
# 2. Unregister the backing device specifically if it's still active
# This is often the cause of "Device busy" during wipefs
BACKING_NAME=$(basename "$BACKING_DEV")
if [ -d "/sys/block/$BACKING_NAME/bcache" ]; then
echo 1 | sudo tee /sys/block/$BACKING_NAME/bcache/stop >
/dev/null 2>&1 || true
fi
# 3. Unregister all bcache cache sets
for cset in /sys/fs/bcache/*-*-*-*-*; do
if [ -d "$cset" ]; then
echo 1 | sudo tee "$cset/unregister" > /dev/null 2>&1 || true
fi
done
# 4. Wait for kernel/udev to catch up
sudo udevadm settle
sleep 2
log "Cleanup complete."
}
# Trap for unexpected exits
trap cleanup EXIT SIGINT SIGTERM
check_deps() {
for cmd in make-bcache fio iostat wipefs bc pgrep udevadm; do
if ! command -v $cmd &> /dev/null; then
echo "Error: $cmd not found."
exit 1
fi
done
}
run_cycle() {
local i=$1
log ">>> STARTING CYCLE #$i"
# 1. Clean and Initialize
log "Wiping devices..."
# If wipefs fails, we try a quick dd to clear the headers
sudo wipefs -a $BACKING_DEV || (sudo dd if=/dev/zero
of=$BACKING_DEV bs=1M count=10 && sudo wipefs -a $BACKING_DEV)
sudo wipefs -a $CACHE_DEV
log "Creating bcache..."
sudo make-bcache -B $BACKING_DEV -C $CACHE_DEV
sleep 3
if [ ! -b "$BCACHE_DEV" ]; then
echo "Error: $BCACHE_DEV did not initialize."
exit 1
fi
# 2. Detach
log "Detaching backing device..."
BDEV_NAME=$(basename $BCACHE_DEV)
echo 1 | sudo tee /sys/block/$BDEV_NAME/bcache/detach > /dev/null
STATE=$(cat /sys/block/$BDEV_NAME/bcache/state)
log "Device state: $STATE"
# 3. Stress Test
log "Running fio stress test (${FIO_RUNTIME}s)..."
sudo fio --name=bcache_test --filename=$BCACHE_DEV --rw=randrw --bs=4k \
--direct=1 --ioengine=libaio --iodepth=64 --runtime=$FIO_RUNTIME \
--numjobs=4 --group_reporting --minimal > /dev/null
# 4. Validation
log "Validating I/O accounting..."
sleep 5
STATS_LINE=$(iostat -x 1 2 $BDEV_NAME | grep -w "$BDEV_NAME" | tail -n 1)
UTIL=$(echo "$STATS_LINE" | awk '{print $NF}')
log "Final Stats -> %util: $UTIL"
if (( $(echo "$UTIL > 1.0" | bc -l) )); then
echo "!!! FAILURE: Accounting leak detected!"
exit 1
fi
# 5. Cycle Teardown
log "Cycle $i complete. Teardown..."
# We call our cleanup function logic manually to ensure a clean
slate for the next iteration
cleanup
# Re-enable 'exit on error' for the next loop
set -e
log ">>> CYCLE #$i FINISHED"
}
# --- Main ---
check_deps
sudo -v
for ((c=1; c<=ITERATIONS; c++)); do
run_cycle $c
# Extra breather between cycles
sleep 2
done
log "ALL CYCLES PASSED."
trap - EXIT
----
Powered by blists - more mailing lists