linux-kernel - Re: [PATCH v2] bcache: use bio cloning for detached device requests

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <CANubcdX7eNbH_bo4-f94DUbdiEbt04Vxy1MPyhm+CZyXB01FuQ@mail.gmail.com>
Date: Tue, 20 Jan 2026 10:48:42 +0800
From: Stephen Zhang <starzhangzsd@...il.com>
To: Coly Li <colyli@...as.com>, Kent Overstreet <kent.overstreet@...ux.dev>, axboe@...nel.dk, 
	Sasha Levin <sashal@...nel.org>, Christoph Hellwig <hch@...radead.org>
Cc: linux-bcache@...r.kernel.org, 
	Linux Kernel Mailing List <linux-kernel@...r.kernel.org>, zhangshida <zhangshida@...inos.cn>
Subject: Re: [PATCH v2] bcache: use bio cloning for detached device requests

Stephen Zhang <starzhangzsd@...il.com> 于2026年1月20日周二 10:39写道：
>
> ---------- Forwarded message ---------
> 发件人： zhangshida <starzhangzsd@...il.com>
> Date: 2026年1月20日周二 10:35
> Subject: [PATCH v2] bcache: use bio cloning for detached device requests
> To: <colyli@...as.com>, <kent.overstreet@...ux.dev>,
> <axboe@...nel.dk>, <sashal@...nel.org>, <hch@...radead.org>
> Cc: <linux-bcache@...r.kernel.org>, <linux-kernel@...r.kernel.org>,
> <zhangshida@...inos.cn>, <starzhangzsd@...il.com>, Christoph Hellwig
> <hch@....de>
>
>
> From: Shida Zhang <zhangshida@...inos.cn>
>
> Previously, bcache hijacked the bi_end_io and bi_private fields of
> the incoming bio when the backing device was in a detached state.
> This is fragile and breaks if the bio is needed to be processed by
> other layers.
>
> This patch transitions to using a cloned bio embedded within a private
> structure. This ensures the original bio's metadata remains untouched.
>
> Fixes: 53280e398471 ("bcache: fix improper use of bi_end_io")
> Co-developed-by: Christoph Hellwig <hch@....de>
> Signed-off-by: Christoph Hellwig <hch@....de>
> Signed-off-by: Shida Zhang <zhangshida@...inos.cn>
> ---
>
> Changelog:
> v1:
> https://lore.kernel.org/all/20260115074811.230807-1-zhangshida@kylinos.cn/
>
>  drivers/md/bcache/bcache.h  |  9 +++++
>  drivers/md/bcache/request.c | 79 ++++++++++++++++---------------------
>  drivers/md/bcache/super.c   | 12 +++++-
>  3 files changed, 54 insertions(+), 46 deletions(-)
>
> diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
> index 8ccacba8554..54ff4e0238a 100644
> --- a/drivers/md/bcache/bcache.h
> +++ b/drivers/md/bcache/bcache.h
> @@ -273,6 +273,8 @@ struct bcache_device {
>
>         struct bio_set          bio_split;
>
> +       struct bio_set          bio_detach;
> +
>         unsigned int            data_csum:1;
>
>         int (*cache_miss)(struct btree *b, struct search *s,
> @@ -753,6 +755,13 @@ struct bbio {
>         struct bio              bio;
>  };
>
> +struct detached_dev_io_private {
> +       struct bcache_device    *d;
> +       unsigned long           start_time;
> +       struct bio              *orig_bio;
> +       struct bio              bio;
> +};
> +
>  #define BTREE_PRIO             USHRT_MAX
>  #define INITIAL_PRIO           32768U
>
> diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
> index 82fdea7dea7..e0b12cb622b 100644
> --- a/drivers/md/bcache/request.c
> +++ b/drivers/md/bcache/request.c
> @@ -1077,68 +1077,58 @@ static CLOSURE_CALLBACK(cached_dev_nodata)
>         continue_at(cl, cached_dev_bio_complete, NULL);
>  }
>
> -struct detached_dev_io_private {
> -       struct bcache_device    *d;
> -       unsigned long           start_time;
> -       bio_end_io_t            *bi_end_io;
> -       void                    *bi_private;
> -       struct block_device     *orig_bdev;
> -};
> -
>  static void detached_dev_end_io(struct bio *bio)
>  {
> -       struct detached_dev_io_private *ddip;
> -
> -       ddip = bio->bi_private;
> -       bio->bi_end_io = ddip->bi_end_io;
> -       bio->bi_private = ddip->bi_private;
> +       struct detached_dev_io_private *ddip =
> +               container_of(bio, struct detached_dev_io_private, bio);
> +       struct bio *orig_bio = ddip->orig_bio;
>
>         /* Count on the bcache device */
> -       bio_end_io_acct_remapped(bio, ddip->start_time, ddip->orig_bdev);
> +       bio_end_io_acct(orig_bio, ddip->start_time);
>
>         if (bio->bi_status) {
> -               struct cached_dev *dc = container_of(ddip->d,
> -                                                    struct cached_dev, disk);
> +               struct cached_dev *dc = bio->bi_private;
> +
>                 /* should count I/O error for backing device here */
>                 bch_count_backing_io_errors(dc, bio);
> +               orig_bio->bi_status = bio->bi_status;
>         }
>
> -       kfree(ddip);
> -       bio_endio(bio);
> +       bio_put(bio);
> +       bio_endio(orig_bio);
>  }
>
> -static void detached_dev_do_request(struct bcache_device *d, struct bio *bio,
> -               struct block_device *orig_bdev, unsigned long start_time)
> +static void detached_dev_do_request(struct bcache_device *d,
> +               struct bio *orig_bio, unsigned long start_time)
>  {
>         struct detached_dev_io_private *ddip;
>         struct cached_dev *dc = container_of(d, struct cached_dev, disk);
> +       struct bio *clone_bio;
>
> -       /*
> -        * no need to call closure_get(&dc->disk.cl),
> -        * because upper layer had already opened bcache device,
> -        * which would call closure_get(&dc->disk.cl)
> -        */
> -       ddip = kzalloc(sizeof(struct detached_dev_io_private), GFP_NOIO);
> -       if (!ddip) {
> -               bio->bi_status = BLK_STS_RESOURCE;
> -               bio_endio(bio);
> +       if (bio_op(orig_bio) == REQ_OP_DISCARD &&
> +           !bdev_max_discard_sectors(dc->bdev)) {
> +               bio_endio(orig_bio);
>                 return;
>         }
>
> -       ddip->d = d;
> +       clone_bio = bio_alloc_clone(dc->bdev, orig_bio, GFP_NOIO,
> +                                   &d->bio_detach);
> +       if (!clone_bio) {
> +               orig_bio->bi_status = BLK_STS_RESOURCE;
> +               bio_endio(orig_bio);
> +               return;
> +       }
> +
> +       ddip = container_of(clone_bio, struct detached_dev_io_private, bio);
>         /* Count on the bcache device */
> -       ddip->orig_bdev = orig_bdev;
> +       ddip->d = d;
>         ddip->start_time = start_time;
> -       ddip->bi_end_io = bio->bi_end_io;
> -       ddip->bi_private = bio->bi_private;
> -       bio->bi_end_io = detached_dev_end_io;
> -       bio->bi_private = ddip;
> -
> -       if ((bio_op(bio) == REQ_OP_DISCARD) &&
> -           !bdev_max_discard_sectors(dc->bdev))
> -               detached_dev_end_io(bio);
> -       else
> -               submit_bio_noacct(bio);
> +       ddip->orig_bio = orig_bio;
> +
> +       clone_bio->bi_end_io = detached_dev_end_io;
> +       clone_bio->bi_private = dc;
> +
> +       submit_bio_noacct(clone_bio);
>  }
>
>  static void quit_max_writeback_rate(struct cache_set *c,
> @@ -1214,10 +1204,10 @@ void cached_dev_submit_bio(struct bio *bio)
>
>         start_time = bio_start_io_acct(bio);
>
> -       bio_set_dev(bio, dc->bdev);
>         bio->bi_iter.bi_sector += dc->sb.data_offset;
>
>         if (cached_dev_get(dc)) {
> +               bio_set_dev(bio, dc->bdev);
>                 s = search_alloc(bio, d, orig_bdev, start_time);
>                 trace_bcache_request_start(s->d, bio);
>
> @@ -1237,9 +1227,10 @@ void cached_dev_submit_bio(struct bio *bio)
>                         else
>                                 cached_dev_read(dc, s);
>                 }
> -       } else
> +       } else {
>                 /* I/O request sent to backing device */
> -               detached_dev_do_request(d, bio, orig_bdev, start_time);
> +               detached_dev_do_request(d, bio, start_time);
> +       }
>  }
>
>  static int cached_dev_ioctl(struct bcache_device *d, blk_mode_t mode,
> diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
> index c17d4517af2..d4b798668c8 100644
> --- a/drivers/md/bcache/super.c
> +++ b/drivers/md/bcache/super.c
> @@ -887,6 +887,7 @@ static void bcache_device_free(struct bcache_device *d)
>         }
>
>         bioset_exit(&d->bio_split);
> +       bioset_exit(&d->bio_detach);
>         kvfree(d->full_dirty_stripes);
>         kvfree(d->stripe_sectors_dirty);
>
> @@ -949,6 +950,11 @@ static int bcache_device_init(struct
> bcache_device *d, unsigned int block_size,
>                         BIOSET_NEED_BVECS|BIOSET_NEED_RESCUER))
>                 goto out_ida_remove;
>
> +       if (bioset_init(&d->bio_detach, 4,
> +                       offsetof(struct detached_dev_io_private, bio),
> +                       BIOSET_NEED_BVECS|BIOSET_NEED_RESCUER))
> +               goto out_bioset_split_exit;
> +
>         if (lim.logical_block_size > PAGE_SIZE && cached_bdev) {
>                 /*
>                  * This should only happen with BCACHE_SB_VERSION_BDEV.
> @@ -964,7 +970,7 @@ static int bcache_device_init(struct bcache_device
> *d, unsigned int block_size,
>
>         d->disk = blk_alloc_disk(&lim, NUMA_NO_NODE);
>         if (IS_ERR(d->disk))
> -               goto out_bioset_exit;
> +               goto out_bioset_detach_exit;
>
>         set_capacity(d->disk, sectors);
>         snprintf(d->disk->disk_name, DISK_NAME_LEN, "bcache%i", idx);
> @@ -976,7 +982,9 @@ static int bcache_device_init(struct bcache_device
> *d, unsigned int block_size,
>         d->disk->private_data   = d;
>         return 0;
>
> -out_bioset_exit:
> +out_bioset_detach_exit:
> +       bioset_exit(&d->bio_detach);
> +out_bioset_split_exit:
>         bioset_exit(&d->bio_split);
>  out_ida_remove:
>         ida_free(&bcache_device_idx, idx);
> --
> 2.34.1

I’ve tested this patch with the script below, and the results look good.
I couldn't find a standard test suite for the project, but I’d be
happy to integrate
these tests into it if needed. Just let me know.

Thanks,
Shida
-----
#!/bin/bash
# cycle_test.sh - Automation for bcache detached bio-cloning patch

# --- CONFIGURATION ---
BACKING_DEV="/dev/sdb1"
CACHE_DEV="/dev/nvme0n1p1"
BCACHE_DEV="/dev/bcache0"
ITERATIONS=3
FIO_RUNTIME=60

set -e

log() { echo -e "\n[$(date +%T)] $1"; }

# --- CLEANUP HANDLER ---
cleanup() {
    set +e
    log "CLEANING UP RESOURCES..."

    if pgrep fio > /dev/null; then
        sudo pkill -9 fio
    fi

    # 1. Stop the logical bcache device
    if [ -b "$BCACHE_DEV" ]; then
        BDEV_NAME=$(basename "$BCACHE_DEV")
        echo 1 | sudo tee /sys/block/$BDEV_NAME/bcache/stop >
/dev/null 2>&1 || true
    fi

    # 2. Unregister the backing device specifically if it's still active
    # This is often the cause of "Device busy" during wipefs
    BACKING_NAME=$(basename "$BACKING_DEV")
    if [ -d "/sys/block/$BACKING_NAME/bcache" ]; then
        echo 1 | sudo tee /sys/block/$BACKING_NAME/bcache/stop >
/dev/null 2>&1 || true
    fi

    # 3. Unregister all bcache cache sets
    for cset in /sys/fs/bcache/*-*-*-*-*; do
        if [ -d "$cset" ]; then
            echo 1 | sudo tee "$cset/unregister" > /dev/null 2>&1 || true
        fi
    done

    # 4. Wait for kernel/udev to catch up
    sudo udevadm settle
    sleep 2
    log "Cleanup complete."
}

# Trap for unexpected exits
trap cleanup EXIT SIGINT SIGTERM

check_deps() {
    for cmd in make-bcache fio iostat wipefs bc pgrep udevadm; do
        if ! command -v $cmd &> /dev/null; then
            echo "Error: $cmd not found."
            exit 1
        fi
    done
}

run_cycle() {
    local i=$1
    log ">>> STARTING CYCLE #$i"

    # 1. Clean and Initialize
    log "Wiping devices..."
    # If wipefs fails, we try a quick dd to clear the headers
    sudo wipefs -a $BACKING_DEV || (sudo dd if=/dev/zero
of=$BACKING_DEV bs=1M count=10 && sudo wipefs -a $BACKING_DEV)
    sudo wipefs -a $CACHE_DEV

    log "Creating bcache..."
    sudo make-bcache -B $BACKING_DEV -C $CACHE_DEV
    sleep 3

    if [ ! -b "$BCACHE_DEV" ]; then
        echo "Error: $BCACHE_DEV did not initialize."
        exit 1
    fi

    # 2. Detach
    log "Detaching backing device..."
    BDEV_NAME=$(basename $BCACHE_DEV)
    echo 1 | sudo tee /sys/block/$BDEV_NAME/bcache/detach > /dev/null

    STATE=$(cat /sys/block/$BDEV_NAME/bcache/state)
    log "Device state: $STATE"

    # 3. Stress Test
    log "Running fio stress test (${FIO_RUNTIME}s)..."
    sudo fio --name=bcache_test --filename=$BCACHE_DEV --rw=randrw --bs=4k \
         --direct=1 --ioengine=libaio --iodepth=64 --runtime=$FIO_RUNTIME \
         --numjobs=4 --group_reporting --minimal > /dev/null

    # 4. Validation
    log "Validating I/O accounting..."
    sleep 5

    STATS_LINE=$(iostat -x 1 2 $BDEV_NAME | grep -w "$BDEV_NAME" | tail -n 1)
    UTIL=$(echo "$STATS_LINE" | awk '{print $NF}')
    log "Final Stats -> %util: $UTIL"

    if (( $(echo "$UTIL > 1.0" | bc -l) )); then
        echo "!!! FAILURE: Accounting leak detected!"
        exit 1
    fi

    # 5. Cycle Teardown
    log "Cycle $i complete. Teardown..."
    # We call our cleanup function logic manually to ensure a clean
slate for the next iteration
    cleanup

    # Re-enable 'exit on error' for the next loop
    set -e
    log ">>> CYCLE #$i FINISHED"
}

# --- Main ---
check_deps
sudo -v

for ((c=1; c<=ITERATIONS; c++)); do
    run_cycle $c
    # Extra breather between cycles
    sleep 2
done

log "ALL CYCLES PASSED."
trap - EXIT
----