linux-kernel - Re: [RFC PATCH 1/1] virtio-blk: process block layer timedout request

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <f4c17344-33be-4832-9045-fb6b40a478b6@nvidia.com>
Date: Tue, 9 Jan 2024 03:33:28 +0000
From: Chaitanya Kulkarni <chaitanyak@...dia.com>
To: Stefan Hajnoczi <stefanha@...hat.com>, Chaitanya Kulkarni
	<chaitanyak@...dia.com>
CC: "virtualization@...ts.linux.dev" <virtualization@...ts.linux.dev>,
	"linux-block@...r.kernel.org" <linux-block@...r.kernel.org>,
	"linux-kernel@...r.kernel.org" <linux-kernel@...r.kernel.org>,
	"mst@...hat.com" <mst@...hat.com>, "hch@....de" <hch@....de>,
	"jasowang@...hat.com" <jasowang@...hat.com>, "pbonzini@...hat.com"
	<pbonzini@...hat.com>, "xuanzhuo@...ux.alibaba.com"
	<xuanzhuo@...ux.alibaba.com>, "axboe@...nel.dk" <axboe@...nel.dk>, Ming Lei
	<ming.lei@...hat.com>
Subject: Re: [RFC PATCH 1/1] virtio-blk: process block layer timedout request

On 11/30/23 17:25, Stefan Hajnoczi wrote:
> On Wed, Nov 29, 2023 at 11:01:33PM -0800, Chaitanya Kulkarni wrote:
>> Improve block layer request handling by implementing a timeout handler.
>> Current implementation assums that request will never timeout and will
>> be completed by underlaying transport. However, this assumption can
>> cause issues under heavy load especially when dealing with different
>> subsystems and real hardware.
>>
>> To solve this, add a block layer request timeout handler that will
>> complete timed-out requests in the same context if the virtio device
>> has a VIRTIO_CONFIG_S_DRIVER_OK status. If the device has any other
>> status, we'll stop the block layer request queue and proceed with the
>> teardown sequence, allowing applications waiting for I/O to exit
>> gracefully with appropriate error.
>>
>> Also, add two new module parameters that allows user to specify the
>> I/O timeout for the tagset when allocating the disk and a teardown limit
>> for the timed out requeets before we initiate device teardown from the
>> timeout handler. These changes will improve the stability and
>> reliability of our system under request timeout scenario.
>>
>> Signed-off-by: Chaitanya Kulkarni <kch@...dia.com>
>> ---
>>   drivers/block/virtio_blk.c      | 122 ++++++++++++++++++++++++++++++++
>>   include/uapi/linux/virtio_blk.h |   1 +
>>   2 files changed, 123 insertions(+)
> Hi,
> This looks interesting. There was a discussion about implementing
> ->timeout() in September:
> https://lore.kernel.org/io-uring/20230926145526.GA387951@fedora/

Thanks for pointing that out ...

>> diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
>> index 4689ac2e0c0e..da26c2bf933b 100644
>> --- a/drivers/block/virtio_blk.c
>> +++ b/drivers/block/virtio_blk.c
>> @@ -16,6 +16,7 @@
>>   #include <linux/blk-mq-virtio.h>
>>   #include <linux/numa.h>
>>   #include <linux/vmalloc.h>
>> +#include <linux/xarray.h>
>>   #include <uapi/linux/virtio_ring.h>
>>   
>>   #define PART_BITS 4
>> @@ -31,6 +32,15 @@
>>   #define VIRTIO_BLK_INLINE_SG_CNT	2
>>   #endif
>>   
>> +static unsigned int io_timeout = 20;
>> +module_param(io_timeout, uint, 0644);
>> +MODULE_PARM_DESC(io_timeout, "timeout in seconds for I/O requests. Default:20");
>> +
>> +static unsigned int timeout_teardown_limit = 2;
>> +module_param(timeout_teardown_limit, uint, 0644);
>> +MODULE_PARM_DESC(timeout_teardown_limit,
>> +		"request timeout teardown limit for stable dev. Default:2");
>> +
>>   static unsigned int num_request_queues;
>>   module_param(num_request_queues, uint, 0644);
>>   MODULE_PARM_DESC(num_request_queues,
>> @@ -84,6 +94,20 @@ struct virtio_blk {
>>   
>>   	/* For zoned device */
>>   	unsigned int zone_sectors;
>> +
>> +	/*
>> +	 * Block layer Request timeout teardown limit when device is in the
>> +	 * stable state, i.e. it has VIRTIO_CONFIG_S_DRIVER_OK value for its
>> +	 * config status. Once this limit is reached issue
>> +	 * virtblk_teardown_work to teardown the device in the block lyaer
>> +	 * request timeout callback.
>> +	 */
>> +	atomic_t rq_timeout_count;
>> +	/* avoid tear down race between remove and teardown work */
>> +	struct mutex teardown_mutex;
>> +	/* tear down work to be scheduled from block layer request handler */
>> +	struct work_struct teardown_work;
>> +
>>   };
>>   
>>   struct virtblk_req {
>> @@ -117,6 +141,8 @@ static inline blk_status_t virtblk_result(u8 status)
>>   	case VIRTIO_BLK_S_OK:
>>   		return BLK_STS_OK;
>>   	case VIRTIO_BLK_S_UNSUPP:
>> +	case VIRTIO_BLK_S_TIMEOUT:
>> +		return BLK_STS_TIMEOUT;
>>   		return BLK_STS_NOTSUPP;
>>   	case VIRTIO_BLK_S_ZONE_OPEN_RESOURCE:
>>   		return BLK_STS_ZONE_OPEN_RESOURCE;
>> @@ -926,6 +952,7 @@ static void virtblk_free_disk(struct gendisk *disk)
>>   	struct virtio_blk *vblk = disk->private_data;
>>   
>>   	ida_free(&vd_index_ida, vblk->index);
>> +	mutex_destroy(&vblk->teardown_mutex);
>>   	mutex_destroy(&vblk->vdev_mutex);
>>   	kfree(vblk);
>>   }
>> @@ -1287,6 +1314,86 @@ static int virtblk_poll(struct blk_mq_hw_ctx *hctx, struct io_comp_batch *iob)
>>   	return found;
>>   }
>>   
>> +static bool virtblk_cancel_request(struct request *rq, void *data)
>> +{
>> +	struct virtblk_req *vbr = blk_mq_rq_to_pdu(rq);
>> +
>> +	vbr->in_hdr.status = VIRTIO_BLK_S_TIMEOUT;
>> +	if (blk_mq_request_started(rq) && !blk_mq_request_completed(rq))
>> +		blk_mq_complete_request(rq);
>> +
>> +	return true;
>> +}
>> +
>> +static void virtblk_teardown_work(struct work_struct *w)
>> +{
>> +	struct virtio_blk *vblk =
>> +		container_of(w, struct virtio_blk, teardown_work);
>> +	struct request_queue *q = vblk->disk->queue;
>> +	struct virtio_device *vdev = vblk->vdev;
>> +	struct blk_mq_hw_ctx *hctx;
>> +	unsigned long idx;
>> +
>> +	mutex_lock(&vblk->teardown_mutex);
>> +	if (!vblk->vdev)
>> +		goto unlock;
>> +
>> +	blk_mq_quiesce_queue(q);
>> +
>> +	/* Process any outstanding request from device. */
>> +	xa_for_each(&q->hctx_table, idx, hctx)
>> +		virtblk_poll(hctx, NULL);
>> +
>> +	blk_sync_queue(q);
>> +	blk_mq_tagset_busy_iter(&vblk->tag_set, virtblk_cancel_request, vblk);
>> +	blk_mq_tagset_wait_completed_request(&vblk->tag_set);
>> +
>> +	/*
>> +	 * Unblock any pending dispatch I/Os before we destroy device. From
>> +	 * del_gendisk() -> __blk_mark_disk_dead(disk) will set GD_DEAD flag,
>> +	 * that will make sure any new I/O from bio_queue_enter() to fail.
>> +	 */
>> +	blk_mq_unquiesce_queue(q);
>> +	del_gendisk(vblk->disk);
>> +	blk_mq_free_tag_set(&vblk->tag_set);
>> +
>> +	mutex_lock(&vblk->vdev_mutex);
>> +	flush_work(&vblk->config_work);
>> +
>> +	virtio_reset_device(vdev);
>> +
>> +	vblk->vdev = NULL;
>> +
>> +	vdev->config->del_vqs(vdev);
>> +	kfree(vblk->vqs);
>> +
>> +	mutex_unlock(&vblk->vdev_mutex);
>> +
>> +	put_disk(vblk->disk);
>> +
>> +unlock:
>> +	mutex_unlock(&vblk->teardown_mutex);
>> +}
>> +
>> +static enum blk_eh_timer_return virtblk_timeout(struct request *req)
>> +{
>> +	struct virtio_blk *vblk = req->mq_hctx->queue->queuedata;
>> +	struct virtio_device *vdev = vblk->vdev;
>> +	bool ok = vdev->config->get_status(vdev) & VIRTIO_CONFIG_S_DRIVER_OK;
> Please check VIRTIO_CONFIG_S_NEEDS_RESET first. When
> VIRTIO_CONFIG_S_NEEDS_RESET is set the device is not ok, regardless of
> whether DRIVER_OK is set. See 2.1.1 Driver Requirements: Device Status
> Field in the VIRTIO specification.

okay will add that to next version ...

>
>> +
>> +	if ((atomic_dec_return(&vblk->rq_timeout_count) != 0) && ok) {
>> +		virtblk_cancel_request(req, NULL);
> The driver cannot abandon the request here because the device still owns
> the request:
>
> 1. I/O buffer memory is corrupted for READ requests and disk contents
>     are corrupted for WRITE requests when the device does finally process
>     the request.
>
> 2. After virtblk_timeout() -> virtblk_cancel_request() ->
>     blk_mq_complete_request(), a freed address is returned from
>     virtblk_done() -> virtqueue_get_buf() -> blk_mq_rq_from_pdu() when
>     the device finally completes the request.
>
> I suggest the following:
>
> (Optional) Add an ABORT/CANCEL request type to the VIRTIO specification
> and use it to safely cancel requests when the device supports it.

I really want to keep this simple without introducing a new command
so we don't have to modify the underlying transport to handle that.

> Otherwise reset the device so that all pending requests are canceled.

will issue reset here and see if that works ...

>> +		return BLK_EH_DONE;
>> +	}
>> +
>> +	dev_err(&vdev->dev, "%s:%s initiating teardown\n", __func__,
>> +		vblk->disk->disk_name);
>> +
>> +	queue_work(virtblk_wq, &vblk->teardown_work);
>> +
>> +	return BLK_EH_RESET_TIMER;
>> +}
>> +
>>   static const struct blk_mq_ops virtio_mq_ops = {
>>   	.queue_rq	= virtio_queue_rq,
>>   	.queue_rqs	= virtio_queue_rqs,
>> @@ -1294,6 +1401,7 @@ static const struct blk_mq_ops virtio_mq_ops = {
>>   	.complete	= virtblk_request_done,
>>   	.map_queues	= virtblk_map_queues,
>>   	.poll		= virtblk_poll,
>> +	.timeout	= virtblk_timeout,
>>   };
>>   
>>   static unsigned int virtblk_queue_depth;
>> @@ -1365,6 +1473,7 @@ static int virtblk_probe(struct virtio_device *vdev)
>>   	memset(&vblk->tag_set, 0, sizeof(vblk->tag_set));
>>   	vblk->tag_set.ops = &virtio_mq_ops;
>>   	vblk->tag_set.queue_depth = queue_depth;
>> +	vblk->tag_set.timeout = io_timeout * HZ;
>>   	vblk->tag_set.numa_node = NUMA_NO_NODE;
>>   	vblk->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
>>   	vblk->tag_set.cmd_size =
>> @@ -1387,6 +1496,10 @@ static int virtblk_probe(struct virtio_device *vdev)
>>   	}
>>   	q = vblk->disk->queue;
>>   
>> +	mutex_init(&vblk->teardown_mutex);
>> +	INIT_WORK(&vblk->teardown_work, virtblk_teardown_work);
>> +	atomic_set(&vblk->rq_timeout_count, timeout_teardown_limit);
>> +
>>   	virtblk_name_format("vd", index, vblk->disk->disk_name, DISK_NAME_LEN);
>>   
>>   	vblk->disk->major = major;
>> @@ -1598,6 +1711,12 @@ static void virtblk_remove(struct virtio_device *vdev)
>>   {
>>   	struct virtio_blk *vblk = vdev->priv;
>>   
>> +	mutex_lock(&vblk->teardown_mutex);
>> +
>> +	/* we did the cleanup in the timeout handler */
>> +	if (!vblk->vdev)
>> +		goto unlock;
>> +
>>   	/* Make sure no work handler is accessing the device. */
>>   	flush_work(&vblk->config_work);
>>   
>> @@ -1618,6 +1737,9 @@ static void virtblk_remove(struct virtio_device *vdev)
>>   	mutex_unlock(&vblk->vdev_mutex);
>>   
>>   	put_disk(vblk->disk);
>> +
>> +unlock:
>> +	mutex_unlock(&vblk->teardown_mutex);
>>   }
>>   
>>   #ifdef CONFIG_PM_SLEEP
>> diff --git a/include/uapi/linux/virtio_blk.h b/include/uapi/linux/virtio_blk.h
>> index 3744e4da1b2a..ed864195ab26 100644
>> --- a/include/uapi/linux/virtio_blk.h
>> +++ b/include/uapi/linux/virtio_blk.h
>> @@ -317,6 +317,7 @@ struct virtio_scsi_inhdr {
>>   #define VIRTIO_BLK_S_OK		0
>>   #define VIRTIO_BLK_S_IOERR	1
>>   #define VIRTIO_BLK_S_UNSUPP	2
>> +#define VIRTIO_BLK_S_TIMEOUT	3
> The structs and constants in this header file come from the VIRTIO
> specification. Anything changed in this file must first be accepted into
> the VIRTIO spec because this is the hardware interface definition.
>
> VIRTIO_BLK_S_TIMEOUT seems to be synthetic value that is purely used by
> software, not the device. Maybe there is no need to update the spec.
> Just avoid using in_hdr.status to signal timeouts and use a separate
> flag/field instead in a block layer or virtio_blk driver request struct.

It is a specific error hence I've added that on the similar lines,
do you have a specific field in mind that you would prefer ?

Thanks for the reply, just got back from time off, looking forward to
send V2 as soon as I get the reply.

-ck