linux-kernel - Re: [Xen-devel] [PATCH v1] xen-blkfront: dynamic configuration of per-vbd resources

Open Source and information security mailing list archives

Message-ID: <586dead3-7392-2873-e23a-1236ef14da3b@gmail.com>
Date:   Fri, 6 Apr 2018 13:13:32 +0300
From:   Oleksandr Andrushchenko <andr2000@...il.com>
To:     Roger Pau Monné <roger.pau@...rix.com>,
        Konrad Rzeszutek Wilk <konrad.wilk@...cle.com>
Cc:     Somasundaram Krishnasamy <somasundaram.krishnasamy@...cle.com>,
        Bob Liu <bob.liu@...cle.com>, linux-kernel@...r.kernel.org,
        xen-devel@...ts.xen.org
Subject: Re: [Xen-devel] [PATCH v1] xen-blkfront: dynamic configuration of
 per-vbd resources

On 04/03/2018 02:22 PM, Roger Pau Monné wrote:
> On Mon, Apr 02, 2018 at 01:42:32PM -0400, Konrad Rzeszutek Wilk wrote:
>> From: Bob Liu <bob.liu@...cle.com>
>>
>> The current VBD layer reserves buffer space for each attached device based on
>> three statically configured settings which are read at boot time.
>>   * max_indirect_segs: Maximum amount of segments.
>>   * max_ring_page_order: Maximum order of pages to be used for the shared ring.
>>   * max_queues: Maximum of queues(rings) to be used.
>>
>> But the storage backend, workload, and guest memory result in very different
>> tuning requirements. It's impossible to centrally predict application
>> characteristics so it's best to leave allow the settings can be dynamiclly
>> adjusted based on workload inside the Guest.
>>
>> Usage:
>> Show current values:
>> cat /sys/devices/vbd-xxx/max_indirect_segs
>> cat /sys/devices/vbd-xxx/max_ring_page_order
>> cat /sys/devices/vbd-xxx/max_queues
>>
>> Write new values:
>> echo <new value> > /sys/devices/vbd-xxx/max_indirect_segs
>> echo <new value> > /sys/devices/vbd-xxx/max_ring_page_order
>> echo <new value> > /sys/devices/vbd-xxx/max_queues
>>
>> Signed-off-by: Bob Liu <bob.liu@...cle.com>
>> Signed-off-by: Somasundaram Krishnasamy <somasundaram.krishnasamy@...cle.com>
>> Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@...cle.com>
>> ---
>>   drivers/block/xen-blkfront.c | 320 ++++++++++++++++++++++++++++++++++++++++---
>>   1 file changed, 304 insertions(+), 16 deletions(-)
>>
>> diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
>> index 92ec1bbece51..4ebd368f4d1a 100644
>> --- a/drivers/block/xen-blkfront.c
>> +++ b/drivers/block/xen-blkfront.c
>> @@ -46,6 +46,7 @@
>>   #include <linux/scatterlist.h>
>>   #include <linux/bitmap.h>
>>   #include <linux/list.h>
>> +#include <linux/delay.h>
>>   
>>   #include <xen/xen.h>
>>   #include <xen/xenbus.h>
>> @@ -217,6 +218,11 @@ struct blkfront_info
>>   	/* Save uncomplete reqs and bios for migration. */
>>   	struct list_head requests;
>>   	struct bio_list bio_list;
>> +	/* For dynamic configuration. */
>> +	unsigned int reconfiguring:1;
> bool reconfiguring:1 maybe?
>
> And I would likely place it together with the feature_ fields, so that
> no more padding is added to the struct.
>
>> +	int new_max_indirect_segments;
>> +	int new_max_ring_page_order;
>> +	int new_max_queues;
> All the ints should be unsigned ints AFAICT.
>
>>   };
>>   
>>   static unsigned int nr_minors;
>> @@ -1355,6 +1361,31 @@ static void blkif_free(struct blkfront_info *info, int suspend)
>>   	for (i = 0; i < info->nr_rings; i++)
>>   		blkif_free_ring(&info->rinfo[i]);
>>   
>> +	/* Remove old xenstore nodes. */
>> +	if (info->nr_ring_pages > 1)
>> +		xenbus_rm(XBT_NIL, info->xbdev->nodename, "ring-page-order");
>> +
>> +	if (info->nr_rings == 1) {
>> +		if (info->nr_ring_pages == 1) {
>> +			xenbus_rm(XBT_NIL, info->xbdev->nodename, "ring-ref");
>> +		} else {
>> +			for (i = 0; i < info->nr_ring_pages; i++) {
>> +				char ring_ref_name[RINGREF_NAME_LEN];
>> +
>> +				snprintf(ring_ref_name, RINGREF_NAME_LEN, "ring-ref%u", i);
>> +				xenbus_rm(XBT_NIL, info->xbdev->nodename, ring_ref_name);
>> +			}
>> +		}
>> +	} else {
>> +		xenbus_rm(XBT_NIL, info->xbdev->nodename, "multi-queue-num-queues");
>> +
>> +		for (i = 0; i < info->nr_rings; i++) {
>> +			char queuename[QUEUE_NAME_LEN];
>> +
>> +			snprintf(queuename, QUEUE_NAME_LEN, "queue-%u", i);
>> +			xenbus_rm(XBT_NIL, info->xbdev->nodename, queuename);
>> +		}
>> +	}
>>   	kfree(info->rinfo);
>>   	info->rinfo = NULL;
>>   	info->nr_rings = 0;
>> @@ -1778,10 +1809,18 @@ static int talk_to_blkback(struct xenbus_device *dev,
>>   	if (!info)
>>   		return -ENODEV;
>>   
>> -	max_page_order = xenbus_read_unsigned(info->xbdev->otherend,
>> -					      "max-ring-page-order", 0);
>> -	ring_page_order = min(xen_blkif_max_ring_order, max_page_order);
>> -	info->nr_ring_pages = 1 << ring_page_order;
>> +	err = xenbus_scanf(XBT_NIL, info->xbdev->otherend,
>> +			   "max-ring-page-order", "%u", &max_page_order);
>> +	if (err != 1)
>> +		info->nr_ring_pages = 1;
>> +	else {
>> +		ring_page_order = min(xen_blkif_max_ring_order, max_page_order);
>> +		if (info->new_max_ring_page_order) {
>> +			BUG_ON(info->new_max_ring_page_order > max_page_order);
Do you really want to BUG_ON here? IMO, this is just a misconfiguration
which can happen, but you will make the whole domain down with this...
>> +			ring_page_order = info->new_max_ring_page_order;
>> +		}
>> +		info->nr_ring_pages = 1 << ring_page_order;
>> +	}
> You could likely simply this as:
>
> max_page_order = xenbus_read_unsigned(info->xbdev->otherend,
> 				      "max-ring-page-order", 0);
> if ((info->new_max_ring_page_order) {
> 	BUG_ON(info->new_max_ring_page_order > max_page_order);
> 	info->nr_ring_pages = 1 << info->new_max_ring_page_order;
> } else
> 	info->nr_ring_pages = 1 << min(xen_blkif_max_ring_order, max_page_order);
>
> I'm not sure of the benefit of switching the xenbus_read_unsigned to a
> xenbus_scanf. IMO it seems to make the code more complex.
>
>>   
>>   	err = negotiate_mq(info);
>>   	if (err)
>> @@ -1903,6 +1942,10 @@ static int negotiate_mq(struct blkfront_info *info)
>>   	backend_max_queues = xenbus_read_unsigned(info->xbdev->otherend,
>>   						  "multi-queue-max-queues", 1);
>>   	info->nr_rings = min(backend_max_queues, xen_blkif_max_queues);
>> +	if (info->new_max_queues) {
>> +		BUG_ON(info->new_max_queues > backend_max_queues);
Again, why BUG_ON?
>> +		info->nr_rings = info->new_max_queues;
>> +	}
>>   	/* We need at least one ring. */
>>   	if (!info->nr_rings)
>>   		info->nr_rings = 1;
>> @@ -2261,6 +2304,8 @@ static int blkfront_setup_indirect(struct blkfront_ring_info *rinfo)
>>    */
>>   static void blkfront_gather_backend_features(struct blkfront_info *info)
>>   {
>> +	int err;
>> +	int persistent;
> unsigned int. You use the '%u' format specifier below.
>
>>   	unsigned int indirect_segments;
>>   
>>   	info->feature_flush = 0;
>> @@ -2291,19 +2336,241 @@ static void blkfront_gather_backend_features(struct blkfront_info *info)
>>   	if (xenbus_read_unsigned(info->xbdev->otherend, "feature-discard", 0))
>>   		blkfront_setup_discard(info);
>>   
>> -	info->feature_persistent =
>> -		!!xenbus_read_unsigned(info->xbdev->otherend,
>> -				       "feature-persistent", 0);
>> +	err = xenbus_gather(XBT_NIL, info->xbdev->otherend,
>> +			"feature-persistent", "%u", &persistent,
>> +			NULL);
>> +
>> +	info->feature_persistent = err ? 0 : persistent;
>> +
>> +	err = xenbus_gather(XBT_NIL, info->xbdev->otherend,
>> +			    "feature-max-indirect-segments", "%u", &indirect_segments,
>> +			    NULL);
>> +	if (err)
>> +		info->max_indirect_segments = 0;
>> +	else {
>> +		info->max_indirect_segments = min(indirect_segments,
>> +						  xen_blkif_max_segments);
>> +		if (info->new_max_indirect_segments) {
>> +			BUG_ON(info->new_max_indirect_segments > indirect_segments);
And here
>> +			info->max_indirect_segments = info->new_max_indirect_segments;
>> +		}
>> +	}
> Again I think using xenbus_read_unsigned makes the code simpler, see
> the suggestion regarding new_max_ring_page_order.
>
>> +}
>> +
>> +static ssize_t max_ring_page_order_show(struct device *dev,
>> +					struct device_attribute *attr, char *page)
>> +{
>> +	struct blkfront_info *info = dev_get_drvdata(dev);
>> +
>> +	return sprintf(page, "%u\n", get_order(info->nr_ring_pages * XEN_PAGE_SIZE));
get_order returns int? "%u" -> "%d"?
>> +}
>> +
>> +static ssize_t max_indirect_segs_show(struct device *dev,
>> +				      struct device_attribute *attr, char *page)
>> +{
>> +	struct blkfront_info *info = dev_get_drvdata(dev);
>> +
>> +	return sprintf(page, "%u\n", info->max_indirect_segments);
new_max_indirect_segments is currently defined as int
>> +}
>> +
>> +static ssize_t max_queues_show(struct device *dev,
>> +			       struct device_attribute *attr, char *page)
>> +{
>> +	struct blkfront_info *info = dev_get_drvdata(dev);
>> +
>> +	return sprintf(page, "%u\n", info->nr_rings);
>> +}
>> +
>> +static ssize_t dynamic_reconfig_device(struct blkfront_info *info, ssize_t count)
> Not sure you need to pass 'count' here. dynamic_reconfig_device
> doesn't care about count at all. This function should just return < 0
> for error or 0 on success.
and also why ssize_t, not size_t?
>
>> +{
>> +	unsigned int i;
>> +	int err = -EBUSY;
>> +	unsigned int inflight;
>> +
>> +	/*
>> +	 * Make sure no migration in parallel, device lock is actually a
>> +	 * mutex.
>> +	 */
>> +	if (!device_trylock(&info->xbdev->dev)) {
>> +		pr_err("Fail to acquire dev:%s lock, may be in migration.\n",
>> +			dev_name(&info->xbdev->dev));
>> +		return err;
>> +	}
>> +
>> +	/*
>> +	 * Prevent new requests and guarantee no uncompleted reqs.
>> +	 */
>> +	blk_mq_freeze_queue(info->rq);
>> +	inflight = atomic_read(&info->gd->part0.in_flight[0]) +
>> +		   atomic_read(&info->gd->part0.in_flight[1]);
>> +	if (inflight)
>> +		goto out;
> Er, I'm not sure I like this approach. Why not just switch the state
> to closed, wait for the backend to also switch to closed, reconnect
> and then requeue any pending requests on the shadow copy of the ring?
>
> Basically like what is currently done for migration.
>
>> +
>> +	/*
>> +	 * Front 				Backend
>> +	 * Switch to XenbusStateClosed
>> +	 *					frontend_changed():
>> +	 *					 case XenbusStateClosed:
>> +	 *						xen_blkif_disconnect()
>> +	 *						Switch to XenbusStateClosed
>> +	 * blkfront_resume():
>> +	 *					frontend_changed():
>> +	 *						reconnect
>> +	 * Wait until XenbusStateConnected
>> +	 */
>> +	info->reconfiguring = true;
Not sure if this is directly applicable, but can we finally make
use of XenbusStateReconfiguring/XenbusStateReconfigured bus
states? We have it somewhat implemented in PV DRM [1]
>> +	xenbus_switch_state(info->xbdev, XenbusStateClosed);
>> +
>> +	/* Poll every 100ms, 1 minute timeout. */
>> +	for (i = 0; i < 600; i++) {
>> +		/*
>> +		 * Wait backend enter XenbusStateClosed, blkback_changed()
>> +		 * will clear reconfiguring.
>> +		 */
>> +		if (!info->reconfiguring)
>> +			goto resume;
>> +		schedule_timeout_interruptible(msecs_to_jiffies(100));
>> +	}
>> +	goto out;
> This shouldn't be done with a busy loop. Why not do this in
> blkback_changed instead?
>
>> +
>> +resume:
>> +	if (blkfront_resume(info->xbdev))
>> +		goto out;
>> +
>> +	/* Poll every 100ms, 1 minute timeout. */
>> +	for (i = 0; i < 600; i++) {
>> +		/* Wait blkfront enter StateConnected which is done by blkif_recover(). */
>> +		if (info->xbdev->state == XenbusStateConnected) {
>> +			err = count;
>> +			goto out;
>> +		}
>> +		schedule_timeout_interruptible(msecs_to_jiffies(100));
>> +	}
>> +
>> +out:
>> +	blk_mq_unfreeze_queue(info->rq);
>> +	device_unlock(&info->xbdev->dev);
>> +
>> +	return err;
>> +}
>> +
>> +static ssize_t max_indirect_segs_store(struct device *dev,
>> +		struct device_attribute *attr, const char *buf, size_t count)
>> +{
>> +	ssize_t ret;
>> +	unsigned int max_segs = 0, backend_max_segs = 0;
>> +	struct blkfront_info *info = dev_get_drvdata(dev);
>> +	int err;
>> +
>> +	ret = kstrtouint(buf, 10, &max_segs);
>> +	if (ret < 0)
>> +		return ret;
>> +
>> +	if (max_segs == info->max_indirect_segments)
>> +		return count;
>> +
>> +	err = xenbus_gather(XBT_NIL, info->xbdev->otherend,
>> +			    "feature-max-indirect-segments", "%u", &backend_max_segs,
> Having to read all the backend features every time the user writes to
> the device nodes seems inefficient, although I assume this is not
> supposed to happen frequently...
>
>> +			    NULL);
>> +	if (err) {
>> +		pr_err("Backend %s doesn't support feature-indirect-segments.\n",
>> +			info->xbdev->otherend);
>> +		return -EOPNOTSUPP;
>> +	}
>> +
>> +	if (max_segs > backend_max_segs) {
>> +		pr_err("Invalid max indirect segment (%u), backend-max: %u.\n",
>> +			max_segs, backend_max_segs);
>> +		return -EINVAL;
>> +	}
>>   
>> -	indirect_segments = xenbus_read_unsigned(info->xbdev->otherend,
>> -					"feature-max-indirect-segments", 0);
>> -	if (indirect_segments > xen_blkif_max_segments)
>> -		indirect_segments = xen_blkif_max_segments;
>> -	if (indirect_segments <= BLKIF_MAX_SEGMENTS_PER_REQUEST)
>> -		indirect_segments = 0;
>> -	info->max_indirect_segments = indirect_segments;
>> +	info->new_max_indirect_segments = max_segs;
>> +
>> +	return dynamic_reconfig_device(info, count);
> No need to pass count, just use:
>
> return dynamic_reconfig_device(info) :? count;
>
> (same for all the cases below).
>
>>   }
>>   
>> +static ssize_t max_ring_page_order_store(struct device *dev,
>> +				 struct device_attribute *attr,
>> +				 const char *buf, size_t count)
>> +{
>> +	ssize_t ret;
>> +	unsigned int max_order = 0, backend_max_order = 0;
>> +	struct blkfront_info *info = dev_get_drvdata(dev);
>> +	int err;
>> +
>> +	ret = kstrtouint(buf, 10, &max_order);
>> +	if (ret < 0)
>> +		return ret;
>> +
>> +	if ((1 << max_order) == info->nr_ring_pages)
>> +		return count;
>> +
>> +	if (max_order > XENBUS_MAX_RING_GRANT_ORDER) {
>> +		pr_err("Invalid max_ring_page_order (%u), max: %u.\n",
>> +			max_order, XENBUS_MAX_RING_GRANT_ORDER);
>> +		return -EINVAL;
>> +	}
>> +
>> +	err = xenbus_scanf(XBT_NIL, info->xbdev->otherend,
>> +			   "max-ring-page-order", "%u", &backend_max_order);
>> +	if (err != 1) {
>> +		pr_err("Backend %s doesn't support feature multi-page-ring.\n",
>> +			info->xbdev->otherend);
>> +		return -EOPNOTSUPP;
>> +	}
>> +	if (max_order > backend_max_order) {
>> +		pr_err("Invalid max_ring_page_order (%u), backend supports max: %u.\n",
>> +			max_order, backend_max_order);
>> +		return -EINVAL;
>> +	}
>> +	info->new_max_ring_page_order = max_order;
>> +
>> +	return dynamic_reconfig_device(info, count);
>> +}
>> +
>> +static ssize_t max_queues_store(struct device *dev,
>> +				struct device_attribute *attr,
>> +				const char *buf, size_t count)
>> +{
>> +	ssize_t ret;
>> +	unsigned int max_queues = 0, backend_max_queues = 0;
>> +	struct blkfront_info *info = dev_get_drvdata(dev);
>> +	int err;
>> +
>> +	ret = kstrtouint(buf, 10, &max_queues);
>> +	if (ret < 0)
>> +		return ret;
>> +
>> +	if (max_queues == info->nr_rings)
>> +		return count;
>> +
>> +	if (max_queues > num_online_cpus()) {
>> +		pr_err("Invalid max_queues (%u), can't bigger than online cpus: %u.\n",
>> +			max_queues, num_online_cpus());
>> +		return -EINVAL;
>> +	}
>> +
>> +	err = xenbus_scanf(XBT_NIL, info->xbdev->otherend,
>> +			   "multi-queue-max-queues", "%u", &backend_max_queues);
>> +	if (err != 1) {
>> +		pr_err("Backend %s doesn't support block multi queue.\n",
>> +			info->xbdev->otherend);
>> +		return -EOPNOTSUPP;
>> +	}
>> +	if (max_queues > backend_max_queues) {
>> +		pr_err("Invalid max_queues (%u), backend supports max: %u.\n",
>> +			max_queues, backend_max_queues);
>> +		return -EINVAL;
>> +	}
>> +	info->new_max_queues = max_queues;
>> +
>> +	return dynamic_reconfig_device(info, count);
>> +}
>> +
>> +static DEVICE_ATTR_RW(max_queues);
>> +static DEVICE_ATTR_RW(max_ring_page_order);
>> +static DEVICE_ATTR_RW(max_indirect_segs);
> Can't you just use the same attribute for all the nodes? Also this
> could be:
>
> const static DEVICE_ATTR_RW(node_attr);
>
> Thanks, Roger.
>
> _______________________________________________
> Xen-devel mailing list
> Xen-devel@...ts.xenproject.org
> https://lists.xenproject.org/mailman/listinfo/xen-devel
[1] 
https://cgit.freedesktop.org/drm-misc/commit/?id=c575b7eeb89f94356997abd62d6d5a0590e259b7

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives