lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20160727080729.7sgmqak42rw6vaok@mac>
Date:	Wed, 27 Jul 2016 10:07:29 +0200
From:	Roger Pau Monné <roger.pau@...rix.com>
To:	Bob Liu <bob.liu@...cle.com>
CC:	<linux-kernel@...r.kernel.org>, <xen-devel@...ts.xenproject.org>,
	<konrad.wilk@...cle.com>
Subject: Re: [PATCH v3] xen-blkfront: dynamic configuration of per-vbd
 resources

Hello,

Thanks for the fixes.

On Wed, Jul 27, 2016 at 11:21:25AM +0800, Bob Liu wrote:
> The current VBD layer reserves buffer space for each attached device based on
> three statically configured settings which are read at boot time.
>  * max_indirect_segs: Maximum amount of segments.
>  * max_ring_page_order: Maximum order of pages to be used for the shared ring.
>  * max_queues: Maximum of queues(rings) to be used.
> 
> But the storage backend, workload, and guest memory result in very different
> tuning requirements. It's impossible to centrally predict application
> characteristics so it's best to leave allow the settings can be dynamiclly
> adjusted based on workload inside the Guest.
> 
> Usage:
> Show current values:
> cat /sys/devices/vbd-xxx/max_indirect_segs
> cat /sys/devices/vbd-xxx/max_ring_page_order
> cat /sys/devices/vbd-xxx/max_queues
> 
> Write new values:
> echo <new value> > /sys/devices/vbd-xxx/max_indirect_segs
> echo <new value> > /sys/devices/vbd-xxx/max_ring_page_order
> echo <new value> > /sys/devices/vbd-xxx/max_queues
> 
> Signed-off-by: Bob Liu <bob.liu@...cle.com>
> --
> v3:
>  * Remove new_max_indirect_segments.
>  * Fix BUG_ON().
> v2:
>  * Rename to max_ring_page_order.
>  * Remove the waiting code suggested by Roger.
> ---
>  drivers/block/xen-blkfront.c |  277 ++++++++++++++++++++++++++++++++++++++++--
>  1 file changed, 269 insertions(+), 8 deletions(-)
> 
> diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
> index 1b4c380..57baa54 100644
> --- a/drivers/block/xen-blkfront.c
> +++ b/drivers/block/xen-blkfront.c
> @@ -212,6 +212,10 @@ struct blkfront_info
>  	/* Save uncomplete reqs and bios for migration. */
>  	struct list_head requests;
>  	struct bio_list bio_list;
> +	/* For dynamic configuration. */
> +	unsigned int reconfiguring:1;
> +	unsigned int max_ring_page_order;
> +	unsigned int max_queues;

max_{ring_page_order/queues} should be after max_indirect_segments, and 
reconfigurng should be of type bool (moreover when below you assign 'true' 
or 'false' to it).

>  };
>  
>  static unsigned int nr_minors;
> @@ -1350,6 +1354,31 @@ static void blkif_free(struct blkfront_info *info, int suspend)
>  	for (i = 0; i < info->nr_rings; i++)
>  		blkif_free_ring(&info->rinfo[i]);
>  
> +	/* Remove old xenstore nodes. */
> +	if (info->nr_ring_pages > 1)
> +		xenbus_rm(XBT_NIL, info->xbdev->nodename, "ring-page-order");
> +
> +	if (info->nr_rings == 1) {
> +		if (info->nr_ring_pages == 1) {
> +			xenbus_rm(XBT_NIL, info->xbdev->nodename, "ring-ref");
> +		} else {
> +			for (i = 0; i < info->nr_ring_pages; i++) {
> +				char ring_ref_name[RINGREF_NAME_LEN];
> +
> +				snprintf(ring_ref_name, RINGREF_NAME_LEN, "ring-ref%u", i);
> +				xenbus_rm(XBT_NIL, info->xbdev->nodename, ring_ref_name);
> +			}
> +		}
> +	} else {
> +		xenbus_rm(XBT_NIL, info->xbdev->nodename, "multi-queue-num-queues");
> +
> +		for (i = 0; i < info->nr_rings; i++) {
> +			char queuename[QUEUE_NAME_LEN];
> +
> +			snprintf(queuename, QUEUE_NAME_LEN, "queue-%u", i);
> +			xenbus_rm(XBT_NIL, info->xbdev->nodename, queuename);
> +		}
> +	}
>  	kfree(info->rinfo);
>  	info->rinfo = NULL;
>  	info->nr_rings = 0;
> @@ -1763,15 +1792,20 @@ static int talk_to_blkback(struct xenbus_device *dev,
>  	const char *message = NULL;
>  	struct xenbus_transaction xbt;
>  	int err;
> -	unsigned int i, max_page_order = 0;
> +	unsigned int i, backend_max_order = 0;
>  	unsigned int ring_page_order = 0;
>  
>  	err = xenbus_scanf(XBT_NIL, info->xbdev->otherend,
> -			   "max-ring-page-order", "%u", &max_page_order);
> +			   "max-ring-page-order", "%u", &backend_max_order);
>  	if (err != 1)
>  		info->nr_ring_pages = 1;
>  	else {
> -		ring_page_order = min(xen_blkif_max_ring_order, max_page_order);
> +		if (info->max_ring_page_order)
> +			/* Dynamic configured through /sys. */
> +			ring_page_order = min(info->max_ring_page_order, backend_max_order);
> +		else
> +			/* Default. */
> +			ring_page_order = min(xen_blkif_max_ring_order, backend_max_order);
>  		info->nr_ring_pages = 1 << ring_page_order;

In order to avoid this 'if' here (and below), what do you think of assigning 
the global values to the blkfront_info structure in blkfront_probe? Then you 
would only have to do min(info->max_ring_page_order, backend_max_order) in 
order to get the min value.

>  	}
>  
> @@ -1894,7 +1928,13 @@ static int negotiate_mq(struct blkfront_info *info)
>  	if (err < 0)
>  		backend_max_queues = 1;
>  
> -	info->nr_rings = min(backend_max_queues, xen_blkif_max_queues);
> +	if (info->max_queues)
> +		/* Dynamic configured through /sys */
> +		info->nr_rings = min(backend_max_queues, info->max_queues);
> +	else
> +		/* Default. */
> +		info->nr_rings = min(backend_max_queues, xen_blkif_max_queues);
> +
>  	/* We need at least one ring. */
>  	if (!info->nr_rings)
>  		info->nr_rings = 1;
> @@ -2352,11 +2392,198 @@ static void blkfront_gather_backend_features(struct blkfront_info *info)
>  			    NULL);
>  	if (err)
>  		info->max_indirect_segments = 0;
> -	else
> -		info->max_indirect_segments = min(indirect_segments,
> -						  xen_blkif_max_segments);
> +	else {
> +		if (info->max_indirect_segments)
> +			/* Dynamic configured through /sys */
> +			info->max_indirect_segments = min(indirect_segments,
> +							  info->max_indirect_segments);
> +		else
> +			info->max_indirect_segments = min(indirect_segments,
> +							  xen_blkif_max_segments);
> +	}
> +}
> +
> +static ssize_t max_ring_page_order_show(struct device *dev,
> +					struct device_attribute *attr, char *page)
> +{
> +	struct blkfront_info *info = dev_get_drvdata(dev);
> +
> +	return sprintf(page, "%u\n", get_order(info->nr_ring_pages * XEN_PAGE_SIZE));
> +}
> +
> +static ssize_t max_indirect_segs_show(struct device *dev,
> +				      struct device_attribute *attr, char *page)
> +{
> +	struct blkfront_info *info = dev_get_drvdata(dev);
> +
> +	return sprintf(page, "%u\n", info->max_indirect_segments);
> +}
> +
> +static ssize_t max_queues_show(struct device *dev,
> +			       struct device_attribute *attr, char *page)
> +{
> +	struct blkfront_info *info = dev_get_drvdata(dev);
> +
> +	return sprintf(page, "%u\n", info->nr_rings);
> +}
> +
> +static ssize_t dynamic_reconfig_device(struct blkfront_info *info, ssize_t count)
> +{
> +	/*
> +	 * Prevent new requests even to software request queue.
> +	 */
> +	blk_mq_freeze_queue(info->rq);
> +
> +	/*
> +	 * Guarantee no uncompleted reqs.
> +	 */
> +	if (part_in_flight(&info->gd->part0) || info->reconfiguring) {
> +		blk_mq_unfreeze_queue(info->rq);
> +		pr_err("Dev:%s busy, please retry later.\n", dev_name(&info->xbdev->dev));
> +		return -EBUSY;
> +	}
> +
> +	/*
> +	 * Frontend: 				Backend:
> +	 * Freeze_queue()
> +	 * Switch to XenbusStateClosed
> +	 *					frontend_changed(StateClosed)
> +	 *						> xen_blkif_disconnect()
> +	 *						> Switch to XenbusStateClosed
> +	 * blkback_changed(StateClosed)
> +	 *	> blkfront_resume()
> +	 *	> Switch to StateInitialised
> +	 *					frontend_changed(StateInitialised):
> +	 *						> reconnect
> +	 *						> Switch to StateConnected
> +	 * blkback_changed(StateConnected)
> +	 *	> blkif_recover()
> +	 *		> Also switch to StateConnected
> +	 *		> Unfreeze_queue()
> +	 */
> +	info->reconfiguring = true;

This is racy AFAICT. You should use an atomic test_and_set in the condition 
above or lock the blkfront_info mutex before and while setting 
reconfiguring. Without this two (or more) concurrent calls to 
dynamic_reconfig_device might succeed.

> +	xenbus_switch_state(info->xbdev, XenbusStateClosed);
> +
> +	return count;
> +}
> +
> +static ssize_t max_indirect_segs_store(struct device *dev,
> +		struct device_attribute *attr, const char *buf, size_t count)
> +{
> +	ssize_t ret;
> +	unsigned int max_segs = 0, backend_max_segs = 0;
> +	struct blkfront_info *info = dev_get_drvdata(dev);
> +	int err;
> +
> +	ret = kstrtouint(buf, 10, &max_segs);
> +	if (ret < 0)
> +		return ret;
> +
> +	if (max_segs == info->max_indirect_segments)
> +		return count;
> +
> +	err = xenbus_gather(XBT_NIL, info->xbdev->otherend,
> +			    "feature-max-indirect-segments", "%u", &backend_max_segs,
> +			    NULL);
> +	if (err) {
> +		pr_err("Backend %s doesn't support feature-indirect-segments.\n",
> +			info->xbdev->otherend);
> +		return -EOPNOTSUPP;
> +	}
> +
> +	if (max_segs > backend_max_segs) {
> +		pr_err("Invalid max indirect segment (%u), backend-max: %u.\n",
> +			max_segs, backend_max_segs);
> +		return -EINVAL;
> +	}
> +
> +	info->max_indirect_segments = max_segs;
> +
> +	return dynamic_reconfig_device(info, count);
> +}
> +
> +static ssize_t max_ring_page_order_store(struct device *dev,
> +				 struct device_attribute *attr,
> +				 const char *buf, size_t count)
> +{
> +	ssize_t ret;
> +	unsigned int max_order = 0, backend_max_order = 0;
> +	struct blkfront_info *info = dev_get_drvdata(dev);
> +	int err;
> +
> +	ret = kstrtouint(buf, 10, &max_order);
> +	if (ret < 0)
> +		return ret;
> +
> +	if ((1 << max_order) == info->nr_ring_pages)
> +		return count;
> +
> +	if (max_order > XENBUS_MAX_RING_GRANT_ORDER) {
> +		pr_err("Invalid max_ring_page_order (%u), max: %u.\n",
> +				max_order, XENBUS_MAX_RING_GRANT_ORDER);
> +		return -EINVAL;
> +	}
> +
> +	err = xenbus_scanf(XBT_NIL, info->xbdev->otherend,
> +			   "max-ring-page-order", "%u", &backend_max_order);
> +	if (err != 1) {
> +		pr_err("Backend %s doesn't support feature multi-page-ring.\n",
> +			info->xbdev->otherend);
> +		return -EOPNOTSUPP;
> +	}
> +	if (max_order > backend_max_order) {
> +		pr_err("Invalid max_ring_page_order (%u), backend supports max: %u.\n",
> +			max_order, backend_max_order);
> +		return -EINVAL;
> +	}
> +	info->max_ring_page_order = max_order;
> +
> +	return dynamic_reconfig_device(info, count);
> +}
> +
> +static ssize_t max_queues_store(struct device *dev,
> +				struct device_attribute *attr,
> +				const char *buf, size_t count)
> +{
> +	ssize_t ret;
> +	unsigned int max_queues = 0, backend_max_queues = 0;
> +	struct blkfront_info *info = dev_get_drvdata(dev);
> +	int err;
> +
> +	ret = kstrtouint(buf, 10, &max_queues);
> +	if (ret < 0)
> +		return ret;
> +
> +	if (max_queues == info->nr_rings)
> +		return count;
> +
> +	if (max_queues > num_online_cpus()) {
> +		pr_err("Invalid max_queues (%u), can't bigger than online cpus: %u.\n",
> +			max_queues, num_online_cpus());
> +		return -EINVAL;
> +	}
> +
> +	err = xenbus_scanf(XBT_NIL, info->xbdev->otherend,
> +			   "multi-queue-max-queues", "%u", &backend_max_queues);
> +	if (err != 1) {
> +		pr_err("Backend %s doesn't support block multi queue.\n",
> +			info->xbdev->otherend);
> +		return -EOPNOTSUPP;
> +	}
> +	if (max_queues > backend_max_queues) {
> +		pr_err("Invalid max_queues (%u), backend supports max: %u.\n",
> +			max_queues, backend_max_queues);
> +		return -EINVAL;
> +	}
> +	info->max_queues = max_queues;
> +
> +	return dynamic_reconfig_device(info, count);
>  }
>  
> +static DEVICE_ATTR_RW(max_queues);
> +static DEVICE_ATTR_RW(max_ring_page_order);
> +static DEVICE_ATTR_RW(max_indirect_segs);
> +
>  /*
>   * Invoked when the backend is finally 'ready' (and has told produced
>   * the details about the physical device - #sectors, size, etc).
> @@ -2393,6 +2620,10 @@ static void blkfront_connect(struct blkfront_info *info)
>  		 * supports indirect descriptors, and how many.
>  		 */
>  		blkif_recover(info);
> +		if (info->reconfiguring) {
> +			blk_mq_unfreeze_queue(info->rq);
> +			info->reconfiguring = false;
> +		}
>  		return;
>  
>  	default:
> @@ -2443,6 +2674,22 @@ static void blkfront_connect(struct blkfront_info *info)
>  		return;
>  	}
>  
> +	err = device_create_file(&info->xbdev->dev, &dev_attr_max_ring_page_order);
> +	if (err)
> +		goto fail;
> +
> +	err = device_create_file(&info->xbdev->dev, &dev_attr_max_indirect_segs);
> +	if (err) {
> +		device_remove_file(&info->xbdev->dev, &dev_attr_max_ring_page_order);
> +		goto fail;
> +	}
> +
> +	err = device_create_file(&info->xbdev->dev, &dev_attr_max_queues);
> +	if (err) {
> +		device_remove_file(&info->xbdev->dev, &dev_attr_max_ring_page_order);
> +		device_remove_file(&info->xbdev->dev, &dev_attr_max_indirect_segs);
> +		goto fail;
> +	}
>  	xenbus_switch_state(info->xbdev, XenbusStateConnected);
>  
>  	/* Kick pending requests. */
> @@ -2453,6 +2700,12 @@ static void blkfront_connect(struct blkfront_info *info)
>  	add_disk(info->gd);
>  
>  	info->is_ready = 1;
> +	return;
> +
> +fail:
> +	blkif_free(info, 0);
> +	xlvbd_release_gendisk(info);
> +	return;

Hm, I'm not sure whether this chunk should be in a separate patch, it seems 
like blkfront_connect doesn't properly cleanup on error (if 
xlvbd_alloc_gendisk fails blkif_free will not be called). Do you think you 
could send the addition of the 'fail' label as a separate patch and fix the 
error path of xlvbd_alloc_gendisk?

>  }
>  
>  /**
> @@ -2500,8 +2753,16 @@ static void blkback_changed(struct xenbus_device *dev,
>  		break;
>  
>  	case XenbusStateClosed:
> -		if (dev->state == XenbusStateClosed)
> +		if (dev->state == XenbusStateClosed) {
> +			if (info->reconfiguring)
> +				if (blkfront_resume(info->xbdev)) {

Could you please join those two conditions:

if (info->reconfiguring && blkfront_resume(info->xbdev)) { ...

Also, I'm not sure this is correct, if blkfront sees the "Closing" state on 
blkback it will try to close the frontend and destroy the block device (see 
blkfront_closing), and this should be avoided. You should call 
blkfront_resume as soon as you see the backend move to the Closed or Closing 
states, without calling blkfront_closing.

> +					/* Resume failed. */
> +					info->reconfiguring = false;
> +					xenbus_switch_state(info->xbdev, XenbusStateClosed);
> +					pr_err("Resume from dynamic configuration failed\n");
> +				}
>  			break;
> +		}
>  		/* Missed the backend's Closing state -- fallthrough */
>  	case XenbusStateClosing:
>  		if (info)
> -- 
> 1.7.10.4

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ