[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <1427804127-11372-2-git-send-email-bob.liu@oracle.com>
Date:	Tue, 31 Mar 2015 20:15:27 +0800
From:	Bob Liu <bob.liu@...cle.com>
To:	xen-devel@...ts.xenproject.org
Cc:	paul.durrant@...rix.com, david.vrabel@...rix.com,
	roger.pau@...rix.com, linux-kernel@...r.kernel.org,
	konrad.wilk@...cle.com, Bob Liu <bob.liu@...cle.com>
Subject: [PATCH v2 2/2] xen/block: add multi-page ring support
Extend xen/block to support multi-page ring, so that more requests can be issued
by using more than one pages as the request ring between blkfront and backend.
As a result, the performance can get improved significantly.
We got some impressive improvements on our highend iscsi storage cluster backend.
If using 64 pages as the ring, the IOPS increased about 15 times for the
throughput testing and above doubled for the latency testing.
The reason was the limit on outstanding requests is 32 if use only one-page
ring, but in our case the iscsi lun was spread across about 100 physical drives,
32 was really not enough to keep them busy.
Change in V2:
* Rebased to 4.0-rc6
* Add description on how this protocol works into io/blkif.h
Signed-off-by: Bob Liu <bob.liu@...cle.com>
---
 drivers/block/xen-blkback/blkback.c |   2 +-
 drivers/block/xen-blkback/common.h  |   3 +-
 drivers/block/xen-blkback/xenbus.c  |  83 ++++++++++++++++++++-------
 drivers/block/xen-blkfront.c        | 111 +++++++++++++++++++++++++-----------
 include/xen/interface/io/blkif.h    |  13 +++++
 5 files changed, 156 insertions(+), 56 deletions(-)
diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c
index 0c8da82..072d15b 100644
--- a/drivers/block/xen-blkback/blkback.c
+++ b/drivers/block/xen-blkback/blkback.c
@@ -628,7 +628,7 @@ purge_gnt_list:
 		}
 
 		/* Shrink if we have more than xen_blkif_max_buffer_pages */
-		shrink_free_pagepool(blkif, xen_blkif_max_buffer_pages);
+		shrink_free_pagepool(blkif, xen_blkif_max_buffer_pages * blkif->nr_ring_pages);
 
 		if (log_stats && time_after(jiffies, blkif->st_print))
 			print_stats(blkif);
diff --git a/drivers/block/xen-blkback/common.h b/drivers/block/xen-blkback/common.h
index 375d288..320ca35 100644
--- a/drivers/block/xen-blkback/common.h
+++ b/drivers/block/xen-blkback/common.h
@@ -254,7 +254,7 @@ struct backend_info;
 #define PERSISTENT_GNT_WAS_ACTIVE	1
 
 /* Number of requests that we can fit in a ring */
-#define XEN_BLKIF_REQS			32
+#define XEN_BLKIF_REQS			(32 * XENBUS_MAX_RING_PAGES)
 
 struct persistent_gnt {
 	struct page *page;
@@ -326,6 +326,7 @@ struct xen_blkif {
 	struct work_struct	free_work;
 	/* Thread shutdown wait queue. */
 	wait_queue_head_t	shutdown_wq;
+	int nr_ring_pages;
 };
 
 struct seg_buf {
diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c
index ff30259..b75be66 100644
--- a/drivers/block/xen-blkback/xenbus.c
+++ b/drivers/block/xen-blkback/xenbus.c
@@ -193,8 +193,8 @@ fail:
 	return ERR_PTR(-ENOMEM);
 }
 
-static int xen_blkif_map(struct xen_blkif *blkif, grant_ref_t gref,
-			 unsigned int evtchn)
+static int xen_blkif_map(struct xen_blkif *blkif, grant_ref_t *gref,
+			 unsigned int nr_grefs, unsigned int evtchn)
 {
 	int err;
 
@@ -202,7 +202,7 @@ static int xen_blkif_map(struct xen_blkif *blkif, grant_ref_t gref,
 	if (blkif->irq)
 		return 0;
 
-	err = xenbus_map_ring_valloc(blkif->be->dev, &gref, 1,
+	err = xenbus_map_ring_valloc(blkif->be->dev, gref, nr_grefs,
 				     &blkif->blk_ring);
 	if (err < 0)
 		return err;
@@ -212,21 +212,21 @@ static int xen_blkif_map(struct xen_blkif *blkif, grant_ref_t gref,
 	{
 		struct blkif_sring *sring;
 		sring = (struct blkif_sring *)blkif->blk_ring;
-		BACK_RING_INIT(&blkif->blk_rings.native, sring, PAGE_SIZE);
+		BACK_RING_INIT(&blkif->blk_rings.native, sring, PAGE_SIZE * nr_grefs);
 		break;
 	}
 	case BLKIF_PROTOCOL_X86_32:
 	{
 		struct blkif_x86_32_sring *sring_x86_32;
 		sring_x86_32 = (struct blkif_x86_32_sring *)blkif->blk_ring;
-		BACK_RING_INIT(&blkif->blk_rings.x86_32, sring_x86_32, PAGE_SIZE);
+		BACK_RING_INIT(&blkif->blk_rings.x86_32, sring_x86_32, PAGE_SIZE * nr_grefs);
 		break;
 	}
 	case BLKIF_PROTOCOL_X86_64:
 	{
 		struct blkif_x86_64_sring *sring_x86_64;
 		sring_x86_64 = (struct blkif_x86_64_sring *)blkif->blk_ring;
-		BACK_RING_INIT(&blkif->blk_rings.x86_64, sring_x86_64, PAGE_SIZE);
+		BACK_RING_INIT(&blkif->blk_rings.x86_64, sring_x86_64, PAGE_SIZE * nr_grefs);
 		break;
 	}
 	default:
@@ -588,6 +588,10 @@ static int xen_blkbk_probe(struct xenbus_device *dev,
 	if (err)
 		goto fail;
 
+	err = xenbus_printf(XBT_NIL, dev->nodename, "feature-multi-page-ring", "%u", 1);
+	if (err)
+		goto fail;
+
 	err = xenbus_switch_state(dev, XenbusStateInitWait);
 	if (err)
 		goto fail;
@@ -852,22 +856,61 @@ again:
 static int connect_ring(struct backend_info *be)
 {
 	struct xenbus_device *dev = be->dev;
-	unsigned long ring_ref;
-	unsigned int evtchn;
+	unsigned int ring_ref[XENBUS_MAX_RING_PAGES];
+	unsigned int evtchn, nr_grefs;
 	unsigned int pers_grants;
 	char protocol[64] = "";
 	int err;
 
 	DPRINTK("%s", dev->otherend);
-
-	err = xenbus_gather(XBT_NIL, dev->otherend, "ring-ref", "%lu",
-			    &ring_ref, "event-channel", "%u", &evtchn, NULL);
-	if (err) {
-		xenbus_dev_fatal(dev, err,
-				 "reading %s/ring-ref and event-channel",
+	err = xenbus_scanf(XBT_NIL, dev->otherend, "event-channel", "%u",
+			  &evtchn);
+	if (err != 1) {
+		err = -EINVAL;
+		xenbus_dev_fatal(dev, err, "reading %s/event-channel",
 				 dev->otherend);
 		return err;
 	}
+	pr_info(DRV_PFX "event-channel %u\n", evtchn);
+
+	err = xenbus_scanf(XBT_NIL, dev->otherend, "max-ring-pages", "%u",
+			  &nr_grefs);
+	if (err != 1) {
+		err = xenbus_scanf(XBT_NIL, dev->otherend, "ring-ref",
+				  "%u", &ring_ref[0]);
+		if (err != 1) {
+			err = -EINVAL;
+			xenbus_dev_fatal(dev, err, "reading %s/ring-ref",
+					 dev->otherend);
+			return err;
+		}
+		nr_grefs = 1;
+		pr_info(DRV_PFX "%s:using single page: ring-ref %d\n",
+				 dev->otherend, ring_ref[0]);
+	} else {
+		unsigned int i;
+
+		if (nr_grefs > XENBUS_MAX_RING_PAGES) {
+			err = -EINVAL;
+			xenbus_dev_fatal(dev, err, "%s/request %d ring pages exceed max",
+					 dev->otherend, nr_grefs);
+			return err;
+		}
+		for (i = 0; i < nr_grefs; i++) {
+			char ring_ref_name[15];
+
+			snprintf(ring_ref_name, sizeof(ring_ref_name), "ring-ref%u", i);
+			err = xenbus_scanf(XBT_NIL, dev->otherend,
+					   ring_ref_name, "%d", &ring_ref[i]);
+			if (err != 1) {
+				err = -EINVAL;
+				xenbus_dev_fatal(dev, err, "reading %s/%s",
+						 dev->otherend, ring_ref_name);
+				return err;
+			}
+			pr_info(DRV_PFX "ring-ref%u: %d\n", i, ring_ref[i]);
+		}
+	}
 
 	be->blkif->blk_protocol = BLKIF_PROTOCOL_DEFAULT;
 	err = xenbus_gather(XBT_NIL, dev->otherend, "protocol",
@@ -892,16 +935,16 @@ static int connect_ring(struct backend_info *be)
 
 	be->blkif->vbd.feature_gnt_persistent = pers_grants;
 	be->blkif->vbd.overflow_max_grants = 0;
+	be->blkif->nr_ring_pages = nr_grefs;
 
-	pr_info(DRV_PFX "ring-ref %ld, event-channel %d, protocol %d (%s) %s\n",
-		ring_ref, evtchn, be->blkif->blk_protocol, protocol,
-		pers_grants ? "persistent grants" : "");
+	pr_info(DRV_PFX "ring-pages:%d, event-channel %d, protocol %d (%s) %s\n",
+			nr_grefs, evtchn, be->blkif->blk_protocol, protocol,
+			pers_grants ? "persistent grants" : "");
 
 	/* Map the shared frame, irq etc. */
-	err = xen_blkif_map(be->blkif, ring_ref, evtchn);
+	err = xen_blkif_map(be->blkif, ring_ref, nr_grefs, evtchn);
 	if (err) {
-		xenbus_dev_fatal(dev, err, "mapping ring-ref %lu port %u",
-				 ring_ref, evtchn);
+		xenbus_dev_fatal(dev, err, "mapping ring-ref port %u", evtchn);
 		return err;
 	}
 
diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index 2c61cf8..528e4f6 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -98,7 +98,12 @@ static unsigned int xen_blkif_max_segments = 32;
 module_param_named(max, xen_blkif_max_segments, int, S_IRUGO);
 MODULE_PARM_DESC(max, "Maximum amount of segments in indirect requests (default is 32)");
 
-#define BLK_RING_SIZE __CONST_RING_SIZE(blkif, PAGE_SIZE)
+static unsigned int xen_blkif_max_ring_pages = 1;
+module_param_named(max_ring_pages, xen_blkif_max_ring_pages, int, 0644);
+MODULE_PARM_DESC(max_ring_pages, "Maximum amount of pages to be used as the ring");
+
+#define BLK_RING_SIZE __CONST_RING_SIZE(blkif, PAGE_SIZE * info->nr_ring_pages)
+#define BLK_MAX_RING_SIZE __CONST_RING_SIZE(blkif, PAGE_SIZE * XENBUS_MAX_RING_PAGES)
 
 /*
  * We have one of these per vbd, whether ide, scsi or 'other'.  They
@@ -114,13 +119,14 @@ struct blkfront_info
 	int vdevice;
 	blkif_vdev_t handle;
 	enum blkif_state connected;
-	int ring_ref;
+	int ring_ref[XENBUS_MAX_RING_PAGES];
+	int nr_ring_pages;
 	struct blkif_front_ring ring;
 	unsigned int evtchn, irq;
 	struct request_queue *rq;
 	struct work_struct work;
 	struct gnttab_free_callback callback;
-	struct blk_shadow shadow[BLK_RING_SIZE];
+	struct blk_shadow shadow[BLK_MAX_RING_SIZE];
 	struct list_head grants;
 	struct list_head indirect_pages;
 	unsigned int persistent_gnts_c;
@@ -139,8 +145,6 @@ static unsigned int nr_minors;
 static unsigned long *minors;
 static DEFINE_SPINLOCK(minor_lock);
 
-#define MAXIMUM_OUTSTANDING_BLOCK_REQS \
-	(BLKIF_MAX_SEGMENTS_PER_REQUEST * BLK_RING_SIZE)
 #define GRANT_INVALID_REF	0
 
 #define PARTS_PER_DISK		16
@@ -1033,12 +1037,15 @@ free_shadow:
 	flush_work(&info->work);
 
 	/* Free resources associated with old device channel. */
-	if (info->ring_ref != GRANT_INVALID_REF) {
-		gnttab_end_foreign_access(info->ring_ref, 0,
-					  (unsigned long)info->ring.sring);
-		info->ring_ref = GRANT_INVALID_REF;
-		info->ring.sring = NULL;
+	for (i = 0; i < info->nr_ring_pages; i++) {
+		if (info->ring_ref[i] != GRANT_INVALID_REF) {
+			gnttab_end_foreign_access(info->ring_ref[i], 0, 0);
+			info->ring_ref[i] = GRANT_INVALID_REF;
+		}
 	}
+	free_pages((unsigned long)info->ring.sring, get_order(info->nr_ring_pages * PAGE_SIZE));
+	info->ring.sring = NULL;
+
 	if (info->irq)
 		unbind_from_irqhandler(info->irq, info);
 	info->evtchn = info->irq = 0;
@@ -1245,26 +1252,30 @@ static int setup_blkring(struct xenbus_device *dev,
 			 struct blkfront_info *info)
 {
 	struct blkif_sring *sring;
-	grant_ref_t gref;
-	int err;
+	int err, i;
+	unsigned long ring_size = info->nr_ring_pages * PAGE_SIZE;
+	grant_ref_t gref[XENBUS_MAX_RING_PAGES];
 
-	info->ring_ref = GRANT_INVALID_REF;
+	for (i = 0; i < XENBUS_MAX_RING_PAGES; i++)
+		info->ring_ref[i] = GRANT_INVALID_REF;
 
-	sring = (struct blkif_sring *)__get_free_page(GFP_NOIO | __GFP_HIGH);
+	sring = (struct blkif_sring *)__get_free_pages(GFP_NOIO | __GFP_HIGH,
+						       get_order(ring_size));
 	if (!sring) {
 		xenbus_dev_fatal(dev, -ENOMEM, "allocating shared ring");
 		return -ENOMEM;
 	}
 	SHARED_RING_INIT(sring);
-	FRONT_RING_INIT(&info->ring, sring, PAGE_SIZE);
+	FRONT_RING_INIT(&info->ring, sring, ring_size);
 
-	err = xenbus_grant_ring(dev, info->ring.sring, 1, &gref);
+	err = xenbus_grant_ring(dev, info->ring.sring, info->nr_ring_pages, gref);
 	if (err < 0) {
 		free_page((unsigned long)sring);
 		info->ring.sring = NULL;
 		goto fail;
 	}
-	info->ring_ref = gref;
+	for (i = 0; i < info->nr_ring_pages; i++)
+		info->ring_ref[i] = gref[i];
 
 	err = xenbus_alloc_evtchn(dev, &info->evtchn);
 	if (err)
@@ -1292,7 +1303,14 @@ static int talk_to_blkback(struct xenbus_device *dev,
 {
 	const char *message = NULL;
 	struct xenbus_transaction xbt;
-	int err;
+	int err, i, multi_ring_pages = 0;
+
+	err = xenbus_scanf(XBT_NIL, info->xbdev->otherend,
+			   "feature-multi-page-ring", "%u", &multi_ring_pages);
+	if (err != 1)
+		info->nr_ring_pages = 1;
+	else
+		info->nr_ring_pages = xen_blkif_max_ring_pages;
 
 	/* Create shared ring, alloc event channel. */
 	err = setup_blkring(dev, info);
@@ -1306,11 +1324,31 @@ again:
 		goto destroy_blkring;
 	}
 
-	err = xenbus_printf(xbt, dev->nodename,
-			    "ring-ref", "%u", info->ring_ref);
-	if (err) {
-		message = "writing ring-ref";
-		goto abort_transaction;
+	if (info->nr_ring_pages == 1) {
+		err = xenbus_printf(xbt, dev->nodename,
+				    "ring-ref", "%u", info->ring_ref[0]);
+		if (err) {
+			message = "writing ring-ref";
+			goto abort_transaction;
+		}
+	} else {
+		err = xenbus_printf(xbt, dev->nodename,
+				    "max-ring-pages", "%u", info->nr_ring_pages);
+		if (err) {
+			message = "writing max-ring-pages";
+			goto abort_transaction;
+		}
+
+		for (i = 0; i < info->nr_ring_pages; i++) {
+			char ring_ref_name[15];
+			sprintf(ring_ref_name, "ring-ref%d", i);
+			err = xenbus_printf(xbt, dev->nodename,
+					ring_ref_name, "%d", info->ring_ref[i]);
+			if (err) {
+				message = "writing ring-ref";
+				goto abort_transaction;
+			}
+		}
 	}
 	err = xenbus_printf(xbt, dev->nodename,
 			    "event-channel", "%u", info->evtchn);
@@ -1338,6 +1376,7 @@ again:
 		goto destroy_blkring;
 	}
 
+	info->shadow[BLK_RING_SIZE-1].req.u.rw.id = 0x0fffffff;
 	xenbus_switch_state(dev, XenbusStateInitialised);
 
 	return 0;
@@ -1422,21 +1461,13 @@ static int blkfront_probe(struct xenbus_device *dev,
 	info->connected = BLKIF_STATE_DISCONNECTED;
 	INIT_WORK(&info->work, blkif_restart_queue);
 
-	for (i = 0; i < BLK_RING_SIZE; i++)
+	for (i = 0; i < BLK_MAX_RING_SIZE; i++)
 		info->shadow[i].req.u.rw.id = i+1;
-	info->shadow[BLK_RING_SIZE-1].req.u.rw.id = 0x0fffffff;
 
 	/* Front end dir is a number, which is used as the id. */
 	info->handle = simple_strtoul(strrchr(dev->nodename, '/')+1, NULL, 0);
 	dev_set_drvdata(&dev->dev, info);
 
-	err = talk_to_blkback(dev, info);
-	if (err) {
-		kfree(info);
-		dev_set_drvdata(&dev->dev, NULL);
-		return err;
-	}
-
 	return 0;
 }
 
@@ -1476,7 +1507,7 @@ static int blkif_recover(struct blkfront_info *info)
 
 	/* Stage 2: Set up free list. */
 	memset(&info->shadow, 0, sizeof(info->shadow));
-	for (i = 0; i < BLK_RING_SIZE; i++)
+	for (i = 0; i < BLK_MAX_RING_SIZE; i++)
 		info->shadow[i].req.u.rw.id = i+1;
 	info->shadow_free = info->ring.req_prod_pvt;
 	info->shadow[BLK_RING_SIZE-1].req.u.rw.id = 0x0fffffff;
@@ -1906,8 +1937,17 @@ static void blkback_changed(struct xenbus_device *dev,
 	dev_dbg(&dev->dev, "blkfront:blkback_changed to state %d.\n", backend_state);
 
 	switch (backend_state) {
-	case XenbusStateInitialising:
 	case XenbusStateInitWait:
+		/*
+		 * Talk to blkback after backend enter InitWait so we can know
+		 * whether backend spports feature-multi-page-ring.
+		 */
+		if (talk_to_blkback(dev, info)) {
+			kfree(info);
+			dev_set_drvdata(&dev->dev, NULL);
+		}
+		break;
+	case XenbusStateInitialising:
 	case XenbusStateInitialised:
 	case XenbusStateReconfiguring:
 	case XenbusStateReconfigured:
@@ -2088,6 +2128,9 @@ static int __init xlblk_init(void)
 {
 	int ret;
 
+	if (xen_blkif_max_ring_pages > XENBUS_MAX_RING_PAGES)
+		return -EINVAL;
+
 	if (!xen_domain())
 		return -ENODEV;
 
diff --git a/include/xen/interface/io/blkif.h b/include/xen/interface/io/blkif.h
index c33e1c4..9c2c9a7 100644
--- a/include/xen/interface/io/blkif.h
+++ b/include/xen/interface/io/blkif.h
@@ -127,6 +127,19 @@ typedef uint64_t blkif_sector_t;
 #define BLKIF_OP_INDIRECT          6
 
 /*
+ * "feature-multi-page-ring" is introduced to use more than one page as the
+ * request ring between blkfront and backend. As a result, more reqs can be
+ * issued from blkfront and the performance get improved accordingly.
+ *
+ * If supported, the backend will write the key 'feature-multi-page-ring'
+ * for that vbd.
+ * Frontends that are aware of this feature and wish to use it will write key
+ * "max-ring-pages" to set the number of pages that they wish to be used as the
+ * ring. The 'xen_blkif_max_ring_pages' parameter is introduced with default
+ * number still one page, the maximum is XENBUS_MAX_RING_PAGES.
+ */
+
+/*
  * Maximum scatter/gather segments per request.
  * This is carefully chosen so that sizeof(struct blkif_ring) <= PAGE_SIZE.
  * NB. This could be 12 if the ring indexes weren't stored in the same page.
-- 
1.8.3.1
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/
Powered by blists - more mailing lists
 
