[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <20150401145557.GC28938@l.oracle.com>
Date: Wed, 1 Apr 2015 10:55:57 -0400
From: Konrad Rzeszutek Wilk <konrad.wilk@...cle.com>
To: Bob Liu <bob.liu@...cle.com>
Cc: xen-devel@...ts.xenproject.org, paul.durrant@...rix.com,
david.vrabel@...rix.com, roger.pau@...rix.com,
linux-kernel@...r.kernel.org
Subject: Re: [PATCH v2 2/2] xen/block: add multi-page ring support
On Wed, Apr 01, 2015 at 07:51:47PM +0800, Bob Liu wrote:
>
> On 03/31/2015 08:15 PM, Bob Liu wrote:
> > Extend xen/block to support multi-page ring, so that more requests can be issued
> > by using more than one pages as the request ring between blkfront and backend.
> > As a result, the performance can get improved significantly.
> >
> > We got some impressive improvements on our highend iscsi storage cluster backend.
> > If using 64 pages as the ring, the IOPS increased about 15 times for the
> > throughput testing and above doubled for the latency testing.
> >
> > The reason was the limit on outstanding requests is 32 if use only one-page
> > ring, but in our case the iscsi lun was spread across about 100 physical drives,
> > 32 was really not enough to keep them busy.
> >
> > Change in V2:
> > * Rebased to 4.0-rc6
> > * Add description on how this protocol works into io/blkif.h
> >
>
> I found there were already some related descriptions in file
> xen/include/public/io/blkif.h. I'll send an new patch following the
> protocol in that file, and leave file include/xen/interface/io/blkif.h
> unchanged.
>
> 190 * max-ring-pages
> 191 * Values: <uint32_t>
> 192 * Default Value: 1
> 193 * Notes: DEPRECATED, 2, 3
Well, perhaps remove the 'DEPRECATED' part?
> 194 *
> 195 * The maximum supported size of the request ring buffer in
> units of
> 196 * machine pages. The value must be a power of 2.
>
> 302 * num-ring-pages
> 303 * Values: <uint32_t>
> 304 * Default Value: 1
> 305 * Maximum Value: MAX(max-ring-pages,(0x1 << max-ring-page-order))
> 306 * Notes: DEPRECATED, 2, 3
> 307 *
> 308 * The size of the frontend allocated request ring buffer in
> units of
> 309 * machine pages. The value must be a power of 2.
> 310 *
>
> Regards,
> -Bob
>
> > Signed-off-by: Bob Liu <bob.liu@...cle.com>
> > ---
> > drivers/block/xen-blkback/blkback.c | 2 +-
> > drivers/block/xen-blkback/common.h | 3 +-
> > drivers/block/xen-blkback/xenbus.c | 83 ++++++++++++++++++++-------
> > drivers/block/xen-blkfront.c | 111 +++++++++++++++++++++++++-----------
> > include/xen/interface/io/blkif.h | 13 +++++
> > 5 files changed, 156 insertions(+), 56 deletions(-)
> >
> > diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c
> > index 0c8da82..072d15b 100644
> > --- a/drivers/block/xen-blkback/blkback.c
> > +++ b/drivers/block/xen-blkback/blkback.c
> > @@ -628,7 +628,7 @@ purge_gnt_list:
> > }
> >
> > /* Shrink if we have more than xen_blkif_max_buffer_pages */
> > - shrink_free_pagepool(blkif, xen_blkif_max_buffer_pages);
> > + shrink_free_pagepool(blkif, xen_blkif_max_buffer_pages * blkif->nr_ring_pages);
> >
> > if (log_stats && time_after(jiffies, blkif->st_print))
> > print_stats(blkif);
> > diff --git a/drivers/block/xen-blkback/common.h b/drivers/block/xen-blkback/common.h
> > index 375d288..320ca35 100644
> > --- a/drivers/block/xen-blkback/common.h
> > +++ b/drivers/block/xen-blkback/common.h
> > @@ -254,7 +254,7 @@ struct backend_info;
> > #define PERSISTENT_GNT_WAS_ACTIVE 1
> >
> > /* Number of requests that we can fit in a ring */
> > -#define XEN_BLKIF_REQS 32
> > +#define XEN_BLKIF_REQS (32 * XENBUS_MAX_RING_PAGES)
> >
> > struct persistent_gnt {
> > struct page *page;
> > @@ -326,6 +326,7 @@ struct xen_blkif {
> > struct work_struct free_work;
> > /* Thread shutdown wait queue. */
> > wait_queue_head_t shutdown_wq;
> > + int nr_ring_pages;
> > };
> >
> > struct seg_buf {
> > diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c
> > index ff30259..b75be66 100644
> > --- a/drivers/block/xen-blkback/xenbus.c
> > +++ b/drivers/block/xen-blkback/xenbus.c
> > @@ -193,8 +193,8 @@ fail:
> > return ERR_PTR(-ENOMEM);
> > }
> >
> > -static int xen_blkif_map(struct xen_blkif *blkif, grant_ref_t gref,
> > - unsigned int evtchn)
> > +static int xen_blkif_map(struct xen_blkif *blkif, grant_ref_t *gref,
> > + unsigned int nr_grefs, unsigned int evtchn)
> > {
> > int err;
> >
> > @@ -202,7 +202,7 @@ static int xen_blkif_map(struct xen_blkif *blkif, grant_ref_t gref,
> > if (blkif->irq)
> > return 0;
> >
> > - err = xenbus_map_ring_valloc(blkif->be->dev, &gref, 1,
> > + err = xenbus_map_ring_valloc(blkif->be->dev, gref, nr_grefs,
> > &blkif->blk_ring);
> > if (err < 0)
> > return err;
> > @@ -212,21 +212,21 @@ static int xen_blkif_map(struct xen_blkif *blkif, grant_ref_t gref,
> > {
> > struct blkif_sring *sring;
> > sring = (struct blkif_sring *)blkif->blk_ring;
> > - BACK_RING_INIT(&blkif->blk_rings.native, sring, PAGE_SIZE);
> > + BACK_RING_INIT(&blkif->blk_rings.native, sring, PAGE_SIZE * nr_grefs);
> > break;
> > }
> > case BLKIF_PROTOCOL_X86_32:
> > {
> > struct blkif_x86_32_sring *sring_x86_32;
> > sring_x86_32 = (struct blkif_x86_32_sring *)blkif->blk_ring;
> > - BACK_RING_INIT(&blkif->blk_rings.x86_32, sring_x86_32, PAGE_SIZE);
> > + BACK_RING_INIT(&blkif->blk_rings.x86_32, sring_x86_32, PAGE_SIZE * nr_grefs);
> > break;
> > }
> > case BLKIF_PROTOCOL_X86_64:
> > {
> > struct blkif_x86_64_sring *sring_x86_64;
> > sring_x86_64 = (struct blkif_x86_64_sring *)blkif->blk_ring;
> > - BACK_RING_INIT(&blkif->blk_rings.x86_64, sring_x86_64, PAGE_SIZE);
> > + BACK_RING_INIT(&blkif->blk_rings.x86_64, sring_x86_64, PAGE_SIZE * nr_grefs);
> > break;
> > }
> > default:
> > @@ -588,6 +588,10 @@ static int xen_blkbk_probe(struct xenbus_device *dev,
> > if (err)
> > goto fail;
> >
> > + err = xenbus_printf(XBT_NIL, dev->nodename, "feature-multi-page-ring", "%u", 1);
> > + if (err)
> > + goto fail;
> > +
> > err = xenbus_switch_state(dev, XenbusStateInitWait);
> > if (err)
> > goto fail;
> > @@ -852,22 +856,61 @@ again:
> > static int connect_ring(struct backend_info *be)
> > {
> > struct xenbus_device *dev = be->dev;
> > - unsigned long ring_ref;
> > - unsigned int evtchn;
> > + unsigned int ring_ref[XENBUS_MAX_RING_PAGES];
> > + unsigned int evtchn, nr_grefs;
> > unsigned int pers_grants;
> > char protocol[64] = "";
> > int err;
> >
> > DPRINTK("%s", dev->otherend);
> > -
> > - err = xenbus_gather(XBT_NIL, dev->otherend, "ring-ref", "%lu",
> > - &ring_ref, "event-channel", "%u", &evtchn, NULL);
> > - if (err) {
> > - xenbus_dev_fatal(dev, err,
> > - "reading %s/ring-ref and event-channel",
> > + err = xenbus_scanf(XBT_NIL, dev->otherend, "event-channel", "%u",
> > + &evtchn);
> > + if (err != 1) {
> > + err = -EINVAL;
> > + xenbus_dev_fatal(dev, err, "reading %s/event-channel",
> > dev->otherend);
> > return err;
> > }
> > + pr_info(DRV_PFX "event-channel %u\n", evtchn);
> > +
> > + err = xenbus_scanf(XBT_NIL, dev->otherend, "max-ring-pages", "%u",
> > + &nr_grefs);
> > + if (err != 1) {
> > + err = xenbus_scanf(XBT_NIL, dev->otherend, "ring-ref",
> > + "%u", &ring_ref[0]);
> > + if (err != 1) {
> > + err = -EINVAL;
> > + xenbus_dev_fatal(dev, err, "reading %s/ring-ref",
> > + dev->otherend);
> > + return err;
> > + }
> > + nr_grefs = 1;
> > + pr_info(DRV_PFX "%s:using single page: ring-ref %d\n",
> > + dev->otherend, ring_ref[0]);
> > + } else {
> > + unsigned int i;
> > +
> > + if (nr_grefs > XENBUS_MAX_RING_PAGES) {
> > + err = -EINVAL;
> > + xenbus_dev_fatal(dev, err, "%s/request %d ring pages exceed max",
> > + dev->otherend, nr_grefs);
> > + return err;
> > + }
> > + for (i = 0; i < nr_grefs; i++) {
> > + char ring_ref_name[15];
> > +
> > + snprintf(ring_ref_name, sizeof(ring_ref_name), "ring-ref%u", i);
> > + err = xenbus_scanf(XBT_NIL, dev->otherend,
> > + ring_ref_name, "%d", &ring_ref[i]);
> > + if (err != 1) {
> > + err = -EINVAL;
> > + xenbus_dev_fatal(dev, err, "reading %s/%s",
> > + dev->otherend, ring_ref_name);
> > + return err;
> > + }
> > + pr_info(DRV_PFX "ring-ref%u: %d\n", i, ring_ref[i]);
> > + }
> > + }
> >
> > be->blkif->blk_protocol = BLKIF_PROTOCOL_DEFAULT;
> > err = xenbus_gather(XBT_NIL, dev->otherend, "protocol",
> > @@ -892,16 +935,16 @@ static int connect_ring(struct backend_info *be)
> >
> > be->blkif->vbd.feature_gnt_persistent = pers_grants;
> > be->blkif->vbd.overflow_max_grants = 0;
> > + be->blkif->nr_ring_pages = nr_grefs;
> >
> > - pr_info(DRV_PFX "ring-ref %ld, event-channel %d, protocol %d (%s) %s\n",
> > - ring_ref, evtchn, be->blkif->blk_protocol, protocol,
> > - pers_grants ? "persistent grants" : "");
> > + pr_info(DRV_PFX "ring-pages:%d, event-channel %d, protocol %d (%s) %s\n",
> > + nr_grefs, evtchn, be->blkif->blk_protocol, protocol,
> > + pers_grants ? "persistent grants" : "");
> >
> > /* Map the shared frame, irq etc. */
> > - err = xen_blkif_map(be->blkif, ring_ref, evtchn);
> > + err = xen_blkif_map(be->blkif, ring_ref, nr_grefs, evtchn);
> > if (err) {
> > - xenbus_dev_fatal(dev, err, "mapping ring-ref %lu port %u",
> > - ring_ref, evtchn);
> > + xenbus_dev_fatal(dev, err, "mapping ring-ref port %u", evtchn);
> > return err;
> > }
> >
> > diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
> > index 2c61cf8..528e4f6 100644
> > --- a/drivers/block/xen-blkfront.c
> > +++ b/drivers/block/xen-blkfront.c
> > @@ -98,7 +98,12 @@ static unsigned int xen_blkif_max_segments = 32;
> > module_param_named(max, xen_blkif_max_segments, int, S_IRUGO);
> > MODULE_PARM_DESC(max, "Maximum amount of segments in indirect requests (default is 32)");
> >
> > -#define BLK_RING_SIZE __CONST_RING_SIZE(blkif, PAGE_SIZE)
> > +static unsigned int xen_blkif_max_ring_pages = 1;
> > +module_param_named(max_ring_pages, xen_blkif_max_ring_pages, int, 0644);
> > +MODULE_PARM_DESC(max_ring_pages, "Maximum amount of pages to be used as the ring");
> > +
> > +#define BLK_RING_SIZE __CONST_RING_SIZE(blkif, PAGE_SIZE * info->nr_ring_pages)
> > +#define BLK_MAX_RING_SIZE __CONST_RING_SIZE(blkif, PAGE_SIZE * XENBUS_MAX_RING_PAGES)
> >
> > /*
> > * We have one of these per vbd, whether ide, scsi or 'other'. They
> > @@ -114,13 +119,14 @@ struct blkfront_info
> > int vdevice;
> > blkif_vdev_t handle;
> > enum blkif_state connected;
> > - int ring_ref;
> > + int ring_ref[XENBUS_MAX_RING_PAGES];
> > + int nr_ring_pages;
> > struct blkif_front_ring ring;
> > unsigned int evtchn, irq;
> > struct request_queue *rq;
> > struct work_struct work;
> > struct gnttab_free_callback callback;
> > - struct blk_shadow shadow[BLK_RING_SIZE];
> > + struct blk_shadow shadow[BLK_MAX_RING_SIZE];
> > struct list_head grants;
> > struct list_head indirect_pages;
> > unsigned int persistent_gnts_c;
> > @@ -139,8 +145,6 @@ static unsigned int nr_minors;
> > static unsigned long *minors;
> > static DEFINE_SPINLOCK(minor_lock);
> >
> > -#define MAXIMUM_OUTSTANDING_BLOCK_REQS \
> > - (BLKIF_MAX_SEGMENTS_PER_REQUEST * BLK_RING_SIZE)
> > #define GRANT_INVALID_REF 0
> >
> > #define PARTS_PER_DISK 16
> > @@ -1033,12 +1037,15 @@ free_shadow:
> > flush_work(&info->work);
> >
> > /* Free resources associated with old device channel. */
> > - if (info->ring_ref != GRANT_INVALID_REF) {
> > - gnttab_end_foreign_access(info->ring_ref, 0,
> > - (unsigned long)info->ring.sring);
> > - info->ring_ref = GRANT_INVALID_REF;
> > - info->ring.sring = NULL;
> > + for (i = 0; i < info->nr_ring_pages; i++) {
> > + if (info->ring_ref[i] != GRANT_INVALID_REF) {
> > + gnttab_end_foreign_access(info->ring_ref[i], 0, 0);
> > + info->ring_ref[i] = GRANT_INVALID_REF;
> > + }
> > }
> > + free_pages((unsigned long)info->ring.sring, get_order(info->nr_ring_pages * PAGE_SIZE));
> > + info->ring.sring = NULL;
> > +
> > if (info->irq)
> > unbind_from_irqhandler(info->irq, info);
> > info->evtchn = info->irq = 0;
> > @@ -1245,26 +1252,30 @@ static int setup_blkring(struct xenbus_device *dev,
> > struct blkfront_info *info)
> > {
> > struct blkif_sring *sring;
> > - grant_ref_t gref;
> > - int err;
> > + int err, i;
> > + unsigned long ring_size = info->nr_ring_pages * PAGE_SIZE;
> > + grant_ref_t gref[XENBUS_MAX_RING_PAGES];
> >
> > - info->ring_ref = GRANT_INVALID_REF;
> > + for (i = 0; i < XENBUS_MAX_RING_PAGES; i++)
> > + info->ring_ref[i] = GRANT_INVALID_REF;
> >
> > - sring = (struct blkif_sring *)__get_free_page(GFP_NOIO | __GFP_HIGH);
> > + sring = (struct blkif_sring *)__get_free_pages(GFP_NOIO | __GFP_HIGH,
> > + get_order(ring_size));
> > if (!sring) {
> > xenbus_dev_fatal(dev, -ENOMEM, "allocating shared ring");
> > return -ENOMEM;
> > }
> > SHARED_RING_INIT(sring);
> > - FRONT_RING_INIT(&info->ring, sring, PAGE_SIZE);
> > + FRONT_RING_INIT(&info->ring, sring, ring_size);
> >
> > - err = xenbus_grant_ring(dev, info->ring.sring, 1, &gref);
> > + err = xenbus_grant_ring(dev, info->ring.sring, info->nr_ring_pages, gref);
> > if (err < 0) {
> > free_page((unsigned long)sring);
> > info->ring.sring = NULL;
> > goto fail;
> > }
> > - info->ring_ref = gref;
> > + for (i = 0; i < info->nr_ring_pages; i++)
> > + info->ring_ref[i] = gref[i];
> >
> > err = xenbus_alloc_evtchn(dev, &info->evtchn);
> > if (err)
> > @@ -1292,7 +1303,14 @@ static int talk_to_blkback(struct xenbus_device *dev,
> > {
> > const char *message = NULL;
> > struct xenbus_transaction xbt;
> > - int err;
> > + int err, i, multi_ring_pages = 0;
> > +
> > + err = xenbus_scanf(XBT_NIL, info->xbdev->otherend,
> > + "feature-multi-page-ring", "%u", &multi_ring_pages);
> > + if (err != 1)
> > + info->nr_ring_pages = 1;
> > + else
> > + info->nr_ring_pages = xen_blkif_max_ring_pages;
> >
> > /* Create shared ring, alloc event channel. */
> > err = setup_blkring(dev, info);
> > @@ -1306,11 +1324,31 @@ again:
> > goto destroy_blkring;
> > }
> >
> > - err = xenbus_printf(xbt, dev->nodename,
> > - "ring-ref", "%u", info->ring_ref);
> > - if (err) {
> > - message = "writing ring-ref";
> > - goto abort_transaction;
> > + if (info->nr_ring_pages == 1) {
> > + err = xenbus_printf(xbt, dev->nodename,
> > + "ring-ref", "%u", info->ring_ref[0]);
> > + if (err) {
> > + message = "writing ring-ref";
> > + goto abort_transaction;
> > + }
> > + } else {
> > + err = xenbus_printf(xbt, dev->nodename,
> > + "max-ring-pages", "%u", info->nr_ring_pages);
> > + if (err) {
> > + message = "writing max-ring-pages";
> > + goto abort_transaction;
> > + }
> > +
> > + for (i = 0; i < info->nr_ring_pages; i++) {
> > + char ring_ref_name[15];
> > + sprintf(ring_ref_name, "ring-ref%d", i);
> > + err = xenbus_printf(xbt, dev->nodename,
> > + ring_ref_name, "%d", info->ring_ref[i]);
> > + if (err) {
> > + message = "writing ring-ref";
> > + goto abort_transaction;
> > + }
> > + }
> > }
> > err = xenbus_printf(xbt, dev->nodename,
> > "event-channel", "%u", info->evtchn);
> > @@ -1338,6 +1376,7 @@ again:
> > goto destroy_blkring;
> > }
> >
> > + info->shadow[BLK_RING_SIZE-1].req.u.rw.id = 0x0fffffff;
> > xenbus_switch_state(dev, XenbusStateInitialised);
> >
> > return 0;
> > @@ -1422,21 +1461,13 @@ static int blkfront_probe(struct xenbus_device *dev,
> > info->connected = BLKIF_STATE_DISCONNECTED;
> > INIT_WORK(&info->work, blkif_restart_queue);
> >
> > - for (i = 0; i < BLK_RING_SIZE; i++)
> > + for (i = 0; i < BLK_MAX_RING_SIZE; i++)
> > info->shadow[i].req.u.rw.id = i+1;
> > - info->shadow[BLK_RING_SIZE-1].req.u.rw.id = 0x0fffffff;
> >
> > /* Front end dir is a number, which is used as the id. */
> > info->handle = simple_strtoul(strrchr(dev->nodename, '/')+1, NULL, 0);
> > dev_set_drvdata(&dev->dev, info);
> >
> > - err = talk_to_blkback(dev, info);
> > - if (err) {
> > - kfree(info);
> > - dev_set_drvdata(&dev->dev, NULL);
> > - return err;
> > - }
> > -
> > return 0;
> > }
> >
> > @@ -1476,7 +1507,7 @@ static int blkif_recover(struct blkfront_info *info)
> >
> > /* Stage 2: Set up free list. */
> > memset(&info->shadow, 0, sizeof(info->shadow));
> > - for (i = 0; i < BLK_RING_SIZE; i++)
> > + for (i = 0; i < BLK_MAX_RING_SIZE; i++)
> > info->shadow[i].req.u.rw.id = i+1;
> > info->shadow_free = info->ring.req_prod_pvt;
> > info->shadow[BLK_RING_SIZE-1].req.u.rw.id = 0x0fffffff;
> > @@ -1906,8 +1937,17 @@ static void blkback_changed(struct xenbus_device *dev,
> > dev_dbg(&dev->dev, "blkfront:blkback_changed to state %d.\n", backend_state);
> >
> > switch (backend_state) {
> > - case XenbusStateInitialising:
> > case XenbusStateInitWait:
> > + /*
> > + * Talk to blkback after backend enter InitWait so we can know
> > + * whether backend spports feature-multi-page-ring.
> > + */
> > + if (talk_to_blkback(dev, info)) {
> > + kfree(info);
> > + dev_set_drvdata(&dev->dev, NULL);
> > + }
> > + break;
> > + case XenbusStateInitialising:
> > case XenbusStateInitialised:
> > case XenbusStateReconfiguring:
> > case XenbusStateReconfigured:
> > @@ -2088,6 +2128,9 @@ static int __init xlblk_init(void)
> > {
> > int ret;
> >
> > + if (xen_blkif_max_ring_pages > XENBUS_MAX_RING_PAGES)
> > + return -EINVAL;
> > +
> > if (!xen_domain())
> > return -ENODEV;
> >
> > diff --git a/include/xen/interface/io/blkif.h b/include/xen/interface/io/blkif.h
> > index c33e1c4..9c2c9a7 100644
> > --- a/include/xen/interface/io/blkif.h
> > +++ b/include/xen/interface/io/blkif.h
> > @@ -127,6 +127,19 @@ typedef uint64_t blkif_sector_t;
> > #define BLKIF_OP_INDIRECT 6
> >
> > /*
> > + * "feature-multi-page-ring" is introduced to use more than one page as the
> > + * request ring between blkfront and backend. As a result, more reqs can be
> > + * issued from blkfront and the performance get improved accordingly.
> > + *
> > + * If supported, the backend will write the key 'feature-multi-page-ring'
> > + * for that vbd.
> > + * Frontends that are aware of this feature and wish to use it will write key
> > + * "max-ring-pages" to set the number of pages that they wish to be used as the
> > + * ring. The 'xen_blkif_max_ring_pages' parameter is introduced with default
> > + * number still one page, the maximum is XENBUS_MAX_RING_PAGES.
> > + */
> > +
> > +/*
> > * Maximum scatter/gather segments per request.
> > * This is carefully chosen so that sizeof(struct blkif_ring) <= PAGE_SIZE.
> > * NB. This could be 12 if the ring indexes weren't stored in the same page.
> >
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Powered by blists - more mailing lists