[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20130304204154.GL15386@phenom.dumpdata.com>
Date: Mon, 4 Mar 2013 15:41:54 -0500
From: Konrad Rzeszutek Wilk <konrad.wilk@...cle.com>
To: Roger Pau Monne <roger.pau@...rix.com>
Cc: linux-kernel@...r.kernel.org, xen-devel@...ts.xen.org
Subject: Re: [PATCH RFC 12/12] xen-block: implement indirect descriptors
On Thu, Feb 28, 2013 at 11:28:55AM +0100, Roger Pau Monne wrote:
> Indirect descriptors introduce a new block operation
> (BLKIF_OP_INDIRECT) that passes grant references instead of segments
> in the request. This grant references are filled with arrays of
> blkif_request_segment_aligned, this way we can send more segments in a
> request.
>
> The proposed implementation sets the maximum number of indirect grefs
> (frames filled with blkif_request_segment_aligned) to 256 in the
> backend and 64 in the frontend. The value in the frontend has been
> chosen experimentally, and the backend value has been set to a sane
> value that allows expanding the maximum number of indirect descriptors
> in the frontend if needed.
So we are still using a similar format of the form:
<gref, first_sec, last_sect, pad>, etc.
Why not utilize a layout that fits with the bio sg? That way
we might not even have to do the bio_alloc call and instead can
setup an bio (and bio-list) with the appropiate offsets/list?
Meaning that the format of the indirect descriptors is:
<gref, offset, next_index, pad>
We already know what the first_sec and last_sect are - they
are basically: sector_number + nr_segments * (whatever the sector size is) + offset
>
> The migration code has changed from the previous implementation, in
> which we simply remapped the segments on the shared ring. Now the
> maximum number of segments allowed in a request can change depending
> on the backend, so we have to requeue all the requests in the ring and
> in the queue and split the bios in them if they are bigger than the
> new maximum number of segments.
>
> Signed-off-by: Roger Pau Monné <roger.pau@...rix.com>
> Cc: Konrad Rzeszutek Wilk <konrad.wilk@...cle.com>
> Cc: xen-devel@...ts.xen.org
> ---
> drivers/block/xen-blkback/blkback.c | 129 +++++++---
> drivers/block/xen-blkback/common.h | 80 ++++++-
> drivers/block/xen-blkback/xenbus.c | 8 +
> drivers/block/xen-blkfront.c | 498 +++++++++++++++++++++++++++++------
> include/xen/interface/io/blkif.h | 25 ++
> 5 files changed, 622 insertions(+), 118 deletions(-)
>
> diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c
> index 0fa30db..98eb16b 100644
> --- a/drivers/block/xen-blkback/blkback.c
> +++ b/drivers/block/xen-blkback/blkback.c
> @@ -70,7 +70,7 @@ MODULE_PARM_DESC(reqs, "Number of blkback requests to allocate per backend");
> * algorithm.
> */
>
> -static int xen_blkif_max_pgrants = 352;
> +static int xen_blkif_max_pgrants = 1024;
> module_param_named(max_persistent_grants, xen_blkif_max_pgrants, int, 0644);
> MODULE_PARM_DESC(max_persistent_grants,
> "Maximum number of grants to map persistently");
> @@ -578,10 +578,6 @@ purge_gnt_list:
> return 0;
> }
>
> -struct seg_buf {
> - unsigned long buf;
> - unsigned int nsec;
> -};
> /*
> * Unmap the grant references, and also remove the M2P over-rides
> * used in the 'pending_req'.
> @@ -761,32 +757,79 @@ out_of_memory:
> return -ENOMEM;
> }
>
> -static int xen_blkbk_map_seg(struct blkif_request *req,
> - struct pending_req *pending_req,
> +static int xen_blkbk_map_seg(struct pending_req *pending_req,
> struct seg_buf seg[],
> struct page *pages[])
> {
> int i, rc;
> - grant_ref_t grefs[BLKIF_MAX_SEGMENTS_PER_REQUEST];
>
> - for (i = 0; i < req->u.rw.nr_segments; i++)
> - grefs[i] = req->u.rw.seg[i].gref;
> -
> - rc = xen_blkbk_map(pending_req->blkif, grefs,
> + rc = xen_blkbk_map(pending_req->blkif, pending_req->grefs,
> pending_req->persistent_gnts,
> pending_req->grant_handles, pending_req->pages,
> - req->u.rw.nr_segments,
> + pending_req->nr_pages,
> (pending_req->operation != BLKIF_OP_READ));
> if (rc)
> return rc;
>
> - for (i = 0; i < req->u.rw.nr_segments; i++)
> - seg[i].buf = pfn_to_mfn(page_to_pfn(pending_req->pages[i]))
> - << PAGE_SHIFT | (req->u.rw.seg[i].first_sect << 9);
> + for (i = 0; i < pending_req->nr_pages; i++)
> + seg[i].buf |= pfn_to_mfn(page_to_pfn(pending_req->pages[i]))
> + << PAGE_SHIFT;
>
> return 0;
> }
>
> +static int xen_blkbk_parse_indirect(struct blkif_request *req,
> + struct pending_req *pending_req,
> + struct seg_buf seg[],
> + struct phys_req *preq)
> +{
> + struct persistent_gnt **persistent =
> + pending_req->indirect_persistent_gnts;
> + struct page **pages = pending_req->indirect_pages;
> + struct xen_blkif *blkif = pending_req->blkif;
> + int indirect_grefs, rc, n, nseg, i;
> + struct blkif_request_segment_aligned *segments = NULL;
> +
> + nseg = pending_req->nr_pages;
> + indirect_grefs = (nseg + SEGS_PER_INDIRECT_FRAME - 1) /
> + SEGS_PER_INDIRECT_FRAME;
> +
> + rc = xen_blkbk_map(blkif, req->u.indirect.indirect_grefs,
> + persistent, pending_req->indirect_handles,
> + pages, indirect_grefs, true);
> + if (rc)
> + goto unmap;
> +
> + for (n = 0, i = 0; n < nseg; n++) {
> + if ((n % SEGS_PER_INDIRECT_FRAME) == 0) {
> + /* Map indirect segments */
> + if (segments)
> + kunmap_atomic(segments);
> + segments =
> + kmap_atomic(pages[n/SEGS_PER_INDIRECT_FRAME]);
> + }
> + i = n % SEGS_PER_INDIRECT_FRAME;
> + pending_req->grefs[n] = segments[i].gref;
> + seg[n].nsec = segments[i].last_sect -
> + segments[i].first_sect + 1;
> + seg[n].buf = segments[i].first_sect << 9;
> + if ((segments[i].last_sect >= (PAGE_SIZE >> 9)) ||
> + (segments[i].last_sect <
> + segments[i].first_sect)) {
> + rc = -EINVAL;
> + goto unmap;
> + }
> + preq->nr_sects += seg[n].nsec;
> + }
> +
> +unmap:
> + if (segments)
> + kunmap_atomic(segments);
> + xen_blkbk_unmap(blkif, pending_req->indirect_handles,
> + pages, persistent, indirect_grefs);
> + return rc;
> +}
> +
> static int dispatch_discard_io(struct xen_blkif *blkif,
> struct blkif_request *req)
> {
> @@ -980,17 +1023,21 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
> struct pending_req *pending_req)
> {
> struct phys_req preq;
> - struct seg_buf seg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
> + struct seg_buf *seg = pending_req->seg;
> unsigned int nseg;
> struct bio *bio = NULL;
> - struct bio *biolist[BLKIF_MAX_SEGMENTS_PER_REQUEST];
> + struct bio **biolist = pending_req->biolist;
> int i, nbio = 0;
> int operation;
> struct blk_plug plug;
> bool drain = false;
> struct page **pages = pending_req->pages;
> + unsigned short req_operation;
> +
> + req_operation = req->operation == BLKIF_OP_INDIRECT ?
> + req->u.indirect.indirect_op : req->operation;
>
> - switch (req->operation) {
> + switch (req_operation) {
> case BLKIF_OP_READ:
> blkif->st_rd_req++;
> operation = READ;
> @@ -1012,33 +1059,49 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
> }
>
> /* Check that the number of segments is sane. */
> - nseg = req->u.rw.nr_segments;
> + nseg = req->operation == BLKIF_OP_INDIRECT ?
> + req->u.indirect.nr_segments : req->u.rw.nr_segments;
>
> if (unlikely(nseg == 0 && operation != WRITE_FLUSH) ||
> - unlikely(nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST)) {
> + unlikely((req->operation != BLKIF_OP_INDIRECT) &&
> + (nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST)) ||
> + unlikely((req->operation == BLKIF_OP_INDIRECT) &&
> + (nseg > MAX_INDIRECT_SEGMENTS))) {
> pr_debug(DRV_PFX "Bad number of segments in request (%d)\n",
> nseg);
> /* Haven't submitted any bio's yet. */
> goto fail_response;
> }
>
> - preq.sector_number = req->u.rw.sector_number;
> preq.nr_sects = 0;
>
> pending_req->blkif = blkif;
> - pending_req->id = req->u.rw.id;
> - pending_req->operation = req->operation;
> pending_req->status = BLKIF_RSP_OKAY;
> pending_req->nr_pages = nseg;
>
> - for (i = 0; i < nseg; i++) {
> - seg[i].nsec = req->u.rw.seg[i].last_sect -
> - req->u.rw.seg[i].first_sect + 1;
> - if ((req->u.rw.seg[i].last_sect >= (PAGE_SIZE >> 9)) ||
> - (req->u.rw.seg[i].last_sect < req->u.rw.seg[i].first_sect))
> + if (req->operation != BLKIF_OP_INDIRECT) {
> + preq.dev = req->u.rw.handle;
> + preq.sector_number = req->u.rw.sector_number;
> + pending_req->id = req->u.rw.id;
> + pending_req->operation = req->operation;
> + for (i = 0; i < nseg; i++) {
> + pending_req->grefs[i] = req->u.rw.seg[i].gref;
> + seg[i].nsec = req->u.rw.seg[i].last_sect -
> + req->u.rw.seg[i].first_sect + 1;
> + seg[i].buf = req->u.rw.seg[i].first_sect << 9;
> + if ((req->u.rw.seg[i].last_sect >= (PAGE_SIZE >> 9)) ||
> + (req->u.rw.seg[i].last_sect <
> + req->u.rw.seg[i].first_sect))
> + goto fail_response;
> + preq.nr_sects += seg[i].nsec;
> + }
> + } else {
> + preq.dev = req->u.indirect.handle;
> + preq.sector_number = req->u.indirect.sector_number;
> + pending_req->id = req->u.indirect.id;
> + pending_req->operation = req->u.indirect.indirect_op;
> + if (xen_blkbk_parse_indirect(req, pending_req, seg, &preq))
> goto fail_response;
> - preq.nr_sects += seg[i].nsec;
> -
> }
>
> if (xen_vbd_translate(&preq, blkif, operation) != 0) {
> @@ -1074,7 +1137,7 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
> * the hypercall to unmap the grants - that is all done in
> * xen_blkbk_unmap.
> */
> - if (xen_blkbk_map_seg(req, pending_req, seg, pages))
> + if (xen_blkbk_map_seg(pending_req, seg, pages))
> goto fail_flush;
>
> /*
> @@ -1146,7 +1209,7 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
> pending_req->nr_pages);
> fail_response:
> /* Haven't submitted any bio's yet. */
> - make_response(blkif, req->u.rw.id, req->operation, BLKIF_RSP_ERROR);
> + make_response(blkif, req->u.rw.id, req_operation, BLKIF_RSP_ERROR);
> free_req(blkif, pending_req);
> msleep(1); /* back off a bit */
> return -EIO;
> diff --git a/drivers/block/xen-blkback/common.h b/drivers/block/xen-blkback/common.h
> index 0b0ad3f..d3656d2 100644
> --- a/drivers/block/xen-blkback/common.h
> +++ b/drivers/block/xen-blkback/common.h
> @@ -50,6 +50,17 @@
> __func__, __LINE__, ##args)
>
>
> +/*
> + * This is the maximum number of segments that would be allowed in indirect
> + * requests. This value will also be passed to the frontend.
> + */
> +#define MAX_INDIRECT_SEGMENTS 256
> +
> +#define SEGS_PER_INDIRECT_FRAME \
> +(PAGE_SIZE/sizeof(struct blkif_request_segment_aligned))
> +#define MAX_INDIRECT_GREFS \
> +((MAX_INDIRECT_SEGMENTS + SEGS_PER_INDIRECT_FRAME - 1)/SEGS_PER_INDIRECT_FRAME)
> +
> /* Not a real protocol. Used to generate ring structs which contain
> * the elements common to all protocols only. This way we get a
> * compiler-checkable way to use common struct elements, so we can
> @@ -77,11 +88,21 @@ struct blkif_x86_32_request_discard {
> uint64_t nr_sectors;
> } __attribute__((__packed__));
>
> +struct blkif_x86_32_request_indirect {
> + uint8_t indirect_op;
> + uint16_t nr_segments;
> + uint64_t id;
> + blkif_vdev_t handle;
> + blkif_sector_t sector_number;
> + grant_ref_t indirect_grefs[BLKIF_MAX_INDIRECT_GREFS_PER_REQUEST];
> +} __attribute__((__packed__));
> +
> struct blkif_x86_32_request {
> uint8_t operation; /* BLKIF_OP_??? */
> union {
> struct blkif_x86_32_request_rw rw;
> struct blkif_x86_32_request_discard discard;
> + struct blkif_x86_32_request_indirect indirect;
> } u;
> } __attribute__((__packed__));
>
> @@ -113,11 +134,22 @@ struct blkif_x86_64_request_discard {
> uint64_t nr_sectors;
> } __attribute__((__packed__));
>
> +struct blkif_x86_64_request_indirect {
> + uint8_t indirect_op;
> + uint16_t nr_segments;
> + uint32_t _pad1; /* offsetof(blkif_..,u.indirect.id)==8 */
> + uint64_t id;
> + blkif_vdev_t handle;
> + blkif_sector_t sector_number;
> + grant_ref_t indirect_grefs[BLKIF_MAX_INDIRECT_GREFS_PER_REQUEST];
> +} __attribute__((__packed__));
> +
> struct blkif_x86_64_request {
> uint8_t operation; /* BLKIF_OP_??? */
> union {
> struct blkif_x86_64_request_rw rw;
> struct blkif_x86_64_request_discard discard;
> + struct blkif_x86_64_request_indirect indirect;
> } u;
> } __attribute__((__packed__));
>
> @@ -235,6 +267,11 @@ struct xen_blkif {
> wait_queue_head_t waiting_to_free;
> };
>
> +struct seg_buf {
> + unsigned long buf;
> + unsigned int nsec;
> +};
> +
> /*
> * Each outstanding request that we've passed to the lower device layers has a
> * 'pending_req' allocated to it. Each buffer_head that completes decrements
> @@ -249,9 +286,16 @@ struct pending_req {
> unsigned short operation;
> int status;
> struct list_head free_list;
> - struct persistent_gnt *persistent_gnts[BLKIF_MAX_SEGMENTS_PER_REQUEST];
> - struct page *pages[BLKIF_MAX_SEGMENTS_PER_REQUEST];
> - grant_handle_t grant_handles[BLKIF_MAX_SEGMENTS_PER_REQUEST];
> + struct persistent_gnt *persistent_gnts[MAX_INDIRECT_SEGMENTS];
> + struct page *pages[MAX_INDIRECT_SEGMENTS];
> + grant_handle_t grant_handles[MAX_INDIRECT_SEGMENTS];
> + grant_ref_t grefs[MAX_INDIRECT_SEGMENTS];
> + /* Indirect descriptors */
> + struct persistent_gnt *indirect_persistent_gnts[MAX_INDIRECT_GREFS];
> + struct page *indirect_pages[MAX_INDIRECT_GREFS];
> + grant_handle_t indirect_handles[MAX_INDIRECT_GREFS];
> + struct seg_buf seg[MAX_INDIRECT_SEGMENTS];
> + struct bio *biolist[MAX_INDIRECT_SEGMENTS];
> };
>
>
> @@ -289,7 +333,7 @@ struct xenbus_device *xen_blkbk_xenbus(struct backend_info *be);
> static inline void blkif_get_x86_32_req(struct blkif_request *dst,
> struct blkif_x86_32_request *src)
> {
> - int i, n = BLKIF_MAX_SEGMENTS_PER_REQUEST;
> + int i, n = BLKIF_MAX_SEGMENTS_PER_REQUEST, j = MAX_INDIRECT_GREFS;
> dst->operation = src->operation;
> switch (src->operation) {
> case BLKIF_OP_READ:
> @@ -312,6 +356,19 @@ static inline void blkif_get_x86_32_req(struct blkif_request *dst,
> dst->u.discard.sector_number = src->u.discard.sector_number;
> dst->u.discard.nr_sectors = src->u.discard.nr_sectors;
> break;
> + case BLKIF_OP_INDIRECT:
> + dst->u.indirect.indirect_op = src->u.indirect.indirect_op;
> + dst->u.indirect.nr_segments = src->u.indirect.nr_segments;
> + dst->u.indirect.handle = src->u.indirect.handle;
> + dst->u.indirect.id = src->u.indirect.id;
> + dst->u.indirect.sector_number = src->u.indirect.sector_number;
> + barrier();
> + if (j > dst->u.indirect.nr_segments)
> + j = dst->u.indirect.nr_segments;
> + for (i = 0; i < j; i++)
> + dst->u.indirect.indirect_grefs[i] =
> + src->u.indirect.indirect_grefs[i];
> + break;
> default:
> break;
> }
> @@ -320,7 +377,7 @@ static inline void blkif_get_x86_32_req(struct blkif_request *dst,
> static inline void blkif_get_x86_64_req(struct blkif_request *dst,
> struct blkif_x86_64_request *src)
> {
> - int i, n = BLKIF_MAX_SEGMENTS_PER_REQUEST;
> + int i, n = BLKIF_MAX_SEGMENTS_PER_REQUEST, j = MAX_INDIRECT_GREFS;
> dst->operation = src->operation;
> switch (src->operation) {
> case BLKIF_OP_READ:
> @@ -343,6 +400,19 @@ static inline void blkif_get_x86_64_req(struct blkif_request *dst,
> dst->u.discard.sector_number = src->u.discard.sector_number;
> dst->u.discard.nr_sectors = src->u.discard.nr_sectors;
> break;
> + case BLKIF_OP_INDIRECT:
> + dst->u.indirect.indirect_op = src->u.indirect.indirect_op;
> + dst->u.indirect.nr_segments = src->u.indirect.nr_segments;
> + dst->u.indirect.handle = src->u.indirect.handle;
> + dst->u.indirect.id = src->u.indirect.id;
> + dst->u.indirect.sector_number = src->u.indirect.sector_number;
> + barrier();
> + if (j > dst->u.indirect.nr_segments)
> + j = dst->u.indirect.nr_segments;
> + for (i = 0; i < j; i++)
> + dst->u.indirect.indirect_grefs[i] =
> + src->u.indirect.indirect_grefs[i];
> + break;
> default:
> break;
> }
> diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c
> index 8f929cb..9e16abb 100644
> --- a/drivers/block/xen-blkback/xenbus.c
> +++ b/drivers/block/xen-blkback/xenbus.c
> @@ -700,6 +700,14 @@ again:
> goto abort;
> }
>
> + err = xenbus_printf(xbt, dev->nodename, "max-indirect-segments", "%u",
> + MAX_INDIRECT_SEGMENTS);
> + if (err) {
> + xenbus_dev_fatal(dev, err, "writing %s/max-indirect-segments",
> + dev->nodename);
> + goto abort;
> + }
> +
> err = xenbus_printf(xbt, dev->nodename, "sectors", "%llu",
> (unsigned long long)vbd_sz(&be->blkif->vbd));
> if (err) {
> diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
> index 4d81fcc..074d302 100644
> --- a/drivers/block/xen-blkfront.c
> +++ b/drivers/block/xen-blkfront.c
> @@ -74,12 +74,30 @@ struct grant {
> struct blk_shadow {
> struct blkif_request req;
> struct request *request;
> - struct grant *grants_used[BLKIF_MAX_SEGMENTS_PER_REQUEST];
> + struct grant **grants_used;
> + struct grant **indirect_grants;
> +};
> +
> +struct split_bio {
> + struct bio *bio;
> + atomic_t pending;
> + int err;
> };
>
> static DEFINE_MUTEX(blkfront_mutex);
> static const struct block_device_operations xlvbd_block_fops;
>
> +/*
> + * Maximum number of segments in indirect requests, the actual value used by
> + * the frontend driver is the minimum of this value and the value provided
> + * by the backend driver.
> + */
> +
> +static int xen_blkif_max_segments = 64;
> +module_param_named(max_segments, xen_blkif_max_segments, int, 0);
> +MODULE_PARM_DESC(max_segments,
> +"Maximum number of segments in indirect requests");
> +
> #define BLK_RING_SIZE __CONST_RING_SIZE(blkif, PAGE_SIZE)
>
> /*
> @@ -98,7 +116,7 @@ struct blkfront_info
> enum blkif_state connected;
> int ring_ref;
> struct blkif_front_ring ring;
> - struct scatterlist sg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
> + struct scatterlist *sg;
> unsigned int evtchn, irq;
> struct request_queue *rq;
> struct work_struct work;
> @@ -114,6 +132,8 @@ struct blkfront_info
> unsigned int discard_granularity;
> unsigned int discard_alignment;
> unsigned int feature_persistent:1;
> + unsigned int max_indirect_segments;
> + unsigned int sector_size;
> int is_ready;
> };
>
> @@ -142,6 +162,14 @@ static DEFINE_SPINLOCK(minor_lock);
>
> #define DEV_NAME "xvd" /* name in /dev */
>
> +#define SEGS_PER_INDIRECT_FRAME \
> + (PAGE_SIZE/sizeof(struct blkif_request_segment_aligned))
> +#define INDIRECT_GREFS(_segs) \
> + ((_segs + SEGS_PER_INDIRECT_FRAME - 1)/SEGS_PER_INDIRECT_FRAME)
> +#define MIN(_a, _b) ((_a) < (_b) ? (_a) : (_b))
> +
> +static int blkfront_setup_indirect(struct blkfront_info *info);
> +
> static int get_id_from_freelist(struct blkfront_info *info)
> {
> unsigned long free = info->shadow_free;
> @@ -358,7 +386,8 @@ static int blkif_queue_request(struct request *req)
> struct blkif_request *ring_req;
> unsigned long id;
> unsigned int fsect, lsect;
> - int i, ref;
> + int i, ref, n;
> + struct blkif_request_segment_aligned *segments = NULL;
>
> /*
> * Used to store if we are able to queue the request by just using
> @@ -369,21 +398,27 @@ static int blkif_queue_request(struct request *req)
> grant_ref_t gref_head;
> struct grant *gnt_list_entry = NULL;
> struct scatterlist *sg;
> + int nseg, max_grefs;
>
> if (unlikely(info->connected != BLKIF_STATE_CONNECTED))
> return 1;
>
> - /* Check if we have enought grants to allocate a requests */
> - if (info->persistent_gnts_c < BLKIF_MAX_SEGMENTS_PER_REQUEST) {
> + max_grefs = info->max_indirect_segments ?
> + info->max_indirect_segments +
> + INDIRECT_GREFS(info->max_indirect_segments) :
> + BLKIF_MAX_SEGMENTS_PER_REQUEST;
> +
> + /* Check if we have enough grants to allocate a requests */
> + if (info->persistent_gnts_c < max_grefs) {
> new_persistent_gnts = 1;
> if (gnttab_alloc_grant_references(
> - BLKIF_MAX_SEGMENTS_PER_REQUEST - info->persistent_gnts_c,
> + max_grefs - info->persistent_gnts_c,
> &gref_head) < 0) {
> gnttab_request_free_callback(
> &info->callback,
> blkif_restart_queue_callback,
> info,
> - BLKIF_MAX_SEGMENTS_PER_REQUEST);
> + max_grefs);
> return 1;
> }
> } else
> @@ -394,42 +429,82 @@ static int blkif_queue_request(struct request *req)
> id = get_id_from_freelist(info);
> info->shadow[id].request = req;
>
> - ring_req->u.rw.id = id;
> - ring_req->u.rw.sector_number = (blkif_sector_t)blk_rq_pos(req);
> - ring_req->u.rw.handle = info->handle;
> -
> - ring_req->operation = rq_data_dir(req) ?
> - BLKIF_OP_WRITE : BLKIF_OP_READ;
> -
> - if (req->cmd_flags & (REQ_FLUSH | REQ_FUA)) {
> - /*
> - * Ideally we can do an unordered flush-to-disk. In case the
> - * backend onlysupports barriers, use that. A barrier request
> - * a superset of FUA, so we can implement it the same
> - * way. (It's also a FLUSH+FUA, since it is
> - * guaranteed ordered WRT previous writes.)
> - */
> - ring_req->operation = info->flush_op;
> - }
> -
> if (unlikely(req->cmd_flags & (REQ_DISCARD | REQ_SECURE))) {
> /* id, sector_number and handle are set above. */
> ring_req->operation = BLKIF_OP_DISCARD;
> ring_req->u.discard.nr_sectors = blk_rq_sectors(req);
> + ring_req->u.discard.id = id;
> + ring_req->u.discard.sector_number =
> + (blkif_sector_t)blk_rq_pos(req);
> if ((req->cmd_flags & REQ_SECURE) && info->feature_secdiscard)
> ring_req->u.discard.flag = BLKIF_DISCARD_SECURE;
> else
> ring_req->u.discard.flag = 0;
> } else {
> - ring_req->u.rw.nr_segments = blk_rq_map_sg(req->q, req,
> - info->sg);
> - BUG_ON(ring_req->u.rw.nr_segments >
> - BLKIF_MAX_SEGMENTS_PER_REQUEST);
> -
> - for_each_sg(info->sg, sg, ring_req->u.rw.nr_segments, i) {
> + BUG_ON(info->max_indirect_segments == 0 &&
> + req->nr_phys_segments > BLKIF_MAX_SEGMENTS_PER_REQUEST);
> + BUG_ON(info->max_indirect_segments &&
> + req->nr_phys_segments > info->max_indirect_segments);
> + nseg = blk_rq_map_sg(req->q, req, info->sg);
> + if (nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST) {
> + /* Indirect OP */
> + ring_req->operation = BLKIF_OP_INDIRECT;
> + ring_req->u.indirect.indirect_op = rq_data_dir(req) ?
> + BLKIF_OP_WRITE : BLKIF_OP_READ;
> + ring_req->u.indirect.id = id;
> + ring_req->u.indirect.sector_number =
> + (blkif_sector_t)blk_rq_pos(req);
> + ring_req->u.indirect.handle = info->handle;
> + if (req->cmd_flags & (REQ_FLUSH | REQ_FUA)) {
> + /*
> + * Ideally we can do an unordered flush-to-disk. In case the
> + * backend onlysupports barriers, use that. A barrier request
> + * a superset of FUA, so we can implement it the same
> + * way. (It's also a FLUSH+FUA, since it is
> + * guaranteed ordered WRT previous writes.)
> + */
> + ring_req->u.indirect.indirect_op =
> + info->flush_op;
> + }
> + ring_req->u.indirect.nr_segments = nseg;
> + } else {
> + ring_req->u.rw.id = id;
> + ring_req->u.rw.sector_number =
> + (blkif_sector_t)blk_rq_pos(req);
> + ring_req->u.rw.handle = info->handle;
> + ring_req->operation = rq_data_dir(req) ?
> + BLKIF_OP_WRITE : BLKIF_OP_READ;
> + if (req->cmd_flags & (REQ_FLUSH | REQ_FUA)) {
> + /*
> + * Ideally we can do an unordered flush-to-disk. In case the
> + * backend onlysupports barriers, use that. A barrier request
> + * a superset of FUA, so we can implement it the same
> + * way. (It's also a FLUSH+FUA, since it is
> + * guaranteed ordered WRT previous writes.)
> + */
> + ring_req->operation = info->flush_op;
> + }
> + ring_req->u.rw.nr_segments = nseg;
> + }
> + for_each_sg(info->sg, sg, nseg, i) {
> fsect = sg->offset >> 9;
> lsect = fsect + (sg->length >> 9) - 1;
>
> + if ((ring_req->operation == BLKIF_OP_INDIRECT) &&
> + (i % SEGS_PER_INDIRECT_FRAME == 0)) {
> + if (segments)
> + kunmap_atomic(segments);
> +
> + n = i / SEGS_PER_INDIRECT_FRAME;
> + gnt_list_entry = get_grant(&gref_head, info);
> + info->shadow[id].indirect_grants[n] =
> + gnt_list_entry;
> + segments = kmap_atomic(
> + pfn_to_page(gnt_list_entry->pfn));
> + ring_req->u.indirect.indirect_grefs[n] =
> + gnt_list_entry->gref;
> + }
> +
> gnt_list_entry = get_grant(&gref_head, info);
> ref = gnt_list_entry->gref;
>
> @@ -461,13 +536,23 @@ static int blkif_queue_request(struct request *req)
> kunmap_atomic(bvec_data);
> kunmap_atomic(shared_data);
> }
> -
> - ring_req->u.rw.seg[i] =
> - (struct blkif_request_segment) {
> - .gref = ref,
> - .first_sect = fsect,
> - .last_sect = lsect };
> + if (ring_req->operation != BLKIF_OP_INDIRECT) {
> + ring_req->u.rw.seg[i] =
> + (struct blkif_request_segment) {
> + .gref = ref,
> + .first_sect = fsect,
> + .last_sect = lsect };
> + } else {
> + n = i % SEGS_PER_INDIRECT_FRAME;
> + segments[n] =
> + (struct blkif_request_segment_aligned) {
> + .gref = ref,
> + .first_sect = fsect,
> + .last_sect = lsect };
> + }
> }
> + if (segments)
> + kunmap_atomic(segments);
> }
>
> info->ring.req_prod_pvt++;
> @@ -542,7 +627,8 @@ wait:
> flush_requests(info);
> }
>
> -static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size)
> +static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size,
> + unsigned int segments)
> {
> struct request_queue *rq;
> struct blkfront_info *info = gd->private_data;
> @@ -571,7 +657,7 @@ static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size)
> blk_queue_max_segment_size(rq, PAGE_SIZE);
>
> /* Ensure a merged request will fit in a single I/O ring slot. */
> - blk_queue_max_segments(rq, BLKIF_MAX_SEGMENTS_PER_REQUEST);
> + blk_queue_max_segments(rq, segments);
>
> /* Make sure buffer addresses are sector-aligned. */
> blk_queue_dma_alignment(rq, 511);
> @@ -588,13 +674,14 @@ static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size)
> static void xlvbd_flush(struct blkfront_info *info)
> {
> blk_queue_flush(info->rq, info->feature_flush);
> - printk(KERN_INFO "blkfront: %s: %s: %s %s\n",
> + printk(KERN_INFO "blkfront: %s: %s: %s %s %s\n",
> info->gd->disk_name,
> info->flush_op == BLKIF_OP_WRITE_BARRIER ?
> "barrier" : (info->flush_op == BLKIF_OP_FLUSH_DISKCACHE ?
> "flush diskcache" : "barrier or flush"),
> info->feature_flush ? "enabled" : "disabled",
> - info->feature_persistent ? "using persistent grants" : "");
> + info->feature_persistent ? "using persistent grants" : "",
> + info->max_indirect_segments ? "using indirect descriptors" : "");
> }
>
> static int xen_translate_vdev(int vdevice, int *minor, unsigned int *offset)
> @@ -734,7 +821,9 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity,
> gd->driverfs_dev = &(info->xbdev->dev);
> set_capacity(gd, capacity);
>
> - if (xlvbd_init_blk_queue(gd, sector_size)) {
> + if (xlvbd_init_blk_queue(gd, sector_size,
> + info->max_indirect_segments ? :
> + BLKIF_MAX_SEGMENTS_PER_REQUEST)) {
> del_gendisk(gd);
> goto release;
> }
> @@ -818,6 +907,7 @@ static void blkif_free(struct blkfront_info *info, int suspend)
> {
> struct grant *persistent_gnt;
> struct grant *n;
> + int i, j, segs;
>
> /* Prevent new requests being issued until we fix things up. */
> spin_lock_irq(&info->io_lock);
> @@ -843,6 +933,47 @@ static void blkif_free(struct blkfront_info *info, int suspend)
> }
> BUG_ON(info->persistent_gnts_c != 0);
>
> + kfree(info->sg);
> + info->sg = NULL;
> + for (i = 0; i < BLK_RING_SIZE; i++) {
> + /*
> + * Clear persistent grants present in requests already
> + * on the shared ring
> + */
> + if (!info->shadow[i].request)
> + goto free_shadow;
> +
> + segs = info->shadow[i].req.operation == BLKIF_OP_INDIRECT ?
> + info->shadow[i].req.u.indirect.nr_segments :
> + info->shadow[i].req.u.rw.nr_segments;
> + for (j = 0; j < segs; j++) {
> + persistent_gnt = info->shadow[i].grants_used[j];
> + gnttab_end_foreign_access(persistent_gnt->gref, 0, 0UL);
> + __free_page(pfn_to_page(persistent_gnt->pfn));
> + kfree(persistent_gnt);
> + }
> +
> + if (info->shadow[i].req.operation != BLKIF_OP_INDIRECT)
> + /*
> + * If this is not an indirect operation don't try to
> + * free indirect segments
> + */
> + goto free_shadow;
> +
> + for (j = 0; j < INDIRECT_GREFS(segs); j++) {
> + persistent_gnt = info->shadow[i].indirect_grants[j];
> + gnttab_end_foreign_access(persistent_gnt->gref, 0, 0UL);
> + __free_page(pfn_to_page(persistent_gnt->pfn));
> + kfree(persistent_gnt);
> + }
> +
> +free_shadow:
> + kfree(info->shadow[i].grants_used);
> + info->shadow[i].grants_used = NULL;
> + kfree(info->shadow[i].indirect_grants);
> + info->shadow[i].indirect_grants = NULL;
> + }
> +
> /* No more gnttab callback work. */
> gnttab_cancel_free_callback(&info->callback);
> spin_unlock_irq(&info->io_lock);
> @@ -873,6 +1004,10 @@ static void blkif_completion(struct blk_shadow *s, struct blkfront_info *info,
> char *bvec_data;
> void *shared_data;
> unsigned int offset = 0;
> + int nseg;
> +
> + nseg = s->req.operation == BLKIF_OP_INDIRECT ?
> + s->req.u.indirect.nr_segments : s->req.u.rw.nr_segments;
>
> if (bret->operation == BLKIF_OP_READ) {
> /*
> @@ -885,7 +1020,7 @@ static void blkif_completion(struct blk_shadow *s, struct blkfront_info *info,
> BUG_ON((bvec->bv_offset + bvec->bv_len) > PAGE_SIZE);
> if (bvec->bv_offset < offset)
> i++;
> - BUG_ON(i >= s->req.u.rw.nr_segments);
> + BUG_ON(i >= nseg);
> shared_data = kmap_atomic(
> pfn_to_page(s->grants_used[i]->pfn));
> bvec_data = bvec_kmap_irq(bvec, &flags);
> @@ -897,10 +1032,17 @@ static void blkif_completion(struct blk_shadow *s, struct blkfront_info *info,
> }
> }
> /* Add the persistent grant into the list of free grants */
> - for (i = 0; i < s->req.u.rw.nr_segments; i++) {
> + for (i = 0; i < nseg; i++) {
> list_add(&s->grants_used[i]->node, &info->persistent_gnts);
> info->persistent_gnts_c++;
> }
> + if (s->req.operation == BLKIF_OP_INDIRECT) {
> + for (i = 0; i < INDIRECT_GREFS(nseg); i++) {
> + list_add(&s->indirect_grants[i]->node,
> + &info->persistent_gnts);
> + info->persistent_gnts_c++;
> + }
> + }
> }
>
> static irqreturn_t blkif_interrupt(int irq, void *dev_id)
> @@ -1034,8 +1176,6 @@ static int setup_blkring(struct xenbus_device *dev,
> SHARED_RING_INIT(sring);
> FRONT_RING_INIT(&info->ring, sring, PAGE_SIZE);
>
> - sg_init_table(info->sg, BLKIF_MAX_SEGMENTS_PER_REQUEST);
> -
> err = xenbus_grant_ring(dev, virt_to_mfn(info->ring.sring));
> if (err < 0) {
> free_page((unsigned long)sring);
> @@ -1116,12 +1256,6 @@ again:
> goto destroy_blkring;
> }
>
> - /* Allocate memory for grants */
> - err = fill_grant_buffer(info, BLK_RING_SIZE *
> - BLKIF_MAX_SEGMENTS_PER_REQUEST);
> - if (err)
> - goto out;
> -
> xenbus_switch_state(dev, XenbusStateInitialised);
>
> return 0;
> @@ -1223,13 +1357,84 @@ static int blkfront_probe(struct xenbus_device *dev,
> return 0;
> }
>
> +/*
> + * This is a clone of md_trim_bio, used to split a bio into smaller ones
> + */
> +static void trim_bio(struct bio *bio, int offset, int size)
> +{
> + /* 'bio' is a cloned bio which we need to trim to match
> + * the given offset and size.
> + * This requires adjusting bi_sector, bi_size, and bi_io_vec
> + */
> + int i;
> + struct bio_vec *bvec;
> + int sofar = 0;
> +
> + size <<= 9;
> + if (offset == 0 && size == bio->bi_size)
> + return;
> +
> + bio->bi_sector += offset;
> + bio->bi_size = size;
> + offset <<= 9;
> + clear_bit(BIO_SEG_VALID, &bio->bi_flags);
> +
> + while (bio->bi_idx < bio->bi_vcnt &&
> + bio->bi_io_vec[bio->bi_idx].bv_len <= offset) {
> + /* remove this whole bio_vec */
> + offset -= bio->bi_io_vec[bio->bi_idx].bv_len;
> + bio->bi_idx++;
> + }
> + if (bio->bi_idx < bio->bi_vcnt) {
> + bio->bi_io_vec[bio->bi_idx].bv_offset += offset;
> + bio->bi_io_vec[bio->bi_idx].bv_len -= offset;
> + }
> + /* avoid any complications with bi_idx being non-zero*/
> + if (bio->bi_idx) {
> + memmove(bio->bi_io_vec, bio->bi_io_vec+bio->bi_idx,
> + (bio->bi_vcnt - bio->bi_idx) * sizeof(struct bio_vec));
> + bio->bi_vcnt -= bio->bi_idx;
> + bio->bi_idx = 0;
> + }
> + /* Make sure vcnt and last bv are not too big */
> + bio_for_each_segment(bvec, bio, i) {
> + if (sofar + bvec->bv_len > size)
> + bvec->bv_len = size - sofar;
> + if (bvec->bv_len == 0) {
> + bio->bi_vcnt = i;
> + break;
> + }
> + sofar += bvec->bv_len;
> + }
> +}
> +
> +static void split_bio_end(struct bio *bio, int error)
> +{
> + struct split_bio *split_bio = bio->bi_private;
> +
> + if (error)
> + split_bio->err = error;
> +
> + if (atomic_dec_and_test(&split_bio->pending)) {
> + split_bio->bio->bi_phys_segments = 0;
> + bio_endio(split_bio->bio, split_bio->err);
> + kfree(split_bio);
> + }
> + bio_put(bio);
> +}
>
> static int blkif_recover(struct blkfront_info *info)
> {
> int i;
> - struct blkif_request *req;
> + struct request *req, *n;
> struct blk_shadow *copy;
> - int j;
> + int rc;
> + struct bio *bio, *cloned_bio;
> + struct bio_list bio_list, merge_bio;
> + unsigned int segs;
> + int pending, offset, size;
> + struct split_bio *split_bio;
> + struct list_head requests;
>
> /* Stage 1: Make a safe copy of the shadow state. */
> copy = kmalloc(sizeof(info->shadow),
> @@ -1245,36 +1450,64 @@ static int blkif_recover(struct blkfront_info *info)
> info->shadow_free = info->ring.req_prod_pvt;
> info->shadow[BLK_RING_SIZE-1].req.u.rw.id = 0x0fffffff;
>
> - /* Stage 3: Find pending requests and requeue them. */
> + rc = blkfront_setup_indirect(info);
> + if (rc) {
> + kfree(copy);
> + return rc;
> + }
> +
> + segs = info->max_indirect_segments ? : BLKIF_MAX_SEGMENTS_PER_REQUEST;
> + blk_queue_max_segments(info->rq, segs);
> + bio_list_init(&bio_list);
> + INIT_LIST_HEAD(&requests);
> for (i = 0; i < BLK_RING_SIZE; i++) {
> /* Not in use? */
> if (!copy[i].request)
> continue;
>
> - /* Grab a request slot and copy shadow state into it. */
> - req = RING_GET_REQUEST(&info->ring, info->ring.req_prod_pvt);
> - *req = copy[i].req;
> -
> - /* We get a new request id, and must reset the shadow state. */
> - req->u.rw.id = get_id_from_freelist(info);
> - memcpy(&info->shadow[req->u.rw.id], ©[i], sizeof(copy[i]));
> -
> - if (req->operation != BLKIF_OP_DISCARD) {
> - /* Rewrite any grant references invalidated by susp/resume. */
> - for (j = 0; j < req->u.rw.nr_segments; j++)
> - gnttab_grant_foreign_access_ref(
> - req->u.rw.seg[j].gref,
> - info->xbdev->otherend_id,
> - pfn_to_mfn(copy[i].grants_used[j]->pfn),
> - 0);
> + /*
> + * Get the bios in the request so we can re-queue them.
> + */
> + if (copy[i].request->cmd_flags &
> + (REQ_FLUSH | REQ_FUA | REQ_DISCARD | REQ_SECURE)) {
> + /*
> + * Flush operations don't contain bios, so
> + * we need to requeue the whole request
> + */
> + list_add(©[i].request->queuelist, &requests);
> + continue;
> }
> - info->shadow[req->u.rw.id].req = *req;
> -
> - info->ring.req_prod_pvt++;
> + merge_bio.head = copy[i].request->bio;
> + merge_bio.tail = copy[i].request->biotail;
> + bio_list_merge(&bio_list, &merge_bio);
> + copy[i].request->bio = NULL;
> + blk_put_request(copy[i].request);
> }
>
> kfree(copy);
>
> + /*
> + * Empty the queue, this is important because we might have
> + * requests in the queue with more segments than what we
> + * can handle now.
> + */
> + spin_lock_irq(&info->io_lock);
> + while ((req = blk_fetch_request(info->rq)) != NULL) {
> + if (req->cmd_flags &
> + (REQ_FLUSH | REQ_FUA | REQ_DISCARD | REQ_SECURE)) {
> + list_add(&req->queuelist, &requests);
> + continue;
> + }
> + merge_bio.head = req->bio;
> + merge_bio.tail = req->biotail;
> + bio_list_merge(&bio_list, &merge_bio);
> + req->bio = NULL;
> + if (req->cmd_flags & (REQ_FLUSH | REQ_FUA))
> + pr_alert("diskcache flush request found!\n");
> + __blk_put_request(info->rq, req);
> + }
> + spin_unlock_irq(&info->io_lock);
> +
> xenbus_switch_state(info->xbdev, XenbusStateConnected);
>
> spin_lock_irq(&info->io_lock);
> @@ -1282,14 +1515,50 @@ static int blkif_recover(struct blkfront_info *info)
> /* Now safe for us to use the shared ring */
> info->connected = BLKIF_STATE_CONNECTED;
>
> - /* Send off requeued requests */
> - flush_requests(info);
> -
> /* Kick any other new requests queued since we resumed */
> kick_pending_request_queues(info);
>
> + list_for_each_entry_safe(req, n, &requests, queuelist) {
> + /* Requeue pending requests (flush or discard) */
> + list_del_init(&req->queuelist);
> + BUG_ON(req->nr_phys_segments > segs);
> + blk_requeue_request(info->rq, req);
> + }
> spin_unlock_irq(&info->io_lock);
>
> + while ((bio = bio_list_pop(&bio_list)) != NULL) {
> + /* Traverse the list of pending bios and re-queue them */
> + if (bio_segments(bio) > segs) {
> + /*
> + * This bio has more segments than what we can
> + * handle, we have to split it.
> + */
> + pending = (bio_segments(bio) + segs - 1) / segs;
> + split_bio = kzalloc(sizeof(*split_bio), GFP_NOIO);
> + BUG_ON(split_bio == NULL);
> + atomic_set(&split_bio->pending, pending);
> + split_bio->bio = bio;
> + for (i = 0; i < pending; i++) {
> + offset = (i * segs * PAGE_SIZE) >> 9;
> + size = MIN((segs * PAGE_SIZE) >> 9,
> + (bio->bi_size >> 9) - offset);
> + cloned_bio = bio_clone(bio, GFP_NOIO);
> + BUG_ON(cloned_bio == NULL);
> + trim_bio(cloned_bio, offset, size);
> + cloned_bio->bi_private = split_bio;
> + cloned_bio->bi_end_io = split_bio_end;
> + submit_bio(cloned_bio->bi_rw, cloned_bio);
> + }
> + /*
> + * Now we have to wait for all those smaller bios to
> + * end, so we can also end the "parent" bio.
> + */
> + continue;
> + }
> + /* We don't need to split this bio */
> + submit_bio(bio->bi_rw, bio);
> + }
> +
> return 0;
> }
>
> @@ -1309,8 +1578,12 @@ static int blkfront_resume(struct xenbus_device *dev)
> blkif_free(info, info->connected == BLKIF_STATE_CONNECTED);
>
> err = talk_to_blkback(dev, info);
> - if (info->connected == BLKIF_STATE_SUSPENDED && !err)
> - err = blkif_recover(info);
> +
> + /*
> + * We have to wait for the backend to switch to
> + * connected state, since we want to read which
> + * features it supports.
> + */
>
> return err;
> }
> @@ -1388,6 +1661,62 @@ static void blkfront_setup_discard(struct blkfront_info *info)
> kfree(type);
> }
>
> +static int blkfront_setup_indirect(struct blkfront_info *info)
> +{
> + unsigned int indirect_segments, segs;
> + int err, i;
> +
> + err = xenbus_gather(XBT_NIL, info->xbdev->otherend,
> + "max-indirect-segments", "%u", &indirect_segments,
> + NULL);
> + if (err) {
> + info->max_indirect_segments = 0;
> + segs = BLKIF_MAX_SEGMENTS_PER_REQUEST;
> + } else {
> + info->max_indirect_segments = MIN(indirect_segments,
> + xen_blkif_max_segments);
> + segs = info->max_indirect_segments;
> + }
> + info->sg = kzalloc(sizeof(info->sg[0]) * segs, GFP_KERNEL);
> + if (info->sg == NULL)
> + goto out_of_memory;
> + sg_init_table(info->sg, segs);
> +
> + err = fill_grant_buffer(info,
> + (segs + INDIRECT_GREFS(segs)) * BLK_RING_SIZE);
> + if (err)
> + goto out_of_memory;
> +
> + for (i = 0; i < BLK_RING_SIZE; i++) {
> + info->shadow[i].grants_used = kzalloc(
> + sizeof(info->shadow[i].grants_used[0]) * segs,
> + GFP_NOIO);
> + if (info->max_indirect_segments)
> + info->shadow[i].indirect_grants = kzalloc(
> + sizeof(info->shadow[i].indirect_grants[0]) *
> + INDIRECT_GREFS(segs),
> + GFP_NOIO);
> + if ((info->shadow[i].grants_used == NULL) ||
> + (info->max_indirect_segments &&
> + (info->shadow[i].indirect_grants == NULL)))
> + goto out_of_memory;
> + }
> +
> +
> + return 0;
> +
> +out_of_memory:
> + kfree(info->sg);
> + info->sg = NULL;
> + for (i = 0; i < BLK_RING_SIZE; i++) {
> + kfree(info->shadow[i].grants_used);
> + info->shadow[i].grants_used = NULL;
> + kfree(info->shadow[i].indirect_grants);
> + info->shadow[i].indirect_grants = NULL;
> + }
> + return -ENOMEM;
> +}
> +
> /*
> * Invoked when the backend is finally 'ready' (and has told produced
> * the details about the physical device - #sectors, size, etc).
> @@ -1415,8 +1744,9 @@ static void blkfront_connect(struct blkfront_info *info)
> set_capacity(info->gd, sectors);
> revalidate_disk(info->gd);
>
> - /* fall through */
> + return;
> case BLKIF_STATE_SUSPENDED:
> + blkif_recover(info);
> return;
>
> default:
> @@ -1437,6 +1767,7 @@ static void blkfront_connect(struct blkfront_info *info)
> info->xbdev->otherend);
> return;
> }
> + info->sector_size = sector_size;
>
> info->feature_flush = 0;
> info->flush_op = 0;
> @@ -1484,6 +1815,13 @@ static void blkfront_connect(struct blkfront_info *info)
> else
> info->feature_persistent = persistent;
>
> + err = blkfront_setup_indirect(info);
> + if (err) {
> + xenbus_dev_fatal(info->xbdev, err, "setup_indirect at %s",
> + info->xbdev->otherend);
> + return;
> + }
> +
> err = xlvbd_alloc_gendisk(sectors, info, binfo, sector_size);
> if (err) {
> xenbus_dev_fatal(info->xbdev, err, "xlvbd_add at %s",
> diff --git a/include/xen/interface/io/blkif.h b/include/xen/interface/io/blkif.h
> index 01c3d62..6d99849 100644
> --- a/include/xen/interface/io/blkif.h
> +++ b/include/xen/interface/io/blkif.h
> @@ -102,6 +102,8 @@ typedef uint64_t blkif_sector_t;
> */
> #define BLKIF_OP_DISCARD 5
>
> +#define BLKIF_OP_INDIRECT 6
> +
> /*
> * Maximum scatter/gather segments per request.
> * This is carefully chosen so that sizeof(struct blkif_ring) <= PAGE_SIZE.
> @@ -109,6 +111,16 @@ typedef uint64_t blkif_sector_t;
> */
> #define BLKIF_MAX_SEGMENTS_PER_REQUEST 11
>
> +#define BLKIF_MAX_INDIRECT_GREFS_PER_REQUEST 8
> +
> +struct blkif_request_segment_aligned {
> + grant_ref_t gref; /* reference to I/O buffer frame */
> + /* @first_sect: first sector in frame to transfer (inclusive). */
> + /* @last_sect: last sector in frame to transfer (inclusive). */
> + uint8_t first_sect, last_sect;
> + uint16_t _pad; /* padding to make it 8 bytes, so it's cache-aligned */
> +} __attribute__((__packed__));
> +
> struct blkif_request_rw {
> uint8_t nr_segments; /* number of segments */
> blkif_vdev_t handle; /* only for read/write requests */
> @@ -138,11 +150,24 @@ struct blkif_request_discard {
> uint8_t _pad3;
> } __attribute__((__packed__));
>
> +struct blkif_request_indirect {
> + uint8_t indirect_op;
> + uint16_t nr_segments;
> +#ifdef CONFIG_X86_64
> + uint32_t _pad1; /* offsetof(blkif_...,u.indirect.id) == 8 */
> +#endif
> + uint64_t id;
> + blkif_vdev_t handle;
> + blkif_sector_t sector_number;
> + grant_ref_t indirect_grefs[BLKIF_MAX_INDIRECT_GREFS_PER_REQUEST];
> +} __attribute__((__packed__));
> +
> struct blkif_request {
> uint8_t operation; /* BLKIF_OP_??? */
> union {
> struct blkif_request_rw rw;
> struct blkif_request_discard discard;
> + struct blkif_request_indirect indirect;
> } u;
> } __attribute__((__packed__));
>
> --
> 1.7.7.5 (Apple Git-26)
>
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Powered by blists - more mailing lists