linux-kernel - Re: [PATCH 2/4] block: Implement a blk_yield function to voluntarily give up the I/O scheduler.

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20100518214437.GF12330@redhat.com>
Date:	Tue, 18 May 2010 17:44:37 -0400
From:	Vivek Goyal <vgoyal@...hat.com>
To:	Jeff Moyer <jmoyer@...hat.com>
Cc:	linux-kernel@...r.kernel.org, linux-ext4@...r.kernel.org,
	jens.axboe@...cle.com
Subject: Re: [PATCH 2/4] block: Implement a blk_yield function to
	voluntarily give up the I/O scheduler.

On Tue, May 18, 2010 at 02:20:18PM -0400, Jeff Moyer wrote:
> This patch implements a blk_yield to allow a process to voluntarily
> give up its I/O scheduler time slice.  This is desirable for those processes
> which know that they will be blocked on I/O from another process, such as
> the file system journal thread.  Following patches will put calls to blk_yield
> into jbd and jbd2.
> 
> Signed-off-by: Jeff Moyer <jmoyer@...hat.com>
> ---
>  block/blk-core.c         |   13 +++++
>  block/blk-settings.c     |    6 +++
>  block/cfq-iosched.c      |  112 +++++++++++++++++++++++++++++++++++++++++++++-
>  block/elevator.c         |    8 +++
>  include/linux/blkdev.h   |    4 ++
>  include/linux/elevator.h |    3 +
>  6 files changed, 144 insertions(+), 2 deletions(-)
> 
> diff --git a/block/blk-core.c b/block/blk-core.c
> index 9fe174d..b8be6c8 100644
> --- a/block/blk-core.c
> +++ b/block/blk-core.c
> @@ -323,6 +323,18 @@ void blk_unplug(struct request_queue *q)
>  }
>  EXPORT_SYMBOL(blk_unplug);
>  
> +void generic_yield_iosched(struct request_queue *q, struct task_struct *tsk)
> +{
> +	elv_yield(q, tsk);
> +}
> +
> +void blk_yield(struct request_queue *q, struct task_struct *tsk)
> +{
> +	if (q->yield_fn)
> +		q->yield_fn(q, tsk);
> +}
> +EXPORT_SYMBOL(blk_yield);
> +
>  /**
>   * blk_start_queue - restart a previously stopped queue
>   * @q:    The &struct request_queue in question
> @@ -580,6 +592,7 @@ blk_init_queue_node(request_fn_proc *rfn, spinlock_t *lock, int node_id)
>  	q->request_fn		= rfn;
>  	q->prep_rq_fn		= NULL;
>  	q->unplug_fn		= generic_unplug_device;
> +	q->yield_fn		= generic_yield_iosched;
>  	q->queue_flags		= QUEUE_FLAG_DEFAULT;
>  	q->queue_lock		= lock;
>  
> diff --git a/block/blk-settings.c b/block/blk-settings.c
> index f5ed5a1..fe548c9 100644
> --- a/block/blk-settings.c
> +++ b/block/blk-settings.c
> @@ -171,6 +171,12 @@ void blk_queue_make_request(struct request_queue *q, make_request_fn *mfn)
>  }
>  EXPORT_SYMBOL(blk_queue_make_request);
>  
> +void blk_queue_yield(struct request_queue *q, yield_fn *yield)
> +{
> +	q->yield_fn = yield;
> +}
> +EXPORT_SYMBOL_GPL(blk_queue_yield);
> +
>  /**
>   * blk_queue_bounce_limit - set bounce buffer limit for queue
>   * @q: the request queue for the device
> diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
> index 46a7fe5..9aab701 100644
> --- a/block/cfq-iosched.c
> +++ b/block/cfq-iosched.c
> @@ -148,6 +148,7 @@ struct cfq_queue {
>  	struct cfq_queue *new_cfqq;
>  	struct cfq_group *cfqg;
>  	struct cfq_group *orig_cfqg;
> +	struct cfq_io_context *yield_to;
>  	/* Sectors dispatched in current dispatch round */
>  	unsigned long nr_sectors;
>  };
> @@ -320,6 +321,7 @@ enum cfqq_state_flags {
>  	CFQ_CFQQ_FLAG_split_coop,	/* shared cfqq will be splitted */
>  	CFQ_CFQQ_FLAG_deep,		/* sync cfqq experienced large depth */
>  	CFQ_CFQQ_FLAG_wait_busy,	/* Waiting for next request */
> +	CFQ_CFQQ_FLAG_yield,		/* Allow another cfqq to run */
>  };
>  
>  #define CFQ_CFQQ_FNS(name)						\
> @@ -349,6 +351,7 @@ CFQ_CFQQ_FNS(coop);
>  CFQ_CFQQ_FNS(split_coop);
>  CFQ_CFQQ_FNS(deep);
>  CFQ_CFQQ_FNS(wait_busy);
> +CFQ_CFQQ_FNS(yield);
>  #undef CFQ_CFQQ_FNS
>  
>  #ifdef CONFIG_DEBUG_CFQ_IOSCHED
> @@ -1566,6 +1569,7 @@ __cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq,
>  
>  	cfq_clear_cfqq_wait_request(cfqq);
>  	cfq_clear_cfqq_wait_busy(cfqq);
> +	cfq_clear_cfqq_yield(cfqq);
>  
>  	/*
>  	 * If this cfqq is shared between multiple processes, check to
> @@ -2068,7 +2072,7 @@ static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg)
>  		slice = max(slice, 2 * cfqd->cfq_slice_idle);
>  
>  	slice = max_t(unsigned, slice, CFQ_MIN_TT);
> -	cfq_log(cfqd, "workload slice:%d", slice);
> +	cfq_log(cfqd, "workload:%d slice:%d", cfqd->serving_type, slice);
>  	cfqd->workload_expires = jiffies + slice;
>  	cfqd->noidle_tree_requires_idle = false;
>  }
> @@ -2138,7 +2142,8 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
>  		 * ok to wait for this request to complete.
>  		 */
>  		if (cfqq->cfqg->nr_cfqq == 1 && RB_EMPTY_ROOT(&cfqq->sort_list)
> -		    && cfqq->dispatched && cfq_should_idle(cfqd, cfqq)) {
> +		    && cfqq->dispatched && !cfq_cfqq_yield(cfqq) &&
> +		    cfq_should_idle(cfqd, cfqq)) {
>  			cfqq = NULL;
>  			goto keep_queue;
>  		} else
> @@ -2165,6 +2170,9 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
>  		goto expire;
>  	}
>  
> +	if (cfq_cfqq_yield(cfqq))
> +		goto expire;
> +
>  	/*
>  	 * No requests pending. If the active queue still has requests in
>  	 * flight or is idling for a new request, allow either of these
> @@ -2191,6 +2199,98 @@ keep_queue:
>  	return cfqq;
>  }
>  
> +static void cfq_yield(struct request_queue *q, struct task_struct *tsk)
> +{
> +	struct cfq_data *cfqd = q->elevator->elevator_data;
> +	struct cfq_io_context *cic, *new_cic;
> +	struct cfq_queue *cfqq, *sync_cfqq, *async_cfqq, *new_cfqq;
> +
> +	cic = cfq_cic_lookup(cfqd, current->io_context);
> +	if (!cic)
> +		return;
> +
> +	task_lock(tsk);
> +	new_cic = cfq_cic_lookup(cfqd, tsk->io_context);
> +	atomic_long_inc(&tsk->io_context->refcount);
> +	task_unlock(tsk);
> +	if (!new_cic)
> +		goto out_dec;
> +
> +	spin_lock_irq(q->queue_lock);
> +
> +	/*
> +	 * This is primarily called to ensure that the long synchronous
> +	 * time slice does not prevent other I/O happenning (like journal
> +	 * commits) while we idle waiting for it.  Thus, check to see if the
> +	 * current cfqq is the sync cfqq for this process.
> +	 */
> +	cfqq = cic_to_cfqq(cic, 1);
> +	if (!cfqq)
> +		goto out_unlock;
> +
> +	if (cfqd->active_queue != cfqq)
> +		goto out_unlock;

Why does yielding queue has to be active queue? Could it be possible that
a queue submitted a bunch of IO and did yield. It is possible that
yielding queue is not active queue. Even in that case we will like to mark
cfqq yielding and expire queue after dispatch of pending requests?

> +
> +	sync_cfqq = cic_to_cfqq(new_cic, 1);
> +	async_cfqq = cic_to_cfqq(new_cic, 0);
> +	if (!sync_cfqq && !async_cfqq)
> +		goto out_unlock;
> +	if (!RB_EMPTY_ROOT(&sync_cfqq->sort_list))
> +		new_cfqq = sync_cfqq;
> +	else
> +		new_cfqq = async_cfqq;
> +

Why is it mandatory that target context has already the queue setup. If
there is no queue setup, can't we just yield the slice and let somebody
else run?

Oh, I guess you want to keep the slice and idle till target queue
dispatches some requests and sets up context and a queue? But even that
will not help as you have anyway missed the yield event?

> +	/*
> +	 * If we are currently servicing the SYNC_NOIDLE_WORKLOAD, and we
> +	 * are idling on the last queue in that workload, *and* the average
> +	 * think time is larger thank the remaining slice time, go ahead
> +	 * and yield the queue.  Otherwise, don't yield so that fsync-heavy
> +	 * workloads don't starve out the sync-noidle workload.
> +	 */
> +	if (cfqd->serving_type == SYNC_NOIDLE_WORKLOAD &&
> +	    (!sample_valid(cfqq->service_tree->ttime_samples) ||
> +	     cfqq->slice_end - jiffies > cfqq->service_tree->ttime_mean)) {
> +		/*

This is confusing. The yielding queue's stats are already part of service
tree's think time. So if yielding queue has done bunch of IO, then service
tree's mean think time will be low and we will not yield? I guess that's
the reason you are trying to check average number of queues in following
code.  

> +		 * If there's only a single sync-noidle queue on average,
> +		 * there's no point in idling in order to not starve the
> +		 * non-existent other queues.
> +		 */
> +		if (cfq_group_busy_queues_wl(SYNC_NOIDLE_WORKLOAD, cfqd,
> +					     cfqq->cfqg) > 1)

Does cfq_group_busy_queues_wl() give average number of queues. Were you
looking for cfqg->busy_queues_avg?

> +			goto out_unlock;
> +	}
> +
> +	cfq_log_cfqq(cfqd, cfqq, "yielding queue");
> +
> +	/*
> +	 * If there are other requests pending, just mark the queue as
> +	 * yielding and give up our slice after the last request is
> +	 * dispatched.  Or, if there are no requests currently pending
> +	 * on the selected queue, keep our time slice until such a time
> +	 * as the new queue has something to do.  It will then preempt
> +	 * this queue (see cfq_should_preempt).
> +	 */
> +	if (!RB_EMPTY_ROOT(&cfqq->sort_list) ||
> +	    RB_EMPTY_ROOT(&new_cfqq->sort_list)) {
> +		cfqq->yield_to = new_cic;

Where are you clearing yield_to flag? Can't seem to find it.


> +		cfq_mark_cfqq_yield(cfqq);
> +		goto out_unlock;
> +	}
> +
> +	/*
> +	 * At this point, we know that the target queue has requests pending.
> +	 * Yield the io scheduler.
> +	 */
> +	__cfq_slice_expired(cfqd, cfqq, 1);
> +	__cfq_set_active_queue(cfqd, new_cfqq);

What happens to cfq_group considerations? So we can yield slices across
groups. That does not seem right. If existing group has requests on other
service trees, these should be processed first.

Can we handle all this logic in select_queue(). I think it might turn out
to be simpler. I mean in this function if we just mark the existing queue
as yielding and also setup who to yield the queue to and let
select_queue() take the decision whether to idle or choose new queue etc.
I guess it should be possible and code might turn out to be simpler to
read.

Vivek

> +	__blk_run_queue(cfqd->queue);
> +
> +out_unlock:
> +	spin_unlock_irq(q->queue_lock);
> +out_dec:
> +	put_io_context(tsk->io_context);
> +}
> +
>  static int __cfq_forced_dispatch_cfqq(struct cfq_queue *cfqq)
>  {
>  	int dispatched = 0;
> @@ -3094,6 +3194,13 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq,
>  	if (!cfqq)
>  		return false;
>  
> +	/*
> +	 * If the active queue yielded its timeslice to this queue, let
> +	 * it preempt.
> +	 */
> +	if (cfq_cfqq_yield(cfqq) && RQ_CIC(rq) == cfqq->yield_to)
> +		return true;
> +
>  	if (cfq_class_idle(new_cfqq))
>  		return false;
>  
> @@ -3910,6 +4017,7 @@ static struct elevator_type iosched_cfq = {
>  		.elevator_deactivate_req_fn =	cfq_deactivate_request,
>  		.elevator_queue_empty_fn =	cfq_queue_empty,
>  		.elevator_completed_req_fn =	cfq_completed_request,
> +		.elevator_yield_fn =		cfq_yield,
>  		.elevator_former_req_fn =	elv_rb_former_request,
>  		.elevator_latter_req_fn =	elv_rb_latter_request,
>  		.elevator_set_req_fn =		cfq_set_request,
> diff --git a/block/elevator.c b/block/elevator.c
> index 76e3702..3af7796 100644
> --- a/block/elevator.c
> +++ b/block/elevator.c
> @@ -855,6 +855,14 @@ void elv_completed_request(struct request_queue *q, struct request *rq)
>  	}
>  }
>  
> +void elv_yield(struct request_queue *q, struct task_struct *tsk)
> +{
> +	struct elevator_queue *e = q->elevator;
> +
> +	if (e && e->ops->elevator_yield_fn)
> +		e->ops->elevator_yield_fn(q, tsk);
> +}
> +
>  #define to_elv(atr) container_of((atr), struct elv_fs_entry, attr)
>  
>  static ssize_t
> diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
> index 6690e8b..1099d29 100644
> --- a/include/linux/blkdev.h
> +++ b/include/linux/blkdev.h
> @@ -259,6 +259,7 @@ struct request_pm_state
>  
>  typedef void (request_fn_proc) (struct request_queue *q);
>  typedef int (make_request_fn) (struct request_queue *q, struct bio *bio);
> +typedef void (yield_fn) (struct request_queue *q, struct task_struct *tsk);
>  typedef int (prep_rq_fn) (struct request_queue *, struct request *);
>  typedef void (unplug_fn) (struct request_queue *);
>  
> @@ -341,6 +342,7 @@ struct request_queue
>  
>  	request_fn_proc		*request_fn;
>  	make_request_fn		*make_request_fn;
> +	yield_fn		*yield_fn;
>  	prep_rq_fn		*prep_rq_fn;
>  	unplug_fn		*unplug_fn;
>  	merge_bvec_fn		*merge_bvec_fn;
> @@ -833,6 +835,7 @@ extern int blk_execute_rq(struct request_queue *, struct gendisk *,
>  extern void blk_execute_rq_nowait(struct request_queue *, struct gendisk *,
>  				  struct request *, int, rq_end_io_fn *);
>  extern void blk_unplug(struct request_queue *q);
> +extern void blk_yield(struct request_queue *q, struct task_struct *tsk);
>  
>  static inline struct request_queue *bdev_get_queue(struct block_device *bdev)
>  {
> @@ -920,6 +923,7 @@ extern struct request_queue *blk_init_queue_node(request_fn_proc *rfn,
>  extern struct request_queue *blk_init_queue(request_fn_proc *, spinlock_t *);
>  extern void blk_cleanup_queue(struct request_queue *);
>  extern void blk_queue_make_request(struct request_queue *, make_request_fn *);
> +extern void blk_queue_yield(struct request_queue *, yield_fn *);
>  extern void blk_queue_bounce_limit(struct request_queue *, u64);
>  extern void blk_queue_max_hw_sectors(struct request_queue *, unsigned int);
>  extern void blk_queue_max_segments(struct request_queue *, unsigned short);
> diff --git a/include/linux/elevator.h b/include/linux/elevator.h
> index 1cb3372..a5aaee9 100644
> --- a/include/linux/elevator.h
> +++ b/include/linux/elevator.h
> @@ -20,6 +20,7 @@ typedef void (elevator_add_req_fn) (struct request_queue *, struct request *);
>  typedef int (elevator_queue_empty_fn) (struct request_queue *);
>  typedef struct request *(elevator_request_list_fn) (struct request_queue *, struct request *);
>  typedef void (elevator_completed_req_fn) (struct request_queue *, struct request *);
> +typedef void (elevator_yield_fn) (struct request_queue *, struct task_struct *tsk);
>  typedef int (elevator_may_queue_fn) (struct request_queue *, int);
>  
>  typedef int (elevator_set_req_fn) (struct request_queue *, struct request *, gfp_t);
> @@ -44,6 +45,7 @@ struct elevator_ops
>  
>  	elevator_queue_empty_fn *elevator_queue_empty_fn;
>  	elevator_completed_req_fn *elevator_completed_req_fn;
> +	elevator_yield_fn *elevator_yield_fn;
>  
>  	elevator_request_list_fn *elevator_former_req_fn;
>  	elevator_request_list_fn *elevator_latter_req_fn;
> @@ -105,6 +107,7 @@ extern void elv_merge_requests(struct request_queue *, struct request *,
>  extern void elv_merged_request(struct request_queue *, struct request *, int);
>  extern void elv_requeue_request(struct request_queue *, struct request *);
>  extern int elv_queue_empty(struct request_queue *);
> +extern void elv_yield(struct request_queue *, struct task_struct *);
>  extern struct request *elv_former_request(struct request_queue *, struct request *);
>  extern struct request *elv_latter_request(struct request_queue *, struct request *);
>  extern int elv_register_queue(struct request_queue *q);
> -- 
> 1.6.5.2
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/