linux-kernel - Re: [PATCH] io-controller: implement per group request allocation limitation

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20090713160352.GA3714@redhat.com>
Date:	Mon, 13 Jul 2009 12:03:52 -0400
From:	Vivek Goyal <vgoyal@...hat.com>
To:	Gui Jianfeng <guijianfeng@...fujitsu.com>
Cc:	linux-kernel@...r.kernel.org,
	containers@...ts.linux-foundation.org, dm-devel@...hat.com,
	jens.axboe@...cle.com, nauman@...gle.com, dpshah@...gle.com,
	lizf@...fujitsu.com, mikew@...gle.com, fchecconi@...il.com,
	paolo.valente@...more.it, ryov@...inux.co.jp,
	fernando@....ntt.co.jp, s-uchida@...jp.nec.com, taka@...inux.co.jp,
	jmoyer@...hat.com, dhaval@...ux.vnet.ibm.com,
	balbir@...ux.vnet.ibm.com, righi.andrea@...il.com,
	m-ikeda@...jp.nec.com, jbaron@...hat.com, agk@...hat.com,
	snitzer@...hat.com, akpm@...ux-foundation.org, peterz@...radead.org
Subject: Re: [PATCH] io-controller: implement per group request allocation
	limitation

On Fri, Jul 10, 2009 at 09:56:21AM +0800, Gui Jianfeng wrote:
> Hi Vivek,
> 
> This patch exports a cgroup based per group request limits interface.
> and removes the global one. Now we can use this interface to perform
> different request allocation limitation for different groups. 
> 

Thanks Gui. Few points come to mind.

- You seem to be making this as per cgroup limit on all devices. I guess
  that different devices in the system can have different settings of
  q->nr_requests and hence will probably want different per group limit.
  So we might have to make it per cgroup per device limit.

- There does not seem to be any checks for making sure that children
  cgroups don't have more request descriptors allocated than parent group.

- I am re-thinking that what's the advantage of configuring request
  descriptors also through cgroups. It does bring in additional complexity
  with it and it should justfiy the advantages. Can you think of some?

  Until and unless we can come up with some significant advantages, I will
  prefer to continue to use per group limit through q->nr_group_requests
  interface instead of cgroup. Once things stablize, we can revisit it and
  see how this interface can be improved.

Thanks
Vivek

> Signed-off-by: Gui Jianfeng <guijianfeng@...fujitsu.com>
> ---
>  block/blk-core.c     |   23 ++++++++++--
>  block/blk-settings.c |    1 -
>  block/blk-sysfs.c    |   43 -----------------------
>  block/elevator-fq.c  |   94 ++++++++++++++++++++++++++++++++++++++++++++++---
>  block/elevator-fq.h  |    4 ++
>  5 files changed, 111 insertions(+), 54 deletions(-)
> 
> diff --git a/block/blk-core.c b/block/blk-core.c
> index 79fe6a9..7010b76 100644
> --- a/block/blk-core.c
> +++ b/block/blk-core.c
> @@ -722,13 +722,20 @@ static void ioc_set_batching(struct request_queue *q, struct io_context *ioc)
>  static void __freed_request(struct request_queue *q, int sync,
>  					struct request_list *rl)
>  {
> +	struct io_group *iog;
> +	unsigned long nr_group_requests;
> +
>  	if (q->rq_data.count[sync] < queue_congestion_off_threshold(q))
>  		blk_clear_queue_congested(q, sync);
>  
>  	if (q->rq_data.count[sync] + 1 <= q->nr_requests)
>  		blk_clear_queue_full(q, sync);
>  
> -	if (rl->count[sync] + 1 <= q->nr_group_requests) {
> +	iog = rl_iog(rl);
> +
> +	nr_group_requests = get_group_requests(q, iog);
> +
> +	if (nr_group_requests && rl->count[sync] + 1 <= nr_group_requests) {
>  		if (waitqueue_active(&rl->wait[sync]))
>  			wake_up(&rl->wait[sync]);
>  	}
> @@ -828,6 +835,8 @@ static struct request *get_request(struct request_queue *q, int rw_flags,
>  	const bool is_sync = rw_is_sync(rw_flags) != 0;
>  	int may_queue, priv;
>  	int sleep_on_global = 0;
> +	struct io_group *iog;
> +	unsigned long nr_group_requests;
>  
>  	may_queue = elv_may_queue(q, rw_flags);
>  	if (may_queue == ELV_MQUEUE_NO)
> @@ -843,7 +852,12 @@ static struct request *get_request(struct request_queue *q, int rw_flags,
>  	if (q->rq_data.count[is_sync]+1 >= q->nr_requests)
>  		blk_set_queue_full(q, is_sync);
>  
> -	if (rl->count[is_sync]+1 >= q->nr_group_requests) {
> +	iog = rl_iog(rl);
> +
> +	nr_group_requests = get_group_requests(q, iog);
> +
> +	if (nr_group_requests &&
> +	    rl->count[is_sync]+1 >= nr_group_requests) {
>  		ioc = current_io_context(GFP_ATOMIC, q->node);
>  		/*
>  		 * The queue request descriptor group will fill after this
> @@ -852,7 +866,7 @@ static struct request *get_request(struct request_queue *q, int rw_flags,
>  		 * This process will be allowed to complete a batch of
>  		 * requests, others will be blocked.
>  		 */
> -		if (rl->count[is_sync] <= q->nr_group_requests)
> +		if (rl->count[is_sync] <= nr_group_requests)
>  			ioc_set_batching(q, ioc);
>  		else {
>  			if (may_queue != ELV_MQUEUE_MUST
> @@ -898,7 +912,8 @@ static struct request *get_request(struct request_queue *q, int rw_flags,
>  	 * from per group request list
>  	 */
>  
> -	if (rl->count[is_sync] >= (3 * q->nr_group_requests / 2))
> +	if (nr_group_requests &&
> +	    rl->count[is_sync] >= (3 * nr_group_requests / 2))
>  		goto out;
>  
>  	rl->starved[is_sync] = 0;
> diff --git a/block/blk-settings.c b/block/blk-settings.c
> index 78b8aec..bd582a7 100644
> --- a/block/blk-settings.c
> +++ b/block/blk-settings.c
> @@ -148,7 +148,6 @@ void blk_queue_make_request(struct request_queue *q, make_request_fn *mfn)
>  	 * set defaults
>  	 */
>  	q->nr_requests = BLKDEV_MAX_RQ;
> -	q->nr_group_requests = BLKDEV_MAX_GROUP_RQ;
>  
>  	q->make_request_fn = mfn;
>  	blk_queue_dma_alignment(q, 511);
> diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
> index 92b9f25..706d852 100644
> --- a/block/blk-sysfs.c
> +++ b/block/blk-sysfs.c
> @@ -78,40 +78,8 @@ queue_requests_store(struct request_queue *q, const char *page, size_t count)
>  	return ret;
>  }
>  #ifdef CONFIG_GROUP_IOSCHED
> -static ssize_t queue_group_requests_show(struct request_queue *q, char *page)
> -{
> -	return queue_var_show(q->nr_group_requests, (page));
> -}
> -
>  extern void elv_io_group_congestion_threshold(struct request_queue *q,
>  					      struct io_group *iog);
> -
> -static ssize_t
> -queue_group_requests_store(struct request_queue *q, const char *page,
> -					size_t count)
> -{
> -	struct hlist_node *n;
> -	struct io_group *iog;
> -	struct elv_fq_data *efqd;
> -	unsigned long nr;
> -	int ret = queue_var_store(&nr, page, count);
> -
> -	if (nr < BLKDEV_MIN_RQ)
> -		nr = BLKDEV_MIN_RQ;
> -
> -	spin_lock_irq(q->queue_lock);
> -
> -	q->nr_group_requests = nr;
> -
> -	efqd = &q->elevator->efqd;
> -
> -	hlist_for_each_entry(iog, n, &efqd->group_list, elv_data_node) {
> -		elv_io_group_congestion_threshold(q, iog);
> -	}
> -
> -	spin_unlock_irq(q->queue_lock);
> -	return ret;
> -}
>  #endif
>  
>  static ssize_t queue_ra_show(struct request_queue *q, char *page)
> @@ -278,14 +246,6 @@ static struct queue_sysfs_entry queue_requests_entry = {
>  	.store = queue_requests_store,
>  };
>  
> -#ifdef CONFIG_GROUP_IOSCHED
> -static struct queue_sysfs_entry queue_group_requests_entry = {
> -	.attr = {.name = "nr_group_requests", .mode = S_IRUGO | S_IWUSR },
> -	.show = queue_group_requests_show,
> -	.store = queue_group_requests_store,
> -};
> -#endif
> -
>  static struct queue_sysfs_entry queue_ra_entry = {
>  	.attr = {.name = "read_ahead_kb", .mode = S_IRUGO | S_IWUSR },
>  	.show = queue_ra_show,
> @@ -360,9 +320,6 @@ static struct queue_sysfs_entry queue_iostats_entry = {
>  
>  static struct attribute *default_attrs[] = {
>  	&queue_requests_entry.attr,
> -#ifdef CONFIG_GROUP_IOSCHED
> -	&queue_group_requests_entry.attr,
> -#endif
>  	&queue_ra_entry.attr,
>  	&queue_max_hw_sectors_entry.attr,
>  	&queue_max_sectors_entry.attr,
> diff --git a/block/elevator-fq.c b/block/elevator-fq.c
> index 29392e7..bfb0210 100644
> --- a/block/elevator-fq.c
> +++ b/block/elevator-fq.c
> @@ -59,6 +59,35 @@ elv_release_ioq(struct elevator_queue *eq, struct io_queue **ioq_ptr);
>  #define for_each_entity_safe(entity, parent) \
>  	for (; entity && ({ parent = entity->parent; 1; }); entity = parent)
>  
> +unsigned short get_group_requests(struct request_queue *q,
> +				  struct io_group *iog)
> +{
> +	struct cgroup_subsys_state *css;
> +	struct io_cgroup *iocg;
> +	unsigned long nr_group_requests;
> +
> +	if (!iog)
> +		return q->nr_requests;
> +
> +	rcu_read_lock();
> +
> +	if (!iog->iocg_id) {
> +		nr_group_requests = 0;
> +		goto out;
> +	}
> +
> +	css = css_lookup(&io_subsys, iog->iocg_id);
> +	if (!css) {
> +		nr_group_requests = 0;
> +		goto out;
> +	}
> +
> +	iocg = container_of(css, struct io_cgroup, css);
> +	nr_group_requests = iocg->nr_group_requests;
> +out:
> +	rcu_read_unlock();
> +	return nr_group_requests;
> +}
>  
>  static struct io_entity *bfq_lookup_next_entity(struct io_sched_data *sd,
>  						 int extract);
> @@ -1257,14 +1286,17 @@ void elv_io_group_congestion_threshold(struct request_queue *q,
>  						struct io_group *iog)
>  {
>  	int nr;
> +	unsigned long nr_group_requests;
>  
> -	nr = q->nr_group_requests - (q->nr_group_requests / 8) + 1;
> -	if (nr > q->nr_group_requests)
> -		nr = q->nr_group_requests;
> +	nr_group_requests = get_group_requests(q, iog);
> +
> +	nr = nr_group_requests - (nr_group_requests / 8) + 1;
> +	if (nr > nr_group_requests)
> +		nr = nr_group_requests;
>  	iog->nr_congestion_on = nr;
>  
> -	nr = q->nr_group_requests - (q->nr_group_requests / 8)
> -			- (q->nr_group_requests / 16) - 1;
> +	nr = nr_group_requests - (nr_group_requests / 8)
> +			- (nr_group_requests / 16) - 1;
>  	if (nr < 1)
>  		nr = 1;
>  	iog->nr_congestion_off = nr;
> @@ -1283,6 +1315,7 @@ int elv_io_group_congested(struct request_queue *q, struct page *page, int sync)
>  {
>  	struct io_group *iog;
>  	int ret = 0;
> +	unsigned long nr_group_requests;
>  
>  	rcu_read_lock();
>  
> @@ -1300,10 +1333,11 @@ int elv_io_group_congested(struct request_queue *q, struct page *page, int sync)
>  	}
>  
>  	ret = elv_is_iog_congested(q, iog, sync);
> +	nr_group_requests = get_group_requests(q, iog);
>  	if (ret)
>  		elv_log_iog(&q->elevator->efqd, iog, "iog congested=%d sync=%d"
>  			" rl.count[sync]=%d nr_group_requests=%d",
> -			ret, sync, iog->rl.count[sync], q->nr_group_requests);
> +			ret, sync, iog->rl.count[sync], nr_group_requests);
>  	rcu_read_unlock();
>  	return ret;
>  }
> @@ -1549,6 +1583,48 @@ free_buf:
>  	return ret;
>  }
>  
> +static u64 io_cgroup_nr_requests_read(struct cgroup *cgroup,
> +				       struct cftype *cftype)
> +{
> +	struct io_cgroup *iocg;
> +	u64 ret;
> +
> +	if (!cgroup_lock_live_group(cgroup))
> +		return -ENODEV;
> +
> +	iocg = cgroup_to_io_cgroup(cgroup);
> +	spin_lock_irq(&iocg->lock);
> +	ret = iocg->nr_group_requests;
> +	spin_unlock_irq(&iocg->lock);
> +
> +	cgroup_unlock();
> +
> +	return ret;
> +}
> +
> +static int io_cgroup_nr_requests_write(struct cgroup *cgroup,
> +					struct cftype *cftype,
> +					u64 val)
> +{
> +	struct io_cgroup *iocg;
> +
> +	if (val < BLKDEV_MIN_RQ)
> +		val = BLKDEV_MIN_RQ;
> +
> +	if (!cgroup_lock_live_group(cgroup))
> +		return -ENODEV;
> +
> +	iocg = cgroup_to_io_cgroup(cgroup);
> +
> +	spin_lock_irq(&iocg->lock);
> +	iocg->nr_group_requests = (unsigned long)val;
> +	spin_unlock_irq(&iocg->lock);
> +
> +	cgroup_unlock();
> +
> +	return 0;
> +}
> +
>  #define SHOW_FUNCTION(__VAR)						\
>  static u64 io_cgroup_##__VAR##_read(struct cgroup *cgroup,		\
>  				       struct cftype *cftype)		\
> @@ -1735,6 +1811,11 @@ static int io_cgroup_disk_dequeue_read(struct cgroup *cgroup,
>  
>  struct cftype bfqio_files[] = {
>  	{
> +		.name = "nr_group_requests",
> +		.read_u64 = io_cgroup_nr_requests_read,
> +		.write_u64 = io_cgroup_nr_requests_write,
> +	},
> +	{
>  		.name = "policy",
>  		.read_seq_string = io_cgroup_policy_read,
>  		.write_string = io_cgroup_policy_write,
> @@ -1790,6 +1871,7 @@ static struct cgroup_subsys_state *iocg_create(struct cgroup_subsys *subsys,
>  
>  	spin_lock_init(&iocg->lock);
>  	INIT_HLIST_HEAD(&iocg->group_data);
> +	iocg->nr_group_requests = BLKDEV_MAX_GROUP_RQ;
>  	iocg->weight = IO_DEFAULT_GRP_WEIGHT;
>  	iocg->ioprio_class = IO_DEFAULT_GRP_CLASS;
>  	INIT_LIST_HEAD(&iocg->policy_list);
> diff --git a/block/elevator-fq.h b/block/elevator-fq.h
> index f089a55..df077d0 100644
> --- a/block/elevator-fq.h
> +++ b/block/elevator-fq.h
> @@ -308,6 +308,7 @@ struct io_cgroup {
>  	unsigned int weight;
>  	unsigned short ioprio_class;
>  
> +	unsigned long nr_group_requests;
>  	/* list of io_policy_node */
>  	struct list_head policy_list;
>  
> @@ -386,6 +387,9 @@ struct elv_fq_data {
>  	unsigned int fairness;
>  };
>  
> +extern unsigned short get_group_requests(struct request_queue *q,
> +					 struct io_group *iog);
> +
>  /* Logging facilities. */
>  #ifdef CONFIG_DEBUG_GROUP_IOSCHED
>  #define elv_log_ioq(efqd, ioq, fmt, args...) \
> -- 
> 1.5.4.rc3 
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/