linux-kernel - Re: [RFC] [PATCH 8/8] cfq-iosched: Introduce hierarchical scheduling with CFQ queue and group at the same level

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite for Android: free password hash cracker in your pocket
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20101129143118.GC4534@redhat.com>
Date:	Mon, 29 Nov 2010 09:31:18 -0500
From:	Vivek Goyal <vgoyal@...hat.com>
To:	Gui Jianfeng <guijianfeng@...fujitsu.com>
Cc:	Jens Axboe <axboe@...nel.dk>, Corrado Zoccolo <czoccolo@...il.com>,
	Chad Talbott <ctalbott@...gle.com>,
	Nauman Rafique <nauman@...gle.com>,
	Divyesh Shah <dpshah@...gle.com>,
	linux kernel mailing list <linux-kernel@...r.kernel.org>
Subject: Re: [RFC] [PATCH 8/8] cfq-iosched: Introduce hierarchical scheduling
 with CFQ queue and group at the same level

On Mon, Nov 29, 2010 at 10:42:15AM +0800, Gui Jianfeng wrote:
> Vivek Goyal wrote:
> > On Sun, Nov 14, 2010 at 04:25:49PM +0800, Gui Jianfeng wrote:
> >> This patch makes CFQ queue and CFQ group schedules at the same level.
> >> Consider the following hierarchy:
> >>
> >>                     Root
> >>                    / | \
> >>                  q1 q2 G1
> >>                       / \
> >>                     q3  G2 
> >>
> >> q1 q2 and q3 are CFQ queues G1 and G2 are CFQ groups. Currently, q1, q2 
> >> and G1 are scheduling on a same service tree in Root CFQ group. q3 and G2
> >> are schedluing under G1. Note, for the time being, CFQ group is treated 
> >> as "BE and SYNC" workload, and is put on "BE and SYNC" service tree. That
> >> means service differentiate only happens in "BE and SYNC" service tree.
> >> Later, we may introduce "IO Class" for CFQ group.
> >>
> > 
> > Have you got rid of flat mode (existing default mode). IOW, I don't see
> > the introduction of "use_hierarchy" which will differentiate between 
> > whether to treat an hierarchy as flat or not?
> 
> Vivek,
> 
> As I said in [PATCH 0/8], yes, for the time being, I get rid of flat mode.
> Hierarchical cgroup creation is just merge into block-tree, not in Mainline
> now, .so I think it's ok I'd like to post "use_hierarchy" patchset separately
> when this patchset get merged. How do you say, Vivek?

But even single level of group creation can be either flat or
hierarchical. This of two cgroups test1 and test2 created under root. In
flat and hierarchical scheme they will look different.

So I don't think we can drop flat mode for time being. We need to include
use_hierarhcy support along with this patch series.

Thanks
Vivek

> 
> Gui
> 
> > 
> > Vivek
> > 
> >> Signed-off-by: Gui Jianfeng <guijianfeng@...fujitsu.com>
> >> ---
> >>  block/cfq-iosched.c |  483 ++++++++++++++++++++++++++++++++++-----------------
> >>  1 files changed, 324 insertions(+), 159 deletions(-)
> >>
> >> diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
> >> index 1df0928..9def3a2 100644
> >> --- a/block/cfq-iosched.c
> >> +++ b/block/cfq-iosched.c
> >> @@ -105,6 +105,9 @@ struct io_sched_entity {
> >>  	u64 vdisktime;
> >>  	bool is_group_entity;
> >>  	unsigned int weight;
> >> +	struct io_sched_entity *parent;
> >> +	/* Reposition time */
> >> +	unsigned long reposition_time;
> >>  };
> >>  
> >>  /*
> >> @@ -113,8 +116,6 @@ struct io_sched_entity {
> >>  struct cfq_queue {
> >>  	/* The schedule entity */
> >>  	struct io_sched_entity queue_entity;
> >> -	/* Reposition time */
> >> -	unsigned long reposition_time;
> >>  	/* reference count */
> >>  	atomic_t ref;
> >>  	/* various state flags, see below */
> >> @@ -193,6 +194,9 @@ struct cfq_group {
> >>  	/* number of cfqq currently on this group */
> >>  	int nr_cfqq;
> >>  
> >> +	/* number of sub cfq groups */
> >> +	int nr_subgp;
> >> +
> >>  	/* Per group busy queus average. Useful for workload slice calc. */
> >>  	unsigned int busy_queues_avg[2];
> >>  	/*
> >> @@ -219,8 +223,6 @@ struct cfq_group {
> >>   */
> >>  struct cfq_data {
> >>  	struct request_queue *queue;
> >> -	/* Root service tree for cfq_groups */
> >> -	struct cfq_rb_root grp_service_tree;
> >>  	struct cfq_group root_group;
> >>  
> >>  	/*
> >> @@ -337,8 +339,6 @@ cfqg_of_entity(struct io_sched_entity *io_entity)
> >>  	return NULL;
> >>  }
> >>  
> >> -static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd);
> >> -
> >>  static struct cfq_rb_root *service_tree_for(struct cfq_group *cfqg,
> >>  					    enum wl_prio_t prio,
> >>  					    enum wl_type_t type)
> >> @@ -629,10 +629,15 @@ static inline unsigned cfq_group_get_avg_queues(struct cfq_data *cfqd,
> >>  static inline unsigned
> >>  cfq_group_slice(struct cfq_data *cfqd, struct cfq_group *cfqg)
> >>  {
> >> -	struct cfq_rb_root *st = &cfqd->grp_service_tree;
> >>  	struct io_sched_entity *group_entity = &cfqg->group_entity;
> >> +	struct cfq_rb_root *st = group_entity->service_tree;
> >>  
> >> -	return cfq_target_latency * group_entity->weight / st->total_weight;
> >> +	if (st)
> >> +		return cfq_target_latency * group_entity->weight
> >> +			/ st->total_weight;
> >> +	else
> >> +		/* If this is the root group, give it a full slice. */
> >> +		return cfq_target_latency;
> >>  }
> >>  
> >>  static inline void
> >> @@ -795,17 +800,6 @@ static struct io_sched_entity *cfq_rb_first(struct cfq_rb_root *root)
> >>  	return NULL;
> >>  }
> >>  
> >> -static struct io_sched_entity *cfq_rb_first_entity(struct cfq_rb_root *root)
> >> -{
> >> -	if (!root->left)
> >> -		root->left = rb_first(&root->rb);
> >> -
> >> -	if (root->left)
> >> -		return rb_entry_entity(root->left);
> >> -
> >> -	return NULL;
> >> -}
> >> -
> >>  static void rb_erase_init(struct rb_node *n, struct rb_root *root)
> >>  {
> >>  	rb_erase(n, root);
> >> @@ -887,6 +881,7 @@ io_entity_service_tree_add(struct cfq_rb_root *st,
> >>  			   struct io_sched_entity *io_entity)
> >>  {
> >>  	__io_entity_service_tree_add(st, io_entity);
> >> +	io_entity->reposition_time = jiffies;
> >>  	st->count++;
> >>  	st->total_weight += io_entity->weight;
> >>  }
> >> @@ -894,29 +889,49 @@ io_entity_service_tree_add(struct cfq_rb_root *st,
> >>  static void
> >>  cfq_group_service_tree_add(struct cfq_data *cfqd, struct cfq_group *cfqg)
> >>  {
> >> -	struct cfq_rb_root *st = &cfqd->grp_service_tree;
> >>  	struct rb_node *n;
> >>  	struct io_sched_entity *group_entity = &cfqg->group_entity;
> >> -	struct io_sched_entity *__group_entity;
> >> +	struct io_sched_entity *entity;
> >> +	struct cfq_rb_root *st;
> >> +	struct cfq_group *__cfqg;
> >>  
> >>  	cfqg->nr_cfqq++;
> >> +
> >> +	/*
> >> +	 * Root group doesn't belongs to any service
> >> +	 */
> >> +	if (cfqg == &cfqd->root_group)
> >> +		return;
> >> +
> >>  	if (!RB_EMPTY_NODE(&group_entity->rb_node))
> >>  		return;
> >>  
> >> -	/*
> >> -	 * Currently put the group at the end. Later implement something
> >> -	 * so that groups get lesser vtime based on their weights, so that
> >> -	 * if group does not loose all if it was not continously backlogged.
> >> +	/* 
> >> +	 * Enqueue this group and its ancestors onto their service tree.
> >>  	 */
> >> -	n = rb_last(&st->rb);
> >> -	if (n) {
> >> -		__group_entity = rb_entry_entity(n);
> >> -		group_entity->vdisktime = __group_entity->vdisktime +
> >> -					  CFQ_IDLE_DELAY;
> >> -	} else
> >> -		group_entity->vdisktime = st->min_vdisktime;
> >> +	while (group_entity && group_entity->parent) {
> >> +		if (!RB_EMPTY_NODE(&group_entity->rb_node))
> >> +			return;
> >> +		/*
> >> +		 * Currently put the group at the end. Later implement
> >> +		 * something so that groups get lesser vtime based on their
> >> +		 * weights, so that if group does not loose all if it was not
> >> +		 * continously backlogged.
> >> +		 */
> >> +		st = group_entity->service_tree;
> >> +		n = rb_last(&st->rb);
> >> +		if (n) {
> >> +			entity = rb_entry_entity(n);
> >> +			group_entity->vdisktime = entity->vdisktime +
> >> +						  CFQ_IDLE_DELAY;
> >> +		} else
> >> +			group_entity->vdisktime = st->min_vdisktime;
> >>  
> >> -	io_entity_service_tree_add(st, group_entity);
> >> +		io_entity_service_tree_add(st, group_entity);
> >> +		group_entity = group_entity->parent;
> >> +		__cfqg = cfqg_of_entity(group_entity);
> >> +		__cfqg->nr_subgp++;
> >> +	}
> >>  }
> >>  
> >>  static void
> >> @@ -933,27 +948,47 @@ io_entity_service_tree_del(struct cfq_rb_root *st,
> >>  	if (!RB_EMPTY_NODE(&io_entity->rb_node)) {
> >>  		__io_entity_service_tree_del(st, io_entity);
> >>  		st->total_weight -= io_entity->weight;
> >> -		io_entity->service_tree = NULL;
> >>  	}
> >>  }
> >>  
> >>  static void
> >>  cfq_group_service_tree_del(struct cfq_data *cfqd, struct cfq_group *cfqg)
> >>  {
> >> -	struct cfq_rb_root *st = &cfqd->grp_service_tree;
> >>  	struct io_sched_entity *group_entity = &cfqg->group_entity;
> >> +	struct cfq_group *__cfqg, *p_cfqg;
> >>  
> >>  	BUG_ON(cfqg->nr_cfqq < 1);
> >>  	cfqg->nr_cfqq--;
> >>  
> >> +	/*
> >> +	 * Root group doesn't belongs to any service
> >> +	 */
> >> +	if (cfqg == &cfqd->root_group)
> >> +		return;
> >> +
> >>  	/* If there are other cfq queues under this group, don't delete it */
> >>  	if (cfqg->nr_cfqq)
> >>  		return;
> >> -
> >> -	cfq_log_cfqg(cfqd, cfqg, "del_from_rr group");
> >> -	io_entity_service_tree_del(st, group_entity);
> >> -	cfqg->saved_workload_slice = 0;
> >> -	cfq_blkiocg_update_dequeue_stats(&cfqg->blkg, 1);
> >> +	/* If child group exists, don't dequeue it */
> >> +	if (cfqg->nr_subgp)
> >> +		return;
> >> +	
> >> +	/*
> >> +         * Dequeue this group and its ancestors from their service tree.
> >> +         */
> >> +	while (group_entity && group_entity->parent) {
> >> +		__cfqg = cfqg_of_entity(group_entity);
> >> +		p_cfqg = cfqg_of_entity(group_entity->parent);
> >> +		io_entity_service_tree_del(group_entity->service_tree,
> >> +					   group_entity);
> >> +		cfq_blkiocg_update_dequeue_stats(&__cfqg->blkg, 1);
> >> +		cfq_log_cfqg(cfqd, __cfqg, "del_from_rr group");
> >> +		__cfqg->saved_workload_slice = 0;
> >> +		group_entity = group_entity->parent;
> >> +		p_cfqg->nr_subgp--;
> >> +		if (p_cfqg->nr_cfqq || p_cfqg->nr_subgp)
> >> +			return;
> >> +	}
> >>  }
> >>  
> >>  static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq)
> >> @@ -985,7 +1020,6 @@ static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq)
> >>  static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
> >>  				struct cfq_queue *cfqq)
> >>  {
> >> -	struct cfq_rb_root *st = &cfqd->grp_service_tree;
> >>  	unsigned int used_sl, charge;
> >>  	int nr_sync = cfqg->nr_cfqq - cfqg_busy_async_queues(cfqd, cfqg)
> >>  			- cfqg->service_tree_idle.count;
> >> @@ -999,10 +1033,21 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
> >>  	else if (!cfq_cfqq_sync(cfqq) && !nr_sync)
> >>  		charge = cfqq->allocated_slice;
> >>  
> >> -	/* Can't update vdisktime while group is on service tree */
> >> -	__io_entity_service_tree_del(st, group_entity);
> >> -	group_entity->vdisktime += cfq_scale_slice(charge, group_entity);
> >> -	__io_entity_service_tree_add(st, group_entity);
> >> +	/*
> >> +	 * Update the vdisktime on the whole chain.
> >> +	 */
> >> +	while (group_entity && group_entity->parent) {
> >> +		struct cfq_rb_root *st = group_entity->service_tree;
> >> +
> >> +		/* Can't update vdisktime while group is on service tree */
> >> +		__io_entity_service_tree_del(st, group_entity);
> >> +		group_entity->vdisktime += cfq_scale_slice(charge,
> >> +							   group_entity);
> >> +		__io_entity_service_tree_add(st, group_entity);
> >> +		st->count++;
> >> +		group_entity->reposition_time = jiffies;
> >> +		group_entity = group_entity->parent;
> >> +	}
> >>  
> >>  	/* This group is being expired. Save the context */
> >>  	if (time_after(cfqd->workload_expires, jiffies)) {
> >> @@ -1014,7 +1059,8 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
> >>  		cfqg->saved_workload_slice = 0;
> >>  
> >>  	cfq_log_cfqg(cfqd, cfqg, "served: vt=%llu min_vt=%llu",
> >> -		     group_entity->vdisktime, st->min_vdisktime);
> >> +		     cfqg->group_entity.vdisktime,
> >> +		     cfqg->group_entity.service_tree->min_vdisktime);
> >>  	cfq_log_cfqq(cfqq->cfqd, cfqq, "sl_used=%u disp=%u charge=%u iops=%u"
> >>  			" sect=%u", used_sl, cfqq->slice_dispatch, charge,
> >>  			iops_mode(cfqd), cfqq->nr_sectors);
> >> @@ -1036,35 +1082,27 @@ void cfq_update_blkio_group_weight(void *key, struct blkio_group *blkg,
> >>  	cfqg_of_blkg(blkg)->group_entity.weight = weight;
> >>  }
> >>  
> >> -static struct cfq_group *
> >> -cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
> >> +static void init_group_entity(struct blkio_cgroup *blkcg,
> >> +				    struct cfq_group *cfqg)
> >> +{
> >> +	struct io_sched_entity *group_entity = &cfqg->group_entity;
> >> +
> >> +	group_entity->weight = blkcg_get_weight(blkcg, cfqg->blkg.dev);
> >> +	RB_CLEAR_NODE(&group_entity->rb_node);
> >> +	group_entity->is_group_entity = true;
> >> +	group_entity->parent = NULL;
> >> +}
> >> +
> >> +static void init_cfqg(struct cfq_data *cfqd, struct blkio_cgroup *blkcg,
> >> +		      struct cfq_group *cfqg)
> >>  {
> >> -	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
> >> -	struct cfq_group *cfqg = NULL;
> >> -	void *key = cfqd;
> >>  	int i, j;
> >>  	struct cfq_rb_root *st;
> >> -	struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
> >>  	unsigned int major, minor;
> >> -
> >> -	cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key));
> >> -	if (cfqg && !cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) {
> >> -		sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
> >> -		cfqg->blkg.dev = MKDEV(major, minor);
> >> -		goto done;
> >> -	}
> >> -	if (cfqg || !create)
> >> -		goto done;
> >> -
> >> -	cfqg = kzalloc_node(sizeof(*cfqg), GFP_ATOMIC, cfqd->queue->node);
> >> -	if (!cfqg)
> >> -		goto done;
> >> +	struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
> >>  
> >>  	for_each_cfqg_st(cfqg, i, j, st)
> >>  		*st = CFQ_RB_ROOT;
> >> -	RB_CLEAR_NODE(&cfqg->group_entity.rb_node);
> >> -
> >> -	cfqg->group_entity.is_group_entity = true;
> >>  
> >>  	/*
> >>  	 * Take the initial reference that will be released on destroy
> >> @@ -1074,24 +1112,119 @@ cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
> >>  	 */
> >>  	atomic_set(&cfqg->ref, 1);
> >>  
> >> +	/* Add group onto cgroup list */
> >> +	sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
> >> +	cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd,
> >> +				    MKDEV(major, minor));
> >> +	/* Initiate group entity */
> >> +	init_group_entity(blkcg, cfqg);
> >> +	/* Add group on cfqd list */
> >> +	hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list);
> >> +}
> >> +
> >> +static void cfq_destroy_cfqg(struct cfq_data *cfqd, struct cfq_group *cfqg);
> >> +
> >> +static void uninit_cfqg(struct cfq_data *cfqd, struct cfq_group *cfqg)
> >> +{
> >> +	if (!cfq_blkiocg_del_blkio_group(&cfqg->blkg))
> >> +		cfq_destroy_cfqg(cfqd, cfqg);
> >> +}
> >> +
> >> +static void cfqg_set_parent(struct cfq_data *cfqd, struct cfq_group *cfqg,
> >> +			    struct cfq_group *p_cfqg)
> >> +{
> >> +	struct io_sched_entity *group_entity, *p_group_entity;
> >> +
> >> +	group_entity = &cfqg->group_entity;
> >> +
> >> +	p_group_entity = &p_cfqg->group_entity;
> >> +
> >> +	group_entity->parent = p_group_entity;
> >> +
> >>  	/*
> >> -	 * Add group onto cgroup list. It might happen that bdi->dev is
> >> -	 * not initiliazed yet. Initialize this new group without major
> >> -	 * and minor info and this info will be filled in once a new thread
> >> -	 * comes for IO. See code above.
> >> +	 * Currently, just put cfq group entity on "BE:SYNC" workload
> >> +	 * service tree.
> >>  	 */
> >> -	if (bdi->dev) {
> >> -		sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
> >> -		cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd,
> >> -					MKDEV(major, minor));
> >> -	} else
> >> -		cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd,
> >> -					0);
> >> +	group_entity->service_tree = service_tree_for(p_cfqg, BE_WORKLOAD,
> >> +						      SYNC_WORKLOAD);
> >> +	/* child reference */
> >> +	atomic_inc(&p_cfqg->ref);
> >> +}
> >>  
> >> -	cfqg->group_entity.weight = blkcg_get_weight(blkcg, cfqg->blkg.dev);
> >> +int cfqg_chain_alloc(struct cfq_data *cfqd, struct cgroup *cgroup)
> >> +{
> >> +	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
> >> +	struct blkio_cgroup *p_blkcg;
> >> +	struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
> >> +	unsigned int major, minor;
> >> +	struct cfq_group *cfqg, *p_cfqg;
> >> +	void *key = cfqd;
> >> +	int ret;
> >>  
> >> -	/* Add group on cfqd list */
> >> -	hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list);
> >> +	cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key));
> >> +	if (cfqg) {
> >> +		if (!cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) {
> >> +			sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
> >> +			cfqg->blkg.dev = MKDEV(major, minor);
> >> +		}
> >> +		/* chain has already been built */
> >> +		return 0;
> >> +	}
> >> +
> >> +	cfqg = kzalloc_node(sizeof(*cfqg), GFP_ATOMIC, cfqd->queue->node);
> >> +	if (!cfqg)
> >> +		return -1;
> >> +
> >> +	init_cfqg(cfqd, blkcg, cfqg);
> >> +
> >> +	/* Already to the top group */
> >> +	if (!cgroup->parent)
> >> +		return 0;
> >> +
> >> +	/* Allocate CFQ groups on the chain */
> >> +	ret = cfqg_chain_alloc(cfqd, cgroup->parent);
> >> +	if (ret == -1) {
> >> +		uninit_cfqg(cfqd, cfqg);
> >> +		return -1;
> >> +	}
> >> +
> >> +	p_blkcg = cgroup_to_blkio_cgroup(cgroup->parent);
> >> +	p_cfqg = cfqg_of_blkg(blkiocg_lookup_group(p_blkcg, key));
> >> +	BUG_ON(p_cfqg == NULL);
> >> +
> >> +	cfqg_set_parent(cfqd, cfqg, p_cfqg);
> >> +	return 0;
> >> +}
> >> +
> >> +static struct cfq_group *
> >> +cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
> >> +{
> >> +	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
> >> +	struct cfq_group *cfqg = NULL;
> >> +	void *key = cfqd;
> >> +	struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
> >> +	unsigned int major, minor;
> >> +	int ret;
> >> +
> >> +	cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key));
> >> +	if (cfqg && !cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) {
> >> +		sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
> >> +		cfqg->blkg.dev = MKDEV(major, minor);
> >> +		goto done;
> >> +	}
> >> +	if (cfqg || !create)
> >> +		goto done;
> >> +
> >> +	/*
> >> +	 * For hierarchical cfq group scheduling, we need to allocate
> >> +	 * the whole cfq group chain.
> >> +	 */
> >> +	ret = cfqg_chain_alloc(cfqd, cgroup);
> >> +	if (!ret) {
> >> +		cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key));
> >> +		BUG_ON(cfqg == NULL);
> >> +		goto done;
> >> +	}
> >>  
> >>  done:
> >>  	return cfqg;
> >> @@ -1136,12 +1269,22 @@ static void cfq_put_cfqg(struct cfq_group *cfqg)
> >>  {
> >>  	struct cfq_rb_root *st;
> >>  	int i, j;
> >> +	struct io_sched_entity *group_entity;
> >> +	struct cfq_group *p_cfqg;
> >>  
> >>  	BUG_ON(atomic_read(&cfqg->ref) <= 0);
> >>  	if (!atomic_dec_and_test(&cfqg->ref))
> >>  		return;
> >>  	for_each_cfqg_st(cfqg, i, j, st)
> >>  		BUG_ON(!RB_EMPTY_ROOT(&st->rb));
> >> +
> >> +	group_entity = &cfqg->group_entity;
> >> +	if (group_entity->parent) {
> >> +		p_cfqg = cfqg_of_entity(group_entity->parent);
> >> +		/* Drop the reference taken by children */
> >> +		atomic_dec(&p_cfqg->ref);
> >> +	}
> >> +
> >>  	kfree(cfqg);
> >>  }
> >>  
> >> @@ -1336,7 +1479,6 @@ insert:
> >>  	io_entity_service_tree_add(service_tree, queue_entity);
> >>  
> >>  	update_min_vdisktime(service_tree);
> >> -	cfqq->reposition_time = jiffies;
> >>  	if ((add_front || !new_cfqq) && !group_changed)
> >>  		return;
> >>  	cfq_group_service_tree_add(cfqd, cfqq->cfqg);
> >> @@ -1779,28 +1921,30 @@ static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd)
> >>  	return cfqq_of_entity(cfq_rb_first(service_tree));
> >>  }
> >>  
> >> -static struct cfq_queue *cfq_get_next_queue_forced(struct cfq_data *cfqd)
> >> +static struct io_sched_entity *
> >> +cfq_get_next_entity_forced(struct cfq_data *cfqd, struct cfq_group *cfqg)
> >>  {
> >> -	struct cfq_group *cfqg;
> >> -	struct io_sched_entity *queue_entity;
> >> +	struct io_sched_entity *entity;
> >>  	int i, j;
> >>  	struct cfq_rb_root *st;
> >>  
> >>  	if (!cfqd->rq_queued)
> >>  		return NULL;
> >>  
> >> -	cfqg = cfq_get_next_cfqg(cfqd);
> >> -	if (!cfqg)
> >> -		return NULL;
> >> -
> >>  	for_each_cfqg_st(cfqg, i, j, st) {
> >> -		queue_entity = cfq_rb_first(st);
> >> -		if (queue_entity != NULL)
> >> -			return cfqq_of_entity(queue_entity);
> >> +		entity = cfq_rb_first(st);
> >> +
> >> +		if (entity && !entity->is_group_entity)
> >> +			return entity;
> >> +		else if (entity && entity->is_group_entity) {
> >> +			cfqg = cfqg_of_entity(entity);
> >> +			return cfq_get_next_entity_forced(cfqd, cfqg);
> >> +		}
> >>  	}
> >>  	return NULL;
> >>  }
> >>  
> >> +
> >>  /*
> >>   * Get and set a new active queue for service.
> >>   */
> >> @@ -2155,8 +2299,7 @@ static void cfq_setup_merge(struct cfq_queue *cfqq, struct cfq_queue *new_cfqq)
> >>  static enum wl_type_t cfq_choose_wl(struct cfq_data *cfqd,
> >>  				struct cfq_group *cfqg, enum wl_prio_t prio)
> >>  {
> >> -	struct io_sched_entity *queue_entity;
> >> -	struct cfq_queue *cfqq;
> >> +	struct io_sched_entity *entity;
> >>  	unsigned long lowest_start_time;
> >>  	int i;
> >>  	bool time_valid = false;
> >> @@ -2167,12 +2310,11 @@ static enum wl_type_t cfq_choose_wl(struct cfq_data *cfqd,
> >>  	 * type. But for the time being just make use of reposition_time only.
> >>  	 */
> >>  	for (i = 0; i <= SYNC_WORKLOAD; ++i) {
> >> -		queue_entity = cfq_rb_first(service_tree_for(cfqg, prio, i));
> >> -		cfqq = cfqq_of_entity(queue_entity);
> >> -		if (queue_entity &&
> >> +		entity = cfq_rb_first(service_tree_for(cfqg, prio, i));
> >> +		if (entity &&
> >>  		    (!time_valid ||
> >> -		     cfqq->reposition_time < lowest_start_time)) {
> >> -			lowest_start_time = cfqq->reposition_time;
> >> +		     entity->reposition_time < lowest_start_time)) {
> >> +			lowest_start_time = entity->reposition_time;
> >>  			cur_best = i;
> >>  			time_valid = true;
> >>  		}
> >> @@ -2181,47 +2323,13 @@ static enum wl_type_t cfq_choose_wl(struct cfq_data *cfqd,
> >>  	return cur_best;
> >>  }
> >>  
> >> -static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg)
> >> +static void set_workload_expire(struct cfq_data *cfqd, struct cfq_group *cfqg)
> >>  {
> >>  	unsigned slice;
> >>  	unsigned count;
> >>  	struct cfq_rb_root *st;
> >>  	unsigned group_slice;
> >>  
> >> -	if (!cfqg) {
> >> -		cfqd->serving_prio = IDLE_WORKLOAD;
> >> -		cfqd->workload_expires = jiffies + 1;
> >> -		return;
> >> -	}
> >> -
> >> -	/* Choose next priority. RT > BE > IDLE */
> >> -	if (cfq_group_busy_queues_wl(RT_WORKLOAD, cfqd, cfqg))
> >> -		cfqd->serving_prio = RT_WORKLOAD;
> >> -	else if (cfq_group_busy_queues_wl(BE_WORKLOAD, cfqd, cfqg))
> >> -		cfqd->serving_prio = BE_WORKLOAD;
> >> -	else {
> >> -		cfqd->serving_prio = IDLE_WORKLOAD;
> >> -		cfqd->workload_expires = jiffies + 1;
> >> -		return;
> >> -	}
> >> -
> >> -	/*
> >> -	 * For RT and BE, we have to choose also the type
> >> -	 * (SYNC, SYNC_NOIDLE, ASYNC), and to compute a workload
> >> -	 * expiration time
> >> -	 */
> >> -	st = service_tree_for(cfqg, cfqd->serving_prio, cfqd->serving_type);
> >> -	count = st->count;
> >> -
> >> -	/*
> >> -	 * check workload expiration, and that we still have other queues ready
> >> -	 */
> >> -	if (count && !time_after(jiffies, cfqd->workload_expires))
> >> -		return;
> >> -
> >> -	/* otherwise select new workload type */
> >> -	cfqd->serving_type =
> >> -		cfq_choose_wl(cfqd, cfqg, cfqd->serving_prio);
> >>  	st = service_tree_for(cfqg, cfqd->serving_prio, cfqd->serving_type);
> >>  	count = st->count;
> >>  
> >> @@ -2262,26 +2370,51 @@ static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg)
> >>  	cfqd->workload_expires = jiffies + slice;
> >>  }
> >>  
> >> -static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd)
> >> +static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg)
> >>  {
> >> -	struct cfq_rb_root *st = &cfqd->grp_service_tree;
> >> -	struct cfq_group *cfqg;
> >> -	struct io_sched_entity *group_entity;
> >> +	struct cfq_rb_root *st;
> >> +	unsigned count;
> >>  
> >> -	if (RB_EMPTY_ROOT(&st->rb))
> >> -		return NULL;
> >> -	group_entity = cfq_rb_first_entity(st);
> >> -	cfqg = cfqg_of_entity(group_entity);
> >> -	BUG_ON(!cfqg);
> >> -	update_min_vdisktime(st);
> >> -	return cfqg;
> >> +	if (!cfqg) {
> >> +		cfqd->serving_prio = IDLE_WORKLOAD;
> >> +		cfqd->workload_expires = jiffies + 1;
> >> +		return;
> >> +	}
> >> +
> >> +	/* Choose next priority. RT > BE > IDLE */
> >> +	if (cfq_group_busy_queues_wl(RT_WORKLOAD, cfqd, cfqg))
> >> +		cfqd->serving_prio = RT_WORKLOAD;
> >> +	else if (cfq_group_busy_queues_wl(BE_WORKLOAD, cfqd, cfqg))
> >> +		cfqd->serving_prio = BE_WORKLOAD;
> >> +	else {
> >> +		cfqd->serving_prio = IDLE_WORKLOAD;
> >> +		cfqd->workload_expires = jiffies + 1;
> >> +		return;
> >> +	}
> >> +
> >> +	/*
> >> +	 * For RT and BE, we have to choose also the type
> >> +	 * (SYNC, SYNC_NOIDLE, ASYNC), and to compute a workload
> >> +	 * expiration time
> >> +	 */
> >> +	st = service_tree_for(cfqg, cfqd->serving_prio, cfqd->serving_type);
> >> +	count = st->count;
> >> +
> >> +	/*
> >> +	 * check workload expiration, and that we still have other queues ready
> >> +	 */
> >> +	if (count && !time_after(jiffies, cfqd->workload_expires))
> >> +		return;
> >> +
> >> +	/* otherwise select new workload type */
> >> +	cfqd->serving_type =
> >> +		cfq_choose_wl(cfqd, cfqg, cfqd->serving_prio);
> >>  }
> >>  
> >> -static void cfq_choose_cfqg(struct cfq_data *cfqd)
> >> +struct io_sched_entity *choose_serving_entity(struct cfq_data *cfqd,
> >> +					      struct cfq_group *cfqg)
> >>  {
> >> -	struct cfq_group *cfqg = cfq_get_next_cfqg(cfqd);
> >> -
> >> -	cfqd->serving_group = cfqg;
> >> +	struct cfq_rb_root *service_tree;
> >>  
> >>  	/* Restore the workload type data */
> >>  	if (cfqg->saved_workload_slice) {
> >> @@ -2292,8 +2425,21 @@ static void cfq_choose_cfqg(struct cfq_data *cfqd)
> >>  		cfqd->workload_expires = jiffies - 1;
> >>  
> >>  	choose_service_tree(cfqd, cfqg);
> >> -}
> >>  
> >> +	service_tree = service_tree_for(cfqg, cfqd->serving_prio,
> >> +					cfqd->serving_type);
> >> +
> >> +	if (!cfqd->rq_queued)
> >> +		return NULL;
> >> +
> >> +	/* There is nothing to dispatch */
> >> +	if (!service_tree)
> >> +		return NULL;
> >> +	if (RB_EMPTY_ROOT(&service_tree->rb))
> >> +		return NULL;
> >> +
> >> +	return cfq_rb_first(service_tree);
> >> +}
> >>  /*
> >>   * Select a queue for service. If we have a current active queue,
> >>   * check whether to continue servicing it, or retrieve and set a new one.
> >> @@ -2301,6 +2447,8 @@ static void cfq_choose_cfqg(struct cfq_data *cfqd)
> >>  static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
> >>  {
> >>  	struct cfq_queue *cfqq, *new_cfqq = NULL;
> >> +	struct cfq_group *cfqg;
> >> +	struct io_sched_entity *entity;
> >>  
> >>  	cfqq = cfqd->active_queue;
> >>  	if (!cfqq)
> >> @@ -2389,8 +2537,23 @@ new_queue:
> >>  	 * Current queue expired. Check if we have to switch to a new
> >>  	 * service tree
> >>  	 */
> >> -	if (!new_cfqq)
> >> -		cfq_choose_cfqg(cfqd);
> >> +	cfqg = &cfqd->root_group;
> >> +
> >> +	if (!new_cfqq) {
> >> +		do {
> >> +			entity = choose_serving_entity(cfqd, cfqg);
> >> +			if (entity && !entity->is_group_entity) {
> >> +				/* This is the CFQ queue that should run */
> >> +				new_cfqq = cfqq_of_entity(entity);
> >> +				cfqd->serving_group = cfqg;
> >> +				set_workload_expire(cfqd, cfqg);
> >> +				break;
> >> +			} else if (entity && entity->is_group_entity) {
> >> +				/* Continue to lookup in this CFQ group */
> >> +				cfqg = cfqg_of_entity(entity);
> >> +			}
> >> +		} while (entity && entity->is_group_entity);
> >> +	}
> >>  
> >>  	cfqq = cfq_set_active_queue(cfqd, new_cfqq);
> >>  keep_queue:
> >> @@ -2421,10 +2584,14 @@ static int cfq_forced_dispatch(struct cfq_data *cfqd)
> >>  {
> >>  	struct cfq_queue *cfqq;
> >>  	int dispatched = 0;
> >> +	struct io_sched_entity *entity;
> >> +	struct cfq_group *root = &cfqd->root_group;
> >>  
> >>  	/* Expire the timeslice of the current active queue first */
> >>  	cfq_slice_expired(cfqd, 0);
> >> -	while ((cfqq = cfq_get_next_queue_forced(cfqd)) != NULL) {
> >> +	while ((entity = cfq_get_next_entity_forced(cfqd, root)) != NULL) {
> >> +		BUG_ON(entity->is_group_entity);
> >> +		cfqq = cfqq_of_entity(entity);
> >>  		__cfq_set_active_queue(cfqd, cfqq);
> >>  		dispatched += __cfq_forced_dispatch_cfqq(cfqq);
> >>  	}
> >> @@ -3954,9 +4121,6 @@ static void *cfq_init_queue(struct request_queue *q)
> >>  
> >>  	cfqd->cic_index = i;
> >>  
> >> -	/* Init root service tree */
> >> -	cfqd->grp_service_tree = CFQ_RB_ROOT;
> >> -
> >>  	/* Init root group */
> >>  	cfqg = &cfqd->root_group;
> >>  	for_each_cfqg_st(cfqg, i, j, st)
> >> @@ -3966,6 +4130,7 @@ static void *cfq_init_queue(struct request_queue *q)
> >>  	/* Give preference to root group over other groups */
> >>  	cfqg->group_entity.weight = 2*BLKIO_WEIGHT_DEFAULT;
> >>  	cfqg->group_entity.is_group_entity = true;
> >> +	cfqg->group_entity.parent = NULL;
> >>  
> >>  #ifdef CONFIG_CFQ_GROUP_IOSCHED
> >>  	/*
> >> -- 
> >> 1.6.5.2
> >>
> >>
> > 
> 
> -- 
> Regards
> Gui Jianfeng
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/