[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <1246564917-19603-9-git-send-email-vgoyal@redhat.com>
Date: Thu, 2 Jul 2009 16:01:40 -0400
From: Vivek Goyal <vgoyal@...hat.com>
To: linux-kernel@...r.kernel.org,
containers@...ts.linux-foundation.org, dm-devel@...hat.com,
jens.axboe@...cle.com, nauman@...gle.com, dpshah@...gle.com,
lizf@...fujitsu.com, mikew@...gle.com, fchecconi@...il.com,
paolo.valente@...more.it, ryov@...inux.co.jp,
fernando@....ntt.co.jp, s-uchida@...jp.nec.com, taka@...inux.co.jp,
guijianfeng@...fujitsu.com, jmoyer@...hat.com,
dhaval@...ux.vnet.ibm.com, balbir@...ux.vnet.ibm.com,
righi.andrea@...il.com, m-ikeda@...jp.nec.com, jbaron@...hat.com
Cc: agk@...hat.com, snitzer@...hat.com, vgoyal@...hat.com,
akpm@...ux-foundation.org, peterz@...radead.org
Subject: [PATCH 08/25] io-controller: cgroup related changes for hierarchical group support
o This patch introduces some of the cgroup related code for io controller.
Signed-off-by: Fabio Checconi <fabio@...dalf.sssup.it>
Signed-off-by: Paolo Valente <paolo.valente@...more.it>
Signed-off-by: Nauman Rafique <nauman@...gle.com>
Signed-off-by: Gui Jianfeng <guijianfeng@...fujitsu.com>
Signed-off-by: Vivek Goyal <vgoyal@...hat.com>
---
block/blk-ioc.c | 3 +
block/elevator-fq.c | 174 +++++++++++++++++++++++++++++++++++++++++
block/elevator-fq.h | 43 ++++++++++-
include/linux/cgroup_subsys.h | 6 ++
include/linux/iocontext.h | 5 +
5 files changed, 230 insertions(+), 1 deletions(-)
diff --git a/block/blk-ioc.c b/block/blk-ioc.c
index d4ed600..0d56336 100644
--- a/block/blk-ioc.c
+++ b/block/blk-ioc.c
@@ -95,6 +95,9 @@ struct io_context *alloc_io_context(gfp_t gfp_flags, int node)
spin_lock_init(&ret->lock);
ret->ioprio_changed = 0;
ret->ioprio = 0;
+#ifdef CONFIG_GROUP_IOSCHED
+ ret->cgroup_changed = 0;
+#endif
ret->last_waited = jiffies; /* doesn't matter... */
ret->nr_batch_requests = 0; /* because this is 0 */
ret->aic = NULL;
diff --git a/block/elevator-fq.c b/block/elevator-fq.c
index 0acfa2c..84276d5 100644
--- a/block/elevator-fq.c
+++ b/block/elevator-fq.c
@@ -29,6 +29,9 @@ static int elv_rate_sampling_window = HZ / 10;
#define ELV_SLICE_SCALE (5)
#define ELV_HW_QUEUE_MIN (5)
+#define IO_DEFAULT_GRP_WEIGHT 500
+#define IO_DEFAULT_GRP_CLASS IOPRIO_CLASS_BE
+
#define IO_SERVICE_TREE_INIT ((struct io_service_tree) \
{ RB_ROOT, RB_ROOT, NULL, NULL, 0, 0 })
@@ -899,6 +902,177 @@ static void io_flush_idle_tree(struct io_service_tree *st)
__bfq_deactivate_entity(entity, 0);
}
+/* Mainly hierarchical grouping code */
+#ifdef CONFIG_GROUP_IOSCHED
+
+struct io_cgroup io_root_cgroup = {
+ .weight = IO_DEFAULT_GRP_WEIGHT,
+ .ioprio_class = IO_DEFAULT_GRP_CLASS,
+};
+
+static struct io_cgroup *cgroup_to_io_cgroup(struct cgroup *cgroup)
+{
+ return container_of(cgroup_subsys_state(cgroup, io_subsys_id),
+ struct io_cgroup, css);
+}
+
+#define SHOW_FUNCTION(__VAR) \
+static u64 io_cgroup_##__VAR##_read(struct cgroup *cgroup, \
+ struct cftype *cftype) \
+{ \
+ struct io_cgroup *iocg; \
+ u64 ret; \
+ \
+ if (!cgroup_lock_live_group(cgroup)) \
+ return -ENODEV; \
+ \
+ iocg = cgroup_to_io_cgroup(cgroup); \
+ spin_lock_irq(&iocg->lock); \
+ ret = iocg->__VAR; \
+ spin_unlock_irq(&iocg->lock); \
+ \
+ cgroup_unlock(); \
+ \
+ return ret; \
+}
+
+SHOW_FUNCTION(weight);
+SHOW_FUNCTION(ioprio_class);
+#undef SHOW_FUNCTION
+
+#define STORE_FUNCTION(__VAR, __MIN, __MAX) \
+static int io_cgroup_##__VAR##_write(struct cgroup *cgroup, \
+ struct cftype *cftype, \
+ u64 val) \
+{ \
+ struct io_cgroup *iocg; \
+ struct io_group *iog; \
+ struct hlist_node *n; \
+ \
+ if (val < (__MIN) || val > (__MAX)) \
+ return -EINVAL; \
+ \
+ if (!cgroup_lock_live_group(cgroup)) \
+ return -ENODEV; \
+ \
+ iocg = cgroup_to_io_cgroup(cgroup); \
+ \
+ spin_lock_irq(&iocg->lock); \
+ iocg->__VAR = (unsigned long)val; \
+ hlist_for_each_entry(iog, n, &iocg->group_data, group_node) { \
+ iog->entity.new_##__VAR = (unsigned long)val; \
+ smp_wmb(); \
+ iog->entity.ioprio_changed = 1; \
+ } \
+ spin_unlock_irq(&iocg->lock); \
+ \
+ cgroup_unlock(); \
+ \
+ return 0; \
+}
+
+STORE_FUNCTION(weight, 1, WEIGHT_MAX);
+STORE_FUNCTION(ioprio_class, IOPRIO_CLASS_RT, IOPRIO_CLASS_IDLE);
+#undef STORE_FUNCTION
+
+struct cftype bfqio_files[] = {
+ {
+ .name = "weight",
+ .read_u64 = io_cgroup_weight_read,
+ .write_u64 = io_cgroup_weight_write,
+ },
+ {
+ .name = "ioprio_class",
+ .read_u64 = io_cgroup_ioprio_class_read,
+ .write_u64 = io_cgroup_ioprio_class_write,
+ },
+};
+
+static int iocg_populate(struct cgroup_subsys *subsys, struct cgroup *cgroup)
+{
+ return cgroup_add_files(cgroup, subsys, bfqio_files,
+ ARRAY_SIZE(bfqio_files));
+}
+
+static struct cgroup_subsys_state *iocg_create(struct cgroup_subsys *subsys,
+ struct cgroup *cgroup)
+{
+ struct io_cgroup *iocg;
+
+ if (cgroup->parent != NULL) {
+ iocg = kzalloc(sizeof(*iocg), GFP_KERNEL);
+ if (iocg == NULL)
+ return ERR_PTR(-ENOMEM);
+ } else
+ iocg = &io_root_cgroup;
+
+ spin_lock_init(&iocg->lock);
+ INIT_HLIST_HEAD(&iocg->group_data);
+ iocg->weight = IO_DEFAULT_GRP_WEIGHT;
+ iocg->ioprio_class = IO_DEFAULT_GRP_CLASS;
+
+ return &iocg->css;
+}
+
+/*
+ * We cannot support shared io contexts, as we have no mean to support
+ * two tasks with the same ioc in two different groups without major rework
+ * of the main cic/bfqq data structures. By now we allow a task to change
+ * its cgroup only if it's the only owner of its ioc; the drawback of this
+ * behavior is that a group containing a task that forked using CLONE_IO
+ * will not be destroyed until the tasks sharing the ioc die.
+ */
+static int iocg_can_attach(struct cgroup_subsys *subsys, struct cgroup *cgroup,
+ struct task_struct *tsk)
+{
+ struct io_context *ioc;
+ int ret = 0;
+
+ /* task_lock() is needed to avoid races with exit_io_context() */
+ task_lock(tsk);
+ ioc = tsk->io_context;
+ if (ioc != NULL && atomic_read(&ioc->nr_tasks) > 1)
+ /*
+ * ioc == NULL means that the task is either too young or
+ * exiting: if it has still no ioc the ioc can't be shared,
+ * if the task is exiting the attach will fail anyway, no
+ * matter what we return here.
+ */
+ ret = -EINVAL;
+ task_unlock(tsk);
+
+ return ret;
+}
+
+static void iocg_attach(struct cgroup_subsys *subsys, struct cgroup *cgroup,
+ struct cgroup *prev, struct task_struct *tsk)
+{
+ struct io_context *ioc;
+
+ task_lock(tsk);
+ ioc = tsk->io_context;
+ if (ioc != NULL)
+ ioc->cgroup_changed = 1;
+ task_unlock(tsk);
+}
+
+static void iocg_destroy(struct cgroup_subsys *subsys, struct cgroup *cgroup)
+{
+
+ /* Implemented in later patch */
+}
+
+struct cgroup_subsys io_subsys = {
+ .name = "io",
+ .create = iocg_create,
+ .can_attach = iocg_can_attach,
+ .attach = iocg_attach,
+ .destroy = iocg_destroy,
+ .populate = iocg_populate,
+ .subsys_id = io_subsys_id,
+ .use_id = 1,
+};
+#endif /* GROUP_IOSCHED */
/* Elevator fair queuing function */
static inline struct io_queue *elv_active_ioq(struct elevator_queue *e)
{
diff --git a/block/elevator-fq.h b/block/elevator-fq.h
index 57207c4..d9acb75 100644
--- a/block/elevator-fq.h
+++ b/block/elevator-fq.h
@@ -13,11 +13,13 @@
*/
#include <linux/blkdev.h>
+#include <linux/cgroup.h>
#ifndef _BFQ_SCHED_H
#define _BFQ_SCHED_H
#define IO_IOPRIO_CLASSES 3
+#define WEIGHT_MAX 1000
struct io_entity;
struct io_queue;
@@ -88,7 +90,7 @@ struct io_sched_data {
* this entity; used for O(log N) lookups into active trees.
* @service: service received during the last round of service.
* @budget: budget used to calculate F_i; F_i = S_i + @budget / @weight.
- * @weight: weight of the queue, calculated as IOPRIO_BE_NR - @ioprio.
+ * @weight: the weight in use.
* @new_weight: when a weight change is requested, the new weight value
* @parent: parent entity, for hierarchical scheduling.
* @my_sched_data: for non-leaf nodes in the cgroup hierarchy, the
@@ -181,8 +183,10 @@ struct io_queue {
void *sched_queue;
};
+#ifdef CONFIG_GROUP_IOSCHED
struct io_group {
struct io_entity entity;
+ struct hlist_node group_node;
struct io_sched_data sched_data;
struct io_entity *my_entity;
@@ -199,8 +203,45 @@ struct io_group {
* non-RT cfqq in service when this value is non-zero.
*/
unsigned int busy_rt_queues;
+ unsigned short iocg_id;
};
+/**
+ * struct io_cgroup - io cgroup data structure.
+ * @css: subsystem state for io in the containing cgroup.
+ * @weight: cgroup weight.
+ * @ioprio_class: cgroup ioprio_class.
+ * @lock: spinlock that protects @weight, @ioprio_class and @group_data.
+ * @group_data: list containing the io_group belonging to this cgroup.
+ *
+ * @group_data is accessed using RCU, with @lock protecting the updates,
+ * @weight and @ioprio_class are protected by @lock.
+ */
+struct io_cgroup {
+ struct cgroup_subsys_state css;
+
+ unsigned int weight;
+ unsigned short ioprio_class;
+
+ spinlock_t lock;
+ struct hlist_head group_data;
+};
+#else
+struct io_group {
+ struct io_sched_data sched_data;
+
+ /* async_queue and idle_queue are used only for cfq */
+ struct io_queue *async_queue[2][IOPRIO_BE_NR];
+ struct io_queue *async_idle_queue;
+
+ /*
+ * Used to track any pending rt requests so we can pre-empt current
+ * non-RT cfqq in service when this value is non-zero.
+ */
+ unsigned int busy_rt_queues;
+};
+#endif /* CONFIG_GROUP_IOSCHED */
+
struct elv_fq_data {
struct io_group *root_group;
diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h
index 9c8d31b..baf544f 100644
--- a/include/linux/cgroup_subsys.h
+++ b/include/linux/cgroup_subsys.h
@@ -60,3 +60,9 @@ SUBSYS(net_cls)
#endif
/* */
+
+#ifdef CONFIG_GROUP_IOSCHED
+SUBSYS(io)
+#endif
+
+/* */
diff --git a/include/linux/iocontext.h b/include/linux/iocontext.h
index 1482b20..ccecf53 100644
--- a/include/linux/iocontext.h
+++ b/include/linux/iocontext.h
@@ -68,6 +68,11 @@ struct io_context {
unsigned short ioprio;
unsigned short ioprio_changed;
+#ifdef CONFIG_GROUP_IOSCHED
+ /* If task changes the cgroup, elevator processes it asynchronously */
+ unsigned short cgroup_changed;
+#endif
+
/*
* For request batching
*/
--
1.6.0.6
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Powered by blists - more mailing lists