[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <49057ADD.1050705@cn.fujitsu.com>
Date: Mon, 27 Oct 2008 16:25:01 +0800
From: Li Zefan <lizf@...fujitsu.com>
To: Balbir Singh <balbir@...ux.vnet.ibm.com>
CC: bharata@...ux.vnet.ibm.com,
KAMEZAWA Hiroyuki <kamezawa.hiroyu@...fujitsu.com>,
Paul Menage <menage@...gle.com>, linux-kernel@...r.kernel.org,
Srivatsa Vaddagiri <vatsa@...ux.vnet.ibm.com>,
Peter Zijlstra <a.p.zijlstra@...llo.nl>,
Ingo Molnar <mingo@...e.hu>
Subject: Re: [PATCH] Add hierarchical accounting to cpu accounting controller
>>>> So in technical terms this patch looks fine now. There's still the
>>>> question of whether it's OK to change the existing API, since it's
>>>> been in the kernel in its currently (non-hierarchical) form for
>>>> several releases now.
>> Hmm... Can we consider this as an API change ? Currently cpuacct.usage
>> readers of a parent accounting group are missing the usage contributions
>> from its children groups. I would consider this patch as fixing the
>> above problem by correctly reflecting the cpu usage for every accounting
>> group.
>>
>
> If a particular application desires to derive the usage of its
> immediate tasks and does not care about subcgroups, it is a simple
> iteration (after this fix)
>
> cpuacct - sigma(cpuacct_child)
>
> and currently if we cared about child accounting, we could do
>
> cpuacct + recursively(sigma(cpuacct_child))
>
> In that sense this fix makes more sense, but like Paul said we need to
> figure out if it is an API change. My take is that it is a BUG fix,
> since we do care about child subgroups in accounting.
>
cpuacct was designed to count cpu usage of a group of tasks, and now some people
want it to also take child group's usage into account, so I think this is a feature
request but not a bug fix.
How about add a flag to disable/enable hierarchical accounting?
=====
From: Li Zefan <lizf@...fujitsu.com>
Date: Mon, 27 Oct 2008 16:00:21 +0800
Subject: [PATCH] cpuacct: add hierarchical accouning
Add hierarchical accouning to cpu accouting subsystem, so the cputime
of a task is chareged to its accounting group and all it's parent
accouning groups.
Also add 'cpuacct.hierarchy' control file, so we can enable/disable
hierarchical accounting. The default is disabled, so we reserve the
original behavior of cpuacct.
Signed-off-by: Bharata B Rao <bharata@...ux.vnet.ibm.com>
Signed-off-by: Li Zefan <lizf@...fujitsu.com>
---
kernel/sched.c | 75 ++++++++++++++++++++++++++++++++++++++++++++------------
1 files changed, 59 insertions(+), 16 deletions(-)
diff --git a/kernel/sched.c b/kernel/sched.c
index 6625c3c..1c997bd 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -9232,15 +9232,22 @@ struct cgroup_subsys cpu_cgroup_subsys = {
* (balbir@...ibm.com).
*/
-/* track cpu usage of a group of tasks */
+/*
+ * Track cpu usage of a group of tasks.
+ *
+ * If cpuacct_hierarchy is set, it's children's usage is also accounted.
+ */
struct cpuacct {
struct cgroup_subsys_state css;
/* cpuusage holds pointer to a u64-type object on every cpu */
u64 *cpuusage;
+ struct cpuacct *parent;
};
struct cgroup_subsys cpuacct_subsys;
+static int cpuacct_hierarchy;
+
/* return cpu accounting group corresponding to this container */
static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp)
{
@@ -9256,8 +9263,8 @@ static inline struct cpuacct *task_ca(struct task_struct *tsk)
}
/* create a new cpu accounting group */
-static struct cgroup_subsys_state *cpuacct_create(
- struct cgroup_subsys *ss, struct cgroup *cgrp)
+static struct cgroup_subsys_state *cpuacct_create(struct cgroup_subsys *ss,
+ struct cgroup *cgrp)
{
struct cpuacct *ca = kzalloc(sizeof(*ca), GFP_KERNEL);
@@ -9270,12 +9277,14 @@ static struct cgroup_subsys_state *cpuacct_create(
return ERR_PTR(-ENOMEM);
}
+ if (cgrp->parent)
+ ca->parent = cgroup_ca(cgrp->parent);
+
return &ca->css;
}
/* destroy an existing cpu accounting group */
-static void
-cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
+static void cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
{
struct cpuacct *ca = cgroup_ca(cgrp);
@@ -9306,7 +9315,7 @@ static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft)
}
static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype,
- u64 reset)
+ u64 reset)
{
struct cpuacct *ca = cgroup_ca(cgrp);
int err = 0;
@@ -9328,17 +9337,42 @@ out:
return err;
}
-static struct cftype files[] = {
- {
- .name = "usage",
- .read_u64 = cpuusage_read,
- .write_u64 = cpuusage_write,
- },
+static u64 cpuacct_hierarchy_read(struct cgroup *cgrp, struct cftype *cft)
+{
+ return cpuacct_hierarchy;
+}
+
+static int cpuacct_hierarchy_write(struct cgroup *cgrp, struct cftype *cftype,
+ u64 val)
+{
+ cpuacct_hierarchy = !!val;
+ return 0;
+}
+
+static struct cftype cft_cpuusage = {
+ .name = "usage",
+ .read_u64 = cpuusage_read,
+ .write_u64 = cpuusage_write,
+};
+
+static struct cftype cft_hierarchy = {
+ .name = "hierarchy",
+ .read_u64 = cpuacct_hierarchy_read,
+ .write_u64 = cpuacct_hierarchy_write,
};
static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
{
- return cgroup_add_files(cgrp, ss, files, ARRAY_SIZE(files));
+ int ret;
+
+ ret = cgroup_add_file(cgrp, ss, &cft_cpuusage);
+ if (ret)
+ return ret;
+
+ if (!cgrp->parent)
+ ret = cgroup_add_file(cgrp, ss, &cft_hierarchy);
+
+ return ret;
}
/*
@@ -9349,15 +9383,24 @@ static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
static void cpuacct_charge(struct task_struct *tsk, u64 cputime)
{
struct cpuacct *ca;
+ int cpu;
if (!cpuacct_subsys.active)
return;
+ cpu = task_cpu(tsk);
ca = task_ca(tsk);
- if (ca) {
- u64 *cpuusage = percpu_ptr(ca->cpuusage, task_cpu(tsk));
- *cpuusage += cputime;
+ if (cpuacct_hierarchy) {
+ for (; ca; ca = ca->parent) {
+ u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu);
+ *cpuusage += cputime;
+ }
+ } else {
+ if (ca) {
+ u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu);
+ *cpuusage += cputime;
+ }
}
}
--
1.5.4.rc3
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Powered by blists - more mailing lists