linux-kernel - [RFC][PATCH] remove rq->lock from cpuacct cgroup (Was Re: [PATCH] cpuacct: add a branch prediction

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Date:	Fri, 27 Feb 2009 12:22:39 +0900
From:	KAMEZAWA Hiroyuki <kamezawa.hiroyu@...fujitsu.com>
To:	paulmck@...ux.vnet.ibm.com
Cc:	Peter Zijlstra <a.p.zijlstra@...llo.nl>,
	Bharata B Rao <bharata.rao@...il.com>,
	Li Zefan <lizf@...fujitsu.com>, Ingo Molnar <mingo@...e.hu>,
	Paul Menage <menage@...gle.com>,
	Balbir Singh <balbir@...ux.vnet.ibm.com>,
	LKML <linux-kernel@...r.kernel.org>
Subject: [RFC][PATCH] remove rq->lock from cpuacct cgroup (Was Re: [PATCH]
 cpuacct: add a branch prediction

On Thu, 26 Feb 2009 17:29:15 -0800
"Paul E. McKenney" <paulmck@...ux.vnet.ibm.com> wrote:
> > Can't we use seq_counter in include/linux/seqlock.h ?
> > There is only one writer and we don't need write-side lock.
> 
> Yes, seqlock should work fine, good point!
> 

This is a trial version. (may hunk with the latest mmotm)
seems overkill ?

==
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@...fujitsu.com>

cgroup/cpuacct subsystem counts cpu usage by 64bit coutnter in
per-cpu object. In read-side (via cpuacct.usage file), for reading 64bit
value in safe manner, it takes rq->lock of (other) cpus.

In general, taking rq->lock of other cpus from codes not for scheduler
is not good. This patch tries to remove rq->lock used in read-side.

To read 64bit value in safe, this patch uses seqcounter.

Pros.
  - rq->lock is not necessary.
Cons.
  - When updating counter, sequence number must be updated.
    (I hope this per-cpu sequence number is on cache...)
  - not simple.

Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@...fujitsu.com>
---
 kernel/sched.c |  141 ++++++++++++++++++++++++++++++++++++++++++---------------
 1 file changed, 105 insertions(+), 36 deletions(-)

Index: mmotm-2.6.29-Feb24/kernel/sched.c
===================================================================
--- mmotm-2.6.29-Feb24.orig/kernel/sched.c
+++ mmotm-2.6.29-Feb24/kernel/sched.c
@@ -9581,6 +9581,67 @@ struct cgroup_subsys cpu_cgroup_subsys =
 
 #ifdef CONFIG_CGROUP_CPUACCT
 
+#ifndef CONFIG_64BIT
+DEFINE_PER_CPU(struct seqcount, cpuacct_cgroup_seq);
+
+static inline void cpuacct_start_counter_update(void)
+{
+	/* This is called under rq->lock and IRQ is off */
+	struct seqcount *s = &get_cpu_var(cpuacct_cgroup_seq);
+
+	write_seqcount_begin(s);
+	put_cpu_var(cpuacct_cgroup_seq);
+}
+
+static inline void cpuacct_end_counter_update(void)
+{
+	struct seqcount *s = &get_cpu_var(cpuacct_cgroup_seq);
+
+	write_seqcount_end(s);
+	put_cpu_var(cpuacct_cgroup_seq);
+}
+
+static inline u64
+cpuacct_read_counter(u64 *val, int cpu)
+{
+	struct seqcount *s = &per_cpu(cpuacct_cgroup_seq, cpu);
+	unsigned int seq;
+	u64 data;
+
+	do {
+		seq = read_seqcount_begin(s);
+		data = *val;
+	} while (read_seqcount_retry(s, seq));
+	return data;
+}
+/* This is a special funtion called against "offline" cpus. */
+static inline void cpuacct_reset_offline_counter(u64 *val, int cpu)
+{
+	struct seqcount *s = &per_cpu(cpuacct_cgroup_seq, cpu);
+
+	preempt_disable();
+	write_seqcount_begin(s);
+	*val = 0;
+	write_seqcount_end(s);
+	preempt_enable();
+}
+#else
+static inline void cpuacct_start_counter_update(void)
+{
+}
+static inline void cpuacct_end_counter_update(void)
+{
+}
+static inline u64 cpuacct_read_counter(u64 *val, int cpu)
+{
+	return *val;
+}
+static inline void cpuacct_reset_offline_counter(u64 *val, int cpu)
+{
+	*val = 0;
+}
+#endif
+
 /*
  * CPU accounting code for task groups.
  *
@@ -9596,6 +9657,11 @@ struct cpuacct {
 	struct cpuacct *parent;
 };
 
+struct cpuacct_work {
+	struct work_struct work;
+	struct cpuacct *cpuacct;
+};
+
 struct cgroup_subsys cpuacct_subsys;
 
 /* return cpu accounting group corresponding to this container */
@@ -9643,39 +9709,29 @@ cpuacct_destroy(struct cgroup_subsys *ss
 	kfree(ca);
 }
 
+/* In 32bit enviroment, seqcounter is used for reading 64bit in safe way */
 static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)
 {
 	u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu);
 	u64 data;
 
-#ifndef CONFIG_64BIT
-	/*
-	 * Take rq->lock to make 64-bit read safe on 32-bit platforms.
-	 */
-	spin_lock_irq(&cpu_rq(cpu)->lock);
-	data = *cpuusage;
-	spin_unlock_irq(&cpu_rq(cpu)->lock);
-#else
-	data = *cpuusage;
-#endif
+	data = cpuacct_read_counter(cpuusage, cpu);
 
 	return data;
 }
 
-static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
+/* called by per-cpu workqueue */
+static void cpuacct_cpuusage_reset_cpu(struct work_struct *work)
 {
+	struct cpuacct_work *cw = container_of(work, struct cpuacct_work, work);
+	struct cpuacct *ca = cw->cpuacct;
+	int cpu = get_cpu();
 	u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu);
 
-#ifndef CONFIG_64BIT
-	/*
-	 * Take rq->lock to make 64-bit write safe on 32-bit platforms.
-	 */
-	spin_lock_irq(&cpu_rq(cpu)->lock);
-	*cpuusage = val;
-	spin_unlock_irq(&cpu_rq(cpu)->lock);
-#else
-	*cpuusage = val;
-#endif
+	cpuacct_start_counter_update();
+	*cpuusage = 0;
+	cpuacct_end_counter_update();
+	put_cpu();
 }
 
 /* return total cpu usage (in nanoseconds) of a group */
@@ -9691,23 +9747,34 @@ static u64 cpuusage_read(struct cgroup *
 	return totalcpuusage;
 }
 
-static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype,
-								u64 reset)
+static int cpuacct_cpuusage_reset(struct cgroup *cgrp, unsigned int event)
 {
 	struct cpuacct *ca = cgroup_ca(cgrp);
-	int err = 0;
-	int i;
-
-	if (reset) {
-		err = -EINVAL;
-		goto out;
+	int cpu;
+	/*
+	 * Reset All counters....doesn't need to be fast.
+	 * "ca" will be stable while doing this. We are in write() syscall.
+	 */
+	get_online_cpus();
+	/*
+	 * Because we use alloc_percpu() for allocating counter, we have
+	 * a counter per a possible cpu. Reset all online's by workqueue and
+	 * reset offline cpu's directly.
+	 */
+	for_each_possible_cpu(cpu) {
+		if (cpu_online(cpu)) {
+			struct cpuacct_work cw;
+			INIT_WORK(&cw.work, cpuacct_cpuusage_reset_cpu);
+			cw.cpuacct = ca;
+			schedule_work_on(cpu, &cw.work);
+			flush_work(&cw.work);
+		} else {
+			u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu);
+			cpuacct_reset_offline_counter(cpuusage, cpu);
+		}
 	}
-
-	for_each_present_cpu(i)
-		cpuacct_cpuusage_write(ca, i, 0);
-
-out:
-	return err;
+	put_online_cpus();
+	return 0;
 }
 
 static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft,
@@ -9729,7 +9796,7 @@ static struct cftype files[] = {
 	{
 		.name = "usage",
 		.read_u64 = cpuusage_read,
-		.write_u64 = cpuusage_write,
+		.trigger = cpuacct_cpuusage_reset,
 	},
 	{
 		.name = "usage_percpu",
@@ -9756,10 +9823,12 @@ static void cpuacct_charge(struct task_s
 	cpu = task_cpu(tsk);
 	ca = task_ca(tsk);
 
+	cpuacct_start_counter_update();
 	for (; ca; ca = ca->parent) {
 		u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu);
 		*cpuusage += cputime;
 	}
+	cpuacct_end_counter_update();
 }
 
 struct cgroup_subsys cpuacct_subsys = {

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/