linux-kernel - [RFC][PATCH 1/9] sched: Introduce power scheduler

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Date:	Tue,  9 Jul 2013 16:55:30 +0100
From:	Morten Rasmussen <morten.rasmussen@....com>
To:	mingo@...nel.org, peterz@...radead.org
Cc:	arjan@...ux.intel.com, vincent.guittot@...aro.org,
	preeti@...ux.vnet.ibm.com, alex.shi@...el.com, efault@....de,
	pjt@...gle.com, len.brown@...el.com, corbet@....net,
	akpm@...ux-foundation.org, torvalds@...ux-foundation.org,
	tglx@...utronix.de, catalin.marinas@....com,
	linux-kernel@...r.kernel.org, linaro-kernel@...ts.linaro.org,
	morten.rasmussen@....com
Subject: [RFC][PATCH 1/9] sched: Introduce power scheduler

Proof of concept capacity managing power scheduler. Supports simple
packing without any consideration of power topology. The power scheduler
is meant to use a platform specific power driver to obtain information
about power topology and select idle states and frequency/P-states.

For now, the power scheduler is called periodically on cpu0. This will be
replaced by calls from the scheduler in the future. Thresholds and other
defined constants will be configurable, possibly set by the power driver,
in the future. Iterations over all cpus will be also be optimized to
ensure scalability.

Signed-off-by: Morten Rasmussen <morten.rasmussen@....com>
CC: Ingo Molnar <mingo@...nel.org>
CC: Peter Zijlstra <peterz@...radead.org>
CC: Catalin Marinas <catalin.marinas@....com>
---
 arch/arm/Kconfig      |    2 +
 kernel/Kconfig.power  |    3 +
 kernel/sched/Makefile |    1 +
 kernel/sched/power.c  |  161 +++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 167 insertions(+)
 create mode 100644 kernel/Kconfig.power
 create mode 100644 kernel/sched/power.c

diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 2651b1d..04076ab 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -1805,6 +1805,8 @@ config XEN
 	help
 	  Say Y if you want to run Linux in a Virtual Machine on Xen on ARM.
 
+source "kernel/Kconfig.power"
+
 endmenu
 
 menu "Boot options"
diff --git a/kernel/Kconfig.power b/kernel/Kconfig.power
new file mode 100644
index 0000000..4fdaa13
--- /dev/null
+++ b/kernel/Kconfig.power
@@ -0,0 +1,3 @@
+config SCHED_POWER
+	bool "(EXPERIMENTAL) Power scheduler"
+	default n
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index deaf90e..67b01b2 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -17,3 +17,4 @@ obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
 obj-$(CONFIG_SCHEDSTATS) += stats.o
 obj-$(CONFIG_SCHED_DEBUG) += debug.o
 obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o
+obj-$(CONFIG_SCHED_POWER) += power.o
diff --git a/kernel/sched/power.c b/kernel/sched/power.c
new file mode 100644
index 0000000..ddf249f
--- /dev/null
+++ b/kernel/sched/power.c
@@ -0,0 +1,161 @@
+/*
+ * kernel/sched/power.c
+ *
+ * Copyright (C) 2013 ARM Limited.
+ * Author: Morten Rasmussen <morten.rasmussen@....com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/percpu.h>
+#include <linux/workqueue.h>
+#include <linux/sched.h>
+
+#include "sched.h"
+
+#define INTERVAL 5 /* ms */
+#define CPU_FULL 90 /* Busy %-age - TODO: Make tunable */
+
+struct cpu_stats_struct {
+	int load;
+	int nr_tasks;
+};
+
+static unsigned long power_of(int cpu)
+{
+	return cpu_rq(cpu)->cpu_power;
+}
+
+DEFINE_PER_CPU(struct cpu_stats_struct, cpu_stats);
+
+/*
+ * update_cpu_load fetches runqueue statistics from the scheduler should
+ * only be called with approitate locks held.
+ */
+static void update_cpu_load(void)
+{
+	int i;
+
+	for_each_online_cpu(i) {
+		struct rq *rq = cpu_rq(i);
+		int load = 0;
+		u32 sum = rq->avg.runnable_avg_sum;
+		u32 period = rq->avg.runnable_avg_period;
+
+		load = (sum * power_of(i)) / (period+1);
+		per_cpu(cpu_stats, i).load = load;
+		per_cpu(cpu_stats, i).nr_tasks = rq->nr_running;
+
+		/* Take power scheduler kthread into account */
+		if (smp_processor_id() == i)
+			per_cpu(cpu_stats, i).nr_tasks--;
+	}
+}
+
+extern unsigned long arch_scale_freq_power(struct sched_domain *sd, int cpu);
+DEFINE_PER_CPU(unsigned long, arch_cpu_power);
+
+static void get_arch_cpu_power(void)
+{
+	int i;
+
+	if (sched_feat(ARCH_POWER)) {
+		for_each_online_cpu(i)
+			per_cpu(arch_cpu_power, i) =
+				arch_scale_freq_power(cpu_rq(i)->sd, i);
+	} else {
+		for_each_online_cpu(i)
+			per_cpu(arch_cpu_power, i) = SCHED_POWER_SCALE;
+	}
+}
+
+DEFINE_PER_CPU(unsigned long, cpu_power);
+
+/*
+ * power_sched_cpu_power is called from fair.c to get the power scheduler
+ * cpu capacities. We can't use arch_scale_freq_power() as this may already
+ * be defined by the platform.
+ */
+unsigned long power_sched_cpu_power(struct sched_domain *sd, int cpu)
+{
+	return per_cpu(cpu_power, cpu);
+}
+
+/*
+ * calculate_cpu_capacities figures out how many cpus that are necessary
+ * to handle the current load. The current algorithm is very simple and
+ * does not take power topology into account and it does not scale the cpu
+ * capacity. It is either on or off. Plenty of potential for improvements!
+ */
+static void calculate_cpu_capacities(void)
+{
+	int i, spare_cap = 0;
+	struct cpu_stats_struct *stats;
+
+	/*
+	 * spare_cap keeps track of the total available capacity across
+	 * all cpus
+	 */
+
+	for_each_online_cpu(i) {
+		int t_cap = 0;
+		int arch_power = per_cpu(arch_cpu_power, i);
+
+		stats = &per_cpu(cpu_stats, i);
+		t_cap = arch_power - stats->load;
+
+		if (t_cap < (arch_power * (100-CPU_FULL)) / 100) {
+			/* Potential for spreading load */
+			if (stats->nr_tasks > 1)
+				t_cap = -(stats->load / stats->nr_tasks);
+		}
+
+		/* Do we have enough capacity already? */
+		if (spare_cap + t_cap > arch_power) {
+			per_cpu(cpu_power, i) = 1;
+		} else {
+			per_cpu(cpu_power, i) = arch_power;
+			spare_cap += t_cap;
+		}
+	}
+}
+
+static void __power_schedule(void)
+{
+	rcu_read_lock();
+
+	get_arch_cpu_power();
+	update_cpu_load();
+	calculate_cpu_capacities();
+
+	rcu_read_unlock();
+}
+
+struct delayed_work dwork;
+
+/* Periodic power schedule target cpu */
+static int schedule_cpu(void)
+{
+	return 0;
+}
+
+void power_schedule_wq(struct work_struct *work)
+{
+	__power_schedule();
+	mod_delayed_work_on(schedule_cpu(), system_wq, &dwork,
+				msecs_to_jiffies(INTERVAL));
+}
+
+static int __init sched_power_init(void)
+{
+	INIT_DELAYED_WORK(&dwork, power_schedule_wq);
+	mod_delayed_work_on(schedule_cpu(), system_wq, &dwork,
+				msecs_to_jiffies(INTERVAL));
+	return 0;
+}
+late_initcall(sched_power_init);
-- 
1.7.9.5


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/