linux-kernel - [RFC PATCH v2 3/3] sched: introduce synchronized idle injection

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <1447114883-23851-4-git-send-email-jacob.jun.pan@linux.intel.com>
Date:	Mon,  9 Nov 2015 16:21:23 -0800
From:	Jacob Pan <jacob.jun.pan@...ux.intel.com>
To:	Peter Zijlstra <peterz@...radead.org>,
	LKML <linux-kernel@...r.kernel.org>
Cc:	Rafael Wysocki <rafael.j.wysocki@...el.com>,
	Len Brown <len.brown@...el.com>,
	Andi Kleen <andi.kleen@...el.com>,
	Thomas Gleixner <tglx@...utronix.de>,
	Paul Turner <pjt@...gle.com>,
	Tim Chen <tim.c.chen@...ux.intel.com>,
	Dietmar Eggemann <dietmar.eggemann@....com>,
	Eduardo Valentin <edubezval@...il.com>,
	Punit Agrawal <punit.agrawal@....com>,
	Srinivas Pandruvada <srinivas.pandruvada@...ux.intel.com>,
	Jacob Pan <jacob.jun.pan@...ux.intel.com>
Subject: [RFC PATCH v2 3/3] sched: introduce synchronized idle injection

With increasingly constrained power and thermal budget, it's often
necessary to cap power via throttling. Throttling individual CPUs
or devices at random times can help power capping but may not be
optimal in terms of energy efficiency.

In general, the optimal solution in terms of energy efficiency is
to align idle periods such that more shared circuits can be power
gated to enter lower power states.

This patch introduces a scheduler based idle injection method, it
works by blocking CFS runqueue synchronously and periodically. The
actions on all online CPUs are orchestrated by per CPU hrtimers.

Two sysctl knobs are given to the userspce for selecting the
percentage of idle time as well as the forced idle duration for each
idle period injected.

Since only CFS class is targeted, other high priority tasks are not
affected, such as RT, softirq, and interrupts.

Signed-off-by: Jacob Pan <jacob.jun.pan@...ux.intel.com>
---
 include/linux/sched.h        |  12 ++
 include/linux/sched/sysctl.h |   5 +
 include/trace/events/sched.h |  23 +++
 init/Kconfig                 |   8 +
 kernel/sched/fair.c          | 381 ++++++++++++++++++++++++++++++++++++++++++-
 kernel/sched/sched.h         |   2 +-
 kernel/sysctl.c              |  20 +++
 7 files changed, 446 insertions(+), 5 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index b7b9501..ae8be25 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -3181,3 +3181,15 @@ static inline unsigned long rlimit_max(unsigned int limit)
 }
 
 #endif
+
+#ifdef CONFIG_CFS_IDLE_INJECT
+extern int proc_sched_cfs_idle_inject_pct_handler(struct ctl_table *table,
+						int write,
+						void __user *buffer,
+						size_t *length, loff_t *ppos);
+extern int proc_sched_cfs_idle_inject_duration_handler(struct ctl_table *table,
+						int write,
+						void __user *buffer,
+						size_t *length, loff_t *ppos);
+
+#endif
diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h
index c9e4731..d32da45 100644
--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -81,6 +81,11 @@ extern unsigned int sysctl_sched_cfs_bandwidth_slice;
 extern unsigned int sysctl_sched_autogroup_enabled;
 #endif
 
+#ifdef CONFIG_CFS_IDLE_INJECT
+extern unsigned int sysctl_sched_cfs_idle_inject_pct;
+extern unsigned int sysctl_sched_cfs_idle_inject_duration;
+#endif
+
 extern int sched_rr_timeslice;
 
 extern int sched_rr_handler(struct ctl_table *table, int write,
diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index 539d6bc..50bd7b6 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -566,6 +566,29 @@ TRACE_EVENT(sched_wake_idle_without_ipi,
 
 	TP_printk("cpu=%d", __entry->cpu)
 );
+
+/*
+ * Tracepoint for idle injection
+ */
+TRACE_EVENT(sched_cfs_idle_inject,
+
+	TP_PROTO(char *msg, int throttled),
+
+	TP_ARGS(msg, throttled),
+
+	TP_STRUCT__entry(
+		__string(msg, msg)
+		__field(int, throttled)
+	),
+
+	TP_fast_assign(
+		__assign_str(msg, msg);
+		__entry->throttled = throttled;
+	),
+
+	TP_printk("%s: throttled=%d", __get_str(msg), __entry->throttled)
+);
+
 #endif /* _TRACE_SCHED_H */
 
 /* This part must be outside protection */
diff --git a/init/Kconfig b/init/Kconfig
index c24b6f7..1f2960a 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1086,6 +1086,14 @@ menuconfig CGROUP_SCHED
 	  bandwidth allocation to such task groups. It uses cgroups to group
 	  tasks.
 
+config CFS_IDLE_INJECT
+	bool "Synchronized CFS idle injection"
+	default n
+	help
+	  This option allows users to inject synchronized idle time across all CPUs.
+	  The feature will align idle time such that the entire CPU package can be duty
+	  cycled by going into the deepest/lowest power states.
+
 if CGROUP_SCHED
 config FAIR_GROUP_SCHED
 	bool "Group scheduling for SCHED_OTHER"
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 9a5e60f..aa89227 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -30,6 +30,7 @@
 #include <linux/mempolicy.h>
 #include <linux/migrate.h>
 #include <linux/task_work.h>
+#include <linux/suspend.h>
 
 #include <trace/events/sched.h>
 
@@ -114,6 +115,17 @@ unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
 unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
 #endif
 
+/*
+ * Knobs for controlling percentage of time when idle is forced across all
+ * CPUs. This is a power management feature intended for achieving deepest
+ * and broadest idle without lower CPU frequencies to less optimal level.
+ * No action is taken if CPUs are natually idle.
+ */
+#ifdef CONFIG_CFS_IDLE_INJECT
+unsigned int sysctl_sched_cfs_idle_inject_pct;
+unsigned int sysctl_sched_cfs_idle_inject_duration = 10UL;
+#endif
+
 static inline void update_load_add(struct load_weight *lw, unsigned long inc)
 {
 	lw->weight += inc;
@@ -2334,7 +2346,9 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 		list_add(&se->group_node, &rq->cfs_tasks);
 	}
 #endif
-	cfs_rq->nr_running++;
+
+	if (!cfs_rq->nr_running++ && !cfs_rq->forced_idle)
+		cfs_rq->runnable = true;
 }
 
 static void
@@ -2347,7 +2361,9 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 		account_numa_dequeue(rq_of(cfs_rq), task_of(se));
 		list_del_init(&se->group_node);
 	}
-	cfs_rq->nr_running--;
+
+	if (!--cfs_rq->nr_running && !cfs_rq->forced_idle)
+		cfs_rq->runnable = false;
 }
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
@@ -5139,7 +5155,7 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev)
 
 again:
 #ifdef CONFIG_FAIR_GROUP_SCHED
-	if (!cfs_rq->nr_running)
+	if (!cfs_rq->runnable)
 		goto idle;
 
 	if (prev->sched_class != &fair_sched_class)
@@ -5218,7 +5234,7 @@ simple:
 	cfs_rq = &rq->cfs;
 #endif
 
-	if (!cfs_rq->nr_running)
+	if (!cfs_rq->runnable)
 		goto idle;
 
 	put_prev_task(rq, prev);
@@ -5237,6 +5253,16 @@ simple:
 	return p;
 
 idle:
+	if ((cfs_rq->forced_idle)) {
+		if (unlikely(local_softirq_pending())) {
+			trace_sched_cfs_idle_inject("softirq pending", 1);
+			cfs_rq->forced_idle = false;
+			cfs_rq->runnable = cfs_rq->nr_running;
+			goto again;
+		}
+		trace_sched_cfs_idle_inject("forced idle", 1);
+		return NULL;
+	}
 	/*
 	 * This is OK, because current is on_cpu, which avoids it being picked
 	 * for load-balance and preemption/IRQs are still disabled avoiding
@@ -8318,3 +8344,350 @@ __init void init_sched_fair_class(void)
 #endif /* SMP */
 
 }
+
+#ifdef CONFIG_CFS_IDLE_INJECT
+static atomic_t idle_inject_active;
+static DEFINE_PER_CPU(struct hrtimer, idle_inject_timer);
+static DEFINE_PER_CPU(bool, idle_injected);
+/* protect injection parameters from runtime changes */
+static DEFINE_SPINLOCK(idle_inject_lock);
+
+/* Track which CPUs are being injected with idle period */
+static unsigned long *idle_inject_cpumask;
+
+/* Default idle injection duration for each period. */
+#define DEFAULT_DURATION_MSECS (10)
+
+static unsigned int duration; /* idle inject duration in msec. */
+static unsigned int inject_interval; /* idle inject duration in msec. */
+static unsigned int idle_pct; /* percentage of time idle is forced */
+/* starting reference time for all CPUs to align idle period */
+static ktime_t inject_start_time;
+static int prepare_idle_inject(void);
+
+static void throttle_rq(int cpu)
+{
+	unsigned int resched = 0;
+	unsigned long flags;
+	struct rq *rq = cpu_rq(cpu);
+
+	raw_spin_lock_irqsave(&rq->lock, flags);
+	rq->cfs.forced_idle = true;
+	resched = rq->cfs.runnable;
+	rq->cfs.runnable = false;
+	raw_spin_unlock_irqrestore(&rq->lock, flags);
+	if (resched)
+		resched_cpu(cpu);
+}
+
+static void unthrottle_rq(int cpu)
+{
+	unsigned int resched = 0;
+	unsigned long flags;
+	struct rq *rq = cpu_rq(cpu);
+
+	raw_spin_lock_irqsave(&rq->lock, flags);
+	rq->cfs.forced_idle = false;
+	resched = rq->cfs.runnable = !!rq->cfs.nr_running;
+	raw_spin_unlock_irqrestore(&rq->lock, flags);
+
+	if (resched)
+		resched_cpu(cpu);
+}
+
+static enum hrtimer_restart idle_inject_timer_fn(struct hrtimer *hrtimer)
+{
+	int cpu = smp_processor_id();
+	struct hrtimer *hrt = this_cpu_ptr(&idle_inject_timer);
+	ktime_t now, delta, period;
+	bool status;
+
+	now = hrtimer_cb_get_time(hrt);
+
+	status = raw_cpu_read(idle_injected);
+	if (status) {
+		/*
+		 * We were injecting idle in the last phase, let's forward the
+		 * timer to the next period
+		 *
+		 * status: 1             0                1        0
+		 * ____          ____________________           _______
+		 *     |________|                    |_________|
+		 *
+		 *     |duration|      interval      |
+		 *
+		 *              ^ we are here
+		 *                  forward to here: ^
+		 */
+		delta = ktime_sub(now, inject_start_time);
+		period = ktime_add(ms_to_ktime(duration),
+				ms_to_ktime(inject_interval));
+		delta = ktime_roundup(delta, period);
+		hrtimer_set_expires(hrt, ktime_add(delta, inject_start_time));
+	} else {
+		/*
+		 * We were not injecting idle in the last phase, let's forward
+		 * timer after forced idle duration
+		 * ____          ____________________           _______
+		 *     |________|                    |_________|
+		 *
+		 *     |duration|      interval      |
+		 *
+		 *     ^ we are here
+		 *              ^ forward timer to here
+		 */
+		hrtimer_set_expires(hrt, ktime_add(ms_to_ktime(duration), now));
+	}
+	raw_cpu_write(idle_injected, !status);
+	trace_sched_cfs_idle_inject("idle sync timer", !status);
+	if (status)
+		unthrottle_rq(cpu);
+	else
+		throttle_rq(cpu);
+
+	return HRTIMER_RESTART;
+}
+
+static void idle_inject_timer_start(void *info)
+{
+	int cpu = smp_processor_id();
+	struct hrtimer *hrt = this_cpu_ptr(&idle_inject_timer);
+
+	this_cpu_write(idle_injected, true);
+	set_bit(cpu, idle_inject_cpumask);
+	hrtimer_start(hrt, ms_to_ktime(duration), HRTIMER_MODE_ABS_PINNED);
+	hrtimer_set_expires(hrt, *(ktime_t *)info);
+}
+
+static int start_idle_inject(void)
+{
+	int ret;
+	ktime_t now = ktime_get();
+
+	if (!atomic_read(&idle_inject_active)) {
+		/* called once per activation of idle injection */
+		ret = prepare_idle_inject();
+		if (ret)
+			return ret;
+	}
+	/* prevent cpu hotplug */
+	get_online_cpus();
+
+	/* set a future time to let all per cpu timers expires the same time */
+	now = ktime_roundup(now, ms_to_ktime(duration));
+
+	/* start one timer per online cpu */
+	inject_start_time = now;
+	on_each_cpu(idle_inject_timer_start, &now, 1);
+	atomic_set(&idle_inject_active, 1);
+
+	put_online_cpus();
+
+	return 0;
+}
+
+static void stop_idle_inject(void)
+{
+	int i;
+	struct hrtimer *hrt;
+
+	if (bitmap_weight(idle_inject_cpumask, num_possible_cpus())) {
+		for_each_set_bit(i, idle_inject_cpumask, num_possible_cpus()) {
+			hrt = &per_cpu(idle_inject_timer, i);
+			hrtimer_cancel(hrt);
+			unthrottle_rq(i);
+		}
+	}
+}
+
+static int idle_inject_cpu_callback(struct notifier_block *nfb,
+				unsigned long action, void *hcpu)
+{
+	unsigned long cpu = (unsigned long)hcpu;
+	struct hrtimer *hrt = &per_cpu(idle_inject_timer, cpu);
+	ktime_t now, delta, period;
+
+	if (!atomic_read(&idle_inject_active))
+		goto exit_ok;
+
+	switch (action) {
+	case CPU_STARTING:
+		raw_cpu_write(idle_injected, true);
+
+		hrtimer_init(hrt, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
+		hrt->function = idle_inject_timer_fn;
+		set_bit(cpu, idle_inject_cpumask);
+
+		now = hrtimer_cb_get_time(hrt);
+		hrtimer_start(hrt, ms_to_ktime(duration),
+			HRTIMER_MODE_ABS_PINNED);
+		/*
+		 * When a new CPU comes online, we need to make sure it aligns
+		 * its phase with the rest of the CPUs. So we set the
+		 * timer to the next period based on the common starting time,
+		 * then start injecting idle time.
+		 */
+		spin_lock_irq(&idle_inject_lock);
+		delta = ktime_sub(now, inject_start_time);
+		period = ktime_add(ms_to_ktime(duration),
+				ms_to_ktime(inject_interval));
+		delta = ktime_roundup(delta, period);
+		spin_unlock_irq(&idle_inject_lock);
+		hrtimer_set_expires(hrt, ktime_add(delta, inject_start_time));
+		break;
+	case CPU_DYING:
+		clear_bit(cpu, idle_inject_cpumask);
+		hrtimer_cancel(hrt);
+		raw_cpu_write(idle_injected, false);
+		unthrottle_rq(cpu);
+		break;
+	default:
+		return NOTIFY_DONE;
+	}
+exit_ok:
+	return NOTIFY_OK;
+}
+
+static int idle_inject_pm_callback(struct notifier_block *self,
+				unsigned long action, void *hcpu)
+{
+	switch (action) {
+	case PM_HIBERNATION_PREPARE:
+	case PM_SUSPEND_PREPARE:
+		if (atomic_read(&idle_inject_active))
+			stop_idle_inject();
+		break;
+	case PM_POST_HIBERNATION:
+	case PM_POST_SUSPEND:
+		printk("POST SUSPEND restart idle injection\n");
+		start_idle_inject();
+		break;
+	default:
+		break;
+	}
+	return NOTIFY_OK;
+}
+
+static struct notifier_block idle_inject_pm_notifier = {
+	.notifier_call = idle_inject_pm_callback,
+};
+
+static struct notifier_block idle_inject_cpu_notifier = {
+	.notifier_call = idle_inject_cpu_callback,
+};
+
+static void end_idle_inject(void) {
+	unregister_hotcpu_notifier(&idle_inject_cpu_notifier);
+	unregister_pm_notifier(&idle_inject_pm_notifier);
+	atomic_set(&idle_inject_active, 0);
+	kfree(idle_inject_cpumask);
+}
+
+static int prepare_idle_inject(void)
+{
+	int retval = 0;
+	int bitmap_size;
+	int cpu;
+	struct hrtimer *hrt;
+
+	bitmap_size = BITS_TO_LONGS(num_possible_cpus()) * sizeof(long);
+	idle_inject_cpumask = kzalloc(bitmap_size, GFP_KERNEL);
+	if (!idle_inject_cpumask)
+		return -ENOMEM;
+
+	retval = register_pm_notifier(&idle_inject_pm_notifier);
+	if (retval)
+		goto exit_free;
+	retval = register_hotcpu_notifier(&idle_inject_cpu_notifier);
+	if (retval)
+		goto exit_unregister_pm;
+	get_online_cpus();
+	for_each_online_cpu(cpu) {
+		hrt = &per_cpu(idle_inject_timer, cpu);
+		hrtimer_init(hrt, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
+		hrt->function = idle_inject_timer_fn;
+	}
+	put_online_cpus();
+
+	if (!duration)
+		duration = DEFAULT_DURATION_MSECS;
+
+	return 0;
+exit_unregister_pm:
+	unregister_pm_notifier(&idle_inject_pm_notifier);
+exit_free:
+	kfree(idle_inject_cpumask);
+	return retval;
+}
+
+int proc_sched_cfs_idle_inject_pct_handler(struct ctl_table *table,
+					int write,
+					void __user *buffer,
+					size_t *length,	loff_t *ppos)
+{
+	int ret;
+
+	ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
+	if (ret)
+		goto out;
+
+	if (idle_pct != sysctl_sched_cfs_idle_inject_pct) {
+		if (!idle_pct)
+			start_idle_inject();
+		else if (!sysctl_sched_cfs_idle_inject_pct) {
+			stop_idle_inject();
+			end_idle_inject();
+		}
+
+		/* recompute injection parameters */
+		spin_lock_irq(&idle_inject_lock);
+		idle_pct = sysctl_sched_cfs_idle_inject_pct;
+		/*
+		 * duration is fixed for each injection period, we adjust
+		 * non idle interval to satisfy the idle percentage set
+		 * by the user. e.g. if duration is 10 and we want 33% idle
+		 * then interval is 20.
+		 * 33% idle
+		 * ____          ___________________          _________
+		 *     |________|                   |________| 33% idle
+		 * ____          ________          _______
+		 *     |________|        |________|  50% idle
+		 *
+		 *     |duration|interval|
+		 */
+		if (idle_pct)
+			inject_interval = (duration * (100 - idle_pct))
+				/ idle_pct;
+
+		spin_unlock_irq(&idle_inject_lock);
+
+	}
+out:
+	return ret;
+}
+
+int proc_sched_cfs_idle_inject_duration_handler(struct ctl_table *table,
+						int write,
+						void __user *buffer,
+						size_t *length,	loff_t *ppos)
+{
+	int ret;
+
+	ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
+	if (ret)
+		goto out;
+
+	if (duration == sysctl_sched_cfs_idle_inject_duration)
+		goto out;
+	/* recompute injection parameters */
+	spin_lock_irq(&idle_inject_lock);
+	duration = jiffies_to_msecs(sysctl_sched_cfs_idle_inject_duration);
+	if (idle_pct)
+		inject_interval = (duration * (100 - idle_pct)) / idle_pct;
+
+	spin_unlock_irq(&idle_inject_lock);
+out:
+	return ret;
+}
+
+#endif /* CONFIG_CFS_IDLE_INJECT */
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 6d2a119..0301d54 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -343,7 +343,7 @@ struct cfs_bandwidth { };
 struct cfs_rq {
 	struct load_weight load;
 	unsigned int nr_running, h_nr_running;
-
+	unsigned int runnable, forced_idle;
 	u64 exec_clock;
 	u64 min_vruntime;
 #ifndef CONFIG_64BIT
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index e69201d..78c304b 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -433,6 +433,26 @@ static struct ctl_table kern_table[] = {
 		.extra1		= &one,
 	},
 #endif
+#ifdef CONFIG_CFS_IDLE_INJECT
+	{
+		.procname	= "sched_cfs_idle_inject_pct",
+		.data		= &sysctl_sched_cfs_idle_inject_pct,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_sched_cfs_idle_inject_pct_handler,
+		.extra1		= &zero,
+		.extra2		= &one_hundred,
+	},
+	{
+		.procname	= "sched_cfs_idle_inject_duration",
+		.data		= &sysctl_sched_cfs_idle_inject_duration,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_sched_cfs_idle_inject_duration_handler,
+		.extra1		= &four,
+		.extra2		= &one_hundred,
+	},
+#endif
 #ifdef CONFIG_PROVE_LOCKING
 	{
 		.procname	= "prove_locking",
-- 
1.9.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/