linux-kernel - [PATCH 1/2] sched: Interrupt Aware Scheduler

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <1494612267-29465-2-git-send-email-rohit.k.jain@oracle.com>
Date:   Fri, 12 May 2017 11:04:26 -0700
From:   Rohit Jain <rohit.k.jain@...cle.com>
To:     linux-kernel@...r.kernel.org
Cc:     peterz@...radead.org, mingo@...hat.com
Subject: [PATCH 1/2] sched: Interrupt Aware Scheduler

The patch avoids CPUs which might be considered interrupt-heavy when
trying to schedule threads (on the push side) in the system. Interrupt
Awareness has only been added into the fair scheduling class.

It does so by, using the following algorithm:
--------------------------------------------------------------------------
1) When the interrupt is getting processed, the start and the end times
are noted for the interrupt on a per-cpu basis.

2) On a periodic basis the interrupt load is processed for each run
queue and this is mapped in terms of percentage in a global array. The
interrupt load for a given CPU is also decayed over time, so that the
most recent interrupt load has the biggest contribution in the interrupt
load calculations. This would mean the scheduler will try to avoid CPUs
(if it can) when scheduling threads which have been recently busy with
handling hardware interrupts.

3) Any CPU which lies above the 80th percentile in terms of percentage
interrupt load is considered interrupt-heavy.

4) During idle CPU search from the scheduler perspective this
information is used to skip CPUs if better are available.

5) If none of the CPUs are better in terms of idleness and interrupt
load, then the interrupt-heavy CPU is considered to be the best
available CPU.
---------------------------------------------------------------------------


The performance numbers:
---------------------------------------------------------------------------
IAS shows about (~3%) improvement on x86 when running OLTP select
workload.

The (normalized) execs/second for the database workload is shown below 
(higher is better)

+----------+------------+------------+
|Number of |BaseLine	|IAS         |
|Session(s)|Execs/second|Execs/second|
+----------+------------+------------+
| 1	   |1.00	|1.00        |
| 4	   |1.00	|1.00        |
| 16	   |1.00	|1.00        |
| 32	   |1.00        |1.01        |
| 64	   |1.00        |1.00        |
| 128	   |1.00        |1.02        |
| 256	   |1.00        |1.05        |
| 512	   |1.00        |1.03        |
+----------+------------+------------+

For microbenchmarks, I used barrier.c (open_mp code). It does a number
of iterations and barrier sync at the end of each for loop. When run,
barrier is run with number of threads which is equal to number of CPUs-2
(1 CPU dedicated to running ping, the other becomes interrupt heavy), we
see that clearly baseline has a lot of variation. I ran this on a 40 CPU
hardware with 38 threads.

I was also running ping on CPU 0 as:
'ping -l 10000 -q -s 10 -f host2'

This program's iterations/second (mean) improves. The thing to note in
this is that the standard deviation of number of iterations per second
goes down, which means the noise due to interrupts is reduced.

Following are the results (higher is better).

+-------+----------------+----------------+------------------+
|Threads|IAS             |Baseline        |Baseline without  |
|       |with ping       |with ping       |ping              |
+-------+-------+--------+-------+--------+-------+----------+
|       |Mean   |Std. Dev|Mean   |Std. Dev|Mean   |Std. Dev  |
+-------+-------+--------+-------+--------+-------+----------+
|1	| 504.5 | 20.6   | 497.3 | 26.4   | 510.4 | 5.8      |
|2	| 481.0 | 29.1   | 485.1 | 28.7   | 509.9 | 5.7      |
|4	| 447.4 | 6.9    | 451.6 | 8.2    | 488.7 | 9.5      |
|8	| 420.0 | 7.4    | 418.5 | 9.8    | 447.6 | 8.9      |
|16	| 360.3 | 43.0   | 358.5 | 42.1   | 374.3 | 45.7     |
|32	| 274.4 | 5.9    | 269.0 | 6.2    | 274.7 | 4.9      |
|38	| 254.5 | 4.6    | 254.6 | 5.9    | 275.3 | 3.8      |
+-------+-------+--------+-------+--------+-------+----------+

Signed-off-by: Rohit Jain <rohit.k.jain@...cle.com>
---
 kernel/sched/core.c    | 42 ++++++++++++++++++++++++++++++++++++++++
 kernel/sched/cputime.c |  6 ++++--
 kernel/sched/fair.c    | 52 ++++++++++++++++++++++++++++++++++++++------------
 kernel/sched/loadavg.c | 40 ++++++++++++++++++++++++++++++++++++++
 kernel/sched/sched.h   | 37 ++++++++++++++++++++++++++++++++++-
 5 files changed, 162 insertions(+), 15 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 759f4bd..c46e398 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -40,6 +40,38 @@
 #include <trace/events/sched.h>
 
 DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+DEFINE_PER_CPU(u64, cpu_intrlast);
+
+void update_rq_intrload(struct rq *rq)
+{
+	u64 intrused, intrstat;
+
+	unsigned int load;
+	int change;
+
+	intrstat = __this_cpu_read(cpu_intrstat);
+	intrused = intrstat - __this_cpu_read(cpu_intrlast);
+	__this_cpu_write(cpu_intrlast, intrstat);
+
+	if (intrused >= TICK_NSEC)
+		intrused = TICK_NSEC - 1;
+	/*
+	 * Actually, need to divide by NSEC_PER_SEC. Instead, right shift by 30,
+	 * 2^30 is close enough to 10^9. Lose some precision, gain performance.
+	 */
+	load = (100*HZ*intrused)>>30;
+
+	dec_intr_buckets(rq->intrload);
+	change = rq->intrload - load;
+	if (change < 0)
+		rq->intrload = load;
+	else if (change > 0)
+		rq->intrload -= (change + 3)/4;
+
+	inc_intr_buckets(rq->intrload);
+}
+#endif
 
 /*
  * Debugging: various feature bits
@@ -3101,6 +3133,7 @@ void scheduler_tick(void)
 	rq_lock(rq, &rf);
 
 	update_rq_clock(rq);
+	update_rq_intrload(rq);
 	curr->sched_class->task_tick(rq, curr, 0);
 	cpu_load_update_active(rq);
 	calc_global_load_tick(rq);
@@ -5717,6 +5750,10 @@ void set_rq_online(struct rq *rq)
 			if (class->rq_online)
 				class->rq_online(rq);
 		}
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+		rq->intrload = 0;
+		inc_intr_buckets(rq->intrload);
+#endif
 	}
 }
 
@@ -5731,6 +5768,9 @@ void set_rq_offline(struct rq *rq)
 		}
 
 		cpumask_clear_cpu(rq->cpu, rq->rd->online);
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+		dec_intr_buckets(rq->intrload);
+#endif
 		rq->online = 0;
 	}
 }
@@ -6184,6 +6224,8 @@ void __init sched_init(void)
 	init_sched_fair_class();
 
 	init_schedstats();
+	init_intr_buckets();
+	init_intr_threshold();
 
 	scheduler_running = 1;
 }
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index aea3135..49a07d6 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -21,6 +21,7 @@
  * compromise in place of having locks on each irq in account_system_time.
  */
 DEFINE_PER_CPU(struct irqtime, cpu_irqtime);
+DEFINE_PER_CPU(u64, cpu_intrstat);
 
 static int sched_clock_irqtime;
 
@@ -69,9 +70,10 @@ void irqtime_account_irq(struct task_struct *curr)
 	 * in that case, so as not to confuse scheduler with a special task
 	 * that do not consume any time, but still wants to run.
 	 */
-	if (hardirq_count())
+	if (hardirq_count()) {
 		irqtime_account_delta(irqtime, delta, CPUTIME_IRQ);
-	else if (in_serving_softirq() && curr != this_cpu_ksoftirqd())
+		__this_cpu_add(cpu_intrstat, delta);
+	} else if (in_serving_softirq() && curr != this_cpu_ksoftirqd())
 		irqtime_account_delta(irqtime, delta, CPUTIME_SOFTIRQ);
 }
 EXPORT_SYMBOL_GPL(irqtime_account_irq);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index d711093..0601c1e 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5598,6 +5598,7 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
 	u64 latest_idle_timestamp = 0;
 	int least_loaded_cpu = this_cpu;
 	int shallowest_idle_cpu = -1;
+	int shallowest_idle_cpu_backup = -1;
 	int i;
 
 	/* Check if we have any choice: */
@@ -5614,10 +5615,16 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
 				 * We give priority to a CPU whose idle state
 				 * has the smallest exit latency irrespective
 				 * of any idle timestamp.
+				 *
+				 * Furthermore, we are aware of the interrupt
+				 * load on the CPU.
 				 */
 				min_exit_latency = idle->exit_latency;
 				latest_idle_timestamp = rq->idle_stamp;
-				shallowest_idle_cpu = i;
+				if (!INTRLOAD_HIGH(rq))
+					shallowest_idle_cpu = i;
+				else
+					shallowest_idle_cpu_backup = i;
 			} else if ((!idle || idle->exit_latency == min_exit_latency) &&
 				   rq->idle_stamp > latest_idle_timestamp) {
 				/*
@@ -5637,7 +5644,12 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
 		}
 	}
 
-	return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu;
+	if (shallowest_idle_cpu != -1)
+		return shallowest_idle_cpu;
+	else if (shallowest_idle_cpu_backup != -1)
+		return shallowest_idle_cpu_backup;
+
+	return least_loaded_cpu;
 }
 
 /*
@@ -5748,15 +5760,18 @@ static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int
 
 	for_each_cpu_wrap(core, cpus, target, wrap) {
 		bool idle = true;
+		int rcpu = -1;
 
 		for_each_cpu(cpu, cpu_smt_mask(core)) {
 			cpumask_clear_cpu(cpu, cpus);
 			if (!idle_cpu(cpu))
 				idle = false;
+			if (!INTRLOAD_HIGH(cpu_rq(cpu)))
+				rcpu = cpu;
 		}
 
 		if (idle)
-			return core;
+			return (rcpu == -1 ? core : rcpu);
 	}
 
 	/*
@@ -5772,7 +5787,7 @@ static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int
  */
 static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
 {
-	int cpu;
+	int cpu, backup_cpu = -1;
 
 	if (!static_branch_likely(&sched_smt_present))
 		return -1;
@@ -5780,11 +5795,15 @@ static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int t
 	for_each_cpu(cpu, cpu_smt_mask(target)) {
 		if (!cpumask_test_cpu(cpu, &p->cpus_allowed))
 			continue;
-		if (idle_cpu(cpu))
-			return cpu;
+		if (idle_cpu(cpu)) {
+			if (!INTRLOAD_HIGH(cpu_rq(cpu)))
+				return cpu;
+			else
+				backup_cpu = cpu;
+		}
 	}
 
-	return -1;
+	return backup_cpu;
 }
 
 #else /* CONFIG_SCHED_SMT */
@@ -5812,7 +5831,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t
 	u64 avg_cost, avg_idle = this_rq()->avg_idle;
 	u64 time, cost;
 	s64 delta;
-	int cpu, wrap;
+	int cpu, wrap, backup_cpu = -1;
 
 	this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc));
 	if (!this_sd)
@@ -5832,10 +5851,18 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t
 	for_each_cpu_wrap(cpu, sched_domain_span(sd), target, wrap) {
 		if (!cpumask_test_cpu(cpu, &p->cpus_allowed))
 			continue;
-		if (idle_cpu(cpu))
-			break;
+		if (idle_cpu(cpu)) {
+			if (INTRLOAD_HIGH(cpu_rq(cpu))) {
+				backup_cpu = cpu;
+			} else {
+				backup_cpu = -1;
+				break;
+			}
+		}
 	}
 
+	if (backup_cpu >= 0)
+		cpu = backup_cpu;
 	time = local_clock() - time;
 	cost = this_sd->avg_scan_cost;
 	delta = (s64)(time - cost) / 8;
@@ -5852,13 +5879,14 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
 	struct sched_domain *sd;
 	int i;
 
-	if (idle_cpu(target))
+	if (idle_cpu(target) && !INTRLOAD_HIGH(cpu_rq(target)))
 		return target;
 
 	/*
 	 * If the previous cpu is cache affine and idle, don't be stupid.
 	 */
-	if (prev != target && cpus_share_cache(prev, target) && idle_cpu(prev))
+	if (prev != target && cpus_share_cache(prev, target) && idle_cpu(prev)
+	    && !INTRLOAD_HIGH(cpu_rq(prev)))
 		return prev;
 
 	sd = rcu_dereference(per_cpu(sd_llc, target));
diff --git a/kernel/sched/loadavg.c b/kernel/sched/loadavg.c
index f15fb2b..5ba9356 100644
--- a/kernel/sched/loadavg.c
+++ b/kernel/sched/loadavg.c
@@ -61,6 +61,10 @@
 /* Variables and functions for calc_load */
 atomic_long_t calc_load_tasks;
 unsigned long calc_load_update;
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+atomic_t intr_buckets[(INTR_PRECISION/INTR_BUCKET_SZ)];
+unsigned int intr_threshold;
+#endif
 unsigned long avenrun[3];
 EXPORT_SYMBOL(avenrun); /* should be removed */
 
@@ -346,6 +350,41 @@ static inline void calc_global_nohz(void) { }
 
 #endif /* CONFIG_NO_HZ_COMMON */
 
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+void init_intr_buckets(void)
+{
+	int i;
+
+	atomic_set(intr_buckets, num_online_cpus());
+	for (i = 1; i < (INTR_PRECISION/INTR_BUCKET_SZ); i++)
+		atomic_set(intr_buckets+i, 0);
+}
+
+void dec_intr_buckets(unsigned int intrload)
+{
+	atomic_dec_if_positive(intr_buckets+(intrload/INTR_BUCKET_SZ));
+}
+
+void inc_intr_buckets(unsigned int intrload)
+{
+	atomic_inc(intr_buckets+(intrload/INTR_BUCKET_SZ));
+}
+
+void update_intr_load_threshold(void)
+{
+	unsigned int count_cpus = 0, bucket_count = 0;
+
+	while ((count_cpus <=
+	       ((num_online_cpus()*INTR_THRS_PCT)/INTR_PRECISION)) &&
+		(bucket_count < (INTR_PRECISION/INTR_BUCKET_SZ))) {
+		count_cpus += atomic_read(intr_buckets+bucket_count);
+		++bucket_count;
+	}
+
+	intr_threshold = (bucket_count*INTR_BUCKET_SZ);
+}
+#endif
+
 /*
  * calc_load - update the avenrun load estimates 10 ticks after the
  * CPUs have updated calc_load_tasks.
@@ -381,6 +420,7 @@ void calc_global_load(unsigned long ticks)
 	 * In case we idled for multiple LOAD_FREQ intervals, catch up in bulk.
 	 */
 	calc_global_nohz();
+	update_intr_load_threshold();
 }
 
 /*
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 7808ab0..29087ba 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -44,6 +44,15 @@
 #define SCHED_WARN_ON(x)	((void)(x))
 #endif
 
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+DECLARE_PER_CPU(u64, cpu_intrstat);
+DECLARE_PER_CPU(u64, cpu_intrlast);
+
+#define INTR_BUCKET_SZ		10
+#define INTR_THRS_PCT		800
+#define INTR_PRECISION		1000
+#endif
+
 struct rq;
 struct cpuidle_state;
 
@@ -59,6 +68,25 @@ extern atomic_long_t calc_load_tasks;
 extern void calc_global_load_tick(struct rq *this_rq);
 extern long calc_load_fold_active(struct rq *this_rq, long adjust);
 
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+extern unsigned int intr_threshold;
+#define INTRLOAD_HIGH(_rq)	((_rq)->intrload > intr_threshold)
+
+extern void inc_intr_buckets(unsigned int intrload);
+extern void dec_intr_buckets(unsigned int intrload);
+extern void init_intr_buckets(void);
+extern void update_intr_load_threshold(void);
+static inline void init_intr_threshold(void) {intr_threshold = INTR_PRECISION; }
+#else
+#define INTRLOAD_HIGH(_rq)	(0)
+
+static inline void inc_intr_buckets(unsigned int intrload) { }
+static inline void dec_intr_buckets(unsigned int intrload) { }
+static inline void init_intr_buckets(void) { }
+static inline void update_intr_load_threshold(void) { }
+static inline void init_intr_threshold(void) { }
+#endif
+
 #ifdef CONFIG_SMP
 extern void cpu_load_update_active(struct rq *this_rq);
 #else
@@ -650,7 +678,9 @@ struct rq {
 	struct load_weight load;
 	unsigned long nr_load_updates;
 	u64 nr_switches;
-
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+	unsigned int intrload;
+#endif
 	struct cfs_rq cfs;
 	struct rt_rq rt;
 	struct dl_rq dl;
@@ -1550,6 +1580,11 @@ static inline void rq_last_tick_reset(struct rq *rq)
 }
 
 extern void update_rq_clock(struct rq *rq);
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+extern void update_rq_intrload(struct rq *rq);
+#else
+static inline void update_rq_intrload(struct rq *rq) { }
+#endif
 
 extern void activate_task(struct rq *rq, struct task_struct *p, int flags);
 extern void deactivate_task(struct rq *rq, struct task_struct *p, int flags);
-- 
2.7.4