lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <1447424529-13671-3-git-send-email-fweisbec@gmail.com>
Date:	Fri, 13 Nov 2015 15:22:04 +0100
From:	Frederic Weisbecker <fweisbec@...il.com>
To:	LKML <linux-kernel@...r.kernel.org>
Cc:	Frederic Weisbecker <fweisbec@...il.com>,
	Peter Zijlstra <peterz@...radead.org>,
	Chris Metcalf <cmetcalf@...hip.com>,
	Thomas Gleixner <tglx@...utronix.de>,
	Luiz Capitulino <lcapitulino@...hat.com>,
	Christoph Lameter <cl@...ux.com>,
	Ingo Molnar <mingo@...nel.org>,
	Viresh Kumar <viresh.kumar@...aro.org>,
	Rik van Riel <riel@...hat.com>
Subject: [PATCH 2/7] nohz: New tick dependency mask

The tick dependency is evaluated on every IRQ. This is a batch of checks
which determine whether it is safe to stop the tick or not. These checks
are often split in many details: posix cpu timers, scheduler, sched clock,
perf events. Each of which are made of smaller details: posix cpu
timer involves checking process wide timers then thread wide timers. Perf
involves checking freq events then more per cpu details.

Checking these details asynchronously every time we update the full
dynticks state bring avoidable overhead and a messy layout.

Lets introduce instead tick dependency masks: one for system wide
dependency (unstable sched clock), one for CPU wide dependency (sched,
perf), and task/signal level dependencies. The subsystems are responsible
of setting and clearing their dependency through a set of APIs that will
take care of concurrent dependency mask modifications and kick targets
to restart the relevant CPU tick whenever needed.

This new dependency engine stays beside the old one until all subsystems
having a tick dependency are converted to it.

Suggested-by: Thomas Gleixner <tglx@...utronix.de>
Suggested-by: Peter Zijlstra <peterz@...radead.org>
Cc: Christoph Lameter <cl@...ux.com>
Cc: Chris Metcalf <cmetcalf@...hip.com>
Cc: Ingo Molnar <mingo@...nel.org>
Cc: Luiz Capitulino <lcapitulino@...hat.com>
Cc: Peter Zijlstra <peterz@...radead.org>
Cc: Rik van Riel <riel@...hat.com>
Cc: Thomas Gleixner <tglx@...utronix.de>
Cc: Viresh Kumar <viresh.kumar@...aro.org>
Signed-off-by: Frederic Weisbecker <fweisbec@...il.com>
---
 include/linux/sched.h    |   8 +++
 include/linux/tick.h     |  21 ++++++++
 kernel/time/tick-sched.c | 130 +++++++++++++++++++++++++++++++++++++++++++++--
 kernel/time/tick-sched.h |   1 +
 4 files changed, 155 insertions(+), 5 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index f87559d..a65782f 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -706,6 +706,10 @@ struct signal_struct {
 	/* Earliest-expiration cache. */
 	struct task_cputime cputime_expires;
 
+#ifdef CONFIG_NO_HZ_FULL
+	unsigned long tick_dependency;
+#endif
+
 	struct list_head cpu_timers[3];
 
 	struct pid *tty_old_pgrp;
@@ -1528,6 +1532,10 @@ struct task_struct {
 		VTIME_SYS,
 	} vtime_snap_whence;
 #endif
+
+#ifdef CONFIG_NO_HZ_FULL
+	unsigned long tick_dependency;
+#endif
 	unsigned long nvcsw, nivcsw; /* context switch counts */
 	u64 start_time;		/* monotonic time in nsec */
 	u64 real_start_time;	/* boot based time in nsec */
diff --git a/include/linux/tick.h b/include/linux/tick.h
index e312219..472fd59 100644
--- a/include/linux/tick.h
+++ b/include/linux/tick.h
@@ -97,6 +97,18 @@ static inline void tick_broadcast_exit(void)
 	tick_broadcast_oneshot_control(TICK_BROADCAST_EXIT);
 }
 
+enum tick_dependency_bit {
+	TICK_POSIX_TIMER_BIT	= 0,
+	TICK_PERF_EVENTS_BIT	= 1,
+	TICK_SCHED_BIT		= 2,
+	TICK_CLOCK_UNSTABLE_BIT	= 3
+};
+
+#define TICK_POSIX_TIMER_MASK		(1 << TICK_POSIX_TIMER_BIT)
+#define TICK_PERF_EVENTS_MASK		(1 << TICK_PERF_EVENTS_BIT)
+#define TICK_SCHED_MASK			(1 << TICK_SCHED_BIT)
+#define TICK_CLOCK_UNSTABLE_MASK	(1 << TICK_CLOCK_UNSTABLE_BIT)
+
 #ifdef CONFIG_NO_HZ_COMMON
 extern int tick_nohz_tick_stopped(void);
 extern void tick_nohz_idle_enter(void);
@@ -152,6 +164,15 @@ static inline int housekeeping_any_cpu(void)
 	return cpumask_any_and(housekeeping_mask, cpu_online_mask);
 }
 
+extern void __tick_nohz_set_dep_delayed(enum tick_dependency_bit bit,
+					unsigned long *dep);
+extern void __tick_nohz_clear_dep(enum tick_dependency_bit bit,
+				  unsigned long *dep);
+extern void tick_nohz_set_dep(enum tick_dependency_bit bit);
+extern void tick_nohz_clear_dep(enum tick_dependency_bit bit);
+extern void tick_nohz_set_dep_cpu(enum tick_dependency_bit bit, int cpu);
+extern void tick_nohz_clear_dep_cpu(enum tick_dependency_bit bit, int cpu);
+
 extern void tick_nohz_full_kick(void);
 extern void tick_nohz_full_kick_cpu(int cpu);
 extern void tick_nohz_full_kick_all(void);
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 7c7ec45..b9ea21d 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -156,11 +156,53 @@ static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs)
 cpumask_var_t tick_nohz_full_mask;
 cpumask_var_t housekeeping_mask;
 bool tick_nohz_full_running;
+static unsigned long tick_dependency;
 
-static bool can_stop_full_tick(void)
+static void trace_tick_dependency(unsigned long dep)
+{
+	if (dep & TICK_POSIX_TIMER_MASK) {
+		trace_tick_stop(0, "posix timers running\n");
+		return;
+	}
+
+	if (dep & TICK_PERF_EVENTS_MASK) {
+		trace_tick_stop(0, "perf events running\n");
+		return;
+	}
+
+	if (dep & TICK_SCHED_MASK) {
+		trace_tick_stop(0, "more than 1 task in runqueue\n");
+		return;
+	}
+
+	if (dep & TICK_CLOCK_UNSTABLE_MASK)
+		trace_tick_stop(0, "unstable sched clock\n");
+}
+
+static bool can_stop_full_tick(struct tick_sched *ts)
 {
 	WARN_ON_ONCE(!irqs_disabled());
 
+	if (tick_dependency) {
+		trace_tick_dependency(tick_dependency);
+		return false;
+	}
+
+	if (ts->tick_dependency) {
+		trace_tick_dependency(ts->tick_dependency);
+		return false;
+	}
+
+	if (current->tick_dependency) {
+		trace_tick_dependency(current->tick_dependency);
+		return false;
+	}
+
+	if (current->signal->tick_dependency) {
+		trace_tick_dependency(current->signal->tick_dependency);
+		return false;
+	}
+
 	if (!sched_can_stop_tick()) {
 		trace_tick_stop(0, "more than 1 task in runqueue\n");
 		return false;
@@ -176,9 +218,10 @@ static bool can_stop_full_tick(void)
 		return false;
 	}
 
-	/* sched_clock_tick() needs us? */
 #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
 	/*
+	 * sched_clock_tick() needs us?
+	 *
 	 * TODO: kick full dynticks CPUs when
 	 * sched_clock_stable is set.
 	 */
@@ -253,6 +296,79 @@ void tick_nohz_full_kick_all(void)
 	preempt_enable();
 }
 
+void __tick_nohz_clear_dep(enum tick_dependency_bit bit,
+				       unsigned long *dep)
+{
+	clear_bit(bit, dep);
+}
+
+static void kick_all_work_fn(struct work_struct *work)
+{
+       tick_nohz_full_kick_all();
+}
+static DECLARE_WORK(kick_all_work, kick_all_work_fn);
+
+void __tick_nohz_set_dep_delayed(enum tick_dependency_bit bit, unsigned long *dep)
+{
+	unsigned long prev;
+
+	prev = fetch_or(dep, BIT_MASK(bit));
+	if (!prev) {
+		/*
+		* We need the IPIs to be sent from sane process context.
+		* The posix cpu timers are always set with irqs disabled.
+		*/
+		schedule_work(&kick_all_work);
+	}
+}
+
+/*
+ * Set a global tick dependency. Lets do the wide IPI kick asynchronously
+ * for callers with irqs disabled.
+ */
+void tick_nohz_set_dep(enum tick_dependency_bit bit)
+{
+	unsigned long prev;
+
+	prev = fetch_or(&tick_dependency, BIT_MASK(bit));
+	if (!prev)
+		tick_nohz_full_kick_all();
+}
+
+void tick_nohz_clear_dep(enum tick_dependency_bit bit)
+{
+	__tick_nohz_clear_dep(bit, &tick_dependency);
+}
+
+void tick_nohz_set_dep_cpu(enum tick_dependency_bit bit, int cpu)
+{
+	unsigned long prev;
+	struct tick_sched *ts;
+
+	ts = per_cpu_ptr(&tick_cpu_sched, cpu);
+
+	prev = fetch_or(&ts->tick_dependency, BIT_MASK(bit));
+	if (!prev) {
+		preempt_disable();
+		/* Perf needs local kick that is NMI safe */
+		if (cpu == smp_processor_id()) {
+			tick_nohz_full_kick();
+		} else {
+			/* Remote irq work not NMI-safe */
+			WARN_ON_ONCE(in_nmi());
+			tick_nohz_full_kick_cpu(cpu);
+		}
+		preempt_enable();
+	}
+}
+
+void tick_nohz_clear_dep_cpu(enum tick_dependency_bit bit, int cpu)
+{
+	struct tick_sched *ts = per_cpu_ptr(&tick_cpu_sched, cpu);
+
+	__tick_nohz_clear_dep(bit, &ts->tick_dependency);
+}
+
 /*
  * Re-evaluate the need for the tick as we switch the current task.
  * It might need the tick due to per task/process properties:
@@ -261,15 +377,19 @@ void tick_nohz_full_kick_all(void)
 void __tick_nohz_task_switch(void)
 {
 	unsigned long flags;
+	struct tick_sched *ts;
 
 	local_irq_save(flags);
 
 	if (!tick_nohz_full_cpu(smp_processor_id()))
 		goto out;
 
-	if (tick_nohz_tick_stopped() && !can_stop_full_tick())
-		tick_nohz_full_kick();
+	ts = this_cpu_ptr(&tick_cpu_sched);
 
+	if (ts->tick_stopped) {
+		if (current->tick_dependency || current->signal->tick_dependency)
+			tick_nohz_full_kick();
+	}
 out:
 	local_irq_restore(flags);
 }
@@ -722,7 +842,7 @@ static void tick_nohz_full_update_tick(struct tick_sched *ts)
 	if (!ts->tick_stopped && ts->nohz_mode == NOHZ_MODE_INACTIVE)
 		return;
 
-	if (can_stop_full_tick())
+	if (can_stop_full_tick(ts))
 		tick_nohz_stop_sched_tick(ts, ktime_get(), cpu);
 	else if (ts->tick_stopped)
 		tick_nohz_restart_sched_tick(ts, ktime_get());
diff --git a/kernel/time/tick-sched.h b/kernel/time/tick-sched.h
index a4a8d4e..d327f70 100644
--- a/kernel/time/tick-sched.h
+++ b/kernel/time/tick-sched.h
@@ -60,6 +60,7 @@ struct tick_sched {
 	u64				next_timer;
 	ktime_t				idle_expires;
 	int				do_timer_last;
+	unsigned long			tick_dependency;
 };
 
 extern struct tick_sched *tick_get_tick_sched(int cpu);
-- 
2.5.3

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ