lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-Id: <20210317045949.1584952-1-joshdon@google.com>
Date:   Tue, 16 Mar 2021 21:59:49 -0700
From:   Josh Don <joshdon@...gle.com>
To:     Ingo Molnar <mingo@...hat.com>,
        Peter Zijlstra <peterz@...radead.org>,
        Juri Lelli <juri.lelli@...hat.com>,
        Vincent Guittot <vincent.guittot@...aro.org>
Cc:     Dietmar Eggemann <dietmar.eggemann@....com>,
        Steven Rostedt <rostedt@...dmis.org>,
        Ben Segall <bsegall@...gle.com>, Mel Gorman <mgorman@...e.de>,
        Daniel Bristot de Oliveira <bristot@...hat.com>,
        Luis Chamberlain <mcgrof@...nel.org>,
        Kees Cook <keescook@...omium.org>,
        Iurii Zaikin <yzaikin@...gle.com>,
        linux-kernel@...r.kernel.org, linux-fsdevel@...r.kernel.org,
        David Rientjes <rientjes@...gle.com>,
        Oleg Rombakh <olegrom@...gle.com>,
        Paul Turner <pjt@...gle.com>, Josh Don <joshdon@...gle.com>
Subject: [PATCH] sched: Warn on long periods of pending need_resched

From: Paul Turner <pjt@...gle.com>

CPU scheduler marks need_resched flag to signal a schedule() on a
particular CPU. But, schedule() may not happen immediately in cases
where the current task is executing in the kernel mode (no
preemption state) for extended periods of time.

This patch adds a warn_on if need_resched is pending for more than the
time specified in sysctl resched_latency_warn_ms. Monitoring is done via
the tick and the accuracy is hence limited to jiffy scale. This also
means that we won't trigger the warning if the tick is disabled.

If resched_latency_warn_ms is set to the default value, only one warning
will be produced per boot.

This warning only exists under CONFIG_SCHED_DEBUG. If it goes off, it is
likely that there is a missing cond_resched() somewhere.

Signed-off-by: Paul Turner <pjt@...gle.com>
Signed-off-by: Josh Don <joshdon@...gle.com>
---
We've caught various bugs using this patch. In fact, a followup patch to
this one will be a patch introducing a missing cond_resched(). However,
this may be too noisy to have as default enabled (especially given that
it requires some tuning); I'm open to having this default disabled
instead.
 kernel/sched/core.c  | 91 ++++++++++++++++++++++++++++++++++++++++++++
 kernel/sched/sched.h |  6 +++
 kernel/sysctl.c      |  8 ++++
 3 files changed, 105 insertions(+)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 98191218d891..ac037fc87d5e 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -58,7 +58,25 @@ const_debug unsigned int sysctl_sched_features =
 #include "features.h"
 	0;
 #undef SCHED_FEAT
+
+/*
+ * Print a warning if need_resched is set for at least this long. At the
+ * default value, only a single warning will be printed per boot.
+ *
+ * Values less than 2 disable the feature.
+ *
+ * A kernel compiled with CONFIG_KASAN tends to run more slowly on average.
+ * Increase the need_resched timeout to reduce false alarms.
+ */
+#ifdef CONFIG_KASAN
+#define RESCHED_DEFAULT_WARN_LATENCY_MS 101
+#define RESCHED_BOOT_QUIET_SEC 600
+#else
+#define RESCHED_DEFAULT_WARN_LATENCY_MS 51
+#define RESCHED_BOOT_QUIET_SEC 300
 #endif
+int sysctl_resched_latency_warn_ms = RESCHED_DEFAULT_WARN_LATENCY_MS;
+#endif /* CONFIG_SCHED_DEBUG */
 
 /*
  * Number of tasks to iterate in a single balance run.
@@ -4520,6 +4538,71 @@ unsigned long long task_sched_runtime(struct task_struct *p)
 	return ns;
 }
 
+#ifdef CONFIG_SCHED_DEBUG
+
+static inline u64 resched_latency_check(struct rq *rq)
+{
+	int latency_warn_ms = READ_ONCE(sysctl_resched_latency_warn_ms);
+	bool warn_only_once = (latency_warn_ms == RESCHED_DEFAULT_WARN_LATENCY_MS);
+	u64 need_resched_latency, now = rq_clock(rq);
+	static bool warned_once;
+
+	if (warn_only_once && warned_once)
+		return 0;
+
+	if (!need_resched() || latency_warn_ms < 2)
+		return 0;
+
+	/* Disable this warning for the first few mins after boot */
+	if (now < RESCHED_BOOT_QUIET_SEC * NSEC_PER_SEC)
+		return 0;
+
+	if (!rq->last_seen_need_resched_ns) {
+		rq->last_seen_need_resched_ns = now;
+		rq->ticks_without_resched = 0;
+		return 0;
+	}
+
+	rq->ticks_without_resched++;
+	need_resched_latency = now - rq->last_seen_need_resched_ns;
+	if (need_resched_latency <= latency_warn_ms * NSEC_PER_MSEC)
+		return 0;
+
+	warned_once = true;
+
+	return need_resched_latency;
+}
+
+static inline void resched_latency_warn(int cpu, u64 latency)
+{
+	static DEFINE_RATELIMIT_STATE(latency_check_ratelimit, 60 * 60 * HZ, 1);
+
+	WARN(__ratelimit(&latency_check_ratelimit),
+	     "CPU %d: need_resched set for > %llu ns (%d ticks) "
+	     "without schedule\n",
+	     cpu, latency, cpu_rq(cpu)->ticks_without_resched);
+}
+
+
+static int __init setup_resched_latency_warn_ms(char *str)
+{
+	long val;
+
+	if ((kstrtol(str, 0, &val))) {
+		pr_warn("Unable to set resched_latency_warn_ms\n");
+		return 1;
+	}
+
+	sysctl_resched_latency_warn_ms = val;
+	return 1;
+}
+__setup("resched_latency_warn_ms=", setup_resched_latency_warn_ms);
+#else
+static inline u64 resched_latency_check(struct rq *rq) { return 0; }
+static inline void resched_latency_warn(int cpu, u64 latency) {}
+#endif /* CONFIG_SCHED_DEBUG */
+
+
 /*
  * This function gets called by the timer code, with HZ frequency.
  * We call it with interrupts disabled.
@@ -4531,6 +4614,7 @@ void scheduler_tick(void)
 	struct task_struct *curr = rq->curr;
 	struct rq_flags rf;
 	unsigned long thermal_pressure;
+	u64 resched_latency;
 
 	arch_scale_freq_tick();
 	sched_clock_tick();
@@ -4541,11 +4625,15 @@ void scheduler_tick(void)
 	thermal_pressure = arch_scale_thermal_pressure(cpu_of(rq));
 	update_thermal_load_avg(rq_clock_thermal(rq), rq, thermal_pressure);
 	curr->sched_class->task_tick(rq, curr, 0);
+	resched_latency = resched_latency_check(rq);
 	calc_global_load_tick(rq);
 	psi_task_tick(rq);
 
 	rq_unlock(rq, &rf);
 
+	if (resched_latency)
+		resched_latency_warn(cpu, resched_latency);
+
 	perf_event_task_tick();
 
 #ifdef CONFIG_SMP
@@ -5040,6 +5128,9 @@ static void __sched notrace __schedule(bool preempt)
 	next = pick_next_task(rq, prev, &rf);
 	clear_tsk_need_resched(prev);
 	clear_preempt_need_resched();
+#ifdef CONFIG_SCHED_DEBUG
+	rq->last_seen_need_resched_ns = 0;
+#endif
 
 	if (likely(prev != next)) {
 		rq->nr_switches++;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 10a1522b1e30..912a8886bc7f 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -57,6 +57,7 @@
 #include <linux/prefetch.h>
 #include <linux/profile.h>
 #include <linux/psi.h>
+#include <linux/ratelimit.h>
 #include <linux/rcupdate_wait.h>
 #include <linux/security.h>
 #include <linux/stop_machine.h>
@@ -963,6 +964,11 @@ struct rq {
 
 	atomic_t		nr_iowait;
 
+#ifdef CONFIG_SCHED_DEBUG
+	u64 last_seen_need_resched_ns;
+	int ticks_without_resched;
+#endif
+
 #ifdef CONFIG_MEMBARRIER
 	int membarrier_state;
 #endif
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 62fbd09b5dc1..526094fc364e 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -193,6 +193,7 @@ static int max_wakeup_granularity_ns = NSEC_PER_SEC;	/* 1 second */
 static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE;
 static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1;
 #endif /* CONFIG_SMP */
+extern int sysctl_resched_latency_warn_ms;
 #endif /* CONFIG_SCHED_DEBUG */
 
 #ifdef CONFIG_COMPACTION
@@ -1763,6 +1764,13 @@ static struct ctl_table kern_table[] = {
 		.extra2		= SYSCTL_ONE,
 	},
 #endif /* CONFIG_NUMA_BALANCING */
+	{
+		.procname	= "resched_latency_warn_ms",
+		.data		= &sysctl_resched_latency_warn_ms,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
 #endif /* CONFIG_SCHED_DEBUG */
 	{
 		.procname	= "sched_rt_period_us",
-- 
2.31.0.rc2.261.g7f71774620-goog

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ