linux-kernel - [PATCH 10/10] fault injection: inject faults in new/rare callchains

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite for Android: free password hash cracker in your pocket
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20161016155612.4784-10-vegard.nossum@oracle.com>
Date:   Sun, 16 Oct 2016 17:56:12 +0200
From:   Vegard Nossum <vegard.nossum@...cle.com>
To:     Akinobu Mita <akinobu.mita@...il.com>
Cc:     linux-kernel@...r.kernel.org,
        Vegard Nossum <vegard.nossum@...cle.com>
Subject: [PATCH 10/10] fault injection: inject faults in new/rare callchains

Before this patch, fault injection uses a combination of randomness and
frequency to determine where to inject faults. The problem with this is
that code paths which are executed very rarely get proportional amounts
of faults injected.

A better heuristic is to look at the actual callchain leading up to the
possible failure point; if we see a callchain that we've never seen up
until this point, chances are it's a rare one and we should definitely
inject a fault here (since we might not get the chance again later).

This uses a probabilistic set structure (similar to a bloom filter) to
determine whether we have seen a particular callchain before by hashing
the stack trace and atomically testing/setting a bit corresponding to
the current callchain.

There is a possibility of false negatives (i.e. we think we have seen a
particular callchain before when in fact we haven't, therefore we don't
inject a fault where we should have). We might use some sort of random
seed here, but the additional complexity doesn't seem worth it to me.

This finds a lot more bugs than just plain fault injection.

To enable at run-time:

  echo Y > /sys/kernel/debug/fail*/fail_new_callsites

Signed-off-by: Vegard Nossum <vegard.nossum@...cle.com>

---

v2: turned this into a runtime knob as suggested by Akinobu Mita and
bumped the hashtable size from 32 to 128 KiB (16 to 64 KiB on 32-bit).
---
 Documentation/fault-injection/fault-injection.txt |  8 +++++
 include/linux/fault-inject.h                      |  2 ++
 lib/Kconfig.debug                                 |  7 +++-
 lib/fault-inject.c                                | 40 ++++++++++++++++++++---
 4 files changed, 51 insertions(+), 6 deletions(-)

diff --git a/Documentation/fault-injection/fault-injection.txt b/Documentation/fault-injection/fault-injection.txt
index 2ef8531..1cbfbbb 100644
--- a/Documentation/fault-injection/fault-injection.txt
+++ b/Documentation/fault-injection/fault-injection.txt
@@ -81,6 +81,14 @@ configuration of fault-injection capabilities.
 	Any positive value limits failures to only processes indicated by
 	/proc/<pid>/make-it-fail==1.
 
+- /sys/kernel/debug/fail*/fail_new_callsites:
+
+	Format: { 'Y' | 'N' }
+	A value of 'Y' will cause a callsite which has never been seen
+	before (since the last boot) to fail immediately. This can be
+	useful for triggering fault injection for very rare or hard-to-hit
+	call chains.
+
 - /sys/kernel/debug/fail*/require-start:
 - /sys/kernel/debug/fail*/require-end:
 - /sys/kernel/debug/fail*/reject-start:
diff --git a/include/linux/fault-inject.h b/include/linux/fault-inject.h
index 9f4956d..022da94 100644
--- a/include/linux/fault-inject.h
+++ b/include/linux/fault-inject.h
@@ -19,6 +19,7 @@ struct fault_attr {
 	atomic_t space;
 	unsigned long verbose;
 	bool task_filter;
+	bool fail_new_callsites;
 	unsigned long stacktrace_depth;
 	unsigned long require_start;
 	unsigned long require_end;
@@ -34,6 +35,7 @@ struct fault_attr {
 		.interval = 1,						\
 		.times = ATOMIC_INIT(1),				\
 		.require_end = ULONG_MAX,				\
+		.fail_new_callsites = false,				\
 		.stacktrace_depth = 32,					\
 		.ratelimit_state = RATELIMIT_STATE_INIT_DISABLED,	\
 		.verbose = 2,						\
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 1f13d02..d06a51d 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1699,7 +1699,12 @@ config FAULT_INJECTION_STACKTRACE_FILTER
 	select STACKTRACE
 	select FRAME_POINTER if !MIPS && !PPC && !S390 && !MICROBLAZE && !ARM_UNWIND && !ARC && !SCORE
 	help
-	  Provide stacktrace filter for fault-injection capabilities
+	  Provide stacktrace filter for fault-injection capabilities.
+
+	  This also allows you to enable fault-injection for new callsites,
+	  i.e. a forced fault the first time a given call chain is seen.
+	  This can be useful for triggering fault injection for very rare
+	  or hard-to-hit call chains.
 
 config LATENCYTOP
 	bool "Latency measuring infrastructure"
diff --git a/lib/fault-inject.c b/lib/fault-inject.c
index adba7c9..a401b14 100644
--- a/lib/fault-inject.c
+++ b/lib/fault-inject.c
@@ -63,7 +63,7 @@ static bool fail_task(struct fault_attr *attr, struct task_struct *task)
 
 #ifdef CONFIG_FAULT_INJECTION_STACKTRACE_FILTER
 
-static bool fail_stacktrace(struct fault_attr *attr)
+static bool fail_stacktrace(struct fault_attr *attr, unsigned int *hash)
 {
 	struct stack_trace trace;
 	int depth = attr->stacktrace_depth;
@@ -88,12 +88,20 @@ static bool fail_stacktrace(struct fault_attr *attr)
 			       entries[n] < attr->require_end)
 			found = true;
 	}
+
+	if (IS_ENABLED(CONFIG_FAULT_INJECTION_AT_NEW_CALLSITES)) {
+		const char *start = (const char *) &entries[0];
+		const char *end = (const char *) &entries[trace.nr_entries];
+
+		*hash = full_name_hash(0, start, end - start);
+	}
+
 	return found;
 }
 
 #else
 
-static inline bool fail_stacktrace(struct fault_attr *attr)
+static inline bool fail_stacktrace(struct fault_attr *attr, unsigned int *hash)
 {
 	return true;
 }
@@ -134,6 +142,8 @@ static bool __fail(struct fault_attr *attr)
 
 bool should_fail(struct fault_attr *attr, ssize_t size)
 {
+	unsigned int hash = 0;
+
 	/* No need to check any other properties if the probability is 0 */
 	if (attr->probability == 0)
 		return false;
@@ -149,6 +159,25 @@ bool should_fail(struct fault_attr *attr, ssize_t size)
 		return false;
 	}
 
+	if (!fail_stacktrace(attr, &hash))
+		return false;
+
+	if (IS_ENABLED(CONFIG_FAULT_INJECTION_STACKTRACE_FILTER) &&
+	    attr->fail_new_callsites) {
+		static unsigned long seen_hashtable[16 * 1024];
+
+		hash &= 8 * sizeof(seen_hashtable) - 1;
+		if (!test_and_set_bit(hash & (BITS_PER_LONG - 1),
+			&seen_hashtable[hash / BITS_PER_LONG]))
+		{
+			/*
+			 * If it's the first time we see this stacktrace, fail it
+			 * without a second thought.
+			 */
+			goto fail;
+		}
+	}
+
 	if (attr->interval > 1) {
 		attr->count++;
 		if (attr->count % attr->interval)
@@ -158,9 +187,7 @@ bool should_fail(struct fault_attr *attr, ssize_t size)
 	if (attr->probability <= prandom_u32() % 100)
 		return false;
 
-	if (!fail_stacktrace(attr))
-		return false;
-
+fail:
 	return __fail(attr);
 }
 EXPORT_SYMBOL_GPL(should_fail);
@@ -241,6 +268,9 @@ struct dentry *fault_create_debugfs_attr(const char *name,
 
 #ifdef CONFIG_FAULT_INJECTION_STACKTRACE_FILTER
 
+	if (!debugfs_create_bool("fail_new_callsites", mode, dir,
+				 &attr->fail_new_callsites))
+		goto fail;
 	if (!debugfs_create_stacktrace_depth("stacktrace-depth", mode, dir,
 				&attr->stacktrace_depth))
 		goto fail;
-- 
2.10.0.479.g221bd91