[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-ID: <158670111777.20085.1305752188791047060.tglx@nanos.tec.linutronix.de>
Date: Sun, 12 Apr 2020 14:18:37 -0000
From: Thomas Gleixner <tglx@...utronix.de>
To: Linus Torvalds <torvalds@...ux-foundation.org>
Cc: linux-kernel@...r.kernel.org, x86@...nel.org
Subject: [GIT pull] locking/urgent for 5.7-rc1
Linus,
please pull the latest locking/urgent branch from:
git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git locking-urgent-2020-04-12
up to: 9a019db0b6be: locking/lockdep: Improve 'invalid wait context' splat
Three small fixes/updates for the locking core code:
- Plug a task struct reference leak in the percpu rswem implementation.
- Document the refcount interaction with PID_MAX_LIMIT
- Improve the 'invalid wait context' data dump in lockdep so it contains
all information which is required to decode the problem
Thanks,
tglx
------------------>
Jann Horn (1):
locking/refcount: Document interaction with PID_MAX_LIMIT
Peter Zijlstra (1):
locking/lockdep: Improve 'invalid wait context' splat
Qian Cai (1):
locking/percpu-rwsem: Fix a task_struct refcount
include/linux/refcount.h | 23 ++++++++++++++-----
kernel/locking/lockdep.c | 51 ++++++++++++++++++++++++++-----------------
kernel/locking/percpu-rwsem.c | 3 ++-
3 files changed, 51 insertions(+), 26 deletions(-)
diff --git a/include/linux/refcount.h b/include/linux/refcount.h
index 0ac50cf62d06..0e3ee25eb156 100644
--- a/include/linux/refcount.h
+++ b/include/linux/refcount.h
@@ -38,11 +38,24 @@
* atomic operations, then the count will continue to edge closer to 0. If it
* reaches a value of 1 before /any/ of the threads reset it to the saturated
* value, then a concurrent refcount_dec_and_test() may erroneously free the
- * underlying object. Given the precise timing details involved with the
- * round-robin scheduling of each thread manipulating the refcount and the need
- * to hit the race multiple times in succession, there doesn't appear to be a
- * practical avenue of attack even if using refcount_add() operations with
- * larger increments.
+ * underlying object.
+ * Linux limits the maximum number of tasks to PID_MAX_LIMIT, which is currently
+ * 0x400000 (and can't easily be raised in the future beyond FUTEX_TID_MASK).
+ * With the current PID limit, if no batched refcounting operations are used and
+ * the attacker can't repeatedly trigger kernel oopses in the middle of refcount
+ * operations, this makes it impossible for a saturated refcount to leave the
+ * saturation range, even if it is possible for multiple uses of the same
+ * refcount to nest in the context of a single task:
+ *
+ * (UINT_MAX+1-REFCOUNT_SATURATED) / PID_MAX_LIMIT =
+ * 0x40000000 / 0x400000 = 0x100 = 256
+ *
+ * If hundreds of references are added/removed with a single refcounting
+ * operation, it may potentially be possible to leave the saturation range; but
+ * given the precise timing details involved with the round-robin scheduling of
+ * each thread manipulating the refcount and the need to hit the race multiple
+ * times in succession, there doesn't appear to be a practical avenue of attack
+ * even if using refcount_add() operations with larger increments.
*
* Memory ordering
* ===============
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 1511690e4de7..ac10db66cc63 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -3952,10 +3952,36 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this,
return ret;
}
+static inline short task_wait_context(struct task_struct *curr)
+{
+ /*
+ * Set appropriate wait type for the context; for IRQs we have to take
+ * into account force_irqthread as that is implied by PREEMPT_RT.
+ */
+ if (curr->hardirq_context) {
+ /*
+ * Check if force_irqthreads will run us threaded.
+ */
+ if (curr->hardirq_threaded || curr->irq_config)
+ return LD_WAIT_CONFIG;
+
+ return LD_WAIT_SPIN;
+ } else if (curr->softirq_context) {
+ /*
+ * Softirqs are always threaded.
+ */
+ return LD_WAIT_CONFIG;
+ }
+
+ return LD_WAIT_MAX;
+}
+
static int
print_lock_invalid_wait_context(struct task_struct *curr,
struct held_lock *hlock)
{
+ short curr_inner;
+
if (!debug_locks_off())
return 0;
if (debug_locks_silent)
@@ -3971,6 +3997,10 @@ print_lock_invalid_wait_context(struct task_struct *curr,
print_lock(hlock);
pr_warn("other info that might help us debug this:\n");
+
+ curr_inner = task_wait_context(curr);
+ pr_warn("context-{%d:%d}\n", curr_inner, curr_inner);
+
lockdep_print_held_locks(curr);
pr_warn("stack backtrace:\n");
@@ -4017,26 +4047,7 @@ static int check_wait_context(struct task_struct *curr, struct held_lock *next)
}
depth++;
- /*
- * Set appropriate wait type for the context; for IRQs we have to take
- * into account force_irqthread as that is implied by PREEMPT_RT.
- */
- if (curr->hardirq_context) {
- /*
- * Check if force_irqthreads will run us threaded.
- */
- if (curr->hardirq_threaded || curr->irq_config)
- curr_inner = LD_WAIT_CONFIG;
- else
- curr_inner = LD_WAIT_SPIN;
- } else if (curr->softirq_context) {
- /*
- * Softirqs are always threaded.
- */
- curr_inner = LD_WAIT_CONFIG;
- } else {
- curr_inner = LD_WAIT_MAX;
- }
+ curr_inner = task_wait_context(curr);
for (; depth < curr->lockdep_depth; depth++) {
struct held_lock *prev = curr->held_locks + depth;
diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c
index a008a1ba21a7..8bbafe3e5203 100644
--- a/kernel/locking/percpu-rwsem.c
+++ b/kernel/locking/percpu-rwsem.c
@@ -118,14 +118,15 @@ static int percpu_rwsem_wake_function(struct wait_queue_entry *wq_entry,
unsigned int mode, int wake_flags,
void *key)
{
- struct task_struct *p = get_task_struct(wq_entry->private);
bool reader = wq_entry->flags & WQ_FLAG_CUSTOM;
struct percpu_rw_semaphore *sem = key;
+ struct task_struct *p;
/* concurrent against percpu_down_write(), can get stolen */
if (!__percpu_rwsem_trylock(sem, reader))
return 1;
+ p = get_task_struct(wq_entry->private);
list_del_init(&wq_entry->entry);
smp_store_release(&wq_entry->private, NULL);
Powered by blists - more mailing lists