linux-kernel - [PATCH v3 1/6] ANDROID: binder: add support for RT prio inheritance.

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20171026140750.119265-2-maco@android.com>
Date:   Thu, 26 Oct 2017 16:07:45 +0200
From:   Martijn Coenen <maco@...roid.com>
To:     gregkh@...uxfoundation.org, john.stultz@...aro.org,
        tkjos@...gle.com, arve@...roid.com, sherryy@...roid.com,
        tglx@...utronix.de, peterz@...radead.org, amit.pundir@...aro.org
Cc:     linux-kernel@...r.kernel.org, devel@...verdev.osuosl.org,
        maco@...gle.com, Martijn Coenen <maco@...roid.com>
Subject: [PATCH v3 1/6] ANDROID: binder: add support for RT prio inheritance.

Adds support for SCHED_BATCH/SCHED_FIFO/SCHED_RR priority inheritance
to the binder driver. The desired behavior is as follows:

Each thread in the binder threadpool runs at a default priority, which is
typically nice 0.

Binder nodes (endpoints that can receive binder transactions) can have a
minimum priority associated with them, which means that all transactions
on this node must run at least at this priority.

Let's say a synchronous transaction is made from task T1 in process P1
to process P2, into node N1 which has a 'N1_min_priority':
1) T1 wakes up a task T2 in P2, then blocks on a waitqueue for reply
2) T2 determines prio=max(prio_of_T1, N1_min_priority);
3) T2 sets its own priority to prio, and stores its old prio
4) T2 returns to userspace, does work
5) Eventually, T2 returns a reply transaction to the driver
6) T2 queues the reply to T1 and wakes it up
7) T2 restores its own priority to what it was

For an asynchronous transaction:
1) T1 wakes up a task T2 in P2, returns to userspace (no work left)
2) T2 determines prio=max(default_prio, N1_min_priority)
3) T2 sets its own priority to prio, and stores its old prio
4) T2 returns to userspace, does work, completes
5) T2 calls back into kernel for more work
6) T2 restores its own priority to its default priority

This still leaves a race condition, where T2 wakes up and gets preempted
before it has a chance to change its own priority. This is addressed in
one of the next patches in the series, by letting T1 change the priority
of T2 *before* waking it up.

Signed-off-by: Martijn Coenen <maco@...roid.com>
---
 drivers/android/binder.c | 217 ++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 188 insertions(+), 29 deletions(-)

diff --git a/drivers/android/binder.c b/drivers/android/binder.c
index 95a96a254e5d..be6e7e753013 100644
--- a/drivers/android/binder.c
+++ b/drivers/android/binder.c
@@ -77,6 +77,7 @@
 #endif
 
 #include <uapi/linux/android/binder.h>
+#include <uapi/linux/sched/types.h>
 #include "binder_alloc.h"
 #include "binder_trace.h"
 
@@ -463,6 +464,22 @@ enum binder_deferred_state {
 	BINDER_DEFERRED_RELEASE      = 0x04,
 };
 
+/**
+ * struct binder_priority - scheduler policy and priority
+ * @sched_policy            scheduler policy
+ * @prio                    [100..139] for SCHED_NORMAL, [0..99] for FIFO/RT
+ *
+ * The binder driver supports inheriting the following scheduler policies:
+ * SCHED_NORMAL
+ * SCHED_BATCH
+ * SCHED_FIFO
+ * SCHED_RR
+ */
+struct binder_priority {
+	unsigned int sched_policy;
+	int prio;
+};
+
 /**
  * struct binder_proc - binder process bookkeeping
  * @proc_node:            element for binder_procs list
@@ -542,7 +559,7 @@ struct binder_proc {
 	int requested_threads;
 	int requested_threads_started;
 	int tmp_ref;
-	long default_priority;
+	struct binder_priority default_priority;
 	struct dentry *debugfs_entry;
 	struct binder_alloc alloc;
 	struct binder_context *context;
@@ -624,8 +641,8 @@ struct binder_transaction {
 	struct binder_buffer *buffer;
 	unsigned int	code;
 	unsigned int	flags;
-	long	priority;
-	long	saved_priority;
+	struct binder_priority	priority;
+	struct binder_priority	saved_priority;
 	kuid_t	sender_euid;
 	/**
 	 * @lock:  protects @from, @to_proc, and @to_thread
@@ -1051,22 +1068,142 @@ static void binder_wakeup_proc_ilocked(struct binder_proc *proc)
 	binder_wakeup_thread_ilocked(proc, thread, /* sync = */false);
 }
 
-static void binder_set_nice(long nice)
+static bool is_rt_policy(int policy)
+{
+	return policy == SCHED_FIFO || policy == SCHED_RR;
+}
+
+static bool is_fair_policy(int policy)
+{
+	return policy == SCHED_NORMAL || policy == SCHED_BATCH;
+}
+
+static bool binder_supported_policy(int policy)
+{
+	return is_fair_policy(policy) || is_rt_policy(policy);
+}
+
+static int to_userspace_prio(int policy, int kernel_priority)
+{
+	if (is_fair_policy(policy))
+		return PRIO_TO_NICE(kernel_priority);
+	else
+		return MAX_USER_RT_PRIO - 1 - kernel_priority;
+}
+
+static int to_kernel_prio(int policy, int user_priority)
+{
+	if (is_fair_policy(policy))
+		return NICE_TO_PRIO(user_priority);
+	else
+		return MAX_USER_RT_PRIO - 1 - user_priority;
+}
+
+/**
+ * binder_set_priority() - sets the scheduler priority of a task
+ * @task:	task to set priority on
+ * @desired:	desired priority to run at
+ *
+ * The scheduler policy of tasks is changed explicitly, because we want to
+ * support a few distinct features:
+ * 1) If the requested priority is higher than the maximum allowed priority,
+ *    we want to "clip" at the highest supported priority.
+ * 2) For a future patch, we need to allow changing the priority of a task
+ *    with a different UID; when we make a binder transaction from process A
+ *    to process B with different UIDs, A must be able to set B's priority
+ *    before B wakes up to handle the call. If B were to raise its own priority
+ *    after waking up, a race condition exists where B gets preempted before
+ *    it can raise its own priority.
+ *
+ * Feature 2) sounds like something a rt_mutex would solve, for example by
+ * having the caller proxy lock an rt_mutex on behalf of the callee, and then
+ * sleeping on it. But we have a few requirements that don't work with this
+ * approach:
+ * 1) binder supports a "minimum node priority", meaning that all transactions
+ *    into a node must run at this priority at a minimum. This means that the
+ *    desired priority for handling a transaction is not necessarily equal to
+ *    the priority of the caller.
+ * 2) binder supports asynchronous transactions, where the caller is not blocked
+ *    on transaction completion; so, it also can't be blocked on an rt_mutex.
+ * 3) similarly, there may not necessarily be a thread waiting for
+ *    transactions at the time the call is made, so we don't know who to proxy-
+ *    lock the lock for.
+ * 4) binder supports nested transactions, where A can call into B, and B can
+ *    call back into A before returning a reply to the original transaction.
+ *    This means that if A is blocked on an rt_mutex B holds, B must first wake
+ *    up A to handle a new transaction, and only then can it proxy-lock and try
+ *    to acquire the new rt_mutex. This leaves a race condition where B
+ *    temporarily runs at its original priority.
+ * 5) rt_mutex does not currently support PI for CFS tasks.
+ */
+static void binder_set_priority(struct task_struct *task,
+				struct binder_priority desired)
 {
-	long min_nice;
+	int priority; /* user-space prio value */
+	bool has_cap_nice;
+	unsigned int policy = desired.sched_policy;
 
-	if (can_nice(current, nice)) {
-		set_user_nice(current, nice);
+	if (task->policy == policy && task->normal_prio == desired.prio)
 		return;
+
+	/*
+	 * We need to check capabilities because sched_setscheduler() doesn't
+	 * allow changing the scheduler of a task with a different UID. This
+	 * in turn means we have to use sched_setscheduler_nocheck(), but we
+	 * still want to check that the target process is allowed to run at
+	 * the desired priority, so we do these checks here.
+	 *
+	 * Another reason is that we want want to clip at the highest "allowed"
+	 * limit, so if a process has an RLIMIT_RTPRIO of 50, and we ask it to
+	 * run at 99, instead of declining the request, we run at prio 50.
+	 */
+	has_cap_nice = has_capability_noaudit(task, CAP_SYS_NICE);
+
+	priority = to_userspace_prio(policy, desired.prio);
+
+	if (is_rt_policy(policy) && !has_cap_nice) {
+		long max_rtprio = task_rlimit(task, RLIMIT_RTPRIO);
+
+		if (max_rtprio == 0) {
+			/* Fall back to SCHED_NORMAL */
+			policy = SCHED_NORMAL;
+			priority = MIN_NICE;
+		} else if (priority > max_rtprio) {
+			priority = max_rtprio;
+		}
 	}
-	min_nice = rlimit_to_nice(rlimit(RLIMIT_NICE));
-	binder_debug(BINDER_DEBUG_PRIORITY_CAP,
-		     "%d: nice value %ld not allowed use %ld instead\n",
-		      current->pid, nice, min_nice);
-	set_user_nice(current, min_nice);
-	if (min_nice <= MAX_NICE)
-		return;
-	binder_user_error("%d RLIMIT_NICE not set\n", current->pid);
+
+	if (is_fair_policy(policy) && !has_cap_nice) {
+		long min_nice = rlimit_to_nice(task_rlimit(task, RLIMIT_NICE));
+
+		if (min_nice > MAX_NICE) {
+			binder_user_error("%d RLIMIT_NICE not set\n",
+					  task->pid);
+			return;
+		} else if (priority < min_nice) {
+			priority = min_nice;
+		}
+	}
+
+	if (policy != desired.sched_policy ||
+	    to_kernel_prio(policy, priority) != desired.prio)
+		binder_debug(BINDER_DEBUG_PRIORITY_CAP,
+			     "%d: priority %d not allowed, using %d instead\n",
+			      task->pid, desired.prio,
+			      to_kernel_prio(policy, priority));
+
+	/* Set the actual priority */
+	if (task->policy != policy || is_rt_policy(policy)) {
+		struct sched_param params;
+
+		params.sched_priority = is_rt_policy(policy) ? priority : 0;
+
+		sched_setscheduler_nocheck(task,
+					   policy | SCHED_RESET_ON_FORK,
+					   &params);
+	}
+	if (is_fair_policy(policy))
+		set_user_nice(task, priority);
 }
 
 static struct binder_node *binder_get_node_ilocked(struct binder_proc *proc,
@@ -1151,7 +1288,8 @@ static struct binder_node *binder_init_node_ilocked(
 	node->ptr = ptr;
 	node->cookie = cookie;
 	node->work.type = BINDER_WORK_NODE;
-	node->min_priority = flags & FLAT_BINDER_FLAG_PRIORITY_MASK;
+	node->min_priority = NICE_TO_PRIO(
+			flags & FLAT_BINDER_FLAG_PRIORITY_MASK);
 	node->accept_fds = !!(flags & FLAT_BINDER_FLAG_ACCEPTS_FDS);
 	spin_lock_init(&node->lock);
 	INIT_LIST_HEAD(&node->work.entry);
@@ -2688,7 +2826,7 @@ static void binder_transaction(struct binder_proc *proc,
 		}
 		thread->transaction_stack = in_reply_to->to_parent;
 		binder_inner_proc_unlock(proc);
-		binder_set_nice(in_reply_to->saved_priority);
+		binder_set_priority(current, in_reply_to->saved_priority);
 		target_thread = binder_get_txn_from_and_acq_inner(in_reply_to);
 		if (target_thread == NULL) {
 			return_error = BR_DEAD_REPLY;
@@ -2853,7 +2991,15 @@ static void binder_transaction(struct binder_proc *proc,
 	t->to_thread = target_thread;
 	t->code = tr->code;
 	t->flags = tr->flags;
-	t->priority = task_nice(current);
+	if (!(t->flags & TF_ONE_WAY) &&
+	    binder_supported_policy(current->policy)) {
+		/* Inherit supported policies for synchronous transactions */
+		t->priority.sched_policy = current->policy;
+		t->priority.prio = current->normal_prio;
+	} else {
+		/* Otherwise, fall back to the default priority */
+		t->priority = target_proc->default_priority;
+	}
 
 	trace_binder_transaction(reply, t, target_node);
 
@@ -3761,7 +3907,7 @@ static int binder_thread_read(struct binder_proc *proc,
 			wait_event_interruptible(binder_user_error_wait,
 						 binder_stop_on_user_error < 2);
 		}
-		binder_set_nice(proc->default_priority);
+		binder_set_priority(current, proc->default_priority);
 	}
 
 	if (non_block) {
@@ -3973,16 +4119,21 @@ static int binder_thread_read(struct binder_proc *proc,
 		BUG_ON(t->buffer == NULL);
 		if (t->buffer->target_node) {
 			struct binder_node *target_node = t->buffer->target_node;
+			struct binder_priority prio = t->priority;
 
 			tr.target.ptr = target_node->ptr;
 			tr.cookie =  target_node->cookie;
-			t->saved_priority = task_nice(current);
-			if (t->priority < target_node->min_priority &&
-			    !(t->flags & TF_ONE_WAY))
-				binder_set_nice(t->priority);
-			else if (!(t->flags & TF_ONE_WAY) ||
-				 t->saved_priority > target_node->min_priority)
-				binder_set_nice(target_node->min_priority);
+			t->saved_priority.sched_policy = current->policy;
+			t->saved_priority.prio = current->normal_prio;
+			if (target_node->min_priority < t->priority.prio) {
+				/* The min_priority on a node can currently
+				 * only use SCHED_NORMAL, so we can just
+				 * hardcode this here.
+				 */
+				prio.sched_policy = SCHED_NORMAL;
+				prio.prio = target_node->min_priority;
+			}
+			binder_set_priority(current, prio);
 			cmd = BR_TRANSACTION;
 		} else {
 			tr.target.ptr = 0;
@@ -4630,7 +4781,14 @@ static int binder_open(struct inode *nodp, struct file *filp)
 	get_task_struct(current->group_leader);
 	proc->tsk = current->group_leader;
 	INIT_LIST_HEAD(&proc->todo);
-	proc->default_priority = task_nice(current);
+	if (binder_supported_policy(current->policy)) {
+		proc->default_priority.sched_policy = current->policy;
+		proc->default_priority.prio = current->normal_prio;
+	} else {
+		proc->default_priority.sched_policy = SCHED_NORMAL;
+		proc->default_priority.prio = NICE_TO_PRIO(0);
+	}
+
 	binder_dev = container_of(filp->private_data, struct binder_device,
 				  miscdev);
 	proc->context = &binder_dev->context;
@@ -4922,13 +5080,14 @@ static void print_binder_transaction_ilocked(struct seq_file *m,
 	spin_lock(&t->lock);
 	to_proc = t->to_proc;
 	seq_printf(m,
-		   "%s %d: %p from %d:%d to %d:%d code %x flags %x pri %ld r%d",
+		   "%s %d: %p from %d:%d to %d:%d code %x flags %x pri %d:%d r%d",
 		   prefix, t->debug_id, t,
 		   t->from ? t->from->proc->pid : 0,
 		   t->from ? t->from->pid : 0,
 		   to_proc ? to_proc->pid : 0,
 		   t->to_thread ? t->to_thread->pid : 0,
-		   t->code, t->flags, t->priority, t->need_reply);
+		   t->code, t->flags, t->priority.sched_policy,
+		   t->priority.prio, t->need_reply);
 	spin_unlock(&t->lock);
 
 	if (proc != to_proc) {
-- 
2.15.0.rc2.357.g7e34df9404-goog