linux-kernel - Re: [PATCH] android: binder: Disable preemption while holding the global binder lock

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <CAD0t5oOuFD-oKyVbzfnAOoDqmQ3bXvjDO-N4gnyp0n0edC=fJw@mail.gmail.com>
Date:   Fri, 16 Sep 2016 17:42:04 -0700
From:   Todd Kjos <tkjos@...roid.com>
To:     Peter Zijlstra <peterz@...radead.org>
Cc:     Arve Hjønnevåg <arve@...roid.com>,
        Greg Kroah-Hartman <gregkh@...uxfoundation.org>,
        Thomas Gleixner <tglx@...utronix.de>,
        "devel@...verdev.osuosl.org" <devel@...verdev.osuosl.org>,
        Riley Andrews <riandrews@...roid.com>,
        LKML <linux-kernel@...r.kernel.org>,
        Christoph Hellwig <hch@...radead.org>,
        Todd Kjos <tkjos@...gle.com>, Ingo Molnar <mingo@...nel.org>
Subject: Re: [PATCH] android: binder: Disable preemption while holding the
 global binder lock

Thanks Peter. We'll give that patch a try as part of our refactoring.
Looking at finer-grained locking and we'll try going back to rt_mutex
plus this patch.

On Wed, Sep 14, 2016 at 9:55 AM, Peter Zijlstra <peterz@...radead.org> wrote:
> On Wed, Sep 14, 2016 at 06:13:40PM +0200, Peter Zijlstra wrote:
>> On Wed, Sep 14, 2016 at 06:11:03PM +0200, Peter Zijlstra wrote:
>> > On Tue, Sep 13, 2016 at 12:53:27PM -0700, Arve Hjønnevåg wrote:
>> > > On Tue, Sep 13, 2016 at 12:32 AM, Peter Zijlstra <peterz@...radead.org> wrote:
>> > > > cgroups should be irrelevant, PI is unaware of them.
>> > >
>> > > I don't think cgroups are irrelevant. PI being unaware of them
>> > > explains the problem I described. If the task that holds the lock is
>> > > in a cgroup that has a low cpu.shares value, then boosting the task's
>> > > priority within that group does necessarily make it any more likely to
>> > > run.
>> >
>> > Thing is, for FIFO/DL the important parameters (prio and deadline resp.)
>> > are not cgroup dependent.
>> >
>> > For CFS you're right, and as per usual, cgroups will be a royal pain.
>> > While we can compute the total weight in the block chain, getting that
>> > back to a task which is stuck in a cgroup is just not going to work
>> > well.
>>
>> Not to mention the fact that the weight depends on the current running
>> state. Having those tasks block insta changes the actual weight.
>>
>> > /me curses @ cgroups.. bloody stupid things.
>>
>> More of that.
>
>
> Something a little like so... completely untested, and has a fair number
> of corner cases still left open I think..
>
>
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -1691,8 +1691,8 @@ struct task_struct {
>         struct seccomp seccomp;
>
>  /* Thread group tracking */
> -       u32 parent_exec_id;
> -       u32 self_exec_id;
> +       u32 parent_exec_id;
> +       u32 self_exec_id;
>  /* Protection of (de-)allocation: mm, files, fs, tty, keyrings, mems_allowed,
>   * mempolicy */
>         spinlock_t alloc_lock;
> @@ -1710,6 +1710,11 @@ struct task_struct {
>         struct task_struct *pi_top_task;
>         /* Deadlock detection and priority inheritance handling */
>         struct rt_mutex_waiter *pi_blocked_on;
> +       unsigned long pi_weight;
> +#ifdef CONFIG_CGROUP_SCHED
> +       struct task_group *orig_task_group;
> +       unsigned long normal_weight;
> +#endif
>  #endif
>
>  #ifdef CONFIG_DEBUG_MUTEXES
> @@ -1977,6 +1982,8 @@ static inline int tsk_nr_cpus_allowed(st
>         return p->nr_cpus_allowed;
>  }
>
> +extern unsigned long task_pi_weight(struct task_struct *p);
> +
>  #define TNF_MIGRATED   0x01
>  #define TNF_NO_GROUP   0x02
>  #define TNF_SHARED     0x04
> --- a/kernel/locking/rtmutex.c
> +++ b/kernel/locking/rtmutex.c
> @@ -20,6 +20,19 @@
>  #include "rtmutex_common.h"
>
>  /*
> + * !(rt_prio || dl_prio)
> + */
> +static inline bool other_prio(int prio)
> +{
> +       return prio >= MAX_RT_PRIO;
> +}
> +
> +static inline bool other_task(struct task_struct *task)
> +{
> +       return other_prio(task->prio);
> +}
> +
> +/*
>   * lock->owner state tracking:
>   *
>   * lock->owner holds the task_struct pointer of the owner. Bit 0
> @@ -226,6 +239,11 @@ rt_mutex_enqueue(struct rt_mutex *lock,
>
>         rb_link_node(&waiter->tree_entry, parent, link);
>         rb_insert_color(&waiter->tree_entry, &lock->waiters);
> +       /*
> +        * We want the lock->waiter_weight to reflect the sum of all queued
> +        * waiters.
> +        */
> +       lock->waiters_weight += waiter->weight;
>  }
>
>  static void
> @@ -239,6 +257,7 @@ rt_mutex_dequeue(struct rt_mutex *lock,
>
>         rb_erase(&waiter->tree_entry, &lock->waiters);
>         RB_CLEAR_NODE(&waiter->tree_entry);
> +       lock->waiters_weight += waiter->weight;
>  }
>
>  static void
> @@ -265,6 +284,18 @@ rt_mutex_enqueue_pi(struct task_struct *
>
>         rb_link_node(&waiter->pi_tree_entry, parent, link);
>         rb_insert_color(&waiter->pi_tree_entry, &task->pi_waiters);
> +
> +       /*
> +        * Since a task can own multiple locks, and we have one pi_waiter
> +        * for every lock, propagate the lock->waiter_weight, which is the sum
> +        * of all weights queued on that lock, into the top waiter, and add
> +        * that to the owning task's pi_weight.
> +        *
> +        * This results in pi_weight being the sum of all blocked waiters
> +        * on this task.
> +        */
> +       waiter->top_weight = waiter->lock->waiters_weight;
> +       task->pi_weight += waiter->top_weight;
>  }
>
>  static void
> @@ -278,6 +309,7 @@ rt_mutex_dequeue_pi(struct task_struct *
>
>         rb_erase(&waiter->pi_tree_entry, &task->pi_waiters);
>         RB_CLEAR_NODE(&waiter->pi_tree_entry);
> +       task->pi_weight -= waiter->top_weight;
>  }
>
>  static void rt_mutex_adjust_prio(struct task_struct *p)
> @@ -497,7 +529,7 @@ static int rt_mutex_adjust_prio_chain(st
>                  * detection is enabled we continue, but stop the
>                  * requeueing in the chain walk.
>                  */
> -               if (top_waiter != task_top_pi_waiter(task)) {
> +               if (top_waiter != task_top_pi_waiter(task) && !other_task(task)) {
>                         if (!detect_deadlock)
>                                 goto out_unlock_pi;
>                         else
> @@ -512,7 +544,7 @@ static int rt_mutex_adjust_prio_chain(st
>          * enabled we continue, but stop the requeueing in the chain
>          * walk.
>          */
> -       if (rt_mutex_waiter_equal(waiter, cmp_task(task))) {
> +       if (rt_mutex_waiter_equal(waiter, cmp_task(task)) && !other_task(task)) {
>                 if (!detect_deadlock)
>                         goto out_unlock_pi;
>                 else
> @@ -627,6 +659,11 @@ static int rt_mutex_adjust_prio_chain(st
>          */
>         waiter->prio = task->prio;
>         waiter->deadline = task->dl.deadline;
> +       /*
> +        * The weight of the task depends on the block chain, since
> +        * we're iterating that, its likely to have changed.
> +        */
> +       waiter->weight = task_pi_weight(task);
>
>         rt_mutex_enqueue(lock, waiter);
>
> @@ -685,6 +722,15 @@ static int rt_mutex_adjust_prio_chain(st
>                 waiter = rt_mutex_top_waiter(lock);
>                 rt_mutex_enqueue_pi(task, waiter);
>                 rt_mutex_adjust_prio(task);
> +
> +       } else if (other_task(task)) {
> +               /*
> +                * Propagate the lock->waiters_weight into task->pi_weight
> +                */
> +               rt_mutex_dequeue_pi(task, prerequeue_top_waiter);
> +               rt_mutex_enqueue_pi(task, prerequeue_top_waiter);
> +               rt_mutex_adjust_prio(task);
> +
>         } else {
>                 /*
>                  * Nothing changed. No need to do any priority
> @@ -728,7 +774,7 @@ static int rt_mutex_adjust_prio_chain(st
>          * then we can stop the chain walk here if we are not in full
>          * deadlock detection mode.
>          */
> -       if (!detect_deadlock && waiter != top_waiter)
> +       if (!detect_deadlock && waiter != top_waiter && !other_task(task))
>                 goto out_put_task;
>
>         goto again;
> @@ -904,6 +950,7 @@ static int task_blocks_on_rt_mutex(struc
>         waiter->lock = lock;
>         waiter->prio = task->prio;
>         waiter->deadline = task->dl.deadline;
> +       waiter->weight = task_pi_weight(task);
>
>         /* Get the top priority waiter on the lock */
>         if (rt_mutex_has_waiters(lock))
> @@ -918,7 +965,7 @@ static int task_blocks_on_rt_mutex(struc
>                 return 0;
>
>         raw_spin_lock(&owner->pi_lock);
> -       if (waiter == rt_mutex_top_waiter(lock)) {
> +       if (other_task(owner) || waiter == rt_mutex_top_waiter(lock)) {
>                 rt_mutex_dequeue_pi(owner, top_waiter);
>                 rt_mutex_enqueue_pi(owner, waiter);
>
> @@ -1017,7 +1064,8 @@ static void mark_wakeup_next_waiter(stru
>  static void remove_waiter(struct rt_mutex *lock,
>                           struct rt_mutex_waiter *waiter)
>  {
> -       bool is_top_waiter = (waiter == rt_mutex_top_waiter(lock));
> +       struct rt_mutex_waiter *top_waiter = rt_mutex_top_waiter(lock);
> +       bool is_top_waiter = (waiter == top_waiter);
>         struct task_struct *owner = rt_mutex_owner(lock);
>         struct rt_mutex *next_lock;
>
> @@ -1032,12 +1080,12 @@ static void remove_waiter(struct rt_mute
>          * Only update priority if the waiter was the highest priority
>          * waiter of the lock and there is an owner to update.
>          */
> -       if (!owner || !is_top_waiter)
> +       if (!owner || (!other_task(owner) && !is_top_waiter))
>                 return;
>
>         raw_spin_lock(&owner->pi_lock);
>
> -       rt_mutex_dequeue_pi(owner, waiter);
> +       rt_mutex_dequeue_pi(owner, top_waiter);
>
>         if (rt_mutex_has_waiters(lock))
>                 rt_mutex_enqueue_pi(owner, rt_mutex_top_waiter(lock));
> --- a/kernel/locking/rtmutex_common.h
> +++ b/kernel/locking/rtmutex_common.h
> @@ -34,6 +34,7 @@ struct rt_mutex_waiter {
>  #endif
>         int prio;
>         u64 deadline;
> +       unsigned weight, top_weight;
>  };
>
>  /*
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -3725,6 +3725,8 @@ void rt_mutex_setprio(struct task_struct
>                 if (rt_prio(oldprio))
>                         p->rt.timeout = 0;
>                 p->sched_class = &fair_sched_class;
> +
> +               task_set_pi_weight(p);
>         }
>
>         p->prio = prio;
> @@ -7933,6 +7935,12 @@ static void sched_change_group(struct ta
>         tg = container_of(task_css_check(tsk, cpu_cgrp_id, true),
>                           struct task_group, css);
>         tg = autogroup_task_group(tsk, tg);
> +
> +#ifdef CONFIG_RT_MUTEXES
> +       tsk->orig_task_group = tg;
> +
> +       if (!tsk->pi_weight)
> +#endif
>         tsk->sched_task_group = tg;
>
>  #ifdef CONFIG_FAIR_GROUP_SCHED
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -6646,6 +6646,32 @@ static void attach_tasks(struct lb_env *
>         raw_spin_unlock(&env->dst_rq->lock);
>  }
>
> +#ifdef CONFIG_RT_MUTEXES
> +/*
> + * See set_load_weight().
> + */
> +static inline unsigned long __task_normal_weight(struct task_struct *p)
> +{
> +       int prio = p->static_prio - MAX_RT_PRIO;
> +
> +       /*
> +        * SCHED_IDLE tasks get minimal weight:
> +        */
> +       if (idle_policy(p->policy))
> +               return scale_load(WEIGHT_IDLEPRIO);
> +
> +       return scale_load(sched_prio_to_weight[prio]);
> +}
> +
> +
> +static unsigned long task_normal_weight(struct task_struct *p);
> +
> +unsigned long task_pi_weight(struct task_struct *p)
> +{
> +       return task_normal_weight(p) + p->pi_weight;
> +}
> +#endif
> +
>  #ifdef CONFIG_FAIR_GROUP_SCHED
>  static void update_blocked_averages(int cpu)
>  {
> @@ -6717,7 +6743,80 @@ static unsigned long task_h_load(struct
>         return div64_ul(p->se.avg.load_avg * cfs_rq->h_load,
>                         cfs_rq_load_avg(cfs_rq) + 1);
>  }
> -#else
> +
> +#ifdef CONFIG_RT_MUTEXES
> +/*
> + * Humongous horrid hack.. because cgroups bloody stink.
> + *
> + * The idea with PI on CFS is to sum the weight of all blocked tasks but with
> + * cgroups the weight of a task depends on the running state of tasks. Blocking
> + * changes the weight.
> + *
> + * Paper over that problem by using the regular averages, and hoping boosting
> + * is short enough to not actually matter much.
> + *
> + * The next problem is that getting that weight back to the boosted task
> + * requires lifting it out of its cgroup. Ideally we'd place it in the first
> + * common parent, but *shees* then we'd have to compute that, so bail and stick
> + * it in the root group.
> + */
> +
> +static unsigned long task_normal_weight(struct task_struct *p)
> +{
> +       struct cfs_rq *cfs_rq = task_cfs_rq(p);
> +
> +       /*
> +        * Once we have ->pi_weight, we'll get boosted into the root group
> +        * and the below falls apart.
> +        */
> +       if (!p->pi_weight) {
> +               update_cfs_rq_h_load(cfs_rq);
> +               p->normal_weight =
> +                       div64_ul(__task_normal_weight(p) * cfs_rq->h_load,
> +                                       cfs_rq_load_avg(cfs_rq) + 1);
> +       }
> +
> +       return p->normal_weight;
> +}
> +
> +static void detach_task_cfs_rq(struct task_struct *p);
> +static void attach_task_cfs_rq(struct task_struct *p);
> +
> +void task_set_pi_weight(struct task_struct *p)
> +{
> +       unsigned long normal_weight = p->normal_weight;
> +       struct task_group *tg = &root_task_group;
> +       bool move_group;
> +
> +       if (!p->pi_weight) {
> +               tg = p->orig_task_group;
> +               normal_weight = __task_normal_weight(p);
> +       }
> +
> +       move_group = p->sched_task_group != tg;
> +
> +       if (move_group) {
> +               p->sched_task_group = tg;
> +
> +               detach_task_cfs_rq(p);
> +               set_task_rq(p, task_cpu(p));
> +
> +#ifdef CONFIG_SMP
> +               /* Tell se's cfs_rq has been changed -- migrated */
> +               p->se.avg.last_update_time = 0;
> +#endif
> +       }
> +
> +       update_load_set(&p->se.load, normal_weight + p->pi_weight);
> +
> +       if (move_group)
> +               attach_task_cfs_rq(p);
> +}
> +
> +#endif
> +
> +#else /* CONFIG_FAIR_GROUP_SCHED */
> +
>  static inline void update_blocked_averages(int cpu)
>  {
>         struct rq *rq = cpu_rq(cpu);
> @@ -6734,8 +6833,21 @@ static unsigned long task_h_load(struct
>  {
>         return p->se.avg.load_avg;
>  }
> +
> +#ifdef CONFIG_RT_MUTEXES
> +static unsigned long task_normal_weight(struct task_struct *p)
> +{
> +       return __task_normal_weight(p);
> +}
> +
> +void task_set_pi_weight(struct task_struct *p)
> +{
> +       update_load_set(&p->se.load, task_pi_weight(p));
> +}
>  #endif
>
> +#endif /* CONFIG_FAIR_GROUP_SCHED */
> +
>  /********** Helpers for find_busiest_group ************************/
>
>  enum group_type {
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -1827,3 +1827,5 @@ static inline void cpufreq_trigger_updat
>  #else /* arch_scale_freq_capacity */
>  #define arch_scale_freq_invariant()    (false)
>  #endif
> +
> +extern void task_set_pi_weight(struct task_struct *p);