lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <CANpmjNNJC=hZ+Tk3_XNx2ZTvs27+NtSdv0CApKKvJRgHGCsx4w@mail.gmail.com>
Date:   Tue, 18 Apr 2023 08:00:00 +0200
From:   Marco Elver <elver@...gle.com>
To:     Thomas Gleixner <tglx@...utronix.de>
Cc:     Frederic Weisbecker <frederic@...nel.org>,
        syzbot <syzbot+3b14b2ed9b3d06dcaa07@...kaller.appspotmail.com>,
        linux-kernel@...r.kernel.org, syzkaller-bugs@...glegroups.com,
        Anna-Maria Behnsen <anna-maria@...utronix.de>,
        Jacob Keller <jacob.e.keller@...el.com>,
        "Paul E. McKenney" <paulmck@...nel.org>,
        Peter Zijlstra <peterz@...radead.org>,
        Sebastian Andrzej Siewior <bigeasy@...utronix.de>
Subject: Re: [PATCH] posix-cpu-timers: Implement the missing
 timer_wait_running callback

On Mon, 17 Apr 2023 at 15:37, Thomas Gleixner <tglx@...utronix.de> wrote:
>
> For some unknown reason the introduction of the timer_wait_running callback
> missed to fixup posix CPU timers, which went unnoticed for almost four years.
> Marco reported recently that the WARN_ON() in timer_wait_running()
> triggers with a posix CPU timer test case.
>
> Posix CPU timers have two execution models for expiring timers depending on
> CONFIG_POSIX_CPU_TIMERS_TASK_WORK:
>
> 1) If not enabled, the expiry happens in hard interrupt context so
>    spin waiting on the remote CPU is reasonably time bound.
>
>    Implement an empty stub function for that case.
>
> 2) If enabled, the expiry happens in task work before returning to user
>    space or guest mode. The expired timers are marked as firing and moved
>    from the timer queue to a local list head with sighand lock held. Once
>    the timers are moved, sighand lock is dropped and the expiry happens in
>    fully preemptible context. That means the expiring task can be scheduled
>    out, migrated, interrupted etc. So spin waiting on it is more than
>    suboptimal.
>
>    The timer wheel has a timer_wait_running() mechanism for RT, which uses
>    a per CPU timer-base expiry lock which is held by the expiry code and the
>    task waiting for the timer function to complete blocks on that lock.
>
>    This does not work in the same way for posix CPU timers as there is no
>    timer base and expiry for process wide timers can run on any task
>    belonging to that process, but the concept of waiting on an expiry lock
>    can be used too in a slightly different way:
>
>     - Add a mutex to struct posix_cputimers_work. This struct is per task
>       and used to schedule the expiry task work from the timer interrupt.
>
>     - Add a task_struct pointer to struct cpu_timer which is used to store
>       a the task which runs the expiry. That's filled in when the task
>       moves the expired timers to the local expiry list. That's not
>       affecting the size of the k_itimer union as there are bigger union
>       members already
>
>     - Let the task take the expiry mutex around the expiry function
>
>     - Let the waiter acquire a task reference with rcu_read_lock() held and
>       block on the expiry mutex
>
>    This avoids spin-waiting on a task which might not even be on a CPU and
>    works nicely for RT too.
>
> Reported-by: Marco Elver <elver@...gle.com>

Tested-by: Marco Elver <elver@...gle.com>

Thanks!

> Fixes: ec8f954a40da ("posix-timers: Use a callback for cancel synchronization on PREEMPT_RT")
> Signed-off-by: Thomas Gleixner <tglx@...utronix.de>
> ---
>  include/linux/posix-timers.h   |    7 +++
>  kernel/time/posix-cpu-timers.c |   81 +++++++++++++++++++++++++++++++++--------
>  kernel/time/posix-timers.c     |    4 ++
>  3 files changed, 77 insertions(+), 15 deletions(-)
>
> --- a/include/linux/posix-timers.h
> +++ b/include/linux/posix-timers.h
> @@ -4,6 +4,7 @@
>
>  #include <linux/spinlock.h>
>  #include <linux/list.h>
> +#include <linux/mutex.h>
>  #include <linux/alarmtimer.h>
>  #include <linux/timerqueue.h>
>
> @@ -62,9 +63,10 @@ static inline int clockid_to_fd(const cl
>   * cpu_timer - Posix CPU timer representation for k_itimer
>   * @node:      timerqueue node to queue in the task/sig
>   * @head:      timerqueue head on which this timer is queued
> - * @task:      Pointer to target task
> + * @pid:       Pointer to target task PID
>   * @elist:     List head for the expiry list
>   * @firing:    Timer is currently firing
> + * @handling:  Pointer to the task which handles expiry
>   */
>  struct cpu_timer {
>         struct timerqueue_node  node;
> @@ -72,6 +74,7 @@ struct cpu_timer {
>         struct pid              *pid;
>         struct list_head        elist;
>         int                     firing;
> +       struct task_struct      *handling;
>  };
>
>  static inline bool cpu_timer_enqueue(struct timerqueue_head *head,
> @@ -135,10 +138,12 @@ struct posix_cputimers {
>  /**
>   * posix_cputimers_work - Container for task work based posix CPU timer expiry
>   * @work:      The task work to be scheduled
> + * @mutex:     Mutex held around expiry in context of this task work
>   * @scheduled:  @work has been scheduled already, no further processing
>   */
>  struct posix_cputimers_work {
>         struct callback_head    work;
> +       struct mutex            mutex;
>         unsigned int            scheduled;
>  };
>
> --- a/kernel/time/posix-cpu-timers.c
> +++ b/kernel/time/posix-cpu-timers.c
> @@ -846,6 +846,8 @@ static u64 collect_timerqueue(struct tim
>                         return expires;
>
>                 ctmr->firing = 1;
> +               /* See posix_cpu_timer_wait_running() */
> +               WRITE_ONCE(ctmr->handling, current);
>                 cpu_timer_dequeue(ctmr);
>                 list_add_tail(&ctmr->elist, firing);
>         }
> @@ -1161,7 +1163,49 @@ static void handle_posix_cpu_timers(stru
>  #ifdef CONFIG_POSIX_CPU_TIMERS_TASK_WORK
>  static void posix_cpu_timers_work(struct callback_head *work)
>  {
> +       struct posix_cputimers_work *cw = container_of(work, typeof(*cw), work);
> +
> +       mutex_lock(&cw->mutex);
>         handle_posix_cpu_timers(current);
> +       mutex_unlock(&cw->mutex);
> +}
> +
> +/*
> + * Invoked from the posix-timer core when a cancel operation failed because
> + * the timer is marked firing. The caller holds rcu_read_lock(), which
> + * protects the timer and the task which is expiring it from being freed.
> + */
> +static void posix_cpu_timer_wait_running(struct k_itimer *timr)
> +{
> +       struct task_struct *tsk = READ_ONCE(timr->it.cpu.handling);
> +
> +       /* Has the handling task completed expiry already? */
> +       if (!tsk)
> +               return;
> +
> +       /* Ensure that the task cannot go away */
> +       get_task_struct(tsk);
> +       /* Now drop the RCU protection so the mutex can be locked */
> +       rcu_read_unlock();
> +       /* Wait on the expiry mutex */
> +       mutex_lock(&tsk->posix_cputimers_work.mutex);
> +       /* Release it immediately again. */
> +       mutex_unlock(&tsk->posix_cputimers_work.mutex);
> +       /* Drop the task reference. */
> +       put_task_struct(tsk);
> +       /* Relock RCU so the callsite is balanced */
> +       rcu_read_lock();
> +}
> +
> +static void posix_cpu_timer_wait_running_nsleep(struct k_itimer *timr)
> +{
> +       /* Ensure that timr->it.cpu.handling task cannot go away */
> +       rcu_read_lock();
> +       spin_unlock_irq(&timr->it_lock);
> +       posix_cpu_timer_wait_running(timr);
> +       rcu_read_unlock();
> +       /* @timr is on stack and is valid */
> +       spin_lock_irq(&timr->it_lock);
>  }
>
>  /*
> @@ -1177,6 +1221,7 @@ void clear_posix_cputimers_work(struct t
>                sizeof(p->posix_cputimers_work.work));
>         init_task_work(&p->posix_cputimers_work.work,
>                        posix_cpu_timers_work);
> +       mutex_init(&p->posix_cputimers_work.mutex);
>         p->posix_cputimers_work.scheduled = false;
>  }
>
> @@ -1255,6 +1300,18 @@ static inline void __run_posix_cpu_timer
>         lockdep_posixtimer_exit();
>  }
>
> +static void posix_cpu_timer_wait_running(struct k_itimer *timr)
> +{
> +       cpu_relax();
> +}
> +
> +static void posix_cpu_timer_wait_running_nsleep(struct k_itimer *timr)
> +{
> +       spin_unlock_irq(&timer.it_lock);
> +       cpu_relax();
> +       spin_lock_irq(&timer.it_lock);
> +}
> +
>  static inline bool posix_cpu_timers_work_scheduled(struct task_struct *tsk)
>  {
>         return false;
> @@ -1363,6 +1420,8 @@ static void handle_posix_cpu_timers(stru
>                  */
>                 if (likely(cpu_firing >= 0))
>                         cpu_timer_fire(timer);
> +               /* See posix_cpu_timer_wait_running() */
> +               WRITE_ONCE(timer->it.cpu.handling, NULL);
>                 spin_unlock(&timer->it_lock);
>         }
>  }
> @@ -1497,23 +1556,16 @@ static int do_cpu_nanosleep(const clocki
>                 expires = cpu_timer_getexpires(&timer.it.cpu);
>                 error = posix_cpu_timer_set(&timer, 0, &zero_it, &it);
>                 if (!error) {
> -                       /*
> -                        * Timer is now unarmed, deletion can not fail.
> -                        */
> +                       /* Timer is now unarmed, deletion can not fail. */
>                         posix_cpu_timer_del(&timer);
> +               } else {
> +                       while (error == TIMER_RETRY) {
> +                               posix_cpu_timer_wait_running_nsleep(&timer);
> +                               error = posix_cpu_timer_del(&timer);
> +                       }
>                 }
> -               spin_unlock_irq(&timer.it_lock);
>
> -               while (error == TIMER_RETRY) {
> -                       /*
> -                        * We need to handle case when timer was or is in the
> -                        * middle of firing. In other cases we already freed
> -                        * resources.
> -                        */
> -                       spin_lock_irq(&timer.it_lock);
> -                       error = posix_cpu_timer_del(&timer);
> -                       spin_unlock_irq(&timer.it_lock);
> -               }
> +               spin_unlock_irq(&timer.it_lock);
>
>                 if ((it.it_value.tv_sec | it.it_value.tv_nsec) == 0) {
>                         /*
> @@ -1623,6 +1675,7 @@ const struct k_clock clock_posix_cpu = {
>         .timer_del              = posix_cpu_timer_del,
>         .timer_get              = posix_cpu_timer_get,
>         .timer_rearm            = posix_cpu_timer_rearm,
> +       .timer_wait_running     = posix_cpu_timer_wait_running,
>  };
>
>  const struct k_clock clock_process = {
> --- a/kernel/time/posix-timers.c
> +++ b/kernel/time/posix-timers.c
> @@ -846,6 +846,10 @@ static struct k_itimer *timer_wait_runni
>         rcu_read_lock();
>         unlock_timer(timer, *flags);
>
> +       /*
> +        * kc->timer_wait_running() might drop RCU lock. So @timer
> +        * cannot be touched anymore after the function returns!
> +        */
>         if (!WARN_ON_ONCE(!kc->timer_wait_running))
>                 kc->timer_wait_running(timer);
>

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ