linux-kernel - Re: [Query]: tick-sched: why don't we stop tick when we are running idle task?

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <CAKohpomgv-LbUcPxazfJkf0Jq0sSULxT0WjrbFovn0k0O-ORwg@mail.gmail.com>
Date:	Fri, 11 Apr 2014 22:08:30 +0530
From:	Viresh Kumar <viresh.kumar@...aro.org>
To:	Peter Zijlstra <peterz@...radead.org>
Cc:	Frederic Weisbecker <fweisbec@...il.com>,
	Thomas Gleixner <tglx@...utronix.de>,
	Linux Kernel Mailing List <linux-kernel@...r.kernel.org>,
	Lists linaro-kernel <linaro-kernel@...ts.linaro.org>
Subject: Re: [Query]: tick-sched: why don't we stop tick when we are running
 idle task?

On 11 April 2014 20:48, Peter Zijlstra <peterz@...radead.org> wrote:
> On Fri, Apr 11, 2014 at 04:53:35PM +0200, Frederic Weisbecker wrote:

> I think there's assumptions that tick runs on the local cpu;

Yes, many function behave that way, i.e. with smp_processor_id() as
CPU.

> also what
> are you going to do when running it on all remote cpus takes longer than
> the tick?
>
>> Otherwise (and ideally) we need to make the scheduler code able to handle long periods without
>> calling scheduler_tick(). But this is a lot more plumbing. And the scheduler has gazillions
>> accounting stuffs to handle. Sounds like a big nightmare to take that direction.
>
> So i'm not at all sure what you guys are talking about, but it seems to
> me you should all put down the bong and have a detox round instead.
>
> This all sounds like a cure worse than the problem.

So, what I was working on isn't ready yet but I would like to show what lines
I have been trying on. In case that is completely incorrect and I should stop
making that work :)

Please share your feedback about this (Yes there are several parts broken
currently, specially the assumption that tick runs on local CPU):

NOTE: Its rebased over some cleanups I did which aren't sent to LKML yet.

---------x-------------------------x---------------
    tick-sched: offload NO_HZ_FULL computation to timekeeping CPUs

    Signed-off-by: Viresh Kumar <viresh.kumar@...aro.org>
---
 include/linux/tick.h     |  2 ++
 kernel/time/tick-sched.c | 90
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----------
 2 files changed, 82 insertions(+), 10 deletions(-)

diff --git a/include/linux/tick.h b/include/linux/tick.h
index 97bbb64..f8efa9f 100644
--- a/include/linux/tick.h
+++ b/include/linux/tick.h
@@ -62,6 +62,7 @@ enum tick_nohz_mode {
  */
 struct tick_sched {
        struct hrtimer                  sched_timer;
+       struct cpumask                  timekeeping_pending;
        enum tick_nohz_mode             nohz_mode;
        unsigned long                   check_clocks;
        unsigned long                   idle_jiffies;
@@ -77,6 +78,7 @@ struct tick_sched {
        ktime_t                         iowait_sleeptime;
        ktime_t                         sleep_length;
        ktime_t                         idle_expires;
+       unsigned int                    cpu;
        int                             inidle;
        int                             idle_active;
        int                             tick_stopped;
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 2d0b154..7560bd0 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -532,13 +532,56 @@ u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time)
 }
 EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us);

+static int tick_nohz_full_get_offload_cpu(unsigned int cpu)
+{
+       const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(cpu));
+       unsigned int offload_cpu;
+       cpumask_t cpumask;
+
+       /* Don't pick any NO_HZ_FULL cpu */
+       cpumask_andnot(&cpumask, cpu_online_mask, tick_nohz_full_mask);
+       cpumask_clear_cpu(cpu, &cpumask);
+
+       /* Try for same node. */
+       offload_cpu = cpumask_first_and(nodemask, &cpumask);
+       if (offload_cpu < nr_cpu_ids)
+               return offload_cpu;
+
+       /* Any online will do */
+       return cpumask_any(&cpumask);
+}
+
+static void tick_nohz_full_offload_timer(void *info)
+{
+       struct tick_sched *ts = info;
+       hrtimer_start_expires(&ts->sched_timer, HRTIMER_MODE_ABS_PINNED);
+}
+
+static void tick_nohz_full_offload_tick(struct tick_sched *ts, ktime_t expires,
+                                       unsigned int cpu)
+{
+       unsigned int offload_cpu = tick_nohz_full_get_offload_cpu(cpu);
+
+       /* Offload accounting to timekeeper */
+       hrtimer_cancel(&ts->sched_timer);
+       hrtimer_set_expires(&ts->sched_timer, expires);
+
+       /*
+        * This is triggering a WARN_ON() as below routine doesn't support calls
+        * while interrupts are disabled. Need to think of some other way to get
+        * this fixed.
+        */
+       smp_call_function_single(offload_cpu, tick_nohz_full_offload_timer, ts,
+                                true);
+}
+
 static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
                                         ktime_t now, int cpu)
 {
        unsigned long seq, last_jiffies, next_jiffies, delta_jiffies;
        ktime_t last_update, expires, ret = { .tv64 = 0 };
        unsigned long rcu_delta_jiffies;
-       struct clock_event_device *dev = tick_get_cpu_device()->evtdev;
+       struct clock_event_device *dev = tick_get_device(cpu)->evtdev;
        u64 time_delta;

        time_delta = timekeeping_max_deferment();
@@ -661,8 +704,12 @@ static ktime_t tick_nohz_stop_sched_tick(struct
tick_sched *ts,
                }

                if (ts->nohz_mode == NOHZ_MODE_HIGHRES) {
-                       hrtimer_start(&ts->sched_timer, expires,
-                                     HRTIMER_MODE_ABS_PINNED);
+                       if (ts->inidle)
+                               hrtimer_start(&ts->sched_timer, expires,
+                                             HRTIMER_MODE_ABS_PINNED);
+                       else
+                               tick_nohz_full_offload_tick(ts, expires, cpu);
+
                        /* Check, if the timer was already in the past */
                        if (hrtimer_active(&ts->sched_timer))
                                goto out;
@@ -687,9 +734,7 @@ out:
 static void tick_nohz_full_stop_tick(struct tick_sched *ts)
 {
 #ifdef CONFIG_NO_HZ_FULL
-       int cpu = smp_processor_id();
-
-       if (!tick_nohz_full_cpu(cpu) || is_idle_task(current))
+       if (!tick_nohz_full_cpu(ts->cpu) || is_idle_task(current))
                return;

        if (!ts->tick_stopped && ts->nohz_mode == NOHZ_MODE_INACTIVE)
@@ -698,7 +743,7 @@ static void tick_nohz_full_stop_tick(struct tick_sched *ts)
        if (!can_stop_full_tick())
                return;

-       tick_nohz_stop_sched_tick(ts, ktime_get(), cpu);
+       tick_nohz_stop_sched_tick(ts, ktime_get(), ts->cpu);
 #endif
 }

@@ -824,11 +869,21 @@ EXPORT_SYMBOL_GPL(tick_nohz_idle_enter);
 void tick_nohz_irq_exit(void)
 {
        struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
+       unsigned int cpu;

-       if (ts->inidle)
+       if (ts->inidle) {
                __tick_nohz_idle_enter(ts);
-       else
+       } else {
                tick_nohz_full_stop_tick(ts);
+
+               while (!cpumask_empty(&ts->timekeeping_pending)) {
+                       cpu = cpumask_first(&ts->timekeeping_pending);
+                       cpumask_clear_cpu(cpu, &ts->timekeeping_pending);
+
+                       /* Try to stop tick of NO_HZ_FULL cpu */
+                       tick_nohz_full_stop_tick(tick_get_tick_sched(cpu));
+               }
+       }
 }

 /**
@@ -1090,13 +1145,23 @@ static enum hrtimer_restart
tick_sched_timer(struct hrtimer *timer)
 {
        struct tick_sched *ts =
                container_of(timer, struct tick_sched, sched_timer);
+       struct tick_sched *this_ts = &__get_cpu_var(tick_cpu_sched);
        ktime_t now = ktime_get();

-       tick_sched_do_timer(now);
+       /* Running as timekeeper ? */
+       if (likely(ts == this_ts))
+               tick_sched_do_timer(now);
+       else
+               cpumask_set_cpu(ts->cpu, &this_ts->timekeeping_pending);
+
        tick_sched_handle(ts);

        hrtimer_forward(timer, now, tick_period);

+       /*
+        * Yes, we are scheduling next tick also while timekeeping on this_cpu.
+        * We will handle that from irq_exit().
+        */
        return HRTIMER_RESTART;
 }

@@ -1149,6 +1214,11 @@ void tick_setup_sched_timer(void)
                tick_nohz_active = 1;
        }
 #endif
+
+#ifdef CONFIG_NO_HZ_FULL
+       ts->cpu = smp_processor_id();
+       cpumask_clear(&ts->timekeeping_pending);
+#endif
 }
 #endif /* HIGH_RES_TIMERS */
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/