linux-kernel - Re: [RFC PATCH v7 11/23] sched/fair: core wide cfs task priority comparison

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <8b9f28f2-7f21-b7da-1056-732b6227ea25@linux.microsoft.com>
Date:   Thu, 17 Sep 2020 10:15:31 -0400
From:   Vineeth Pillai <viremana@...ux.microsoft.com>
To:     Peter Zijlstra <peterz@...radead.org>,
        Julien Desfossez <jdesfossez@...italocean.com>
Cc:     Joel Fernandes <joelaf@...gle.com>,
        Tim Chen <tim.c.chen@...ux.intel.com>,
        Aaron Lu <aaron.lwe@...il.com>,
        Aubrey Li <aubrey.intel@...il.com>,
        Dhaval Giani <dhaval.giani@...cle.com>,
        Chris Hyser <chris.hyser@...cle.com>,
        Nishanth Aravamudan <naravamudan@...italocean.com>,
        mingo@...nel.org, tglx@...utronix.de, pjt@...gle.com,
        torvalds@...ux-foundation.org, linux-kernel@...r.kernel.org,
        fweisbec@...il.com, keescook@...omium.org, kerrnel@...gle.com,
        Phil Auld <pauld@...hat.com>,
        Valentin Schneider <valentin.schneider@....com>,
        Mel Gorman <mgorman@...hsingularity.net>,
        Pawan Gupta <pawan.kumar.gupta@...ux.intel.com>,
        Paolo Bonzini <pbonzini@...hat.com>, joel@...lfernandes.org,
        vineeth@...byteword.org, Chen Yu <yu.c.chen@...el.com>,
        Christian Brauner <christian.brauner@...ntu.com>,
        Agata Gruza <agata.gruza@...el.com>,
        Antonio Gomez Iglesias <antonio.gomez.iglesias@...el.com>,
        graf@...zon.com, konrad.wilk@...cle.com, dfaggioli@...e.com,
        rostedt@...dmis.org, derkling@...gle.com, benbjiang@...cent.com,
        Aaron Lu <ziqian.lzq@...fin.com>
Subject: Re: [RFC PATCH v7 11/23] sched/fair: core wide cfs task priority
 comparison

Hi Peter,

On 8/28/20 5:29 PM, Peter Zijlstra wrote:
>
> This is still a horrible patch..

I was working on your idea of using the lag as a tool to compare vruntime
across cpus:
https://lwn.net/ml/linux-kernel/20200506143506.GH5298@hirez.programming.kicks-ass.net/
https://lwn.net/ml/linux-kernel/20200514130248.GD2940@hirez.programming.kicks-ass.net/

Basically, I record the clock time when a sibling is force idled. And before
comparison, if the sibling is still force idled, take the weighted delta 
of time
the sibling was force idled and then decrement it from the vruntime. The
lag is  computed in pick_task_fair as we do cross-cpu comparison only after
a task_pick.

I think this can solve the SMTn issues since we are doing a one-sided
comaprison.

I have done some initial testing and looks good. Need to do some deep 
analysis
and check the fairness cases. Thought of sharing the prototype and get 
feedback
as I am doing further testing. Please have a look and let me know your 
thoughts.

The git tree is here: 
https://github.com/vineethrp/linux/tree/coresched/pre-v8-v5.9-rc-lagfix
It has  the review comments addressed since v8 was posted.

Thanks,
Vineeth

---
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index cea9a63c2e7a..52b3e7b356e9 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -136,19 +136,8 @@ static inline bool prio_less(struct task_struct *a, 
struct task_struct *b)
         if (pa == -1) /* dl_prio() doesn't work because of stop_class 
above */
                 return !dl_time_before(a->dl.deadline, b->dl.deadline);

-       if (pa == MAX_RT_PRIO + MAX_NICE)  { /* fair */
-               u64 vruntime = b->se.vruntime;
-
-               /*
-                * Normalize the vruntime if tasks are in different cpus.
-                */
-               if (task_cpu(a) != task_cpu(b)) {
-                       vruntime -= task_cfs_rq(b)->min_vruntime;
-                       vruntime += task_cfs_rq(a)->min_vruntime;
-               }
-
-               return !((s64)(a->se.vruntime - vruntime) <= 0);
-       }
+       if (pa == MAX_RT_PRIO + MAX_NICE)       /* fair */
+               return cfs_prio_less(a, b);

         return false;
  }
@@ -235,6 +224,13 @@ static void sched_core_dequeue(struct rq *rq, 
struct task_struct *p)

         rb_erase(&p->core_node, &rq->core_tree);
         RB_CLEAR_NODE(&p->core_node);
+
+    /*
+     * Reset last force idle time if this rq will be
+     * empty after this dequeue.
+     */
+    if (rq->nr_running == 1)
+        rq->core_fi_time = 0;
  }

  /*
@@ -310,8 +306,10 @@ static int __sched_core_stopper(void *data)
                          * cgroup tags. However, dying tasks could still be
                          * left in core queue. Flush them here.
                          */
-                       if (!enabled)
+                       if (!enabled) {
                                 sched_core_flush(cpu);
+                               rq->core_fi_time = 0;
+            }

                         rq->core_enabled = enabled;
                 }
@@ -5181,9 +5179,17 @@ next_class:;
                 if (!rq_i->core_pick)
                         continue;

-               if (is_task_rq_idle(rq_i->core_pick) && rq_i->nr_running &&
-                   !rq_i->core->core_forceidle) {
-                       rq_i->core->core_forceidle = true;
+               if (is_task_rq_idle(rq_i->core_pick) && rq_i->nr_running) {
+
+                       if (!rq_i->core->core_forceidle)
+                               rq_i->core->core_forceidle = true;
+
+                       /*
+                        * XXX: Should we record force idled time a bit 
later on the
+                        * pick_next_task of this sibling when it forces 
itself idle?
+                        */
+                       rq_i->core_fi_time = rq_clock_task(rq_i);
+                       trace_printk("force idle(cpu=%d) TS=%llu\n", i, 
rq_i->core_fi_time);
                 }

                 rq_i->core_pick->core_occupation = occ;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index eadc62930336..7f62a50e23e1 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -389,6 +389,12 @@ static inline struct sched_entity 
*parent_entity(struct sched_entity *se)
         return se->parent;
  }

+static inline bool
+is_same_tg(struct sched_entity *se, struct sched_entity *pse)
+{
+       return se->cfs_rq->tg == pse->cfs_rq->tg;
+}
+
  static void
  find_matching_se(struct sched_entity **se, struct sched_entity **pse)
  {
@@ -453,6 +459,12 @@ static inline struct sched_entity 
*parent_entity(struct sched_entity *se)
         return NULL;
  }

+static inline bool
+is_same_tg(struct sched_entity *se, struct sched_entity *pse)
+{
+       return true;
+}
+
  static inline void
  find_matching_se(struct sched_entity **se, struct sched_entity **pse)
  {
@@ -6962,13 +6974,26 @@ static struct task_struct *pick_task_fair(struct 
rq *rq)
  {
         struct cfs_rq *cfs_rq = &rq->cfs;
         struct sched_entity *se;
+       u64 lag = 0;

         if (!cfs_rq->nr_running)
                 return NULL;

+       /*
+        * Calculate the lag caused due to force idle on this
+        * sibling. Since we do cross-cpu comparison of vruntime
+        * only after a task pick, it should be safe to compute
+        * the lag here.
+        */
+       if (rq->core_fi_time) {
+               lag = rq_clock_task(rq) - rq->core_fi_time;
+               if ((s64)lag < 0)
+                       lag = 0;
+       }
         do {
                 struct sched_entity *curr = cfs_rq->curr;

+
                 se = pick_next_entity(cfs_rq, NULL);

                 if (curr) {
@@ -6977,11 +7002,15 @@ static struct task_struct *pick_task_fair(struct 
rq *rq)

                         if (!se || entity_before(curr, se))
                                 se = curr;
+
+                       cfs_rq->core_lag += lag;
                 }

                 cfs_rq = group_cfs_rq(se);
         } while (cfs_rq);

+       rq->core_fi_time = 0;
+
         return task_of(se);
  }
  #endif
@@ -10707,6 +10736,41 @@ static inline void task_tick_core(struct rq 
*rq, struct task_struct *curr)
             __entity_slice_used(&curr->se))
                 resched_curr(rq);
  }
+
+bool cfs_prio_less(struct task_struct *a, struct task_struct *b)
+{
+       struct sched_entity *se_a = &a->se, *se_b = &b->se;
+       struct cfs_rq *cfs_rq_a, *cfs_rq_b;
+       u64 vruntime_a, vruntime_b;
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+       while (!is_same_tg(se_a, se_b)) {
+               int se_a_depth = se_a->depth;
+               int se_b_depth = se_b->depth;
+
+               if (se_a_depth <= se_b_depth)
+                       se_b = parent_entity(se_b);
+               if (se_a_depth >= se_b_depth)
+                       se_a = parent_entity(se_a);
+       }
+#endif
+
+       cfs_rq_a = cfs_rq_of(se_a);
+       cfs_rq_b = cfs_rq_of(se_b);
+
+       vruntime_a = se_a->vruntime - cfs_rq_a->min_vruntime;
+       vruntime_b = se_b->vruntime - cfs_rq_b->min_vruntime;
+
+       trace_printk("(%s/%d;%Ld,%Lu) ?< (%s/%d;%Ld,%Lu)\n",
+                    a->comm, a->pid, vruntime_a, cfs_rq_a->core_lag,
+                    b->comm, b->pid, vruntime_b, cfs_rq_b->core_lag);
+       if (cfs_rq_a != cfs_rq_b) {
+               vruntime_a -= calc_delta_fair(cfs_rq_a->core_lag, &a->se);
+               vruntime_b -= calc_delta_fair(cfs_rq_b->core_lag, &b->se);
+       }
+
+       return !((s64)(vruntime_a - vruntime_b) <= 0);
+}
  #else
  static inline void task_tick_core(struct rq *rq, struct task_struct 
*curr) {}
  #endif
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 5374ace195b9..a1c15edf8dd5 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -607,6 +607,10 @@ struct cfs_rq {
         struct list_head        throttled_list;
  #endif /* CONFIG_CFS_BANDWIDTH */
  #endif /* CONFIG_FAIR_GROUP_SCHED */
+
+#ifdef CONFIG_SCHED_CORE
+       u64                     core_lag;
+#endif
  };

  static inline int rt_bandwidth_enabled(void)
@@ -1056,6 +1060,7 @@ struct rq {
  #ifdef CONFIG_SCHED_CORE
         /* per rq */
         struct rq               *core;
+       u64                     core_fi_time; /* Last forced idle TS */
         struct task_struct      *core_pick;
         unsigned int            core_enabled;
         unsigned int            core_sched_seq;
vineeth@...station340u:~/WS/kernel_test/linux-qemu/out$ git show > log
vineeth@...station340u:~/WS/kernel_test/linux-qemu/out$ cat log
commit e97ec4d317ac01cabbcdcb1ac8a315a02b3e39f6
Author: Vineeth Pillai <viremana@...ux.microsoft.com>
Date:   Wed Sep 16 11:35:37 2020 -0400

     sched/coresched: lag based cross cpu vruntime comparison

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index cea9a63c2e7a..52b3e7b356e9 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -136,19 +136,8 @@ static inline bool prio_less(struct task_struct *a, 
struct task_struct *b)
      if (pa == -1) /* dl_prio() doesn't work because of stop_class above */
          return !dl_time_before(a->dl.deadline, b->dl.deadline);

-    if (pa == MAX_RT_PRIO + MAX_NICE)  { /* fair */
-        u64 vruntime = b->se.vruntime;
-
-        /*
-         * Normalize the vruntime if tasks are in different cpus.
-         */
-        if (task_cpu(a) != task_cpu(b)) {
-            vruntime -= task_cfs_rq(b)->min_vruntime;
-            vruntime += task_cfs_rq(a)->min_vruntime;
-        }
-
-        return !((s64)(a->se.vruntime - vruntime) <= 0);
-    }
+    if (pa == MAX_RT_PRIO + MAX_NICE)    /* fair */
+        return cfs_prio_less(a, b);

      return false;
  }
@@ -235,6 +224,13 @@ static void sched_core_dequeue(struct rq *rq, 
struct task_struct *p)

      rb_erase(&p->core_node, &rq->core_tree);
      RB_CLEAR_NODE(&p->core_node);
+
+    /*
+     * Reset last force idle time if this rq will be
+     * empty after this dequeue.
+     */
+    if (rq->nr_running == 1)
+        rq->core_fi_time = 0;
  }

  /*
@@ -310,8 +306,10 @@ static int __sched_core_stopper(void *data)
               * cgroup tags. However, dying tasks could still be
               * left in core queue. Flush them here.
               */
-            if (!enabled)
+            if (!enabled) {
                  sched_core_flush(cpu);
+                rq->core_fi_time = 0;
+                /* XXX Reset cfs_rq->core_lag? */
+            }

              rq->core_enabled = enabled;
          }
@@ -5181,9 +5179,17 @@ next_class:;
          if (!rq_i->core_pick)
              continue;

-        if (is_task_rq_idle(rq_i->core_pick) && rq_i->nr_running &&
-            !rq_i->core->core_forceidle) {
-            rq_i->core->core_forceidle = true;
+        if (is_task_rq_idle(rq_i->core_pick) && rq_i->nr_running) {
+
+            if (!rq_i->core->core_forceidle)
+                rq_i->core->core_forceidle = true;
+
+            /*
+             * XXX: Should we record force idled time a bit later on the
+             * pick_next_task of this sibling when it forces itself idle?
+             */
+            rq_i->core_fi_time = rq_clock_task(rq_i);
+            trace_printk("force idle(cpu=%d) TS=%llu\n", i, 
rq_i->core_fi_time);
          }

          rq_i->core_pick->core_occupation = occ;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index eadc62930336..7f62a50e23e1 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -389,6 +389,12 @@ static inline struct sched_entity 
*parent_entity(struct sched_entity *se)
      return se->parent;
  }

+static inline bool
+is_same_tg(struct sched_entity *se, struct sched_entity *pse)
+{
+    return se->cfs_rq->tg == pse->cfs_rq->tg;
+}
+
  static void
  find_matching_se(struct sched_entity **se, struct sched_entity **pse)
  {
@@ -453,6 +459,12 @@ static inline struct sched_entity 
*parent_entity(struct sched_entity *se)
      return NULL;
  }

+static inline bool
+is_same_tg(struct sched_entity *se, struct sched_entity *pse)
+{
+    return true;
+}
+
  static inline void
  find_matching_se(struct sched_entity **se, struct sched_entity **pse)
  {
@@ -6962,13 +6974,26 @@ static struct task_struct *pick_task_fair(struct 
rq *rq)
  {
      struct cfs_rq *cfs_rq = &rq->cfs;
      struct sched_entity *se;
+  u64 lag = 0;

      if (!cfs_rq->nr_running)
          return NULL;

+    /*
+     * Calculate the lag caused due to force idle on this
+     * sibling. Since we do cross-cpu comparison of vruntime
+     * only after a task pick, it should be safe to compute
+     * the lag here.
+     */
+    if (rq->core_fi_time) {
+        lag = rq_clock_task(rq) - rq->core_fi_time;
+        if ((s64)lag < 0)
+            lag = 0;
+    }
      do {
          struct sched_entity *curr = cfs_rq->curr;

+
          se = pick_next_entity(cfs_rq, NULL);

          if (curr) {
@@ -6977,11 +7002,15 @@ static struct task_struct *pick_task_fair(struct 
rq *rq)

              if (!se || entity_before(curr, se))
                  se = curr;
+
+            cfs_rq->core_lag += lag;
          }

          cfs_rq = group_cfs_rq(se);
      } while (cfs_rq);

+    rq->core_fi_time = 0;
+
      return task_of(se);
  }
  #endif
@@ -10707,6 +10736,41 @@ static inline void task_tick_core(struct rq 
*rq, struct task_struct *curr)
          __entity_slice_used(&curr->se))
          resched_curr(rq);
  }
+
+bool cfs_prio_less(struct task_struct *a, struct task_struct *b)
+{
+    struct sched_entity *se_a = &a->se, *se_b = &b->se;
+    struct cfs_rq *cfs_rq_a, *cfs_rq_b;
+    u64 vruntime_a, vruntime_b;
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+    while (!is_same_tg(se_a, se_b)) {
+        int se_a_depth = se_a->depth;
+        int se_b_depth = se_b->depth;
+
+        if (se_a_depth <= se_b_depth)
+            se_b = parent_entity(se_b);
+        if (se_a_depth >= se_b_depth)
+            se_a = parent_entity(se_a);
+    }
+#endif
+
+    cfs_rq_a = cfs_rq_of(se_a);
+    cfs_rq_b = cfs_rq_of(se_b);
+
+    vruntime_a = se_a->vruntime - cfs_rq_a->min_vruntime;
+    vruntime_b = se_b->vruntime - cfs_rq_b->min_vruntime;
+
+    trace_printk("(%s/%d;%Ld,%Lu) ?< (%s/%d;%Ld,%Lu)\n",
+             a->comm, a->pid, vruntime_a, cfs_rq_a->core_lag,
+             b->comm, b->pid, vruntime_b, cfs_rq_b->core_lag);
+    if (cfs_rq_a != cfs_rq_b) {
+        vruntime_a -= calc_delta_fair(cfs_rq_a->core_lag, &a->se);
+        vruntime_b -= calc_delta_fair(cfs_rq_b->core_lag, &b->se);
+    }
+
+    return !((s64)(vruntime_a - vruntime_b) <= 0);
+}
  #else
  static inline void task_tick_core(struct rq *rq, struct task_struct 
*curr) {}
  #endif
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 5374ace195b9..a1c15edf8dd5 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -607,6 +607,10 @@ struct cfs_rq {
      struct list_head    throttled_list;
  #endif /* CONFIG_CFS_BANDWIDTH */
  #endif /* CONFIG_FAIR_GROUP_SCHED */
+
+#ifdef CONFIG_SCHED_CORE
+    u64            core_lag;
+#endif
  };

  static inline int rt_bandwidth_enabled(void)
@@ -1056,6 +1060,7 @@ struct rq {
  #ifdef CONFIG_SCHED_CORE
      /* per rq */
      struct rq        *core;
+   u64            core_fi_time; /* Last forced idle TS */
      struct task_struct    *core_pick;
      unsigned int        core_enabled;
      unsigned int        core_sched_seq;