linux-kernel - Re: [PATCH][sched] Ignore RT throttling if rq->rt tasks are the only running tasks in the rq

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-Id: <1309341351339290@web20d.yandex.ru>
Date:	Sat, 27 Oct 2012 16:01:30 +0400
From:	Kirill Tkhai <tkhai@...dex.ru>
To:	"linux-kernel@...r.kernel.org" <linux-kernel@...r.kernel.org>
Cc:	Steven Rostedt <rostedt@...dmis.org>,
	Ingo Molnar <mingo@...nel.org>,
	Peter Zijlstra <peterz@...radead.org>
Subject: Re: [PATCH][sched] Ignore RT throttling if rq->rt tasks are the only running tasks in the rq

I need a little rework of this patch. I'll send it later.

Sorry for the noise.

Kirill

27.10.2012, 14:36, "Kirill Tkhai" <tkhai@...dex.ru>:
> The current throttling logic always skips RT class if rq->rt is throttled.
> It doesn't handle the special case when RT tasks are the only running tasks
> in the rq. So it's possible CPU picks idle task up when RTs are available.
>
> This patch aims to avoid the above situation. The modified
> _pick_next_task_rt() looks at the number of total rq->rt tasks(with the sum
> of all child rt_rq's) and compares it with the number of all running tasks
> of the rq. If they are equal then scheduler picks the highest rq->rt task
> (children are considered too).
>
> Later, the first unthrottled rq_rt will replace this task. The case
> of appearance of fair task is handled in check_preempt_curr() function.
>
> The patch changes the logic of pick_rt_task() and pick_next_highest_task_rt().
> Now the negative cpu always makes task "picked". But there are no another
> users of this posibility and nobody is touched by this change.
>
> Signed-off-by: Kirill V Tkhai <tkhai@...dex.ru>
> CC: Steven Rostedt <rostedt@...dmis.org>
> CC: Ingo Molnar <mingo@...nel.org>
> CC: Peter Zijlstra <peterz@...radead.org>
>
> ---
>  kernel/sched/core.c  |    6 +++-
>  kernel/sched/rt.c    |   97 ++++++++++++++++++++++++++++++++------------------
>  kernel/sched/sched.h |    3 +-
>  3 files changed, 69 insertions(+), 37 deletions(-)
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index bf41f82..ecc9833 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -901,7 +901,9 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
>  {
>          const struct sched_class *class;
>
> - if (p->sched_class == rq->curr->sched_class) {
> + if (rq->curr->sched_class == rq->extended_class) {
> + resched_task(rq->curr);
> + } else if (p->sched_class == rq->curr->sched_class) {
>                  rq->curr->sched_class->check_preempt_curr(rq, p, flags);
>          } else {
>                  for_each_class(class) {
> @@ -2771,6 +2773,7 @@ static void put_prev_task(struct rq *rq, struct task_struct *prev)
>          if (prev->on_rq || rq->skip_clock_update < 0)
>                  update_rq_clock(rq);
>          prev->sched_class->put_prev_task(rq, prev);
> + rq->extended_class = NULL;
>  }
>
>  /*
> @@ -6892,6 +6895,7 @@ void __init sched_init(void)
>                  rq->calc_load_update = jiffies + LOAD_FREQ;
>                  init_cfs_rq(&rq->cfs);
>                  init_rt_rq(&rq->rt, rq);
> + rq->extended_class = NULL;
>  #ifdef CONFIG_FAIR_GROUP_SCHED
>                  root_task_group.shares = ROOT_TASK_GROUP_LOAD;
>                  INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
> diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
> index 418feb0..6f6da20 100644
> --- a/kernel/sched/rt.c
> +++ b/kernel/sched/rt.c
> @@ -274,15 +274,8 @@ static void update_rt_migration(struct rt_rq *rt_rq)
>
>  static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
>  {
> - struct task_struct *p;
> -
> - if (!rt_entity_is_task(rt_se))
> - return;
> -
> - p = rt_task_of(rt_se);
> - rt_rq = &rq_of_rt_rq(rt_rq)->rt;
> + struct task_struct *p = rt_task_of(rt_se);
>
> - rt_rq->rt_nr_total++;
>          if (p->nr_cpus_allowed > 1)
>                  rt_rq->rt_nr_migratory++;
>
> @@ -291,15 +284,8 @@ static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
>
>  static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
>  {
> - struct task_struct *p;
> -
> - if (!rt_entity_is_task(rt_se))
> - return;
> -
> - p = rt_task_of(rt_se);
> - rt_rq = &rq_of_rt_rq(rt_rq)->rt;
> + struct task_struct *p = rt_task_of(rt_se);
>
> - rt_rq->rt_nr_total--;
>          if (p->nr_cpus_allowed > 1)
>                  rt_rq->rt_nr_migratory--;
>
> @@ -467,6 +453,16 @@ static int rt_se_boosted(struct sched_rt_entity *rt_se)
>          return p->prio != p->normal_prio;
>  }
>
> +static void extended_rt_unthrottles(struct rq *rq, struct rt_rq *rt_rq)
> +{
> + struct task_struct *curr = rq->curr;
> +
> + if (rt_rq_of_se(&curr->rt) == rt_rq)
> + rq->extended_class = NULL;
> + else
> + resched_task(curr);
> +}
> +
>  #ifdef CONFIG_SMP
>  static inline const struct cpumask *sched_rt_period_mask(void)
>  {
> @@ -826,6 +822,9 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
>                                   */
>                                  if (rt_rq->rt_nr_running && rq->curr == rq->idle)
>                                          rq->skip_clock_update = -1;
> +
> + if (rq->extended_class == &rt_sched_class)
> + extended_rt_unthrottles(rq, rt_rq);
>                          }
>                          if (rt_rq->rt_time || rt_rq->rt_nr_running)
>                                  idle = 0;
> @@ -1071,8 +1070,14 @@ void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
>          WARN_ON(!rt_prio(prio));
>          rt_rq->rt_nr_running++;
>
> + if (rt_entity_is_task(rt_se)) {
> + struct rt_rq *rt = &rq_of_rt_rq(rt_rq)->rt;
> +
> + rt->rt_nr_total++;
> + inc_rt_migration(rt_se, rt);
> + }
> +
>          inc_rt_prio(rt_rq, prio);
> - inc_rt_migration(rt_se, rt_rq);
>          inc_rt_group(rt_se, rt_rq);
>  }
>
> @@ -1083,8 +1088,15 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
>          WARN_ON(!rt_rq->rt_nr_running);
>          rt_rq->rt_nr_running--;
>
> + if (rt_entity_is_task(rt_se)) {
> + struct rt_rq *rt = &rq_of_rt_rq(rt_rq)->rt;
> +
> + WARN_ON(!rt->rt_nr_total);
> + rt->rt_nr_total--;
> + dec_rt_migration(rt_se, rt);
> + }
> +
>          dec_rt_prio(rt_rq, rt_se_prio(rt_se));
> - dec_rt_migration(rt_se, rt_rq);
>          dec_rt_group(rt_se, rt_rq);
>  }
>
> @@ -1362,28 +1374,41 @@ static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq,
>          return next;
>  }
>
> +static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu);
> +
>  static struct task_struct *_pick_next_task_rt(struct rq *rq)
>  {
> - struct sched_rt_entity *rt_se;
> - struct task_struct *p;
>          struct rt_rq *rt_rq;
> + struct task_struct *p;
> + int running, rt_total;
>
>          rt_rq = &rq->rt;
> + running = rt_rq->rt_nr_running;
>
> - if (!rt_rq->rt_nr_running)
> - return NULL;
> + /* If rq->rt is suitable to get tasks */
> + if (running && !rt_rq_throttled(rt_rq)) {
> + struct sched_rt_entity *rt_se;
>
> - if (rt_rq_throttled(rt_rq))
> + do {
> + rt_se = pick_next_rt_entity(rq, rt_rq);
> + BUG_ON(!rt_se);
> + rt_rq = group_rt_rq(rt_se);
> + } while (rt_rq);
> +
> + return rt_task_of(rt_se);
> + }
> +
> + rt_total = rt_rq->rt_nr_total;
> +
> + /* If rq has no-RT tasks OR rt_rq and its children are empty */
> + if (rt_total != rq->nr_running || !rt_total)
>                  return NULL;
>
> - do {
> - rt_se = pick_next_rt_entity(rq, rt_rq);
> - BUG_ON(!rt_se);
> - rt_rq = group_rt_rq(rt_se);
> - } while (rt_rq);
> + /* All running tasks are RT. Let's avoid idle wasting CPU time */
> + p = pick_next_highest_task_rt(rq, -1);
> + rq->extended_class = &rt_sched_class;
>
> - p = rt_task_of(rt_se);
> - p->se.exec_start = rq->clock_task;
> + WARN_ON(!p || rq->cfs.h_nr_running);
>
>          return p;
>  }
> @@ -1392,9 +1417,11 @@ static struct task_struct *pick_next_task_rt(struct rq *rq)
>  {
>          struct task_struct *p = _pick_next_task_rt(rq);
>
> - /* The running task is never eligible for pushing */
> - if (p)
> + if (p) {
> + /* The running task is never eligible for pushing */
>                  dequeue_pushable_task(rq, p);
> + p->se.exec_start = rq->clock_task;
> + }
>
>  #ifdef CONFIG_SMP
>          /*
> @@ -1426,9 +1453,9 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
>
>  static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
>  {
> - if (!task_running(rq, p) &&
> -    (cpu < 0 || cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) &&
> -    (p->nr_cpus_allowed > 1))
> + if (cpu < 0 || (!task_running(rq, p)
> + && (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))
> + && p->nr_cpus_allowed > 1)))
>                  return 1;
>          return 0;
>  }
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index 508e77e..9fdacef 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -294,6 +294,7 @@ static inline int rt_bandwidth_enabled(void)
>  struct rt_rq {
>          struct rt_prio_array active;
>          unsigned int rt_nr_running;
> + unsigned long rt_nr_total;
>  #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
>          struct {
>                  int curr; /* highest queued rt task prio */
> @@ -304,7 +305,6 @@ struct rt_rq {
>  #endif
>  #ifdef CONFIG_SMP
>          unsigned long rt_nr_migratory;
> - unsigned long rt_nr_total;
>          int overloaded;
>          struct plist_head pushable_tasks;
>  #endif
> @@ -396,6 +396,7 @@ struct rq {
>  #ifdef CONFIG_RT_GROUP_SCHED
>          struct list_head leaf_rt_rq_list;
>  #endif
> + const struct sched_class *extended_class;
>
>          /*
>           * This is part of a global counter where only the total sum
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/