linux-kernel - Re: [PATCH v2 4/6] blk-mq: avoid inserting requests before establishing new mapping

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <CACVXFVNpxqd1KsEtaunt5dUW7uh9RK1GxL4ZQ9im=eh3Mcr9ZQ@mail.gmail.com>
Date:	Thu, 9 Jul 2015 15:04:34 +0800
From:	Ming Lei <tom.leiming@...il.com>
To:	Akinobu Mita <akinobu.mita@...il.com>
Cc:	Linux Kernel Mailing List <linux-kernel@...r.kernel.org>,
	Jens Axboe <axboe@...nel.dk>
Subject: Re: [PATCH v2 4/6] blk-mq: avoid inserting requests before
 establishing new mapping

On Thu, Jul 2, 2015 at 10:29 PM, Akinobu Mita <akinobu.mita@...il.com> wrote:
> Notifier callbacks for CPU_ONLINE action can be run on the other CPU
> than the CPU which was just onlined.  So it is possible for the
> process running on the just onlined CPU to insert request and run
> hw queue before establishing new mapping which is done by
> blk_mq_queue_reinit_notify().
>
> This can cause a problem when the CPU has just been onlined first time
> since the request queue was initialized.  At this time ctx->index_hw
> for the CPU, which is the index in hctx->ctxs[] for this ctx, is still
> zero before blk_mq_queue_reinit_notify() is called by notifier
> callbacks for CPU_ONLINE action.
>
> For example, there is a single hw queue (hctx) and two CPU queues
> (ctx0 for CPU0, and ctx1 for CPU1).  Now CPU1 is just onlined and
> a request is inserted into ctx1->rq_list and set bit0 in pending
> bitmap as ctx1->index_hw is still zero.
>
> And then while running hw queue, flush_busy_ctxs() finds bit0 is set
> in pending bitmap and tries to retrieve requests in
> hctx->ctxs[0]->rq_list.  But htx->ctxs[0] is a pointer to ctx0, so the
> request in ctx1->rq_list is ignored.
>
> Fix it by ensuring that new mapping is established before onlined cpu
> starts running.
>
> Signed-off-by: Akinobu Mita <akinobu.mita@...il.com>
> Cc: Jens Axboe <axboe@...nel.dk>
> Cc: Ming Lei <tom.leiming@...il.com>
> ---
>  block/blk-mq-cpumap.c |  9 +++++----
>  block/blk-mq.c        | 41 ++++++++++++++++++++++++++---------------
>  block/blk-mq.h        |  3 ++-
>  3 files changed, 33 insertions(+), 20 deletions(-)
>
> diff --git a/block/blk-mq-cpumap.c b/block/blk-mq-cpumap.c
> index 1e28ddb..8764c24 100644
> --- a/block/blk-mq-cpumap.c
> +++ b/block/blk-mq-cpumap.c
> @@ -31,7 +31,8 @@ static int get_first_sibling(unsigned int cpu)
>         return cpu;
>  }
>
> -int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues)
> +int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues,
> +                           const struct cpumask *online_mask)
>  {
>         unsigned int i, nr_cpus, nr_uniq_cpus, queue, first_sibling;
>         cpumask_var_t cpus;
> @@ -41,7 +42,7 @@ int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues)
>
>         cpumask_clear(cpus);
>         nr_cpus = nr_uniq_cpus = 0;
> -       for_each_online_cpu(i) {
> +       for_each_cpu(i, online_mask) {
>                 nr_cpus++;
>                 first_sibling = get_first_sibling(i);
>                 if (!cpumask_test_cpu(first_sibling, cpus))
> @@ -51,7 +52,7 @@ int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues)
>
>         queue = 0;
>         for_each_possible_cpu(i) {
> -               if (!cpu_online(i)) {
> +               if (!cpumask_test_cpu(i, online_mask)) {
>                         map[i] = 0;
>                         continue;
>                 }
> @@ -95,7 +96,7 @@ unsigned int *blk_mq_make_queue_map(struct blk_mq_tag_set *set)
>         if (!map)
>                 return NULL;
>
> -       if (!blk_mq_update_queue_map(map, set->nr_hw_queues))
> +       if (!blk_mq_update_queue_map(map, set->nr_hw_queues, cpu_online_mask))
>                 return map;
>
>         kfree(map);
> diff --git a/block/blk-mq.c b/block/blk-mq.c
> index ff72e18..ad07373 100644
> --- a/block/blk-mq.c
> +++ b/block/blk-mq.c
> @@ -1799,7 +1799,8 @@ static void blk_mq_init_cpu_queues(struct request_queue *q,
>         }
>  }
>
> -static void blk_mq_map_swqueue(struct request_queue *q)
> +static void blk_mq_map_swqueue(struct request_queue *q,
> +                              const struct cpumask *online_mask)
>  {
>         unsigned int i;
>         struct blk_mq_hw_ctx *hctx;
> @@ -1816,7 +1817,7 @@ static void blk_mq_map_swqueue(struct request_queue *q)
>          */
>         queue_for_each_ctx(q, ctx, i) {
>                 /* If the cpu isn't online, the cpu is mapped to first hctx */
> -               if (!cpu_online(i))
> +               if (!cpumask_test_cpu(i, online_mask))
>                         continue;
>
>                 hctx = q->mq_ops->map_queue(q, i);
> @@ -2046,7 +2047,10 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
>
>         blk_mq_add_queue_tag_set(set, q);
>
> -       blk_mq_map_swqueue(q);
> +       get_online_cpus();
> +       blk_mq_update_queue_map(q->mq_map, q->nr_hw_queues, cpu_online_mask);
> +       blk_mq_map_swqueue(q, cpu_online_mask);
> +       put_online_cpus();
>
>         return q;
>
> @@ -2083,13 +2087,14 @@ void blk_mq_free_queue(struct request_queue *q)
>  }
>
>  /* Basically redo blk_mq_init_queue with queue frozen */
> -static void blk_mq_queue_reinit(struct request_queue *q)
> +static void blk_mq_queue_reinit(struct request_queue *q,
> +                               const struct cpumask *online_mask)
>  {
>         WARN_ON_ONCE(!atomic_read(&q->mq_freeze_depth));
>
>         blk_mq_sysfs_unregister(q);
>
> -       blk_mq_update_queue_map(q->mq_map, q->nr_hw_queues);
> +       blk_mq_update_queue_map(q->mq_map, q->nr_hw_queues, online_mask);
>
>         /*
>          * redo blk_mq_init_cpu_queues and blk_mq_init_hw_queues. FIXME: maybe
> @@ -2097,25 +2102,31 @@ static void blk_mq_queue_reinit(struct request_queue *q)
>          * involves free and re-allocate memory, worthy doing?)
>          */
>
> -       blk_mq_map_swqueue(q);
> +       blk_mq_map_swqueue(q, online_mask);
>
>         blk_mq_sysfs_register(q);
>  }
>
> +static struct cpumask online_new;

Better to comment on why there isn't race on the single
global variable.

> +
>  static int blk_mq_queue_reinit_notify(struct notifier_block *nb,
>                                       unsigned long action, void *hcpu)
>  {
>         struct request_queue *q;
> +       int cpu = (unsigned long)hcpu;
>
> -       /*
> -        * Before new mappings are established, hotadded cpu might already
> -        * start handling requests. This doesn't break anything as we map
> -        * offline CPUs to first hardware queue. We will re-init the queue
> -        * below to get optimal settings.
> -        */

You should put some comment here about why the change is introduced
because the race isn't very obvious.

> -       if (action != CPU_DEAD && action != CPU_DEAD_FROZEN &&
> -           action != CPU_ONLINE && action != CPU_ONLINE_FROZEN)
> +       switch (action & ~CPU_TASKS_FROZEN) {
> +       case CPU_DEAD:
> +       case CPU_UP_CANCELED:
> +               cpumask_copy(&online_new, cpu_online_mask);
> +               break;
> +       case CPU_UP_PREPARE:
> +               cpumask_copy(&online_new, cpu_online_mask);
> +               cpumask_set_cpu(cpu, &online_new);
> +               break;
> +       default:
>                 return NOTIFY_OK;
> +       }
>
>         mutex_lock(&all_q_mutex);
>
> @@ -2139,7 +2150,7 @@ static int blk_mq_queue_reinit_notify(struct notifier_block *nb,
>         }
>
>         list_for_each_entry(q, &all_q_list, all_q_node)
> -               blk_mq_queue_reinit(q);
> +               blk_mq_queue_reinit(q, &online_new);
>
>         list_for_each_entry(q, &all_q_list, all_q_node)
>                 blk_mq_unfreeze_queue(q);
> diff --git a/block/blk-mq.h b/block/blk-mq.h
> index 6a48c4c..f4fea79 100644
> --- a/block/blk-mq.h
> +++ b/block/blk-mq.h
> @@ -51,7 +51,8 @@ void blk_mq_disable_hotplug(void);
>   * CPU -> queue mappings
>   */
>  extern unsigned int *blk_mq_make_queue_map(struct blk_mq_tag_set *set);
> -extern int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues);
> +extern int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues,
> +                                  const struct cpumask *online_mask);
>  extern int blk_mq_hw_queue_to_node(unsigned int *map, unsigned int);
>
>  /*
> --
> 1.9.1
>



-- 
Ming Lei
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/