linux-kernel - Re: 4.14: WARNING: CPU: 4 PID: 2895 at block/blk-mq.c:1144 with virtio-blk (also 4.12 stable)

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <07e95492-9237-5c0c-fae9-c5704c735d38@suse.de>
Date:   Thu, 23 Nov 2017 15:42:59 +0100
From:   Hannes Reinecke <hare@...e.de>
To:     Christoph Hellwig <hch@....de>, Jens Axboe <axboe@...nel.dk>
Cc:     Christian Borntraeger <borntraeger@...ibm.com>,
        Bart Van Assche <Bart.VanAssche@....com>,
        "linux-block@...r.kernel.org" <linux-block@...r.kernel.org>,
        "linux-kernel@...r.kernel.org" <linux-kernel@...r.kernel.org>,
        Thomas Gleixner <tglx@...utronix.de>
Subject: Re: 4.14: WARNING: CPU: 4 PID: 2895 at block/blk-mq.c:1144 with
 virtio-blk (also 4.12 stable)

On 11/23/2017 03:34 PM, Christoph Hellwig wrote:
> FYI, the patch below changes both the irq and block mappings to
> always use the cpu possible map (should be split in two in due time).
> 
> I think this is the right way forward.  For every normal machine
> those two are the same, but for VMs with maxcpus above their normal
> count or some big iron that can grow more cpus it means we waster
> a few more resources for the not present but reserved cpus.  It
> fixes the reported issue for me:
> 
> diff --git a/block/blk-mq-cpumap.c b/block/blk-mq-cpumap.c
> index 9f8cffc8a701..3eb169f15842 100644
> --- a/block/blk-mq-cpumap.c
> +++ b/block/blk-mq-cpumap.c
> @@ -16,11 +16,6 @@
>  
>  static int cpu_to_queue_index(unsigned int nr_queues, const int cpu)
>  {
> -	/*
> -	 * Non present CPU will be mapped to queue index 0.
> -	 */
> -	if (!cpu_present(cpu))
> -		return 0;
>  	return cpu % nr_queues;
>  }
>  
> diff --git a/block/blk-mq.c b/block/blk-mq.c
> index 11097477eeab..612ce1fb7c4e 100644
> --- a/block/blk-mq.c
> +++ b/block/blk-mq.c
> @@ -2114,16 +2114,11 @@ static void blk_mq_init_cpu_queues(struct request_queue *q,
>  		INIT_LIST_HEAD(&__ctx->rq_list);
>  		__ctx->queue = q;
>  
> -		/* If the cpu isn't present, the cpu is mapped to first hctx */
> -		if (!cpu_present(i))
> -			continue;
> -
> -		hctx = blk_mq_map_queue(q, i);
> -
>  		/*
>  		 * Set local node, IFF we have more than one hw queue. If
>  		 * not, we remain on the home node of the device
>  		 */
> +		hctx = blk_mq_map_queue(q, i);
>  		if (nr_hw_queues > 1 && hctx->numa_node == NUMA_NO_NODE)
>  			hctx->numa_node = local_memory_node(cpu_to_node(i));
>  	}
> @@ -2180,7 +2175,7 @@ static void blk_mq_map_swqueue(struct request_queue *q)
>  	 *
>  	 * If the cpu isn't present, the cpu is mapped to first hctx.
>  	 */
> -	for_each_present_cpu(i) {
> +	for_each_possible_cpu(i) {
>  		hctx_idx = q->mq_map[i];
>  		/* unmapped hw queue can be remapped after CPU topo changed */
>  		if (!set->tags[hctx_idx] &&
> diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c
> index e12d35108225..a37a3b4b6342 100644
> --- a/kernel/irq/affinity.c
> +++ b/kernel/irq/affinity.c
> @@ -39,7 +39,7 @@ static void irq_spread_init_one(struct cpumask *irqmsk, struct cpumask *nmsk,
>  	}
>  }
>  
> -static cpumask_var_t *alloc_node_to_present_cpumask(void)
> +static cpumask_var_t *alloc_node_to_possible_cpumask(void)
>  {
>  	cpumask_var_t *masks;
>  	int node;
> @@ -62,7 +62,7 @@ static cpumask_var_t *alloc_node_to_present_cpumask(void)
>  	return NULL;
>  }
>  
> -static void free_node_to_present_cpumask(cpumask_var_t *masks)
> +static void free_node_to_possible_cpumask(cpumask_var_t *masks)
>  {
>  	int node;
>  
> @@ -71,22 +71,22 @@ static void free_node_to_present_cpumask(cpumask_var_t *masks)
>  	kfree(masks);
>  }
>  
> -static void build_node_to_present_cpumask(cpumask_var_t *masks)
> +static void build_node_to_possible_cpumask(cpumask_var_t *masks)
>  {
>  	int cpu;
>  
> -	for_each_present_cpu(cpu)
> +	for_each_possible_cpu(cpu)
>  		cpumask_set_cpu(cpu, masks[cpu_to_node(cpu)]);
>  }
>  
> -static int get_nodes_in_cpumask(cpumask_var_t *node_to_present_cpumask,
> +static int get_nodes_in_cpumask(cpumask_var_t *node_to_possible_cpumask,
>  				const struct cpumask *mask, nodemask_t *nodemsk)
>  {
>  	int n, nodes = 0;
>  
>  	/* Calculate the number of nodes in the supplied affinity mask */
>  	for_each_node(n) {
> -		if (cpumask_intersects(mask, node_to_present_cpumask[n])) {
> +		if (cpumask_intersects(mask, node_to_possible_cpumask[n])) {
>  			node_set(n, *nodemsk);
>  			nodes++;
>  		}
> @@ -109,7 +109,7 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd)
>  	int last_affv = affv + affd->pre_vectors;
>  	nodemask_t nodemsk = NODE_MASK_NONE;
>  	struct cpumask *masks;
> -	cpumask_var_t nmsk, *node_to_present_cpumask;
> +	cpumask_var_t nmsk, *node_to_possible_cpumask;
>  
>  	/*
>  	 * If there aren't any vectors left after applying the pre/post
> @@ -125,8 +125,8 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd)
>  	if (!masks)
>  		goto out;
>  
> -	node_to_present_cpumask = alloc_node_to_present_cpumask();
> -	if (!node_to_present_cpumask)
> +	node_to_possible_cpumask = alloc_node_to_possible_cpumask();
> +	if (!node_to_possible_cpumask)
>  		goto out;
>  
>  	/* Fill out vectors at the beginning that don't need affinity */
> @@ -135,8 +135,8 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd)
>  
>  	/* Stabilize the cpumasks */
>  	get_online_cpus();
> -	build_node_to_present_cpumask(node_to_present_cpumask);
> -	nodes = get_nodes_in_cpumask(node_to_present_cpumask, cpu_present_mask,
> +	build_node_to_possible_cpumask(node_to_possible_cpumask);
> +	nodes = get_nodes_in_cpumask(node_to_possible_cpumask, cpu_possible_mask,
>  				     &nodemsk);
>  
>  	/*
> @@ -146,7 +146,7 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd)
>  	if (affv <= nodes) {
>  		for_each_node_mask(n, nodemsk) {
>  			cpumask_copy(masks + curvec,
> -				     node_to_present_cpumask[n]);
> +				     node_to_possible_cpumask[n]);
>  			if (++curvec == last_affv)
>  				break;
>  		}
> @@ -160,7 +160,7 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd)
>  		vecs_per_node = (affv - (curvec - affd->pre_vectors)) / nodes;
>  
>  		/* Get the cpus on this node which are in the mask */
> -		cpumask_and(nmsk, cpu_present_mask, node_to_present_cpumask[n]);
> +		cpumask_and(nmsk, cpu_possible_mask, node_to_possible_cpumask[n]);
>  
>  		/* Calculate the number of cpus per vector */
>  		ncpus = cpumask_weight(nmsk);
> @@ -192,7 +192,7 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd)
>  	/* Fill out vectors at the end that don't need affinity */
>  	for (; curvec < nvecs; curvec++)
>  		cpumask_copy(masks + curvec, irq_default_affinity);
> -	free_node_to_present_cpumask(node_to_present_cpumask);
> +	free_node_to_possible_cpumask(node_to_possible_cpumask);
>  out:
>  	free_cpumask_var(nmsk);
>  	return masks;
> @@ -214,7 +214,7 @@ int irq_calc_affinity_vectors(int minvec, int maxvec, const struct irq_affinity
>  		return 0;
>  
>  	get_online_cpus();
> -	ret = min_t(int, cpumask_weight(cpu_present_mask), vecs) + resv;
> +	ret = min_t(int, cpumask_weight(cpu_possible_mask), vecs) + resv;
>  	put_online_cpus();
>  	return ret;
>  }
> 
What will happen for the CPU hotplug case?
Wouldn't we route I/O to a disabled CPU with this patch?

Cheers,

Hannes
-- 
Dr. Hannes Reinecke		   Teamlead Storage & Networking
hare@...e.de			               +49 911 74053 688
SUSE LINUX GmbH, Maxfeldstr. 5, 90409 Nürnberg
GF: F. Imendörffer, J. Smithard, J. Guild, D. Upmanyu, G. Norton
HRB 21284 (AG Nürnberg)