lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <9025c412-9b6f-4714-a880-e8e6084e1b4c@suse.de>
Date: Thu, 3 Jul 2025 08:58:02 +0200
From: Hannes Reinecke <hare@...e.de>
To: Daniel Wagner <wagi@...nel.org>, Jens Axboe <axboe@...nel.dk>,
 Keith Busch <kbusch@...nel.org>, Christoph Hellwig <hch@....de>,
 Sagi Grimberg <sagi@...mberg.me>, "Michael S. Tsirkin" <mst@...hat.com>
Cc: Aaron Tomlin <atomlin@...mlin.com>,
 "Martin K. Petersen" <martin.petersen@...cle.com>,
 Thomas Gleixner <tglx@...utronix.de>, Costa Shulyupin
 <costa.shul@...hat.com>, Juri Lelli <juri.lelli@...hat.com>,
 Valentin Schneider <vschneid@...hat.com>, Waiman Long <llong@...hat.com>,
 Ming Lei <ming.lei@...hat.com>, Frederic Weisbecker <frederic@...nel.org>,
 Mel Gorman <mgorman@...e.de>,
 Mathieu Desnoyers <mathieu.desnoyers@...icios.com>,
 linux-kernel@...r.kernel.org, linux-block@...r.kernel.org,
 linux-nvme@...ts.infradead.org, megaraidlinux.pdl@...adcom.com,
 linux-scsi@...r.kernel.org, storagedev@...rochip.com,
 virtualization@...ts.linux.dev, GR-QLogic-Storage-Upstream@...vell.com
Subject: Re: [PATCH v7 08/10] blk-mq: use hk cpus only when isolcpus=io_queue
 is enabled

On 7/2/25 18:33, Daniel Wagner wrote:
> Extend the capabilities of the generic CPU to hardware queue (hctx)
> mapping code, so it maps houskeeping CPUs and isolated CPUs to the
> hardware queues evenly.
> 
> A hctx is only operational when there is at least one online
> housekeeping CPU assigned (aka active_hctx). Thus, check the final
> mapping that there is no hctx which has only offline housekeeing CPU and
> online isolated CPUs.
> 
> Example mapping result:
> 
>    16 online CPUs
> 
>    isolcpus=io_queue,2-3,6-7,12-13
> 
> Queue mapping:
>          hctx0: default 0 2
>          hctx1: default 1 3
>          hctx2: default 4 6
>          hctx3: default 5 7
>          hctx4: default 8 12
>          hctx5: default 9 13
>          hctx6: default 10
>          hctx7: default 11
>          hctx8: default 14
>          hctx9: default 15
> 
> IRQ mapping:
>          irq 42 affinity 0 effective 0  nvme0q0
>          irq 43 affinity 0 effective 0  nvme0q1
>          irq 44 affinity 1 effective 1  nvme0q2
>          irq 45 affinity 4 effective 4  nvme0q3
>          irq 46 affinity 5 effective 5  nvme0q4
>          irq 47 affinity 8 effective 8  nvme0q5
>          irq 48 affinity 9 effective 9  nvme0q6
>          irq 49 affinity 10 effective 10  nvme0q7
>          irq 50 affinity 11 effective 11  nvme0q8
>          irq 51 affinity 14 effective 14  nvme0q9
>          irq 52 affinity 15 effective 15  nvme0q10
> 
> A corner case is when the number of online CPUs and present CPUs
> differ and the driver asks for less queues than online CPUs, e.g.
> 
>    8 online CPUs, 16 possible CPUs
> 
>    isolcpus=io_queue,2-3,6-7,12-13
>    virtio_blk.num_request_queues=2
> 
> Queue mapping:
>          hctx0: default 0 1 2 3 4 5 6 7 8 12 13
>          hctx1: default 9 10 11 14 15
> 
> IRQ mapping
>          irq 27 affinity 0 effective 0 virtio0-config
>          irq 28 affinity 0-1,4-5,8 effective 5 virtio0-req.0
>          irq 29 affinity 9-11,14-15 effective 0 virtio0-req.1
> 
> Signed-off-by: Daniel Wagner <wagi@...nel.org>
> ---
>   block/blk-mq-cpumap.c | 194 +++++++++++++++++++++++++++++++++++++++++++++++++-
>   1 file changed, 191 insertions(+), 3 deletions(-)
> 
> diff --git a/block/blk-mq-cpumap.c b/block/blk-mq-cpumap.c
> index 8244ecf878358c0b8de84458dcd5100c2f360213..4cb2724a78e13216e50f0e6b1a18f19ea41a54f8 100644
> --- a/block/blk-mq-cpumap.c
> +++ b/block/blk-mq-cpumap.c
> @@ -17,12 +17,25 @@
>   #include "blk.h"
>   #include "blk-mq.h"
>   
> +static struct cpumask blk_hk_online_mask;
> +
>   static unsigned int blk_mq_num_queues(const struct cpumask *mask,
>   				      unsigned int max_queues)
>   {
>   	unsigned int num;
>   
> -	num = cpumask_weight(mask);
> +	if (housekeeping_enabled(HK_TYPE_IO_QUEUE)) {
> +		const struct cpumask *hk_mask;
> +		struct cpumask avail_mask;
> +
> +		hk_mask = housekeeping_cpumask(HK_TYPE_IO_QUEUE);
> +		cpumask_and(&avail_mask, mask, hk_mask);
> +
> +		num = cpumask_weight(&avail_mask);
> +	} else {
> +		num = cpumask_weight(mask);
> +	}
> +
>   	return min_not_zero(num, max_queues);
>   }
>   
> @@ -31,9 +44,13 @@ static unsigned int blk_mq_num_queues(const struct cpumask *mask,
>    *
>    * Returns an affinity mask that represents the queue-to-CPU mapping
>    * requested by the block layer based on possible CPUs.
> + * This helper takes isolcpus settings into account.
>    */
>   const struct cpumask *blk_mq_possible_queue_affinity(void)
>   {
> +	if (housekeeping_enabled(HK_TYPE_IO_QUEUE))
> +		return housekeeping_cpumask(HK_TYPE_IO_QUEUE);
> +
>   	return cpu_possible_mask;
>   }
>   EXPORT_SYMBOL_GPL(blk_mq_possible_queue_affinity);
> @@ -46,6 +63,12 @@ EXPORT_SYMBOL_GPL(blk_mq_possible_queue_affinity);
>    */
>   const struct cpumask *blk_mq_online_queue_affinity(void)
>   {
> +	if (housekeeping_enabled(HK_TYPE_IO_QUEUE)) {
> +		cpumask_and(&blk_hk_online_mask, cpu_online_mask,
> +			    housekeeping_cpumask(HK_TYPE_IO_QUEUE));
> +		return &blk_hk_online_mask;
> +	}
> +
>   	return cpu_online_mask;
>   }
>   EXPORT_SYMBOL_GPL(blk_mq_online_queue_affinity);
> @@ -57,7 +80,8 @@ EXPORT_SYMBOL_GPL(blk_mq_online_queue_affinity);
>    *		ignored.
>    *
>    * Calculates the number of queues to be used for a multiqueue
> - * device based on the number of possible CPUs.
> + * device based on the number of possible CPUs. This helper
> + * takes isolcpus settings into account.
>    */
>   unsigned int blk_mq_num_possible_queues(unsigned int max_queues)
>   {
> @@ -72,7 +96,8 @@ EXPORT_SYMBOL_GPL(blk_mq_num_possible_queues);
>    *		ignored.
>    *
>    * Calculates the number of queues to be used for a multiqueue
> - * device based on the number of online CPUs.
> + * device based on the number of online CPUs. This helper
> + * takes isolcpus settings into account.
>    */
>   unsigned int blk_mq_num_online_queues(unsigned int max_queues)
>   {
> @@ -80,11 +105,169 @@ unsigned int blk_mq_num_online_queues(unsigned int max_queues)
>   }
>   EXPORT_SYMBOL_GPL(blk_mq_num_online_queues);
>   
> +static bool blk_mq_hk_validate(struct blk_mq_queue_map *qmap,
> +			       const struct cpumask *active_hctx)
> +{
> +	/*
> +	 * Verify if the mapping is usable.
> +	 *
> +	 * First, mark all hctx which have at least online houskeeping
> +	 * CPU assigned.
> +	 */
> +	for (int queue = 0; queue < qmap->nr_queues; queue++) {
> +		int cpu;
> +
> +		if (cpumask_test_cpu(queue, active_hctx)) {
> +			/*
> +			 * This htcx has at least one online houskeeping
> +			 * CPU thus it is able to serve any assigned
> +			 * isolated CPU.
> +			 */
> +			continue;
> +		}
> +
> +		/*
> +		 * There is no online houskeeping CPU for this hctx, all
> +		 * good as long as all isolated CPUs are also offline.
> +		 */
> +		for_each_online_cpu(cpu) {
> +			if (qmap->mq_map[cpu] != queue)
> +				continue;
> +
> +			pr_warn("Unable to create a usable CPU-to-queue mapping with the given constraints\n");
> +			return false;
> +		}
> +	}
> +
> +	return true;
> +}
> +
> +/*
> + * blk_mq_map_hk_queues - Create housekeeping CPU to
> + *                        hardware queue mapping
> + * @qmap:	CPU to hardware queue map
> + *
> + * Create a housekeeping CPU to hardware queue mapping in @qmap. @qmap
> + * contains a valid configuration honoring the isolcpus configuration.
> + */
> +static void blk_mq_map_hk_queues(struct blk_mq_queue_map *qmap)
> +{
> +	cpumask_var_t active_hctx __free(free_cpumask_var) = NULL;
> +	struct cpumask *hk_masks __free(kfree) = NULL;
> +	const struct cpumask *mask;
> +	unsigned int queue, cpu, nr_masks;
> +
> +	if (housekeeping_enabled(HK_TYPE_IO_QUEUE))
> +		mask = housekeeping_cpumask(HK_TYPE_IO_QUEUE);
> +	else
> +		goto fallback;
> +
> +	if (!zalloc_cpumask_var(&active_hctx, GFP_KERNEL))
> +		goto fallback;
> +
> +	/* Map housekeeping CPUs to a hctx */
> +	hk_masks = group_mask_cpus_evenly(qmap->nr_queues, mask, &nr_masks);
> +	if (!hk_masks)
> +		goto fallback;
> +
> +	for (queue = 0; queue < qmap->nr_queues; queue++) {
> +		unsigned int idx = (qmap->queue_offset + queue) % nr_masks;
> +
> +		for_each_cpu(cpu, &hk_masks[idx]) {
> +			qmap->mq_map[cpu] = idx;
> +
> +			if (cpu_online(cpu))
> +				cpumask_set_cpu(qmap->mq_map[cpu], active_hctx);

Why cpu_online? Up until this point it really didn't matter if the 
affinity mask was set to 'online' or 'possible' cpus, but here you
require CPUs to be online...

> +		}
> +	}
> +
> +	/* Map isolcpus to hardware context */
> +	queue = cpumask_first(active_hctx);
> +	for_each_cpu_andnot(cpu, cpu_possible_mask, mask) {
> +		qmap->mq_map[cpu] = (qmap->queue_offset + queue) % nr_masks;
> +		queue = cpumask_next_wrap(queue, active_hctx);
> +	}

Really? Doesn't this map _all_ cpus, and not just the isolcpus?

> +
> +	if (!blk_mq_hk_validate(qmap, active_hctx))
> +		goto fallback;
> +
> +	return;
> +
> +fallback:
> +	/*
> +	 * Map all CPUs to the first hctx to ensure at least one online
> +	 * housekeeping CPU is serving it.
> +	 */
> +	for_each_possible_cpu(cpu)
> +		qmap->mq_map[cpu] = 0;

I think you need to map all hctx, no?

> +}
> +
> +/*
> + * blk_mq_map_hk_irq_queues - Create housekeeping CPU to
> + *                            hardware queue mapping
> + * @dev:	The device to map queues
> + * @qmap:	CPU to hardware queue map
> + * @offset:	Queue offset to use for the device
> + *
> + * Create a housekeeping CPU to hardware queue mapping in @qmap. @qmap
> + * contains a valid configuration honoring the isolcpus configuration.
> + */
> +static void blk_mq_map_hk_irq_queues(struct device *dev,
> +				     struct blk_mq_queue_map *qmap,
> +				     int offset)
> +{
> +	cpumask_var_t active_hctx __free(free_cpumask_var) = NULL;
> +	cpumask_var_t mask __free(free_cpumask_var) = NULL;
> +	unsigned int queue, cpu;
> +
> +	if (!zalloc_cpumask_var(&active_hctx, GFP_KERNEL))
> +		goto fallback;
> +
> +	if (!zalloc_cpumask_var(&mask, GFP_KERNEL))
> +		goto fallback;
> +
> +	/* Map housekeeping CPUs to a hctx */
> +	for (queue = 0; queue < qmap->nr_queues; queue++) {
> +		for_each_cpu(cpu, dev->bus->irq_get_affinity(dev, offset + queue)) {
> +			qmap->mq_map[cpu] = qmap->queue_offset + queue;
> +
> +			cpumask_set_cpu(cpu, mask);
> +			if (cpu_online(cpu))
> +				cpumask_set_cpu(qmap->mq_map[cpu], active_hctx);

Now that is really curious. You pick up the interrupt affinity from the
'bus', which, I assume, is the PCI bus. And this would imply that the
bus can (or already is) programmed for this interrupt affinity.
Which would imply that this is a usable interrupt affinity from the
hardware perspective, irrespective on whether the cpu is online or not.
So why the check to cpu_online()? Can't we simply take the existing 
affinity and rely on the hardware to do the right thing?

> +		}
> +	}
> +
> +	/* Map isolcpus to hardware context */
> +	queue = cpumask_first(active_hctx);
> +	for_each_cpu_andnot(cpu, cpu_possible_mask, mask) {
> +		qmap->mq_map[cpu] = qmap->queue_offset + queue;
> +		queue = cpumask_next_wrap(queue, active_hctx);
> +	}
> +
> +	if (!blk_mq_hk_validate(qmap, active_hctx))
> +		goto fallback;
> +
> +	return;
> +
> +fallback:
> +	/*
> +	 * Map all CPUs to the first hctx to ensure at least one online
> +	 * housekeeping CPU is serving it.
> +	 */
> +	for_each_possible_cpu(cpu)
> +		qmap->mq_map[cpu] = 0;

Same comment as previously; don't we need to map all hctx?

> +}
> +
>   void blk_mq_map_queues(struct blk_mq_queue_map *qmap)
>   {
>   	const struct cpumask *masks;
>   	unsigned int queue, cpu, nr_masks;
>   
> +	if (housekeeping_enabled(HK_TYPE_IO_QUEUE)) {
> +		blk_mq_map_hk_queues(qmap);
> +		return;
> +	}
> +
>   	masks = group_cpus_evenly(qmap->nr_queues, &nr_masks);
>   	if (!masks) {
>   		for_each_possible_cpu(cpu)
> @@ -139,6 +322,11 @@ void blk_mq_map_hw_queues(struct blk_mq_queue_map *qmap,
>   	if (!dev->bus->irq_get_affinity)
>   		goto fallback;
>   
> +	if (housekeeping_enabled(HK_TYPE_IO_QUEUE)) {
> +		blk_mq_map_hk_irq_queues(dev, qmap, offset);
> +		return;
> +	}
> +
>   	for (queue = 0; queue < qmap->nr_queues; queue++) {
>   		mask = dev->bus->irq_get_affinity(dev, queue + offset);
>   		if (!mask)
> 

Cheers,

Hannes
-- 
Dr. Hannes Reinecke                  Kernel Storage Architect
hare@...e.de                                +49 911 74053 688
SUSE Software Solutions GmbH, Frankenstr. 146, 90461 Nürnberg
HRB 36809 (AG Nürnberg), GF: I. Totev, A. McDonald, W. Knoblich

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ