linux-kernel - Re: [PATCH v3 2/2] nvme/pci: make PRP list DMA pools per-NUMA-node

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives

Hash Suite: Windows password security audit tool. GUI, reports in PDF.

[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]

Message-ID: <4cf8af38-419e-4d7c-95f7-7248faf3c7bb@grimberg.me>
Date: Tue, 22 Apr 2025 14:48:12 +0300
From: Sagi Grimberg <sagi@...mberg.me>
To: Caleb Sander Mateos <csander@...estorage.com>,
 Keith Busch <kbusch@...nel.org>, Jens Axboe <axboe@...nel.dk>,
 Christoph Hellwig <hch@....de>
Cc: Kanchan Joshi <joshi.k@...sung.com>, linux-nvme@...ts.infradead.org,
 linux-kernel@...r.kernel.org
Subject: Re: [PATCH v3 2/2] nvme/pci: make PRP list DMA pools per-NUMA-node



On 21/04/2025 19:55, Caleb Sander Mateos wrote:
> NVMe commands with more than 4 KB of data allocate PRP list pages from
> the per-nvme_device dma_pool prp_page_pool or prp_small_pool. Each call
> to dma_pool_alloc() and dma_pool_free() takes the per-dma_pool spinlock.
> These device-global spinlocks are a significant source of contention
> when many CPUs are submitting to the same NVMe devices. On a workload
> issuing 32 KB reads from 16 CPUs (8 hypertwin pairs) across 2 NUMA nodes
> to 23 NVMe devices, we observed 2.4% of CPU time spent in
> _raw_spin_lock_irqsave called from dma_pool_alloc and dma_pool_free.
>
> Ideally, the dma_pools would be per-hctx to minimize
> contention. But that could impose considerable resource costs in a
> system with many NVMe devices and CPUs.
>
> As a compromise, allocate per-NUMA-node PRP list DMA pools. Map each
> nvme_queue to the set of DMA pools corresponding to its device and its
> hctx's NUMA node. This reduces the _raw_spin_lock_irqsave overhead by
> about half, to 1.2%. Preventing the sharing of PRP list pages across
> NUMA nodes also makes them cheaper to initialize.
>
> Link: https://lore.kernel.org/linux-nvme/CADUfDZqa=OOTtTTznXRDmBQo1WrFcDw1hBA7XwM7hzJ-hpckcA@mail.gmail.com/T/#u
> Signed-off-by: Caleb Sander Mateos <csander@...estorage.com>
> ---
>   drivers/nvme/host/pci.c | 144 +++++++++++++++++++++++-----------------
>   1 file changed, 84 insertions(+), 60 deletions(-)
>
> diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
> index 642890ddada5..7d86d1ec989a 100644
> --- a/drivers/nvme/host/pci.c
> +++ b/drivers/nvme/host/pci.c
> @@ -16,10 +16,11 @@
>   #include <linux/kstrtox.h>
>   #include <linux/memremap.h>
>   #include <linux/mm.h>
>   #include <linux/module.h>
>   #include <linux/mutex.h>
> +#include <linux/nodemask.h>
>   #include <linux/once.h>
>   #include <linux/pci.h>
>   #include <linux/suspend.h>
>   #include <linux/t10-pi.h>
>   #include <linux/types.h>
> @@ -110,21 +111,24 @@ struct nvme_queue;
>   
>   static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown);
>   static void nvme_delete_io_queues(struct nvme_dev *dev);
>   static void nvme_update_attrs(struct nvme_dev *dev);
>   
> +struct nvme_prp_dma_pools {
> +	struct dma_pool *large;
> +	struct dma_pool *small;
> +};
> +
>   /*
>    * Represents an NVM Express device.  Each nvme_dev is a PCI function.
>    */
>   struct nvme_dev {
>   	struct nvme_queue *queues;
>   	struct blk_mq_tag_set tagset;
>   	struct blk_mq_tag_set admin_tagset;
>   	u32 __iomem *dbs;
>   	struct device *dev;
> -	struct dma_pool *prp_page_pool;
> -	struct dma_pool *prp_small_pool;
>   	unsigned online_queues;
>   	unsigned max_qid;
>   	unsigned io_queues[HCTX_MAX_TYPES];
>   	unsigned int num_vecs;
>   	u32 q_depth;
> @@ -160,10 +164,11 @@ struct nvme_dev {
>   	struct nvme_host_mem_buf_desc *host_mem_descs;
>   	void **host_mem_desc_bufs;
>   	unsigned int nr_allocated_queues;
>   	unsigned int nr_write_queues;
>   	unsigned int nr_poll_queues;
> +	struct nvme_prp_dma_pools prp_pools[];
>   };
>   
>   static int io_queue_depth_set(const char *val, const struct kernel_param *kp)
>   {
>   	return param_set_uint_minmax(val, kp, NVME_PCI_MIN_QUEUE_SIZE,
> @@ -189,10 +194,11 @@ static inline struct nvme_dev *to_nvme_dev(struct nvme_ctrl *ctrl)
>    * An NVM Express queue.  Each device has at least two (one for admin
>    * commands and one for I/O commands).
>    */
>   struct nvme_queue {
>   	struct nvme_dev *dev;
> +	struct nvme_prp_dma_pools prp_pools;
>   	spinlock_t sq_lock;
>   	void *sq_cmds;
>   	 /* only used for poll queues: */
>   	spinlock_t cq_poll_lock ____cacheline_aligned_in_smp;
>   	struct nvme_completion *cqes;
> @@ -395,18 +401,67 @@ static int nvme_pci_npages_prp(void)
>   	unsigned max_bytes = (NVME_MAX_KB_SZ * 1024) + NVME_CTRL_PAGE_SIZE;
>   	unsigned nprps = DIV_ROUND_UP(max_bytes, NVME_CTRL_PAGE_SIZE);
>   	return DIV_ROUND_UP(8 * nprps, NVME_CTRL_PAGE_SIZE - 8);
>   }
>   
> +static struct nvme_prp_dma_pools *
> +nvme_setup_prp_pools(struct nvme_dev *dev, unsigned numa_node)
> +{
> +	struct nvme_prp_dma_pools *prp_pools;
> +	size_t small_align = 256;
> +
> +	prp_pools = &dev->prp_pools[numa_node < nr_node_ids ? numa_node : 0];

I'm assuming you are checking numa_node == NUMA_NO_NODE ?
Perhaps it is better to check that explicitly?

Otherwise, looks good
Reviewed-by: Sagi Grimberg <sagi@...mberg.me>