[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <CAEHaoC1mHQp62GMBofGJwKBkjgf8rQr_tUxwESKa7Ehd6_3nWA@mail.gmail.com>
Date: Sun, 3 Aug 2025 14:15:32 +0300
From: Constantine Gavrilov <cgavrilov@...inidat.com>
To: linux-kernel@...r.kernel.org, Marek Szyprowski <m.szyprowski@...sung.com>,
Robin Murphy <robin.murphy@....com>, iommu@...ts.linux.dev
Subject: [PATCH 6/8] Large DMA alloc/alloc DMA addresses from the top
This is the seventh patch from the set of patches that enable large IOMMU
DMA registrations. Entire work is available at the master branch of the
master branch of git@...hub.com:cgavrilov/linux.git repo.
This patch ensures that addresses in IOMMU group are allocated from the top
to the bottom of the address space. It fixes some issues with the use of
cached_node and cached32_node fields of the iova_domain structure that
resulted in fragmentation of the address space. Fragmented address space can
lead to failed allocations of DMA ranges.
commit d8bb3c731ff750afc568fa73d770eb1fa3e96c09
Author: Constantine Gavrilov <cgavrilov@...inidat.com>
Date: Tue Jul 1 11:19:08 2025 +0300
Allocate DMA addresses from top to bottom in IOVA domains.
The cached_node and cached32_node fields of the iova_domain structure
are used as the starting point for the address search only if the cached
node starts at the DMA limit or below it, or if the DMA limit is 64 bit
or 32 bit respectively.
The cached_node and cached32_node are updated upon successful allocation
only if the search was performed from the node that does not lie below the
cached values and not above the DMA limit.
For clarity, cached_node field was renamed to cached_top_node.
To enable the existing optimization for network stack behavior - where
network drivers can allocate above 250K DMA buffers for network pools
without using SG tables, we add cached_middle_node and middle_pfn_limit
fields. Without using those, the system locks up for minutes at boot
time trying to allocate network pools.
This ensures contiguous allocations from top to the bottom, with holes
due to alignment or due to lower DMA address requirements for some devices
in the group. Altogether, this avoids fragmentation of DMA address space
and ensures that large DMA ranges are available.
diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c
index 0c436dd35404..09356d6065ef 100644
--- a/drivers/iommu/iova.c
+++ b/drivers/iommu/iova.c
@@ -47,7 +47,9 @@ init_iova_domain(struct iova_domain *iovad, unsigned
long granule,
spin_lock_init(&iovad->iova_rbtree_lock);
iovad->rbroot = RB_ROOT;
- iovad->cached_node = &iovad->anchor.node;
+ iovad->cached_top_node = &iovad->anchor.node;
+ iovad->cached_middle_node = &iovad->anchor.node;
+ iovad->middle_pfn_limit = IOVA_ANCHOR;
iovad->cached32_node = &iovad->anchor.node;
iovad->granule = granule;
iovad->start_pfn = start_pfn;
@@ -58,22 +60,63 @@ init_iova_domain(struct iova_domain *iovad,
unsigned long granule,
}
EXPORT_SYMBOL_GPL(init_iova_domain);
+static struct rb_node *iova_find_limit(struct iova_domain *iovad,
unsigned long limit_pfn);
+
static struct rb_node *
-__get_cached_rbnode(struct iova_domain *iovad, unsigned long limit_pfn)
+__get_start_rbnode(struct iova_domain *iovad, unsigned long limit_pfn)
{
- if (limit_pfn <= iovad->dma_32bit_pfn)
+ struct iova *cached = to_iova(iovad->cached32_node);
+ if (limit_pfn == iovad->dma_32bit_pfn || (cached->pfn_hi + 1) >= limit_pfn)
return iovad->cached32_node;
- return iovad->cached_node;
+ cached = to_iova(iovad->cached_middle_node);
+ if (limit_pfn == iovad->middle_pfn_limit || (cached->pfn_hi + 1)
>= limit_pfn)
+ return iovad->cached_middle_node;
+
+ cached = to_iova(iovad->cached_top_node);
+ if (limit_pfn == IOVA_ANCHOR || (cached->pfn_hi + 1) >= limit_pfn)
+ return iovad->cached_top_node;
+
+ return iova_find_limit(iovad, limit_pfn);
}
static void
-__cached_rbnode_insert_update(struct iova_domain *iovad, struct iova *new)
+__cached_rbnode_insert_update(struct iova_domain *iovad, struct iova
*new, struct iova *start_search, unsigned long limit_pfn)
{
- if (new->pfn_hi < iovad->dma_32bit_pfn)
- iovad->cached32_node = &new->node;
- else
- iovad->cached_node = &new->node;
+ /* insert the update only if the search started from the cached
node or above it
+ * This way, we alttempt to allocate from top to the bottom, with
holes due to alignment
+ * or DMA address limit for individual devices in the group
+ */
+ struct iova *cached;
+
+ /* update top node */
+ cached = to_iova(iovad->cached_top_node);
+ if (limit_pfn >= start_search->pfn_lo &&
+ start_search->pfn_lo >= cached->pfn_lo &&
+ new->pfn_lo < cached->pfn_lo)
+ iovad->cached_top_node = &new->node;
+
+ /* update middle node */
+ cached = to_iova(iovad->cached_middle_node);
+ if (limit_pfn >= start_search->pfn_lo &&
+ start_search->pfn_lo >= cached->pfn_lo &&
+ new->pfn_lo < cached->pfn_lo) {
+ iovad->cached_middle_node = &new->node;
+ if (limit_pfn != IOVA_ANCHOR && (limit_pfn > iovad->middle_pfn_limit ||
+ iovad->middle_pfn_limit == IOVA_ANCHOR))
+ iovad->middle_pfn_limit = limit_pfn;
+ } else if (limit_pfn != IOVA_ANCHOR) {
+ iovad->middle_pfn_limit = limit_pfn;
+ iovad->cached_middle_node = &new->node;
+ }
+
+ if (new->pfn_lo <= iovad->dma_32bit_pfn) {
+ cached = to_iova(iovad->cached32_node);
+ if (limit_pfn >= start_search->pfn_lo &&
+ start_search->pfn_lo >= cached->pfn_lo &&
+ new->pfn_lo < cached->pfn_lo)
+ iovad->cached32_node = &new->node;
+ }
}
static void
@@ -87,9 +130,13 @@ __cached_rbnode_delete_update(struct iova_domain
*iovad, struct iova *free)
free->pfn_lo >= cached_iova->pfn_lo))
iovad->cached32_node = rb_next(&free->node);
- cached_iova = to_iova(iovad->cached_node);
+ cached_iova = to_iova(iovad->cached_top_node);
if (free->pfn_lo >= cached_iova->pfn_lo)
- iovad->cached_node = rb_next(&free->node);
+ iovad->cached_top_node = rb_next(&free->node);
+
+ cached_iova = to_iova(iovad->cached_middle_node);
+ if (free->pfn_lo >= cached_iova->pfn_lo && free->pfn_lo <
iovad->middle_pfn_limit)
+ iovad->cached_middle_node = rb_next(&free->node);
}
static struct rb_node *iova_find_limit(struct iova_domain *iovad,
unsigned long limit_pfn)
@@ -161,8 +208,8 @@ static int __alloc_and_insert_iova_range(struct
iova_domain *iovad,
unsigned long size, unsigned long limit_pfn,
struct iova *new, iova_align_t align)
{
- struct rb_node *curr, *prev;
- struct iova *curr_iova;
+ struct rb_node *curr, *prev, *start_search;
+ struct iova *curr_iova, *start_iova;
unsigned long flags;
unsigned long new_pfn, retry_pfn;
unsigned long align_mask;
@@ -179,8 +226,8 @@ static int __alloc_and_insert_iova_range(struct
iova_domain *iovad,
/* Walk the tree backwards */
spin_lock_irqsave(&iovad->iova_rbtree_lock, flags);
- curr = __get_cached_rbnode(iovad, limit_pfn);
- curr_iova = to_iova(curr);
+ curr = start_search = __get_start_rbnode(iovad, limit_pfn);
+ curr_iova = start_iova = to_iova(curr);
retry_pfn = curr_iova->pfn_hi;
retry:
@@ -193,11 +240,11 @@ static int __alloc_and_insert_iova_range(struct
iova_domain *iovad,
} while (curr && new_pfn <= curr_iova->pfn_hi && new_pfn >= low_pfn);
if (high_pfn < size || new_pfn < low_pfn) {
- if (low_pfn == iovad->start_pfn && retry_pfn < limit_pfn) {
+ if (start_search != &iovad->anchor.node && low_pfn ==
iovad->start_pfn && retry_pfn < limit_pfn) {
high_pfn = limit_pfn;
low_pfn = retry_pfn + 1;
- curr = iova_find_limit(iovad, limit_pfn);
- curr_iova = to_iova(curr);
+ curr = start_search = iova_find_limit(iovad, limit_pfn);
+ curr_iova = start_iova = to_iova(curr);
goto retry;
}
goto iova32_full;
@@ -209,7 +256,7 @@ static int __alloc_and_insert_iova_range(struct
iova_domain *iovad,
/* If we have 'prev', it's a valid place to start the insertion. */
iova_insert_rbtree(&iovad->rbroot, new, prev);
- __cached_rbnode_insert_update(iovad, new);
+ __cached_rbnode_insert_update(iovad, new, start_iova, limit_pfn);
spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags);
return 0;
diff --git a/include/linux/iova.h b/include/linux/iova.h
index 2800bdc203b1..0780a64e1149 100644
--- a/include/linux/iova.h
+++ b/include/linux/iova.h
@@ -26,14 +26,16 @@ struct iova_rcache;
/* holds all the iova translations for a domain */
struct iova_domain {
- spinlock_t iova_rbtree_lock; /* Lock to protect update of rbtree */
- struct rb_root rbroot; /* iova domain rbtree root */
- struct rb_node *cached_node; /* Save last alloced node */
- struct rb_node *cached32_node; /* Save last 32-bit alloced node */
- unsigned long granule; /* pfn granularity for this domain */
- unsigned long start_pfn; /* Lower limit for this domain */
- unsigned long dma_32bit_pfn;
- struct iova anchor; /* rbtree lookup anchor */
+ spinlock_t iova_rbtree_lock; /* Lock to protect update of rbtree */
+ struct rb_root rbroot; /* iova domain rbtree root */
+ struct rb_node *cached_top_node; /* Save last alloced node
from the top*/
+ struct rb_node *cached_middle_node; /* Saved last alloced node
in the middle */
+ struct rb_node *cached32_node; /* Save last 32-bit alloced node */
+ unsigned long granule; /* pfn granularity for this domain */
+ unsigned long start_pfn; /* Lower limit for this domain */
+ unsigned long dma_32bit_pfn; /* 32-bit PFN limit, constant */
+ unsigned long middle_pfn_limit; /* cached_middle_node is for
this limit */
+ struct iova anchor; /* rbtree lookup anchor */
struct iova_rcache *rcaches;
struct hlist_node cpuhp_dead;
--
----------------------------------------
Constantine Gavrilov
System Architect and Platform Engineer
Infinidat
----------------------------------------
Powered by blists - more mailing lists