lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <CAEHaoC1mHQp62GMBofGJwKBkjgf8rQr_tUxwESKa7Ehd6_3nWA@mail.gmail.com>
Date: Sun, 3 Aug 2025 14:15:32 +0300
From: Constantine Gavrilov <cgavrilov@...inidat.com>
To: linux-kernel@...r.kernel.org, Marek Szyprowski <m.szyprowski@...sung.com>, 
	Robin Murphy <robin.murphy@....com>, iommu@...ts.linux.dev
Subject: [PATCH 6/8] Large DMA alloc/alloc DMA addresses from the top

This is the seventh patch from the set of patches that enable large IOMMU
DMA registrations. Entire work is available at the master branch of the
master branch of git@...hub.com:cgavrilov/linux.git repo.

This patch ensures that addresses in IOMMU group are allocated from the top
to the bottom of the address space. It fixes some issues with the use of
cached_node and cached32_node fields of the iova_domain structure that
resulted in fragmentation of the address space. Fragmented address space can
lead to failed allocations of DMA ranges.

commit d8bb3c731ff750afc568fa73d770eb1fa3e96c09
Author: Constantine Gavrilov <cgavrilov@...inidat.com>
Date:   Tue Jul 1 11:19:08 2025 +0300

    Allocate DMA addresses from top to bottom in IOVA domains.

    The cached_node and cached32_node fields of the iova_domain structure
    are used as the starting point for the address search only if the cached
    node starts at the DMA limit or below it, or if the DMA limit is 64 bit
    or 32 bit respectively.

    The cached_node and cached32_node are updated upon successful allocation
    only if the search was performed from the node that does not lie below the
    cached values and not above the DMA limit.

    For clarity, cached_node field was renamed to cached_top_node.

    To enable the existing optimization for network stack behavior - where
    network drivers can allocate above 250K DMA buffers for network pools
    without using SG tables, we add cached_middle_node and middle_pfn_limit
    fields. Without using those, the system locks up for minutes at boot
    time trying to allocate network pools.

    This ensures contiguous allocations from top to the bottom, with holes
    due to alignment or due to lower DMA address requirements for some devices
    in the group. Altogether, this avoids fragmentation of DMA address space
    and ensures that large DMA ranges are available.

diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c
index 0c436dd35404..09356d6065ef 100644
--- a/drivers/iommu/iova.c
+++ b/drivers/iommu/iova.c
@@ -47,7 +47,9 @@ init_iova_domain(struct iova_domain *iovad, unsigned
long granule,

     spin_lock_init(&iovad->iova_rbtree_lock);
     iovad->rbroot = RB_ROOT;
-    iovad->cached_node = &iovad->anchor.node;
+    iovad->cached_top_node = &iovad->anchor.node;
+    iovad->cached_middle_node = &iovad->anchor.node;
+    iovad->middle_pfn_limit = IOVA_ANCHOR;
     iovad->cached32_node = &iovad->anchor.node;
     iovad->granule = granule;
     iovad->start_pfn = start_pfn;
@@ -58,22 +60,63 @@ init_iova_domain(struct iova_domain *iovad,
unsigned long granule,
 }
 EXPORT_SYMBOL_GPL(init_iova_domain);

+static struct rb_node *iova_find_limit(struct iova_domain *iovad,
unsigned long limit_pfn);
+
 static struct rb_node *
-__get_cached_rbnode(struct iova_domain *iovad, unsigned long limit_pfn)
+__get_start_rbnode(struct iova_domain *iovad, unsigned long limit_pfn)
 {
-    if (limit_pfn <= iovad->dma_32bit_pfn)
+    struct iova *cached = to_iova(iovad->cached32_node);
+    if (limit_pfn == iovad->dma_32bit_pfn || (cached->pfn_hi + 1) >= limit_pfn)
         return iovad->cached32_node;

-    return iovad->cached_node;
+    cached = to_iova(iovad->cached_middle_node);
+    if (limit_pfn == iovad->middle_pfn_limit || (cached->pfn_hi + 1)
>= limit_pfn)
+        return iovad->cached_middle_node;
+
+    cached = to_iova(iovad->cached_top_node);
+    if (limit_pfn == IOVA_ANCHOR || (cached->pfn_hi + 1) >= limit_pfn)
+        return iovad->cached_top_node;
+
+    return iova_find_limit(iovad, limit_pfn);
 }

 static void
-__cached_rbnode_insert_update(struct iova_domain *iovad, struct iova *new)
+__cached_rbnode_insert_update(struct iova_domain *iovad, struct iova
*new, struct iova *start_search, unsigned long limit_pfn)
 {
-    if (new->pfn_hi < iovad->dma_32bit_pfn)
-        iovad->cached32_node = &new->node;
-    else
-        iovad->cached_node = &new->node;
+    /* insert the update only if the search started from the cached
node or above it
+     * This way, we alttempt to allocate from top to the bottom, with
holes due to alignment
+     * or DMA address limit for individual devices in the group
+     */
+    struct iova *cached;
+
+    /* update top node */
+    cached = to_iova(iovad->cached_top_node);
+    if (limit_pfn >= start_search->pfn_lo &&
+            start_search->pfn_lo >= cached->pfn_lo &&
+            new->pfn_lo < cached->pfn_lo)
+        iovad->cached_top_node = &new->node;
+
+    /* update middle node */
+    cached = to_iova(iovad->cached_middle_node);
+    if (limit_pfn >= start_search->pfn_lo &&
+            start_search->pfn_lo >= cached->pfn_lo &&
+             new->pfn_lo < cached->pfn_lo) {
+        iovad->cached_middle_node = &new->node;
+        if (limit_pfn != IOVA_ANCHOR && (limit_pfn > iovad->middle_pfn_limit ||
+                iovad->middle_pfn_limit == IOVA_ANCHOR))
+            iovad->middle_pfn_limit = limit_pfn;
+    } else if (limit_pfn != IOVA_ANCHOR) {
+        iovad->middle_pfn_limit = limit_pfn;
+        iovad->cached_middle_node = &new->node;
+    }
+
+    if (new->pfn_lo <= iovad->dma_32bit_pfn) {
+        cached = to_iova(iovad->cached32_node);
+        if (limit_pfn >= start_search->pfn_lo &&
+            start_search->pfn_lo >= cached->pfn_lo &&
+             new->pfn_lo < cached->pfn_lo)
+            iovad->cached32_node = &new->node;
+    }
 }

 static void
@@ -87,9 +130,13 @@ __cached_rbnode_delete_update(struct iova_domain
*iovad, struct iova *free)
          free->pfn_lo >= cached_iova->pfn_lo))
         iovad->cached32_node = rb_next(&free->node);

-    cached_iova = to_iova(iovad->cached_node);
+    cached_iova = to_iova(iovad->cached_top_node);
     if (free->pfn_lo >= cached_iova->pfn_lo)
-        iovad->cached_node = rb_next(&free->node);
+        iovad->cached_top_node = rb_next(&free->node);
+
+    cached_iova = to_iova(iovad->cached_middle_node);
+    if (free->pfn_lo >= cached_iova->pfn_lo && free->pfn_lo <
iovad->middle_pfn_limit)
+        iovad->cached_middle_node = rb_next(&free->node);
 }

 static struct rb_node *iova_find_limit(struct iova_domain *iovad,
unsigned long limit_pfn)
@@ -161,8 +208,8 @@ static int __alloc_and_insert_iova_range(struct
iova_domain *iovad,
         unsigned long size, unsigned long limit_pfn,
             struct iova *new, iova_align_t align)
 {
-    struct rb_node *curr, *prev;
-    struct iova *curr_iova;
+    struct rb_node *curr, *prev, *start_search;
+    struct iova *curr_iova, *start_iova;
     unsigned long flags;
     unsigned long new_pfn, retry_pfn;
     unsigned long align_mask;
@@ -179,8 +226,8 @@ static int __alloc_and_insert_iova_range(struct
iova_domain *iovad,
     /* Walk the tree backwards */
     spin_lock_irqsave(&iovad->iova_rbtree_lock, flags);

-    curr = __get_cached_rbnode(iovad, limit_pfn);
-    curr_iova = to_iova(curr);
+    curr = start_search = __get_start_rbnode(iovad, limit_pfn);
+    curr_iova = start_iova = to_iova(curr);
     retry_pfn = curr_iova->pfn_hi;

 retry:
@@ -193,11 +240,11 @@ static int __alloc_and_insert_iova_range(struct
iova_domain *iovad,
     } while (curr && new_pfn <= curr_iova->pfn_hi && new_pfn >= low_pfn);

     if (high_pfn < size || new_pfn < low_pfn) {
-        if (low_pfn == iovad->start_pfn && retry_pfn < limit_pfn) {
+        if (start_search != &iovad->anchor.node && low_pfn ==
iovad->start_pfn && retry_pfn < limit_pfn) {
             high_pfn = limit_pfn;
             low_pfn = retry_pfn + 1;
-            curr = iova_find_limit(iovad, limit_pfn);
-            curr_iova = to_iova(curr);
+            curr = start_search = iova_find_limit(iovad, limit_pfn);
+            curr_iova = start_iova = to_iova(curr);
             goto retry;
         }
         goto iova32_full;
@@ -209,7 +256,7 @@ static int __alloc_and_insert_iova_range(struct
iova_domain *iovad,

     /* If we have 'prev', it's a valid place to start the insertion. */
     iova_insert_rbtree(&iovad->rbroot, new, prev);
-    __cached_rbnode_insert_update(iovad, new);
+    __cached_rbnode_insert_update(iovad, new, start_iova, limit_pfn);

     spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags);
     return 0;
diff --git a/include/linux/iova.h b/include/linux/iova.h
index 2800bdc203b1..0780a64e1149 100644
--- a/include/linux/iova.h
+++ b/include/linux/iova.h
@@ -26,14 +26,16 @@ struct iova_rcache;

 /* holds all the iova translations for a domain */
 struct iova_domain {
-    spinlock_t    iova_rbtree_lock; /* Lock to protect update of rbtree */
-    struct rb_root    rbroot;        /* iova domain rbtree root */
-    struct rb_node    *cached_node;    /* Save last alloced node */
-    struct rb_node    *cached32_node; /* Save last 32-bit alloced node */
-    unsigned long    granule;    /* pfn granularity for this domain */
-    unsigned long    start_pfn;    /* Lower limit for this domain */
-    unsigned long    dma_32bit_pfn;
-    struct iova    anchor;        /* rbtree lookup anchor */
+    spinlock_t    iova_rbtree_lock;    /* Lock to protect update of rbtree */
+    struct rb_root    rbroot;              /* iova domain rbtree root */
+    struct rb_node    *cached_top_node;    /* Save last alloced node
from the top*/
+    struct rb_node    *cached_middle_node; /* Saved last alloced node
in the middle */
+    struct rb_node    *cached32_node;      /* Save last 32-bit alloced node */
+    unsigned long    granule;             /* pfn granularity for this domain */
+    unsigned long    start_pfn;           /* Lower limit for this domain */
+    unsigned long    dma_32bit_pfn;       /* 32-bit PFN limit, constant */
+    unsigned long   middle_pfn_limit;    /* cached_middle_node is for
this limit */
+    struct iova    anchor;              /* rbtree lookup anchor */

     struct iova_rcache    *rcaches;
     struct hlist_node    cpuhp_dead;


-- 
----------------------------------------
Constantine Gavrilov
System Architect and Platform Engineer
Infinidat
----------------------------------------

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ