[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <537377a5-71e0-7705-d881-ac374aa7eab8@linux.alibaba.com>
Date: Mon, 14 Aug 2023 20:15:56 +0800
From: Xianting Tian <xianting.tian@...ux.alibaba.com>
To: Jie Ji <jijie.ji@...ux.alibaba.com>, dwmw2@...radead.org,
baolu.lu@...ux.intel.com, joro@...tes.org, will@...nel.org,
robin.murphy@....com
Cc: iommu@...ts.linux.dev, linux-kernel@...r.kernel.org,
kaijieguo@...ux.alibaba.com, daishengdong@...h.net
Subject: Re: [PATCH] iommu/vt-d: Atomic breakdown of IOPT into finer
granularity
This patch works fo me, you can add:
Tested-by: Xianting Tian <xianting.tian@...ux.alibaba.com>
在 2023/8/14 下午8:10, Jie Ji 写道:
> With the addition of IOMMU support for IO page fault, it's now possible
> to unpin the memory which DMA remapping. However, the lack of support
> for unmapping a subrange of the I/O page table (IOPT) in IOMMU can lead
> to some issues.
>
> For instance, a virtual machine can establish IOPT of 2M/1G for better
> performance, while the host system enable swap and attempts to swap out
> some 4K pages. Unfortunately, unmap subrange of the large-page mapping
> will make IOMMU page walk to error level, and finally cause kernel crash.
>
> This patch support splitting the page table to a finer granularity and
> atomic switch to it when unmap subrange of the large-page mapping. It
> is much better than the unmap then map method to change IOPT, because
> during interval time, all unmapped address space could trigger IO page
> fault, which is unacceptable.
>
> Signed-off-by: Jie Ji <jijie.ji@...ux.alibaba.com>
> Reviewed-by: Kaijie Guo <kaijieguo@...ux.alibaba.com>
> ---
> drivers/iommu/intel/iommu.c | 97 +++++++++++++++++++++++++++++--------
> drivers/iommu/intel/iommu.h | 1 +
> 2 files changed, 78 insertions(+), 20 deletions(-)
>
> diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
> index 5c8c5cdc36cf..b1fe9d07c47b 100644
> --- a/drivers/iommu/intel/iommu.c
> +++ b/drivers/iommu/intel/iommu.c
> @@ -1114,12 +1114,59 @@ static void dma_pte_list_pagetables(struct dmar_domain *domain,
> } while (!first_pte_in_page(pte));
> }
>
> -static void dma_pte_clear_level(struct dmar_domain *domain, int level,
> - struct dma_pte *pte, unsigned long pfn,
> - unsigned long start_pfn, unsigned long last_pfn,
> - struct list_head *freelist)
> +static int switch_to_small_page(struct dmar_domain *domain, int level,
> + struct dma_pte *pte)
> +{
> + u64 pteval, old_pteval, pfn;
> + void *new_pte = NULL;
> + u64 *ptep = NULL;
> + u64 attr;
> + int idx = 0;
> +
> + /* Inherit attr from old pte */
> + attr = pte->val & VTD_ATTR_MASK;
> + if (level == 1)
> + attr &= ~DMA_PTE_LARGE_PAGE;
> +
> + /* Get old pte entry in case pte change during switch */
> + old_pteval = pte->val;
> + pfn = dma_pte_addr(pte) >> VTD_PAGE_SHIFT;
> +
> + new_pte = alloc_pgtable_page(domain->nid, GFP_ATOMIC);
> + if (unlikely(!new_pte))
> + return -ENOMEM;
> +
> + ptep = (u64 *)new_pte;
> + for (idx = 0; idx < (1 << LEVEL_STRIDE); idx++) {
> + pteval = (pfn & level_mask(level)) << VTD_PAGE_SHIFT | attr;
> + ptep[idx] = pteval;
> + pfn += level_size(level);
> + }
> +
> + pteval = ((unsigned long)virt_to_dma_pfn(new_pte) << VTD_PAGE_SHIFT);
> + /* Snoop bit in page directory entry should be zero (reserve field) */
> + pteval |= DMA_PTE_WRITE | DMA_PTE_READ;
> +
> + if (cmpxchg64(&pte->val, old_pteval, pteval) != old_pteval) {
> + pr_err("%s: original pte changed by others, switch fail\n", __func__);
> + free_pgtable_page(new_pte);
> + return -EBUSY;
> + }
> +
> + domain_flush_cache(domain, pte, sizeof(*pte));
> +
> + pr_debug("%s: pte: %#llx->%#llx\n", __func__, old_pteval, pte->val);
> +
> + return 0;
> +}
> +
> +static int dma_pte_clear_level(struct dmar_domain *domain, int level,
> + struct dma_pte *pte, unsigned long pfn,
> + unsigned long start_pfn, unsigned long last_pfn,
> + struct list_head *freelist)
> {
> struct dma_pte *first_pte = NULL, *last_pte = NULL;
> + int ret;
>
> pfn = max(start_pfn, pfn);
> pte = &pte[pfn_level_offset(pfn, level)];
> @@ -1143,6 +1190,17 @@ static void dma_pte_clear_level(struct dmar_domain *domain, int level,
> first_pte = pte;
> last_pte = pte;
> } else if (level > 1) {
> + /* Switch to small page when unmap subrange */
> + if (dma_pte_superpage(pte) &&
> + ((last_pfn - start_pfn + 1) < level_size(level))) {
> + ret = switch_to_small_page(domain, level - 1, pte);
> + if (ret) {
> + pr_err("%s: switch to smaller page fail, ret = %d",
> + __func__, ret);
> + return ret;
> + }
> + }
> +
> /* Recurse down into a level that isn't *entirely* obsolete */
> dma_pte_clear_level(domain, level - 1,
> phys_to_virt(dma_pte_addr(pte)),
> @@ -1156,21 +1214,27 @@ static void dma_pte_clear_level(struct dmar_domain *domain, int level,
> if (first_pte)
> domain_flush_cache(domain, first_pte,
> (void *)++last_pte - (void *)first_pte);
> +
> + return 0;
> }
>
> /* We can't just free the pages because the IOMMU may still be walking
> the page tables, and may have cached the intermediate levels. The
> pages can only be freed after the IOTLB flush has been done. */
> -static void domain_unmap(struct dmar_domain *domain, unsigned long start_pfn,
> - unsigned long last_pfn, struct list_head *freelist)
> +static int domain_unmap(struct dmar_domain *domain, unsigned long start_pfn,
> + unsigned long last_pfn, struct list_head *freelist)
> {
> + int ret;
> +
> if (WARN_ON(!domain_pfn_supported(domain, last_pfn)) ||
> WARN_ON(start_pfn > last_pfn))
> - return;
> + return -EINVAL;
>
> /* we don't need lock here; nobody else touches the iova range */
> - dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
> - domain->pgd, 0, start_pfn, last_pfn, freelist);
> + ret = dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
> + domain->pgd, 0, start_pfn, last_pfn, freelist);
> + if (ret)
> + return ret;
>
> /* free pgd */
> if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
> @@ -1178,6 +1242,8 @@ static void domain_unmap(struct dmar_domain *domain, unsigned long start_pfn,
> list_add_tail(&pgd_page->lru, freelist);
> domain->pgd = NULL;
> }
> +
> + return 0;
> }
>
> /* iommu handling */
> @@ -4219,21 +4285,12 @@ static size_t intel_iommu_unmap(struct iommu_domain *domain,
> {
> struct dmar_domain *dmar_domain = to_dmar_domain(domain);
> unsigned long start_pfn, last_pfn;
> - int level = 0;
> -
> - /* Cope with horrid API which requires us to unmap more than the
> - size argument if it happens to be a large-page mapping. */
> - if (unlikely(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT,
> - &level, GFP_ATOMIC)))
> - return 0;
> -
> - if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
> - size = VTD_PAGE_SIZE << level_to_offset_bits(level);
>
> start_pfn = iova >> VTD_PAGE_SHIFT;
> last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
>
> - domain_unmap(dmar_domain, start_pfn, last_pfn, &gather->freelist);
> + if (domain_unmap(dmar_domain, start_pfn, last_pfn, &gather->freelist))
> + return 0;
>
> if (dmar_domain->max_addr == iova + size)
> dmar_domain->max_addr = iova;
> diff --git a/drivers/iommu/intel/iommu.h b/drivers/iommu/intel/iommu.h
> index 1c5e1d88862b..b4ccf134735f 100644
> --- a/drivers/iommu/intel/iommu.h
> +++ b/drivers/iommu/intel/iommu.h
> @@ -33,6 +33,7 @@
> #define VTD_PAGE_SIZE (1UL << VTD_PAGE_SHIFT)
> #define VTD_PAGE_MASK (((u64)-1) << VTD_PAGE_SHIFT)
> #define VTD_PAGE_ALIGN(addr) (((addr) + VTD_PAGE_SIZE - 1) & VTD_PAGE_MASK)
> +#define VTD_ATTR_MASK (~VTD_PAGE_MASK)
>
> #define VTD_STRIDE_SHIFT (9)
> #define VTD_STRIDE_MASK (((u64)-1) << VTD_STRIDE_SHIFT)
Powered by blists - more mailing lists