[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20190923122454.9888-3-baolu.lu@linux.intel.com>
Date: Mon, 23 Sep 2019 20:24:52 +0800
From: Lu Baolu <baolu.lu@...ux.intel.com>
To: Joerg Roedel <joro@...tes.org>,
David Woodhouse <dwmw2@...radead.org>,
Alex Williamson <alex.williamson@...hat.com>
Cc: ashok.raj@...el.com, sanjay.k.kumar@...el.com,
jacob.jun.pan@...ux.intel.com, kevin.tian@...el.com,
yi.l.liu@...el.com, yi.y.sun@...el.com,
iommu@...ts.linux-foundation.org, kvm@...r.kernel.org,
linux-kernel@...r.kernel.org, Lu Baolu <baolu.lu@...ux.intel.com>,
Yi Sun <yi.y.sun@...ux.intel.com>
Subject: [RFC PATCH 2/4] iommu/vt-d: Add first level page table interfaces
This adds functions to manipulate first level page tables
which could be used by a scalale mode capable IOMMU unit.
intel_mmmap_range(domain, addr, end, phys_addr, prot)
- Map an iova range of [addr, end) to the physical memory
started at @phys_addr with the @prot permissions.
intel_mmunmap_range(domain, addr, end)
- Tear down the map of an iova range [addr, end). A page
list will be returned which will be freed after iotlb
flushing.
Cc: Ashok Raj <ashok.raj@...el.com>
Cc: Jacob Pan <jacob.jun.pan@...ux.intel.com>
Cc: Kevin Tian <kevin.tian@...el.com>
Cc: Liu Yi L <yi.l.liu@...el.com>
Cc: Yi Sun <yi.y.sun@...ux.intel.com>
Signed-off-by: Lu Baolu <baolu.lu@...ux.intel.com>
---
drivers/iommu/Makefile | 2 +-
drivers/iommu/intel-pgtable.c | 342 +++++++++++++++++++++++++++++
include/linux/intel-iommu.h | 24 +-
include/trace/events/intel_iommu.h | 60 +++++
4 files changed, 426 insertions(+), 2 deletions(-)
create mode 100644 drivers/iommu/intel-pgtable.c
diff --git a/drivers/iommu/Makefile b/drivers/iommu/Makefile
index 4f405f926e73..dc550e14cc58 100644
--- a/drivers/iommu/Makefile
+++ b/drivers/iommu/Makefile
@@ -17,7 +17,7 @@ obj-$(CONFIG_ARM_SMMU) += arm-smmu.o arm-smmu-impl.o
obj-$(CONFIG_ARM_SMMU_V3) += arm-smmu-v3.o
obj-$(CONFIG_DMAR_TABLE) += dmar.o
obj-$(CONFIG_INTEL_IOMMU) += intel-iommu.o intel-pasid.o
-obj-$(CONFIG_INTEL_IOMMU) += intel-trace.o
+obj-$(CONFIG_INTEL_IOMMU) += intel-trace.o intel-pgtable.o
obj-$(CONFIG_INTEL_IOMMU_DEBUGFS) += intel-iommu-debugfs.o
obj-$(CONFIG_INTEL_IOMMU_SVM) += intel-svm.o
obj-$(CONFIG_IPMMU_VMSA) += ipmmu-vmsa.o
diff --git a/drivers/iommu/intel-pgtable.c b/drivers/iommu/intel-pgtable.c
new file mode 100644
index 000000000000..8e95978cd381
--- /dev/null
+++ b/drivers/iommu/intel-pgtable.c
@@ -0,0 +1,342 @@
+// SPDX-License-Identifier: GPL-2.0
+/**
+ * intel-pgtable.c - Intel IOMMU page table manipulation library
+ *
+ * Copyright (C) 2019 Intel Corporation
+ *
+ * Author: Lu Baolu <baolu.lu@...ux.intel.com>
+ */
+
+#define pr_fmt(fmt) "DMAR: " fmt
+#include <linux/vmalloc.h>
+#include <linux/mm.h>
+#include <linux/sched.h>
+#include <linux/io.h>
+#include <linux/export.h>
+#include <linux/intel-iommu.h>
+#include <asm/cacheflush.h>
+#include <asm/pgtable.h>
+#include <asm/pgalloc.h>
+#include <trace/events/intel_iommu.h>
+
+#ifdef CONFIG_X86
+/*
+ * mmmap: Map a range of IO virtual address to physical addresses.
+ */
+#define pgtable_populate(domain, nm) \
+do { \
+ void *__new = alloc_pgtable_page(domain->nid); \
+ if (!__new) \
+ return -ENOMEM; \
+ smp_wmb(); \
+ spin_lock(&(domain)->page_table_lock); \
+ if (nm ## _present(*nm)) { \
+ free_pgtable_page(__new); \
+ } else { \
+ set_##nm(nm, __##nm(__pa(__new) | _PAGE_TABLE)); \
+ domain_flush_cache(domain, nm, sizeof(nm##_t)); \
+ } \
+ spin_unlock(&(domain)->page_table_lock); \
+} while(0);
+
+static int
+mmmap_pte_range(struct dmar_domain *domain, pmd_t *pmd, unsigned long addr,
+ unsigned long end, phys_addr_t phys_addr, pgprot_t prot)
+{
+ pte_t *pte, *first_pte;
+ u64 pfn;
+
+ pfn = phys_addr >> PAGE_SHIFT;
+ if (unlikely(pmd_none(*pmd)))
+ pgtable_populate(domain, pmd);
+
+ first_pte = pte = pte_offset_kernel(pmd, addr);
+
+ do {
+ set_pte(pte, pfn_pte(pfn, prot));
+ pfn++;
+ } while (pte++, addr += PAGE_SIZE, addr != end);
+
+ domain_flush_cache(domain, first_pte, (void *)pte - (void *)first_pte);
+
+ return 0;
+}
+
+static int
+mmmap_pmd_range(struct dmar_domain *domain, pud_t *pud, unsigned long addr,
+ unsigned long end, phys_addr_t phys_addr, pgprot_t prot)
+{
+ unsigned long next;
+ pmd_t *pmd;
+
+ if (unlikely(pud_none(*pud)))
+ pgtable_populate(domain, pud);
+ pmd = pmd_offset(pud, addr);
+
+ phys_addr -= addr;
+ do {
+ next = pmd_addr_end(addr, end);
+ if (mmmap_pte_range(domain, pmd, addr, next,
+ phys_addr + addr, prot))
+ return -ENOMEM;
+ } while (pmd++, addr = next, addr != end);
+
+ return 0;
+}
+
+static int
+mmmap_pud_range(struct dmar_domain *domain, p4d_t *p4d, unsigned long addr,
+ unsigned long end, phys_addr_t phys_addr, pgprot_t prot)
+{
+ unsigned long next;
+ pud_t *pud;
+
+ if (unlikely(p4d_none(*p4d)))
+ pgtable_populate(domain, p4d);
+
+ pud = pud_offset(p4d, addr);
+
+ phys_addr -= addr;
+ do {
+ next = pud_addr_end(addr, end);
+ if (mmmap_pmd_range(domain, pud, addr, next,
+ phys_addr + addr, prot))
+ return -ENOMEM;
+ } while (pud++, addr = next, addr != end);
+
+ return 0;
+}
+
+static int
+mmmap_p4d_range(struct dmar_domain *domain, pgd_t *pgd, unsigned long addr,
+ unsigned long end, phys_addr_t phys_addr, pgprot_t prot)
+{
+ unsigned long next;
+ p4d_t *p4d;
+
+ if (cpu_feature_enabled(X86_FEATURE_LA57) && unlikely(pgd_none(*pgd)))
+ pgtable_populate(domain, pgd);
+
+ p4d = p4d_offset(pgd, addr);
+
+ phys_addr -= addr;
+ do {
+ next = p4d_addr_end(addr, end);
+ if (mmmap_pud_range(domain, p4d, addr, next,
+ phys_addr + addr, prot))
+ return -ENOMEM;
+ } while (p4d++, addr = next, addr != end);
+
+ return 0;
+}
+
+int intel_mmmap_range(struct dmar_domain *domain, unsigned long addr,
+ unsigned long end, phys_addr_t phys_addr, int dma_prot)
+{
+ unsigned long next;
+ pgprot_t prot;
+ pgd_t *pgd;
+
+ trace_domain_mm_map(domain, addr, end, phys_addr);
+
+ /*
+ * There is no PAGE_KERNEL_WO for a pte entry, so let's use RW
+ * for a pte that requires write operation.
+ */
+ prot = dma_prot & DMA_PTE_WRITE ? PAGE_KERNEL : PAGE_KERNEL_RO;
+ BUG_ON(addr >= end);
+
+ phys_addr -= addr;
+ pgd = pgd_offset_pgd(domain->pgd, addr);
+ do {
+ next = pgd_addr_end(addr, end);
+ if (mmmap_p4d_range(domain, pgd, addr, next,
+ phys_addr + addr, prot))
+ return -ENOMEM;
+ } while (pgd++, addr = next, addr != end);
+
+ return 0;
+}
+
+/*
+ * mmunmap: Unmap an existing mapping between a range of IO vitual address
+ * and physical addresses.
+ */
+static struct page *
+mmunmap_pte_range(struct dmar_domain *domain, pmd_t *pmd,
+ unsigned long addr, unsigned long end,
+ struct page *freelist, bool reclaim)
+{
+ int i;
+ unsigned long start;
+ pte_t *pte, *first_pte;
+
+ start = addr;
+ pte = pte_offset_kernel(pmd, addr);
+ first_pte = pte;
+ do {
+ set_pte(pte, __pte(0));
+ } while (pte++, addr += PAGE_SIZE, addr != end);
+
+ domain_flush_cache(domain, first_pte, (void *)pte - (void *)first_pte);
+
+ /* Add page to free list if all entries are empty. */
+ if (reclaim) {
+ struct page *pte_page;
+
+ pte = (pte_t *)pmd_page_vaddr(*pmd);
+ for (i = 0; i < PTRS_PER_PTE; i++)
+ if (!pte || !pte_none(pte[i]))
+ goto pte_out;
+
+ pte_page = pmd_page(*pmd);
+ pte_page->freelist = freelist;
+ freelist = pte_page;
+ pmd_clear(pmd);
+ domain_flush_cache(domain, pmd, sizeof(pmd_t));
+ }
+
+pte_out:
+ return freelist;
+}
+
+static struct page *
+mmunmap_pmd_range(struct dmar_domain *domain, pud_t *pud,
+ unsigned long addr, unsigned long end,
+ struct page *freelist, bool reclaim)
+{
+ int i;
+ pmd_t *pmd;
+ unsigned long start, next;
+
+ start = addr;
+ pmd = pmd_offset(pud, addr);
+ do {
+ next = pmd_addr_end(addr, end);
+ if (pmd_none_or_clear_bad(pmd))
+ continue;
+ freelist = mmunmap_pte_range(domain, pmd, addr, next,
+ freelist, reclaim);
+ } while (pmd++, addr = next, addr != end);
+
+ /* Add page to free list if all entries are empty. */
+ if (reclaim) {
+ struct page *pmd_page;
+
+ pmd = (pmd_t *)pud_page_vaddr(*pud);
+ for (i = 0; i < PTRS_PER_PMD; i++)
+ if (!pmd || !pmd_none(pmd[i]))
+ goto pmd_out;
+
+ pmd_page = pud_page(*pud);
+ pmd_page->freelist = freelist;
+ freelist = pmd_page;
+ pud_clear(pud);
+ domain_flush_cache(domain, pud, sizeof(pud_t));
+ }
+
+pmd_out:
+ return freelist;
+}
+
+static struct page *
+mmunmap_pud_range(struct dmar_domain *domain, p4d_t *p4d,
+ unsigned long addr, unsigned long end,
+ struct page *freelist, bool reclaim)
+{
+ int i;
+ pud_t *pud;
+ unsigned long start, next;
+
+ start = addr;
+ pud = pud_offset(p4d, addr);
+ do {
+ next = pud_addr_end(addr, end);
+ if (pud_none_or_clear_bad(pud))
+ continue;
+ freelist = mmunmap_pmd_range(domain, pud, addr, next,
+ freelist, reclaim);
+ } while (pud++, addr = next, addr != end);
+
+ /* Add page to free list if all entries are empty. */
+ if (reclaim) {
+ struct page *pud_page;
+
+ pud = (pud_t *)p4d_page_vaddr(*p4d);
+ for (i = 0; i < PTRS_PER_PUD; i++)
+ if (!pud || !pud_none(pud[i]))
+ goto pud_out;
+
+ pud_page = p4d_page(*p4d);
+ pud_page->freelist = freelist;
+ freelist = pud_page;
+ p4d_clear(p4d);
+ domain_flush_cache(domain, p4d, sizeof(p4d_t));
+ }
+
+pud_out:
+ return freelist;
+}
+
+static struct page *
+mmunmap_p4d_range(struct dmar_domain *domain, pgd_t *pgd,
+ unsigned long addr, unsigned long end,
+ struct page *freelist, bool reclaim)
+{
+ p4d_t *p4d;
+ unsigned long start, next;
+
+ start = addr;
+ p4d = p4d_offset(pgd, addr);
+ do {
+ next = p4d_addr_end(addr, end);
+ if (p4d_none_or_clear_bad(p4d))
+ continue;
+ freelist = mmunmap_pud_range(domain, p4d, addr, next,
+ freelist, reclaim);
+ } while (p4d++, addr = next, addr != end);
+
+ /* Add page to free list if all entries are empty. */
+ if (cpu_feature_enabled(X86_FEATURE_LA57) && reclaim) {
+ struct page *p4d_page;
+ int i;
+
+ p4d = (p4d_t *)pgd_page_vaddr(*pgd);
+ for (i = 0; i < PTRS_PER_P4D; i++)
+ if (!p4d || !p4d_none(p4d[i]))
+ goto p4d_out;
+
+ p4d_page = pgd_page(*pgd);
+ p4d_page->freelist = freelist;
+ freelist = p4d_page;
+ pgd_clear(pgd);
+ domain_flush_cache(domain, pgd, sizeof(pgd_t));
+ }
+
+p4d_out:
+ return freelist;
+}
+
+struct page *
+intel_mmunmap_range(struct dmar_domain *domain,
+ unsigned long addr, unsigned long end)
+{
+ pgd_t *pgd;
+ unsigned long next;
+ struct page *freelist = NULL;
+
+ trace_domain_mm_unmap(domain, addr, end);
+
+ BUG_ON(addr >= end);
+ pgd = pgd_offset_pgd(domain->pgd, addr);
+ do {
+ next = pgd_addr_end(addr, end);
+ if (pgd_none_or_clear_bad(pgd))
+ continue;
+ freelist = mmunmap_p4d_range(domain, pgd, addr, next,
+ freelist, !addr);
+ } while (pgd++, addr = next, addr != end);
+
+ return freelist;
+}
+#endif /* CONFIG_X86 */
diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index 3ee694d4f361..044a91fa5431 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -489,7 +489,8 @@ struct dmar_domain {
struct list_head auxd; /* link to device's auxiliary list */
struct iova_domain iovad; /* iova's that belong to this domain */
- struct dma_pte *pgd; /* virtual address */
+ void *pgd; /* virtual address */
+ spinlock_t page_table_lock; /* Protects page tables */
int gaw; /* max guest address width */
/* adjusted guest address width, 0 is level 2 30-bit */
@@ -662,6 +663,27 @@ int for_each_device_domain(int (*fn)(struct device_domain_info *info,
void iommu_flush_write_buffer(struct intel_iommu *iommu);
int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct device *dev);
+#ifdef CONFIG_X86
+int intel_mmmap_range(struct dmar_domain *domain, unsigned long addr,
+ unsigned long end, phys_addr_t phys_addr, int dma_prot);
+struct page *intel_mmunmap_range(struct dmar_domain *domain,
+ unsigned long addr, unsigned long end);
+#else
+static inline int
+intel_mmmap_range(struct dmar_domain *domain, unsigned long addr,
+ unsigned long end, phys_addr_t phys_addr, int dma_prot)
+{
+ return -ENODEV;
+}
+
+static inline struct page *
+intel_mmunmap_range(struct dmar_domain *domain,
+ unsigned long addr, unsigned long end)
+{
+ return NULL;
+}
+#endif
+
#ifdef CONFIG_INTEL_IOMMU_SVM
int intel_svm_init(struct intel_iommu *iommu);
extern int intel_svm_enable_prq(struct intel_iommu *iommu);
diff --git a/include/trace/events/intel_iommu.h b/include/trace/events/intel_iommu.h
index 54e61d456cdf..e8c95290fd13 100644
--- a/include/trace/events/intel_iommu.h
+++ b/include/trace/events/intel_iommu.h
@@ -99,6 +99,66 @@ DEFINE_EVENT(dma_unmap, bounce_unmap_single,
TP_ARGS(dev, dev_addr, size)
);
+DECLARE_EVENT_CLASS(domain_map,
+ TP_PROTO(struct dmar_domain *domain, unsigned long addr,
+ unsigned long end, phys_addr_t phys_addr),
+
+ TP_ARGS(domain, addr, end, phys_addr),
+
+ TP_STRUCT__entry(
+ __field(struct dmar_domain *, domain)
+ __field(unsigned long, addr)
+ __field(unsigned long, end)
+ __field(phys_addr_t, phys_addr)
+ ),
+
+ TP_fast_assign(
+ __entry->domain = domain;
+ __entry->addr = addr;
+ __entry->end = end;
+ __entry->phys_addr = phys_addr;
+ ),
+
+ TP_printk("domain=%p addr=0x%lx end=0x%lx phys_addr=0x%llx",
+ __entry->domain, __entry->addr, __entry->end,
+ (unsigned long long)__entry->phys_addr)
+);
+
+DEFINE_EVENT(domain_map, domain_mm_map,
+ TP_PROTO(struct dmar_domain *domain, unsigned long addr,
+ unsigned long end, phys_addr_t phys_addr),
+
+ TP_ARGS(domain, addr, end, phys_addr)
+);
+
+DECLARE_EVENT_CLASS(domain_unmap,
+ TP_PROTO(struct dmar_domain *domain, unsigned long addr,
+ unsigned long end),
+
+ TP_ARGS(domain, addr, end),
+
+ TP_STRUCT__entry(
+ __field(struct dmar_domain *, domain)
+ __field(unsigned long, addr)
+ __field(unsigned long, end)
+ ),
+
+ TP_fast_assign(
+ __entry->domain = domain;
+ __entry->addr = addr;
+ __entry->end = end;
+ ),
+
+ TP_printk("domain=%p addr=0x%lx end=0x%lx",
+ __entry->domain, __entry->addr, __entry->end)
+);
+
+DEFINE_EVENT(domain_unmap, domain_mm_unmap,
+ TP_PROTO(struct dmar_domain *domain, unsigned long addr,
+ unsigned long end),
+
+ TP_ARGS(domain, addr, end)
+);
#endif /* _TRACE_INTEL_IOMMU_H */
/* This part must be outside protection */
--
2.17.1
Powered by blists - more mailing lists