[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <715D42877B251141A38726ABF5CABF2C01A300549D@pdsmsx503.ccr.corp.intel.com>
Date: Thu, 16 Apr 2009 10:13:24 +0800
From: "Han, Weidong" <weidong.han@...el.com>
To: "Yu, Fenghua" <fenghua.yu@...el.com>,
David Woodhouse <dwmw2@...radead.org>,
Ingo Molnar <mingo@...e.hu>,
Linus Torvalds <torvalds@...ux-foundation.org>
CC: LKML <linux-kernel@...r.kernel.org>,
IOMMU <iommu@...ts.linux-foundation.org>
Subject: RE: [PATCH] Intel IOMMU Pass Through Support
Acked-by: Weidong Han <weidong@...el.com>
Yu, Fenghua wrote:
> The patch adds kernel parameter intel_iommu=pt to set up pass through
> mode in
> context mapping entry. This disables DMAR in linux kernel; but KVM
> still runs on
> VT-d and interrupt remapping still works.
>
> In this mode, kernel uses swiotlb for DMA API functions but other VT-d
> functionalities are enabled for KVM. KVM always uses multi level
> translation
> page table in VT-d. By default, pass though mode is disabled in
> kernel.
>
> This is useful when people don't want to enable VT-d DMAR in kernel
> but still
> want to use KVM and interrupt remapping for reasons like DMAR
> performance
> concern or debug purpose.
>
> Thanks.
>
> -Fenghua
>
> Signed-off-by: Fenghua Yu <fenghua.yu@...el.com>
>
> ---
>
> Documentation/kernel-parameters.txt | 5
> arch/ia64/include/asm/iommu.h | 1
> arch/ia64/kernel/pci-swiotlb.c | 2
> arch/x86/include/asm/iommu.h | 1
> arch/x86/kernel/pci-swiotlb.c | 3
> drivers/pci/dmar.c | 9 +
> drivers/pci/intel-iommu.c | 187
> ++++++++++++++++++++++++++---------- include/linux/dma_remapping.h
> | 8 + include/linux/intel-iommu.h | 2
> 9 files changed, 167 insertions(+), 51 deletions(-)
>
> diff --git a/Documentation/kernel-parameters.txt
> b/Documentation/kernel-parameters.txt
> index 6172e43..5594cdb 100644
> --- a/Documentation/kernel-parameters.txt
> +++ b/Documentation/kernel-parameters.txt
> @@ -915,6 +915,11 @@ and is between 256 and 4096 characters. It is
> defined in the file With this option on every unmap_single
> operation will result in a hardware IOTLB flush operation as
> opposed to batching them for performance.
> + pt [Default no Pass Through]
> + This option enables Pass Through in context mapping if
> + Pass Through is supported in hardware. With this option
> + DMAR is disabled in kernel and kernel uses swiotlb, but
> + KVM can still uses VT-d IOTLB hardware.
>
> inttest= [IA64]
>
> diff --git a/arch/ia64/include/asm/iommu.h
> b/arch/ia64/include/asm/iommu.h
> index 0490794..37d41ca 100644
> --- a/arch/ia64/include/asm/iommu.h
> +++ b/arch/ia64/include/asm/iommu.h
> @@ -9,6 +9,7 @@ extern void pci_iommu_shutdown(void);
> extern void no_iommu_init(void);
> extern int force_iommu, no_iommu;
> extern int iommu_detected;
> +extern int iommu_pass_through;
> extern void iommu_dma_init(void);
> extern void machvec_init(const char *name);
>
> diff --git a/arch/ia64/kernel/pci-swiotlb.c
> b/arch/ia64/kernel/pci-swiotlb.c
> index 285aae8..223abb1 100644
> --- a/arch/ia64/kernel/pci-swiotlb.c
> +++ b/arch/ia64/kernel/pci-swiotlb.c
> @@ -46,7 +46,7 @@ void __init swiotlb_dma_init(void)
>
> void __init pci_swiotlb_init(void)
> {
> - if (!iommu_detected) {
> + if (!iommu_detected || iommu_pass_through) {
> #ifdef CONFIG_IA64_GENERIC
> swiotlb = 1;
> printk(KERN_INFO "PCI-DMA: Re-initialize machine vector.\n");
> diff --git a/arch/x86/include/asm/iommu.h
> b/arch/x86/include/asm/iommu.h
> index af326a2..fd6d21b 100644
> --- a/arch/x86/include/asm/iommu.h
> +++ b/arch/x86/include/asm/iommu.h
> @@ -6,6 +6,7 @@ extern void no_iommu_init(void);
> extern struct dma_map_ops nommu_dma_ops;
> extern int force_iommu, no_iommu;
> extern int iommu_detected;
> +extern int iommu_pass_through;
>
> /* 10 seconds */
> #define DMAR_OPERATION_TIMEOUT ((cycles_t) tsc_khz*10*1000)
> diff --git a/arch/x86/kernel/pci-swiotlb.c
> b/arch/x86/kernel/pci-swiotlb.c
> index 34f12e9..42a0eb1 100644
> --- a/arch/x86/kernel/pci-swiotlb.c
> +++ b/arch/x86/kernel/pci-swiotlb.c
> @@ -71,7 +71,8 @@ void __init pci_swiotlb_init(void)
> {
> /* don't initialize swiotlb if iommu=off (no_iommu=1) */
> #ifdef CONFIG_X86_64
> - if (!iommu_detected && !no_iommu && max_pfn > MAX_DMA32_PFN)
> + if ((!iommu_detected && !no_iommu && max_pfn > MAX_DMA32_PFN) ||
> + iommu_pass_through)
> swiotlb = 1;
> #endif
> if (swiotlb_force)
> diff --git a/drivers/pci/dmar.c b/drivers/pci/dmar.c
> index fa3a113..1ef1a19 100644
> --- a/drivers/pci/dmar.c
> +++ b/drivers/pci/dmar.c
> @@ -515,6 +515,7 @@ int alloc_iommu(struct dmar_drhd_unit *drhd)
> u32 ver;
> static int iommu_allocated = 0;
> int agaw = 0;
> + int msagaw = 0;
>
> iommu = kzalloc(sizeof(*iommu), GFP_KERNEL);
> if (!iommu)
> @@ -539,8 +540,16 @@ int alloc_iommu(struct dmar_drhd_unit *drhd)
> iommu->seq_id);
> goto error;
> }
> + msagaw = iommu_calculate_max_sagaw(iommu);
> + if (msagaw < 0) {
> + printk(KERN_ERR
> + "Cannot get a valid max agaw for iommu (seq_id = %d)\n",
> + iommu->seq_id);
> + goto error;
> + }
> #endif
> iommu->agaw = agaw;
> + iommu->msagaw = msagaw;
>
> /* the registers might be more than one page */
> map_size = max_t(int, ecap_max_iotlb_offset(iommu->ecap),
> diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c
> index 001b328..205e4a1 100644
> --- a/drivers/pci/intel-iommu.c
> +++ b/drivers/pci/intel-iommu.c
> @@ -53,6 +53,8 @@
>
> #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
>
> +#define MAX_AGAW_WIDTH 64
> +
> #define DOMAIN_MAX_ADDR(gaw) ((((u64)1) << gaw) - 1)
>
> #define IOVA_PFN(addr) ((addr) >> PAGE_SHIFT)
> @@ -127,8 +129,6 @@ static inline void
> context_set_fault_enable(struct context_entry *context) context->lo
> &= (((u64)-1) << 2) | 1; }
>
> -#define CONTEXT_TT_MULTI_LEVEL 0
> -
> static inline void context_set_translation_type(struct context_entry
> *context, unsigned long value)
> {
> @@ -288,6 +288,7 @@ int dmar_disabled = 1;
> static int __initdata dmar_map_gfx = 1;
> static int dmar_forcedac;
> static int intel_iommu_strict;
> +int iommu_pass_through;
>
> #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
> static DEFINE_SPINLOCK(device_domain_lock);
> @@ -318,6 +319,9 @@ static int __init intel_iommu_setup(char *str)
> printk(KERN_INFO
> "Intel-IOMMU: disable batched IOTLB flush\n");
> intel_iommu_strict = 1;
> + } else if (!strncmp(str, "pt", 2)) {
> + iommu_pass_through = 1;
> + printk(KERN_INFO "Intel-IOMMU: Pass Through enabled\n");
> }
>
> str += strcspn(str, ",");
> @@ -397,17 +401,13 @@ void free_iova_mem(struct iova *iova)
>
> static inline int width_to_agaw(int width);
>
> -/* calculate agaw for each iommu.
> - * "SAGAW" may be different across iommus, use a default agaw, and
> - * get a supported less agaw for iommus that don't support the
> default agaw.
> - */
> -int iommu_calculate_agaw(struct intel_iommu *iommu)
> +static int __iommu_calculate_agaw(struct intel_iommu *iommu, int
> max_gaw) {
> unsigned long sagaw;
> int agaw = -1;
>
> sagaw = cap_sagaw(iommu->cap);
> - for (agaw = width_to_agaw(DEFAULT_DOMAIN_ADDRESS_WIDTH);
> + for (agaw = width_to_agaw(max_gaw);
> agaw >= 0; agaw--) {
> if (test_bit(agaw, &sagaw))
> break;
> @@ -416,6 +416,24 @@ int iommu_calculate_agaw(struct intel_iommu
> *iommu) return agaw;
> }
>
> +/*
> + * Calculate max SAGAW for each iommu.
> + */
> +int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
> +{
> + return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
> +}
> +
> +/*
> + * calculate agaw for each iommu.
> + * "SAGAW" may be different across iommus, use a default agaw, and
> + * get a supported less agaw for iommus that don't support the
> default agaw. + */
> +int iommu_calculate_agaw(struct intel_iommu *iommu)
> +{
> + return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
> +}
> +
> /* in native case, each domain is related to only one iommu */
> static struct intel_iommu *domain_get_iommu(struct dmar_domain
> *domain) {
> @@ -1321,8 +1339,8 @@ static void domain_exit(struct dmar_domain
> *domain) free_domain_mem(domain);
> }
>
> -static int domain_context_mapping_one(struct dmar_domain *domain,
> - int segment, u8 bus, u8 devfn)
> +static int domain_context_mapping_one(struct dmar_domain *domain,
> int segment, + u8 bus, u8 devfn, int translation)
> {
> struct context_entry *context;
> unsigned long flags;
> @@ -1335,7 +1353,10 @@ static int domain_context_mapping_one(struct
> dmar_domain *domain,
>
> pr_debug("Set context mapping for %02x:%02x.%d\n",
> bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
> +
> BUG_ON(!domain->pgd);
> + BUG_ON(translation != CONTEXT_TT_PASS_THROUGH &&
> + translation != CONTEXT_TT_MULTI_LEVEL);
>
> iommu = device_to_iommu(segment, bus, devfn);
> if (!iommu)
> @@ -1395,9 +1416,18 @@ static int domain_context_mapping_one(struct
> dmar_domain *domain, }
>
> context_set_domain_id(context, id);
> - context_set_address_width(context, iommu->agaw);
> - context_set_address_root(context, virt_to_phys(pgd));
> - context_set_translation_type(context, CONTEXT_TT_MULTI_LEVEL);
> +
> + /*
> + * In pass through mode, AW must be programmed to indicate the
> largest + * AGAW value supported by hardware. And ASR is ignored by
> hardware. + */
> + if (likely(translation == CONTEXT_TT_MULTI_LEVEL)) {
> + context_set_address_width(context, iommu->agaw);
> + context_set_address_root(context, virt_to_phys(pgd));
> + } else
> + context_set_address_width(context, iommu->msagaw);
> +
> + context_set_translation_type(context, translation);
> context_set_fault_enable(context);
> context_set_present(context);
> domain_flush_cache(domain, context, sizeof(*context));
> @@ -1422,13 +1452,15 @@ static int domain_context_mapping_one(struct
> dmar_domain *domain, }
>
> static int
> -domain_context_mapping(struct dmar_domain *domain, struct pci_dev
> *pdev) +domain_context_mapping(struct dmar_domain *domain, struct
> pci_dev *pdev, + int translation)
> {
> int ret;
> struct pci_dev *tmp, *parent;
>
> ret = domain_context_mapping_one(domain, pci_domain_nr(pdev->bus),
> - pdev->bus->number, pdev->devfn);
> + pdev->bus->number, pdev->devfn,
> + translation);
> if (ret)
> return ret;
>
> @@ -1440,9 +1472,9 @@ domain_context_mapping(struct dmar_domain
> *domain, struct pci_dev *pdev) parent = pdev->bus->self;
> while (parent != tmp) {
> ret = domain_context_mapping_one(domain,
> - pci_domain_nr(parent->bus),
> - parent->bus->number,
> - parent->devfn);
> + pci_domain_nr(parent->bus),
> + parent->bus->number,
> + parent->devfn, translation);
> if (ret)
> return ret;
> parent = parent->bus->self;
> @@ -1450,12 +1482,14 @@ domain_context_mapping(struct dmar_domain
> *domain, struct pci_dev *pdev) if (tmp->is_pcie) /* this is a
> PCIE-to-PCI bridge */ return domain_context_mapping_one(domain,
> pci_domain_nr(tmp->subordinate),
> - tmp->subordinate->number, 0);
> + tmp->subordinate->number, 0,
> + translation);
> else /* this is a legacy PCI bridge */
> return domain_context_mapping_one(domain,
> pci_domain_nr(tmp->bus),
> tmp->bus->number,
> - tmp->devfn);
> + tmp->devfn,
> + translation);
> }
>
> static int domain_context_mapped(struct pci_dev *pdev)
> @@ -1752,7 +1786,7 @@ static int iommu_prepare_identity_map(struct
> pci_dev *pdev, goto error;
>
> /* context entry init */
> - ret = domain_context_mapping(domain, pdev);
> + ret = domain_context_mapping(domain, pdev, CONTEXT_TT_MULTI_LEVEL);
> if (!ret)
> return 0;
> error:
> @@ -1853,6 +1887,23 @@ static inline void iommu_prepare_isa(void)
> }
> #endif /* !CONFIG_DMAR_FLPY_WA */
>
> +/* Initialize each context entry as pass through.*/
> +static int __init init_context_pass_through(void)
> +{
> + struct pci_dev *pdev = NULL;
> + struct dmar_domain *domain;
> + int ret;
> +
> + for_each_pci_dev(pdev) {
> + domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
> + ret = domain_context_mapping(domain, pdev,
> + CONTEXT_TT_PASS_THROUGH);
> + if (ret)
> + return ret;
> + }
> + return 0;
> +}
> +
> static int __init init_dmars(void)
> {
> struct dmar_drhd_unit *drhd;
> @@ -1860,6 +1911,7 @@ static int __init init_dmars(void)
> struct pci_dev *pdev;
> struct intel_iommu *iommu;
> int i, ret;
> + int pass_through = 1;
>
> /*
> * for each drhd
> @@ -1913,7 +1965,15 @@ static int __init init_dmars(void)
> printk(KERN_ERR "IOMMU: allocate root entry failed\n");
> goto error;
> }
> + if (!ecap_pass_through(iommu->ecap))
> + pass_through = 0;
> }
> + if (iommu_pass_through)
> + if (!pass_through) {
> + printk(KERN_INFO
> + "Pass Through is not supported by hardware.\n");
> + iommu_pass_through = 0;
> + }
>
> /*
> * Start from the sane iommu hardware state.
> @@ -1976,37 +2036,57 @@ static int __init init_dmars(void)
> "IOMMU: enable interrupt remapping failed\n");
> }
> #endif
> + /*
> + * If pass through is set and enabled, context entries of all pci
> + * devices are intialized by pass through translation type.
> + */
> + if (iommu_pass_through) {
> + ret = init_context_pass_through();
> + if (ret) {
> + printk(KERN_ERR "IOMMU: Pass through init failed.\n");
> + iommu_pass_through = 0;
> + }
> + }
>
> /*
> - * For each rmrr
> - * for each dev attached to rmrr
> - * do
> - * locate drhd for dev, alloc domain for dev
> - * allocate free domain
> - * allocate page table entries for rmrr
> - * if context not allocated for bus
> - * allocate and init context
> - * set present in root table for this bus
> - * init context with domain, translation etc
> - * endfor
> - * endfor
> + * If pass through is not set or not enabled, setup context entries
> for + * identity mappings for rmrr, gfx, and isa.
> */
> - for_each_rmrr_units(rmrr) {
> - for (i = 0; i < rmrr->devices_cnt; i++) {
> - pdev = rmrr->devices[i];
> - /* some BIOS lists non-exist devices in DMAR table */
> - if (!pdev)
> - continue;
> - ret = iommu_prepare_rmrr_dev(rmrr, pdev);
> - if (ret)
> - printk(KERN_ERR
> + if (!iommu_pass_through) {
> + /*
> + * For each rmrr
> + * for each dev attached to rmrr
> + * do
> + * locate drhd for dev, alloc domain for dev
> + * allocate free domain
> + * allocate page table entries for rmrr
> + * if context not allocated for bus
> + * allocate and init context
> + * set present in root table for this bus
> + * init context with domain, translation etc
> + * endfor
> + * endfor
> + */
> + for_each_rmrr_units(rmrr) {
> + for (i = 0; i < rmrr->devices_cnt; i++) {
> + pdev = rmrr->devices[i];
> + /*
> + * some BIOS lists non-exist devices in DMAR
> + * table.
> + */
> + if (!pdev)
> + continue;
> + ret = iommu_prepare_rmrr_dev(rmrr, pdev);
> + if (ret)
> + printk(KERN_ERR
> "IOMMU: mapping reserved region failed\n");
> + }
> }
> - }
>
> - iommu_prepare_gfx_mapping();
> + iommu_prepare_gfx_mapping();
>
> - iommu_prepare_isa();
> + iommu_prepare_isa();
> + }
>
> /*
> * for each drhd
> @@ -2117,7 +2197,8 @@ get_valid_domain_for_dev(struct pci_dev *pdev)
>
> /* make sure context mapping is ok */
> if (unlikely(!domain_context_mapped(pdev))) {
> - ret = domain_context_mapping(domain, pdev);
> + ret = domain_context_mapping(domain, pdev,
> + CONTEXT_TT_MULTI_LEVEL);
> if (ret) {
> printk(KERN_ERR
> "Domain context map for %s failed",
> @@ -2786,7 +2867,7 @@ int __init intel_iommu_init(void)
> * Check the need for DMA-remapping initialization now.
> * Above initialization will also be used by Interrupt-remapping.
> */
> - if (no_iommu || swiotlb || dmar_disabled)
> + if (no_iommu || (swiotlb && !iommu_pass_through) || dmar_disabled)
> return -ENODEV;
>
> iommu_init_mempool();
> @@ -2806,7 +2887,15 @@ int __init intel_iommu_init(void)
>
> init_timer(&unmap_timer);
> force_iommu = 1;
> - dma_ops = &intel_dma_ops;
> +
> + if (!iommu_pass_through) {
> + printk(KERN_INFO
> + "Multi-level page-table translation for DMAR.\n");
> + dma_ops = &intel_dma_ops;
> + } else
> + printk(KERN_INFO
> + "DMAR: Pass through translation for DMAR.\n");
> +
> init_iommu_sysfs();
>
> register_iommu(&intel_iommu_ops);
> @@ -3146,7 +3235,7 @@ static int intel_iommu_attach_device(struct
> iommu_domain *domain, return -EFAULT;
> }
>
> - ret = domain_context_mapping(dmar_domain, pdev);
> + ret = domain_context_mapping(dmar_domain, pdev,
> CONTEXT_TT_MULTI_LEVEL); if (ret)
> return ret;
>
> diff --git a/include/linux/dma_remapping.h
> b/include/linux/dma_remapping.h
> index 1a455f1..e0a03af 100644
> --- a/include/linux/dma_remapping.h
> +++ b/include/linux/dma_remapping.h
> @@ -13,6 +13,9 @@
> #define DMA_PTE_WRITE (2)
> #define DMA_PTE_SNP (1 << 11)
>
> +#define CONTEXT_TT_MULTI_LEVEL 0
> +#define CONTEXT_TT_PASS_THROUGH 2
> +
> struct intel_iommu;
> struct dmar_domain;
> struct root_entry;
> @@ -21,11 +24,16 @@ extern void free_dmar_iommu(struct intel_iommu
> *iommu);
>
> #ifdef CONFIG_DMAR
> extern int iommu_calculate_agaw(struct intel_iommu *iommu);
> +extern int iommu_calculate_max_sagaw(struct intel_iommu *iommu);
> #else
> static inline int iommu_calculate_agaw(struct intel_iommu *iommu)
> {
> return 0;
> }
> +static inline int iommu_calculate_max_sagaw(struct intel_iommu
> *iommu) +{
> + return 0;
> +}
> #endif
>
> extern int dmar_disabled;
> diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
> index aa8c531..7246971 100644
> --- a/include/linux/intel-iommu.h
> +++ b/include/linux/intel-iommu.h
> @@ -120,6 +120,7 @@ static inline void dmar_writeq(void __iomem
> *addr, u64 val) (ecap_iotlb_offset(e) + ecap_niotlb_iunits(e) * 16)
> #define ecap_coherent(e) ((e) & 0x1)
> #define ecap_qis(e) ((e) & 0x2)
> +#define ecap_pass_through(e) ((e >> 6) & 0x1)
> #define ecap_eim_support(e) ((e >> 4) & 0x1)
> #define ecap_ir_support(e) ((e >> 3) & 0x1)
> #define ecap_max_handle_mask(e) ((e >> 20) & 0xf)
> @@ -302,6 +303,7 @@ struct intel_iommu {
> spinlock_t register_lock; /* protect register handling */
> int seq_id; /* sequence id of the iommu */
> int agaw; /* agaw of this iommu */
> + int msagaw; /* max sagaw of this iommu */
> unsigned int irq;
> unsigned char name[13]; /* Device Name */
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Powered by blists - more mailing lists