lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Date:   Thu, 16 Apr 2020 21:42:41 -0400
From:   Qian Cai <cai@....pw>
To:     Joerg Roedel <joro@...tes.org>
Cc:     iommu@...ts.linux-foundation.org, linux-kernel@...r.kernel.org
Subject: Re: [RFC PATCH] iommu/amd: fix a race in fetch_pte()



> On Apr 13, 2020, at 9:36 PM, Qian Cai <cai@....pw> wrote:
> 
> 
> 
>> On Apr 8, 2020, at 10:19 AM, Joerg Roedel <joro@...tes.org> wrote:
>> 
>> Hi Qian,
>> 
>> On Tue, Apr 07, 2020 at 11:36:05AM -0400, Qian Cai wrote:
>>> After further testing, the change along is insufficient. What I am chasing right
>>> now is the swap device will go offline after heavy memory pressure below. The
>>> symptom is similar to what we have in the commit,
>>> 
>>> 754265bcab78 (“iommu/amd: Fix race in increase_address_space()”)
>>> 
>>> Apparently, it is no possible to take the domain->lock in fetch_pte() because it
>>> could sleep.
>> 
>> Thanks a lot for finding and tracking down another race in the AMD IOMMU
>> page-table code.  The domain->lock is a spin-lock and taking it can't
>> sleep. But fetch_pte() is a fast-path and must not take any locks.
>> 
>> I think the best fix is to update the pt_root and mode of the domain
>> atomically by storing the mode in the lower 12 bits of pt_root. This way
>> they are stored together and can be read/write atomically.
> 
> Like this?

So, this is still not enough that would still trigger storage driver offline under
memory pressure for a bit longer. It looks to me that in fetch_pte() there are
could still racy?

	level	   =  domain->mode - 1;
	pte	   = &domain->pt_root[PM_LEVEL_INDEX(level, address)];
							<— increase_address_space();
	*page_size =  PTE_LEVEL_PAGE_SIZE(level);
	
	while (level > 0) {
		*page_size = PTE_LEVEL_PAGE_SIZE(level);

Then in iommu_unmap_page(),

	while (unmapped < page_size) {
		pte = fetch_pte(dom, bus_addr, &unmap_size);
		…
		bus_addr  = (bus_addr & ~(unmap_size - 1)) + unmap_size;

bus_addr would be unsync with dom->mode when it enter fetch_pte() again.
Could that be a problem?

> 
> diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c
> index 20cce366e951..b36c6b07cbfd 100644
> --- a/drivers/iommu/amd_iommu.c
> +++ b/drivers/iommu/amd_iommu.c
> @@ -1397,13 +1397,13 @@ static struct page *free_sub_pt(unsigned long root, int mode,
> 
> static void free_pagetable(struct protection_domain *domain)
> {
> -	unsigned long root = (unsigned long)domain->pt_root;
> +	int level = iommu_get_mode(domain->pt_root);
> +	unsigned long root = iommu_get_root(domain->pt_root);
> 	struct page *freelist = NULL;
> 
> -	BUG_ON(domain->mode < PAGE_MODE_NONE ||
> -	       domain->mode > PAGE_MODE_6_LEVEL);
> +	BUG_ON(level < PAGE_MODE_NONE || level > PAGE_MODE_6_LEVEL);
> 
> -	freelist = free_sub_pt(root, domain->mode, freelist);
> +	freelist = free_sub_pt(root, level, freelist);
> 
> 	free_page_list(freelist);
> }
> @@ -1417,24 +1417,27 @@ static bool increase_address_space(struct protection_domain *domain,
> 				   unsigned long address,
> 				   gfp_t gfp)
> {
> +	int level;
> 	unsigned long flags;
> 	bool ret = false;
> 	u64 *pte;
> 
> 	spin_lock_irqsave(&domain->lock, flags);
> 
> -	if (address <= PM_LEVEL_SIZE(domain->mode) ||
> -	    WARN_ON_ONCE(domain->mode == PAGE_MODE_6_LEVEL))
> +	level = iommu_get_mode(domain->pt_root);
> +
> +	if (address <= PM_LEVEL_SIZE(level) ||
> +	    WARN_ON_ONCE(level == PAGE_MODE_6_LEVEL))
> 		goto out;
> 
> 	pte = (void *)get_zeroed_page(gfp);
> 	if (!pte)
> 		goto out;
> 
> -	*pte             = PM_LEVEL_PDE(domain->mode,
> -					iommu_virt_to_phys(domain->pt_root));
> -	domain->pt_root  = pte;
> -	domain->mode    += 1;
> +	*pte = PM_LEVEL_PDE(level,
> +		iommu_virt_to_phys((void *)iommu_get_root(domain->pt_root)));
> +
> +	WRITE_ONCE(domain->pt_root, (unsigned long)pte + level + 1);
> 
> 	ret = true;
> 
> @@ -1452,15 +1455,17 @@ static u64 *alloc_pte(struct protection_domain *domain,
> 		      bool *updated)
> {
> 	int level, end_lvl;
> -	u64 *pte, *page;
> +	u64 *pte, *page, *pt_root, *root;
> 
> 	BUG_ON(!is_power_of_2(page_size));
> 
> -	while (address > PM_LEVEL_SIZE(domain->mode))
> +	while (address > PM_LEVEL_SIZE(iommu_get_mode(domain->pt_root)))
> 		*updated = increase_address_space(domain, address, gfp) || *updated;
> 
> -	level   = domain->mode - 1;
> -	pte     = &domain->pt_root[PM_LEVEL_INDEX(level, address)];
> +	pt_root = READ_ONCE(domain->pt_root);
> +	root    = (void *)iommu_get_root(pt_root);
> +	level   = iommu_get_mode(pt_root) - 1;
> +	pte     = &root[PM_LEVEL_INDEX(level, address)];
> 	address = PAGE_SIZE_ALIGN(address, page_size);
> 	end_lvl = PAGE_SIZE_LEVEL(page_size);
> 
> @@ -1536,16 +1541,18 @@ static u64 *fetch_pte(struct protection_domain *domain,
> 		      unsigned long address,
> 		      unsigned long *page_size)
> {
> -	int level;
> 	u64 *pte;
> +	u64 *pt_root = READ_ONCE(domain->pt_root);
> +	u64 *root    = (void *)iommu_get_root(pt_root);
> +	int level    = iommu_get_mode(pt_root);
> 
> 	*page_size = 0;
> 
> -	if (address > PM_LEVEL_SIZE(domain->mode))
> +	if (address > PM_LEVEL_SIZE(level))
> 		return NULL;
> 
> -	level	   =  domain->mode - 1;
> -	pte	   = &domain->pt_root[PM_LEVEL_INDEX(level, address)];
> +	level--;
> +	pte	   = &root[PM_LEVEL_INDEX(level, address)];
> 	*page_size =  PTE_LEVEL_PAGE_SIZE(level);
> 
> 	while (level > 0) {
> @@ -1814,12 +1821,13 @@ static struct protection_domain *dma_ops_domain_alloc(void)
> 	if (protection_domain_init(domain))
> 		goto free_domain;
> 
> -	domain->mode = PAGE_MODE_3_LEVEL;
> 	domain->pt_root = (void *)get_zeroed_page(GFP_KERNEL);
> 	domain->flags = PD_DMA_OPS_MASK;
> 	if (!domain->pt_root)
> 		goto free_domain;
> 
> +	domain->pt_root = iommu_set_mode(domain->pt_root, PAGE_MODE_3_LEVEL);
> +
> 	if (iommu_get_dma_cookie(&domain->domain) == -ENOMEM)
> 		goto free_domain;
> 
> @@ -1847,10 +1855,10 @@ static void set_dte_entry(u16 devid, struct protection_domain *domain,
> 	u64 flags = 0;
> 	u32 old_domid;
> 
> -	if (domain->mode != PAGE_MODE_NONE)
> -		pte_root = iommu_virt_to_phys(domain->pt_root);
> +	if (iommu_get_mode(domain->pt_root) != PAGE_MODE_NONE)
> +		pte_root = iommu_virt_to_phys((void *)iommu_get_root(domain->pt_root));
> 
> -	pte_root |= (domain->mode & DEV_ENTRY_MODE_MASK)
> +	pte_root |= ((unsigned long)domain->pt_root & DEV_ENTRY_MODE_MASK)
> 		    << DEV_ENTRY_MODE_SHIFT;
> 	pte_root |= DTE_FLAG_IR | DTE_FLAG_IW | DTE_FLAG_V | DTE_FLAG_TV;
> 
> @@ -2382,13 +2390,14 @@ static struct iommu_domain *amd_iommu_domain_alloc(unsigned type)
> 		if (!pdomain)
> 			return NULL;
> 
> -		pdomain->mode    = PAGE_MODE_3_LEVEL;
> 		pdomain->pt_root = (void *)get_zeroed_page(GFP_KERNEL);
> 		if (!pdomain->pt_root) {
> 			protection_domain_free(pdomain);
> 			return NULL;
> 		}
> 
> +		pdomain->pt_root = iommu_set_mode(pdomain->pt_root,
> +						  PAGE_MODE_3_LEVEL);
> 		pdomain->domain.geometry.aperture_start = 0;
> 		pdomain->domain.geometry.aperture_end   = ~0ULL;
> 		pdomain->domain.geometry.force_aperture = true;
> @@ -2406,7 +2415,8 @@ static struct iommu_domain *amd_iommu_domain_alloc(unsigned type)
> 		if (!pdomain)
> 			return NULL;
> 
> -		pdomain->mode = PAGE_MODE_NONE;
> +		pdomain->pt_root = iommu_set_mode(pdomain->pt_root,
> +						  PAGE_MODE_NONE);
> 		break;
> 	default:
> 		return NULL;
> @@ -2435,7 +2445,7 @@ static void amd_iommu_domain_free(struct iommu_domain *dom)
> 		dma_ops_domain_free(domain);
> 		break;
> 	default:
> -		if (domain->mode != PAGE_MODE_NONE)
> +		if (iommu_get_mode(domain->pt_root) != PAGE_MODE_NONE)
> 			free_pagetable(domain);
> 
> 		if (domain->flags & PD_IOMMUV2_MASK)
> @@ -2521,7 +2531,7 @@ static int amd_iommu_map(struct iommu_domain *dom, unsigned long iova,
> 	int prot = 0;
> 	int ret;
> 
> -	if (domain->mode == PAGE_MODE_NONE)
> +	if (iommu_get_mode(domain->pt_root) == PAGE_MODE_NONE)
> 		return -EINVAL;
> 
> 	if (iommu_prot & IOMMU_READ)
> @@ -2542,7 +2552,7 @@ static size_t amd_iommu_unmap(struct iommu_domain *dom, unsigned long iova,
> {
> 	struct protection_domain *domain = to_pdomain(dom);
> 
> -	if (domain->mode == PAGE_MODE_NONE)
> +	if (iommu_get_mode(domain->pt_root) == PAGE_MODE_NONE)
> 		return 0;
> 
> 	return iommu_unmap_page(domain, iova, page_size);
> @@ -2555,7 +2565,7 @@ static phys_addr_t amd_iommu_iova_to_phys(struct iommu_domain *dom,
> 	unsigned long offset_mask, pte_pgsize;
> 	u64 *pte, __pte;
> 
> -	if (domain->mode == PAGE_MODE_NONE)
> +	if (iommu_get_mode(domain->pt_root) == PAGE_MODE_NONE)
> 		return iova;
> 
> 	pte = fetch_pte(domain, iova, &pte_pgsize);
> @@ -2713,7 +2723,7 @@ void amd_iommu_domain_direct_map(struct iommu_domain *dom)
> 	spin_lock_irqsave(&domain->lock, flags);
> 
> 	/* Update data structure */
> -	domain->mode    = PAGE_MODE_NONE;
> +	domain->pt_root = iommu_set_mode(domain->pt_root, PAGE_MODE_NONE);
> 
> 	/* Make changes visible to IOMMUs */
> 	update_domain(domain);
> @@ -2910,7 +2920,7 @@ static int __set_gcr3(struct protection_domain *domain, int pasid,
> {
> 	u64 *pte;
> 
> -	if (domain->mode != PAGE_MODE_NONE)
> +	if (iommu_get_mode(domain->pt_root) != PAGE_MODE_NONE)
> 		return -EINVAL;
> 
> 	pte = __get_gcr3_pte(domain->gcr3_tbl, domain->glx, pasid, true);
> @@ -2926,7 +2936,7 @@ static int __clear_gcr3(struct protection_domain *domain, int pasid)
> {
> 	u64 *pte;
> 
> -	if (domain->mode != PAGE_MODE_NONE)
> +	if (iommu_get_mode(domain->pt_root) != PAGE_MODE_NONE)
> 		return -EINVAL;
> 
> 	pte = __get_gcr3_pte(domain->gcr3_tbl, domain->glx, pasid, false);
> diff --git a/drivers/iommu/amd_iommu_proto.h b/drivers/iommu/amd_iommu_proto.h
> index 92c2ba6468a0..34d4dd66cf9b 100644
> --- a/drivers/iommu/amd_iommu_proto.h
> +++ b/drivers/iommu/amd_iommu_proto.h
> @@ -67,6 +67,21 @@ static inline int amd_iommu_create_irq_domain(struct amd_iommu *iommu)
> extern int amd_iommu_complete_ppr(struct pci_dev *pdev, int pasid,
> 				  int status, int tag);
> 
> +static inline int iommu_get_mode(void *pt_root)
> +{
> +	return (unsigned long)pt_root & ~PAGE_MASK;
> +}
> +
> +static inline unsigned long iommu_get_root(void *pt_root)
> +{
> +	return (unsigned long)pt_root & PAGE_MASK;
> +}
> +
> +static inline void *iommu_set_mode(void *pt_root, int mode)
> +{
> +	return (void *)(iommu_get_root(pt_root) + mode);
> +}
> +
> static inline bool is_rd890_iommu(struct pci_dev *pdev)
> {
> 	return (pdev->vendor == PCI_VENDOR_ID_ATI) &&
> diff --git a/drivers/iommu/amd_iommu_types.h b/drivers/iommu/amd_iommu_types.h
> index ca8c4522045b..221adefa56a0 100644
> --- a/drivers/iommu/amd_iommu_types.h
> +++ b/drivers/iommu/amd_iommu_types.h
> @@ -468,8 +468,8 @@ struct protection_domain {
> 				       iommu core code */
> 	spinlock_t lock;	/* mostly used to lock the page table*/
> 	u16 id;			/* the domain id written to the device table */
> -	int mode;		/* paging mode (0-6 levels) */
> -	u64 *pt_root;		/* page table root pointer */
> +	u64 *pt_root;		/* page table root pointer and paging mode (0-6
> +				   levels) */
> 	int glx;		/* Number of levels for GCR3 table */
> 	u64 *gcr3_tbl;		/* Guest CR3 table */
> 	unsigned long flags;	/* flags to find out type of domain */
> -- 
> 2.21.0 (Apple Git-122.2)

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ