linux-kernel - Re: [PATCH 20/22] iommu/amd: Implement IO page-fault handler

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Date:	Wed, 14 Dec 2011 11:13:00 -0500
From:	Jerome Glisse <j.glisse@...il.com>
To:	Joerg Roedel <joerg.roedel@....com>
Cc:	iommu@...ts.linux-foundation.org, linux-kernel@...r.kernel.org
Subject: Re: [PATCH 20/22] iommu/amd: Implement IO page-fault handler

On Mon, Dec 05, 2011 at 02:34:35PM +0100, Joerg Roedel wrote:
> Register the notifier for PPR faults and handle them as
> necessary.
> 
> Signed-off-by: Joerg Roedel <joerg.roedel@....com>
> ---
>  drivers/iommu/amd_iommu_v2.c |  200 ++++++++++++++++++++++++++++++++++++++++--
>  1 files changed, 194 insertions(+), 6 deletions(-)
> 
> diff --git a/drivers/iommu/amd_iommu_v2.c b/drivers/iommu/amd_iommu_v2.c
> index f54b991..0f2ffff 100644
> --- a/drivers/iommu/amd_iommu_v2.c
> +++ b/drivers/iommu/amd_iommu_v2.c
> @@ -21,9 +21,11 @@
>  #include <linux/module.h>
>  #include <linux/sched.h>
>  #include <linux/iommu.h>
> +#include <linux/wait.h>
>  #include <linux/pci.h>
>  #include <linux/gfp.h>
>  
> +#include "amd_iommu_types.h"
>  #include "amd_iommu_proto.h"
>  
>  MODULE_LICENSE("GPL v2");
> @@ -35,6 +37,7 @@ MODULE_AUTHOR("Joerg Roedel <joerg.roedel@....com>");
>  struct pri_queue {
>  	atomic_t inflight;
>  	bool finish;
> +	int status;
>  };
>  
>  struct pasid_state {
> @@ -45,6 +48,8 @@ struct pasid_state {
>  	struct pri_queue pri[PRI_QUEUE_SIZE];	/* PRI tag states */
>  	struct device_state *device_state;	/* Link to our device_state */
>  	int pasid;				/* PASID index */
> +	spinlock_t lock;			/* Protect pri_queues */
> +	wait_queue_head_t wq;			/* To wait for count == 0 */
>  };
>  
>  struct device_state {
> @@ -55,6 +60,20 @@ struct device_state {
>  	int pasid_levels;
>  	int max_pasids;
>  	spinlock_t lock;
> +	wait_queue_head_t wq;
> +};
> +
> +struct fault {
> +	struct work_struct work;
> +	struct device_state *dev_state;
> +	struct pasid_state *state;
> +	struct mm_struct *mm;
> +	u64 address;
> +	u16 devid;
> +	u16 pasid;
> +	u16 tag;
> +	u16 finish;
> +	u16 flags;
>  };
>  
>  struct device_state **state_table;
> @@ -64,6 +83,8 @@ static spinlock_t state_lock;
>  static LIST_HEAD(pasid_state_list);
>  static DEFINE_SPINLOCK(ps_lock);
>  
> +static struct workqueue_struct *iommu_wq;
> +
>  static void free_pasid_states(struct device_state *dev_state);
>  static void unbind_pasid(struct device_state *dev_state, int pasid);
>  
> @@ -109,9 +130,20 @@ static void free_device_state(struct device_state *dev_state)
>  static void put_device_state(struct device_state *dev_state)
>  {
>  	if (atomic_dec_and_test(&dev_state->count))
> -		free_device_state(dev_state);
> +		wake_up(&dev_state->wq);
>  }
>  
> +static void put_device_state_wait(struct device_state *dev_state)
> +{
> +	DEFINE_WAIT(wait);
> +
> +	prepare_to_wait(&dev_state->wq, &wait, TASK_UNINTERRUPTIBLE);
> +	if (!atomic_dec_and_test(&dev_state->count))
> +		schedule();
> +	finish_wait(&dev_state->wq, &wait);
> +
> +	free_device_state(dev_state);
> +}
>  static void link_pasid_state(struct pasid_state *pasid_state)
>  {
>  	spin_lock(&ps_lock);
> @@ -242,10 +274,25 @@ static void put_pasid_state(struct pasid_state *pasid_state)
>  {
>  	if (atomic_dec_and_test(&pasid_state->count)) {
>  		put_device_state(pasid_state->device_state);
> -		free_pasid_state(pasid_state);
> +		wake_up(&pasid_state->wq);
>  	}
>  }
>  
> +static void put_pasid_state_wait(struct pasid_state *pasid_state)
> +{
> +	DEFINE_WAIT(wait);
> +
> +	prepare_to_wait(&pasid_state->wq, &wait, TASK_UNINTERRUPTIBLE);
> +
> +	if (atomic_dec_and_test(&pasid_state->count))
> +		put_device_state(pasid_state->device_state);
> +	else
> +		schedule();
> +
> +	finish_wait(&pasid_state->wq, &wait);
> +	free_pasid_state(pasid_state);
> +}
> +
>  static void unbind_pasid(struct device_state *dev_state, int pasid)
>  {
>  	struct pasid_state *pasid_state;
> @@ -260,7 +307,7 @@ static void unbind_pasid(struct device_state *dev_state, int pasid)
>  	clear_pasid_state(dev_state, pasid);
>  
>  	put_pasid_state(pasid_state); /* Reference taken in this function */
> -	put_pasid_state(pasid_state); /* Reference taken in bind() function */
> +	put_pasid_state_wait(pasid_state); /* Reference from bind() function */
>  }
>  
>  static void free_pasid_states_level1(struct pasid_state **tbl)
> @@ -299,8 +346,8 @@ static void free_pasid_states(struct device_state *dev_state)
>  		if (pasid_state == NULL)
>  			continue;
>  
> -		unbind_pasid(dev_state, i);
>  		put_pasid_state(pasid_state);
> +		unbind_pasid(dev_state, i);
>  	}
>  
>  	if (dev_state->pasid_levels == 2)
> @@ -313,6 +360,120 @@ static void free_pasid_states(struct device_state *dev_state)
>  	free_page((unsigned long)dev_state->states);
>  }
>  
> +static void set_pri_tag_status(struct pasid_state *pasid_state,
> +			       u16 tag, int status)
> +{
> +	unsigned long flags;
> +
> +	spin_lock_irqsave(&pasid_state->lock, flags);
> +	pasid_state->pri[tag].status = status;
> +	spin_unlock_irqrestore(&pasid_state->lock, flags);
> +}
> +
> +static void finish_pri_tag(struct device_state *dev_state,
> +			   struct pasid_state *pasid_state,
> +			   u16 tag)
> +{
> +	unsigned long flags;
> +
> +	spin_lock_irqsave(&pasid_state->lock, flags);
> +	if (atomic_dec_and_test(&pasid_state->pri[tag].inflight) &&
> +	    pasid_state->pri[tag].finish) {
> +		amd_iommu_complete_ppr(dev_state->pdev, pasid_state->pasid,
> +				       pasid_state->pri[tag].status, tag);
> +		pasid_state->pri[tag].finish = false;
> +		pasid_state->pri[tag].status = PPR_SUCCESS;
> +	}
> +	spin_unlock_irqrestore(&pasid_state->lock, flags);
> +}
> +
> +static void do_fault(struct work_struct *work)
> +{
> +	struct fault *fault = container_of(work, struct fault, work);
> +	int npages, write;
> +	struct page *page;
> +
> +	write = !!(fault->flags & PPR_FAULT_WRITE);
> +
> +	npages = get_user_pages(fault->state->task, fault->state->mm,
> +				fault->address, 1, write, 0, &page, NULL);
> +
> +	if (npages == 1)
> +		put_page(page);
> +	else
> +		set_pri_tag_status(fault->state, fault->tag, PPR_INVALID);

I might be missing something on how mm work in linux but can't the page
in a vma change anytime (like being migrated to a node or being evicted
...) I guess my question is when a fault happen is it because the task
page table have an invalid entry ?

I was under the impression that their was a page table associated with
each iommu client. Thus that the task page table was never directly use.
But i haven't carrefully read all the patches.

Cheers,
Jerome

> +
> +	finish_pri_tag(fault->dev_state, fault->state, fault->tag);
> +
> +	put_pasid_state(fault->state);
> +
> +	kfree(fault);
> +}
> +
> +static int ppr_notifier(struct notifier_block *nb, unsigned long e, void *data)
> +{
> +	struct amd_iommu_fault *iommu_fault;
> +	struct pasid_state *pasid_state;
> +	struct device_state *dev_state;
> +	unsigned long flags;
> +	struct fault *fault;
> +	bool finish;
> +	u16 tag;
> +	int ret;
> +
> +	iommu_fault = data;
> +	tag         = iommu_fault->tag & 0x1ff;
> +	finish      = (iommu_fault->tag >> 9) & 1;
> +
> +	ret = NOTIFY_DONE;
> +	dev_state = get_device_state(iommu_fault->device_id);
> +	if (dev_state == NULL)
> +		goto out;
> +
> +	pasid_state = get_pasid_state(dev_state, iommu_fault->pasid);
> +	if (pasid_state == NULL) {
> +		/* We know the device but not the PASID -> send INVALID */
> +		amd_iommu_complete_ppr(dev_state->pdev, iommu_fault->pasid,
> +				       PPR_INVALID, tag);
> +		goto out_drop_state;
> +	}
> +
> +	spin_lock_irqsave(&pasid_state->lock, flags);
> +	atomic_inc(&pasid_state->pri[tag].inflight);
> +	if (finish)
> +		pasid_state->pri[tag].finish = true;
> +	spin_unlock_irqrestore(&pasid_state->lock, flags);
> +
> +	fault = kzalloc(sizeof(*fault), GFP_ATOMIC);
> +	if (fault == NULL) {
> +		/* We are OOM - send success and let the device re-fault */
> +		finish_pri_tag(dev_state, pasid_state, tag);
> +		goto out_drop_state;
> +	}
> +
> +	fault->dev_state = dev_state;
> +	fault->address   = iommu_fault->address;
> +	fault->state     = pasid_state;
> +	fault->tag       = tag;
> +	fault->finish    = finish;
> +	fault->flags     = iommu_fault->flags;
> +	INIT_WORK(&fault->work, do_fault);
> +
> +	queue_work(iommu_wq, &fault->work);
> +
> +	ret = NOTIFY_OK;
> +
> +out_drop_state:
> +	put_device_state(dev_state);
> +
> +out:
> +	return ret;
> +}
> +
> +static struct notifier_block ppr_nb = {
> +	.notifier_call = ppr_notifier,
> +};
> +
>  int amd_iommu_bind_pasid(struct pci_dev *pdev, int pasid,
>  			 struct task_struct *task)
>  {
> @@ -342,6 +503,7 @@ int amd_iommu_bind_pasid(struct pci_dev *pdev, int pasid,
>  		goto out;
>  
>  	atomic_set(&pasid_state->count, 1);
> +	init_waitqueue_head(&pasid_state->wq);
>  	pasid_state->task         = task;
>  	pasid_state->mm           = task->mm;
>  	pasid_state->device_state = dev_state;
> @@ -420,6 +582,7 @@ int amd_iommu_init_device(struct pci_dev *pdev, int pasids)
>  		return -ENOMEM;
>  
>  	spin_lock_init(&dev_state->lock);
> +	init_waitqueue_head(&dev_state->wq);
>  	dev_state->pdev = pdev;
>  
>  	tmp = pasids;
> @@ -503,13 +666,14 @@ void amd_iommu_free_device(struct pci_dev *pdev)
>  	/* Get rid of any remaining pasid states */
>  	free_pasid_states(dev_state);
>  
> -	put_device_state(dev_state);
> +	put_device_state_wait(dev_state);
>  }
>  EXPORT_SYMBOL(amd_iommu_free_device);
>  
>  static int __init amd_iommu_v2_init(void)
>  {
>  	size_t state_table_size;
> +	int ret;
>  
>  	pr_info("AMD IOMMUv2 driver by Joerg Roedel <joerg.roedel@....com>");
>  
> @@ -521,7 +685,21 @@ static int __init amd_iommu_v2_init(void)
>  	if (state_table == NULL)
>  		return -ENOMEM;
>  
> +	ret = -ENOMEM;
> +	iommu_wq = create_workqueue("amd_iommu_v2");
> +	if (iommu_wq == NULL) {
> +		ret = -ENOMEM;
> +		goto out_free;
> +	}
> +
> +	amd_iommu_register_ppr_notifier(&ppr_nb);
> +
>  	return 0;
> +
> +out_free:
> +	free_pages((unsigned long)state_table, get_order(state_table_size));
> +
> +	return ret;
>  }
>  
>  static void __exit amd_iommu_v2_exit(void)
> @@ -530,6 +708,14 @@ static void __exit amd_iommu_v2_exit(void)
>  	size_t state_table_size;
>  	int i;
>  
> +	amd_iommu_unregister_ppr_notifier(&ppr_nb);
> +
> +	flush_workqueue(iommu_wq);
> +
> +	/*
> +	 * The loop below might call flush_workqueue(), so call
> +	 * destroy_workqueue() after it
> +	 */
>  	for (i = 0; i < MAX_DEVICES; ++i) {
>  		dev_state = get_device_state(i);
>  
> @@ -538,10 +724,12 @@ static void __exit amd_iommu_v2_exit(void)
>  
>  		WARN_ON_ONCE(1);
>  
> -		amd_iommu_free_device(dev_state->pdev);
>  		put_device_state(dev_state);
> +		amd_iommu_free_device(dev_state->pdev);
>  	}
>  
> +	destroy_workqueue(iommu_wq);
> +
>  	state_table_size = MAX_DEVICES * sizeof(struct device_state *);
>  	free_pages((unsigned long)state_table, get_order(state_table_size));
>  }
> -- 
> 1.7.5.4
> 
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@...r.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/