linux-kernel - Re: [PATCH v1 4/4] vfio/nvgpu: register device memory for poison handling

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <20230928134550.55fd9d8b.alex.williamson@redhat.com>
Date:   Thu, 28 Sep 2023 13:45:50 -0600
From:   Alex Williamson <alex.williamson@...hat.com>
To:     <ankita@...dia.com>
Cc:     <jgg@...dia.com>, <akpm@...ux-foundation.org>,
        <tony.luck@...el.com>, <bp@...en8.de>, <naoya.horiguchi@....com>,
        <linmiaohe@...wei.com>, <aniketa@...dia.com>, <cjia@...dia.com>,
        <kwankhede@...dia.com>, <targupta@...dia.com>, <vsethi@...dia.com>,
        <acurrid@...dia.com>, <anuaggarwal@...dia.com>,
        <linux-kernel@...r.kernel.org>, <linux-mm@...ck.org>,
        <linux-edac@...r.kernel.org>, <kvm@...r.kernel.org>
Subject: Re: [PATCH v1 4/4] vfio/nvgpu: register device memory for poison
 handling

On Wed, 20 Sep 2023 19:32:10 +0530
<ankita@...dia.com> wrote:

> From: Ankit Agrawal <ankita@...dia.com>
> 
> The nvgrace-gpu-vfio-pci module [1] maps the device memory to the user VA
> (Qemu) using remap_pfn_range() without adding the memory to the kernel.
> The device memory pages are not backed by struct page. Patches 1-3
> implements the mechanism to handle ECC/poison on memory page without
> struct page and expose a registration function. This new mechanism is
> leveraged here.
>  
> The module registers its memory region with the kernel MM for ECC handling
> using the register_pfn_address_space() registration API exposed by the
> kernel. It also defines a failure callback function pfn_memory_failure()
> to get the poisoned PFN from the MM.
>  
> The module track poisoned PFN as a bitmap with a bit per PFN. The PFN is
> communicated by the kernel MM to the module through the failure function,
> which sets the appropriate bit in the bitmap.
>  
> The module also defines a VMA fault ops for the module. It returns
> VM_FAULT_HWPOISON in case the bit for the PFN is set in the bitmap.
> 
> [1] https://lore.kernel.org/all/20230915025415.6762-1-ankita@nvidia.com/
> 
> Signed-off-by: Ankit Agrawal <ankita@...dia.com>
> ---
>  drivers/vfio/pci/nvgrace-gpu/main.c | 107 +++++++++++++++++++++++++++-
>  drivers/vfio/vfio.h                 |  11 ---
>  drivers/vfio/vfio_main.c            |   3 +-
>  include/linux/vfio.h                |  15 ++++
>  4 files changed, 123 insertions(+), 13 deletions(-)
> 
> diff --git a/drivers/vfio/pci/nvgrace-gpu/main.c b/drivers/vfio/pci/nvgrace-gpu/main.c
> index ba323f2d8ea1..1c89ce0cc1cc 100644
> --- a/drivers/vfio/pci/nvgrace-gpu/main.c
> +++ b/drivers/vfio/pci/nvgrace-gpu/main.c
> @@ -6,6 +6,10 @@
>  #include <linux/pci.h>
>  #include <linux/vfio_pci_core.h>
>  #include <linux/vfio.h>
> +#ifdef CONFIG_MEMORY_FAILURE
> +#include <linux/bitmap.h>
> +#include <linux/memory-failure.h>
> +#endif
>  
>  struct nvgrace_gpu_vfio_pci_core_device {
>  	struct vfio_pci_core_device core_device;
> @@ -13,8 +17,85 @@ struct nvgrace_gpu_vfio_pci_core_device {
>  	size_t memlength;
>  	void *memmap;
>  	struct mutex memmap_lock;
> +#ifdef CONFIG_MEMORY_FAILURE
> +	struct pfn_address_space pfn_address_space;
> +	unsigned long *pfn_bitmap;
> +#endif
>  };
>  
> +#ifdef CONFIG_MEMORY_FAILURE
> +void nvgrace_gpu_vfio_pci_pfn_memory_failure(struct pfn_address_space *pfn_space,
> +		unsigned long pfn)
> +{
> +	struct nvgrace_gpu_vfio_pci_core_device *nvdev = container_of(
> +		pfn_space, struct nvgrace_gpu_vfio_pci_core_device, pfn_address_space);
> +	unsigned long mem_offset = pfn - pfn_space->node.start;
> +
> +	if (mem_offset >= nvdev->memlength)
> +		return;
> +
> +	/*
> +	 * MM has called to notify a poisoned page. Track that in the bitmap.
> +	 */
> +	__set_bit(mem_offset, nvdev->pfn_bitmap);
> +}
> +
> +struct pfn_address_space_ops nvgrace_gpu_vfio_pci_pas_ops = {
> +	.failure = nvgrace_gpu_vfio_pci_pfn_memory_failure,
> +};
> +
> +static int
> +nvgrace_gpu_vfio_pci_register_pfn_range(struct nvgrace_gpu_vfio_pci_core_device *nvdev,
> +					struct vm_area_struct *vma)
> +{
> +	unsigned long nr_pages;
> +	int ret = 0;
> +
> +	nr_pages = nvdev->memlength >> PAGE_SHIFT;
> +
> +	nvdev->pfn_address_space.node.start = vma->vm_pgoff;
> +	nvdev->pfn_address_space.node.last = vma->vm_pgoff + nr_pages - 1;
> +	nvdev->pfn_address_space.ops = &nvgrace_gpu_vfio_pci_pas_ops;
> +	nvdev->pfn_address_space.mapping = vma->vm_file->f_mapping;
> +
> +	ret = register_pfn_address_space(&(nvdev->pfn_address_space));
> +
> +	return ret;
> +}
> +
> +static vm_fault_t nvgrace_gpu_vfio_pci_fault(struct vm_fault *vmf)
> +{
> +	unsigned long mem_offset = vmf->pgoff - vmf->vma->vm_pgoff;
> +	struct vfio_device *core_vdev;
> +	struct nvgrace_gpu_vfio_pci_core_device *nvdev;
> +
> +	if (!(vmf->vma->vm_file))
> +		goto error_exit;
> +
> +	core_vdev = vfio_device_from_file(vmf->vma->vm_file);
> +
> +	if (!core_vdev)
> +		goto error_exit;
> +
> +	nvdev = container_of(core_vdev,
> +			struct nvgrace_gpu_vfio_pci_core_device, core_device.vdev);
> +
> +	/*
> +	 * Check if the page is poisoned.
> +	 */
> +	if (mem_offset < (nvdev->memlength >> PAGE_SHIFT) &&
> +		test_bit(mem_offset, nvdev->pfn_bitmap))
> +		return VM_FAULT_HWPOISON;
> +
> +error_exit:
> +	return VM_FAULT_ERROR;
> +}
> +
> +static const struct vm_operations_struct nvgrace_gpu_vfio_pci_mmap_ops = {
> +	.fault = nvgrace_gpu_vfio_pci_fault,
> +};
> +#endif
> +
>  static int nvgrace_gpu_vfio_pci_open_device(struct vfio_device *core_vdev)
>  {
>  	struct vfio_pci_core_device *vdev =
> @@ -46,6 +127,9 @@ static void nvgrace_gpu_vfio_pci_close_device(struct vfio_device *core_vdev)
>  
>  	mutex_destroy(&nvdev->memmap_lock);
>  
> +#ifdef CONFIG_MEMORY_FAILURE
> +	unregister_pfn_address_space(&(nvdev->pfn_address_space));
> +#endif
>  	vfio_pci_core_close_device(core_vdev);
>  }
>  
> @@ -104,8 +188,12 @@ static int nvgrace_gpu_vfio_pci_mmap(struct vfio_device *core_vdev,
>  		return ret;
>  
>  	vma->vm_pgoff = start_pfn;
> +#ifdef CONFIG_MEMORY_FAILURE
> +	vma->vm_ops = &nvgrace_gpu_vfio_pci_mmap_ops;
>  
> -	return 0;
> +	ret = nvgrace_gpu_vfio_pci_register_pfn_range(nvdev, vma);
> +#endif
> +	return ret;
>  }
>  
>  static long
> @@ -406,6 +494,19 @@ nvgrace_gpu_vfio_pci_fetch_memory_property(struct pci_dev *pdev,
>  
>  	nvdev->memlength = memlength;
>  
> +#ifdef CONFIG_MEMORY_FAILURE
> +	/*
> +	 * A bitmap is maintained to track the pages that are poisoned. Each
> +	 * page is represented by a bit. Allocation size in bytes is
> +	 * determined by shifting the device memory size by PAGE_SHIFT to
> +	 * determine the number of pages; and further shifted by 3 as each
> +	 * byte could track 8 pages.
> +	 */
> +	nvdev->pfn_bitmap
> +		= vzalloc((nvdev->memlength >> PAGE_SHIFT)/BITS_PER_TYPE(char));
> +	if (!nvdev->pfn_bitmap)
> +		ret = -ENOMEM;
> +#endif
>  	return ret;
>  }
>  
> @@ -442,6 +543,10 @@ static void nvgrace_gpu_vfio_pci_remove(struct pci_dev *pdev)
>  	struct nvgrace_gpu_vfio_pci_core_device *nvdev = nvgrace_gpu_drvdata(pdev);
>  	struct vfio_pci_core_device *vdev = &nvdev->core_device;
>  
> +#ifdef CONFIG_MEMORY_FAILURE
> +	vfree(nvdev->pfn_bitmap);
> +#endif
> +
>  	vfio_pci_core_unregister_device(vdev);
>  	vfio_put_device(&vdev->vdev);
>  }
> diff --git a/drivers/vfio/vfio.h b/drivers/vfio/vfio.h
> index 307e3f29b527..747094503909 100644
> --- a/drivers/vfio/vfio.h
> +++ b/drivers/vfio/vfio.h
> @@ -16,17 +16,6 @@ struct iommufd_ctx;
>  struct iommu_group;
>  struct vfio_container;
>  
> -struct vfio_device_file {
> -	struct vfio_device *device;
> -	struct vfio_group *group;
> -
> -	u8 access_granted;
> -	u32 devid; /* only valid when iommufd is valid */
> -	spinlock_t kvm_ref_lock; /* protect kvm field */
> -	struct kvm *kvm;
> -	struct iommufd_ctx *iommufd; /* protected by struct vfio_device_set::lock */
> -};
> -
>  void vfio_device_put_registration(struct vfio_device *device);
>  bool vfio_device_try_get_registration(struct vfio_device *device);
>  int vfio_df_open(struct vfio_device_file *df);
> diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c
> index 40732e8ed4c6..a7dafd7c64a6 100644
> --- a/drivers/vfio/vfio_main.c
> +++ b/drivers/vfio/vfio_main.c
> @@ -1309,7 +1309,7 @@ const struct file_operations vfio_device_fops = {
>  	.mmap		= vfio_device_fops_mmap,
>  };
>  
> -static struct vfio_device *vfio_device_from_file(struct file *file)
> +struct vfio_device *vfio_device_from_file(struct file *file)
>  {
>  	struct vfio_device_file *df = file->private_data;
>  
> @@ -1317,6 +1317,7 @@ static struct vfio_device *vfio_device_from_file(struct file *file)
>  		return NULL;
>  	return df->device;
>  }
> +EXPORT_SYMBOL_GPL(vfio_device_from_file);
>  
>  /**
>   * vfio_file_is_valid - True if the file is valid vfio file
> diff --git a/include/linux/vfio.h b/include/linux/vfio.h
> index 454e9295970c..d88af251e931 100644
> --- a/include/linux/vfio.h
> +++ b/include/linux/vfio.h
> @@ -361,4 +361,19 @@ int vfio_virqfd_enable(void *opaque, int (*handler)(void *, void *),
>  		       struct virqfd **pvirqfd, int fd);
>  void vfio_virqfd_disable(struct virqfd **pvirqfd);
>  
> +/*
> + * VFIO device file.
> + */
> +struct vfio_device_file {
> +	struct vfio_device *device;
> +	struct vfio_group *group;
> +	u8 access_granted;
> +	u32 devid; /* only valid when iommufd is valid */
> +	spinlock_t kvm_ref_lock; /* protect kvm field */
> +	struct kvm *kvm;
> +	struct iommufd_ctx *iommufd; /* protected by struct vfio_device_set::lock */
> +};

What here necessitates moving this to the more public header?  Thanks,

Alex

> +
> +struct vfio_device *vfio_device_from_file(struct file *file);
> +
>  #endif /* VFIO_H */