[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <20220103202443.bgtkqgimta63lqay@revolver>
Date: Mon, 3 Jan 2022 20:24:51 +0000
From: Liam Howlett <liam.howlett@...cle.com>
To: Alex Sierra <alex.sierra@....com>
CC: "akpm@...ux-foundation.org" <akpm@...ux-foundation.org>,
"Felix.Kuehling@....com" <Felix.Kuehling@....com>,
"linux-mm@...ck.org" <linux-mm@...ck.org>,
"rcampbell@...dia.com" <rcampbell@...dia.com>,
"linux-ext4@...r.kernel.org" <linux-ext4@...r.kernel.org>,
"linux-xfs@...r.kernel.org" <linux-xfs@...r.kernel.org>,
"amd-gfx@...ts.freedesktop.org" <amd-gfx@...ts.freedesktop.org>,
"dri-devel@...ts.freedesktop.org" <dri-devel@...ts.freedesktop.org>,
"hch@....de" <hch@....de>, "jgg@...dia.com" <jgg@...dia.com>,
"jglisse@...hat.com" <jglisse@...hat.com>,
"apopple@...dia.com" <apopple@...dia.com>,
"willy@...radead.org" <willy@...radead.org>
Subject: Re: [PATCH v2 08/11] lib: add support for device coherent type in
test_hmm
* Alex Sierra <alex.sierra@....com> [211206 14:00]:
> Device Coherent type uses device memory that is coherently accesible by
> the CPU. This could be shown as SP (special purpose) memory range
> at the BIOS-e820 memory enumeration. If no SP memory is supported in
> system, this could be faked by setting CONFIG_EFI_FAKE_MEMMAP.
>
> Currently, test_hmm only supports two different SP ranges of at least
> 256MB size. This could be specified in the kernel parameter variable
> efi_fake_mem. Ex. Two SP ranges of 1GB starting at 0x100000000 &
> 0x140000000 physical address. Ex.
> efi_fake_mem=1G@...00000000:0x40000,1G@...40000000:0x40000
>
> Private and coherent device mirror instances can be created in the same
> probed. This is done by passing the module parameters spm_addr_dev0 &
> spm_addr_dev1. In this case, it will create four instances of
> device_mirror. The first two correspond to private device type, the
> last two to coherent type. Then, they can be easily accessed from user
> space through /dev/hmm_mirror<num_device>. Usually num_device 0 and 1
> are for private, and 2 and 3 for coherent types. If no module
> parameters are passed, two instances of private type device_mirror will
> be created only.
>
> Signed-off-by: Alex Sierra <alex.sierra@....com>
> ---
> lib/test_hmm.c | 252 +++++++++++++++++++++++++++++++++-----------
> lib/test_hmm_uapi.h | 15 ++-
> 2 files changed, 198 insertions(+), 69 deletions(-)
>
> diff --git a/lib/test_hmm.c b/lib/test_hmm.c
> index 9edeff52302e..a1985226d788 100644
> --- a/lib/test_hmm.c
> +++ b/lib/test_hmm.c
> @@ -29,11 +29,22 @@
>
> #include "test_hmm_uapi.h"
>
> -#define DMIRROR_NDEVICES 2
> +#define DMIRROR_NDEVICES 4
> #define DMIRROR_RANGE_FAULT_TIMEOUT 1000
> #define DEVMEM_CHUNK_SIZE (256 * 1024 * 1024U)
> #define DEVMEM_CHUNKS_RESERVE 16
>
> +/*
> + * For device_private pages, dpage is just a dummy struct page
> + * representing a piece of device memory. dmirror_devmem_alloc_page
> + * allocates a real system memory page as backing storage to fake a
> + * real device. zone_device_data points to that backing page. But
> + * for device_coherent memory, the struct page represents real
> + * physical CPU-accessible memory that we can use directly.
> + */
> +#define BACKING_PAGE(page) (is_device_private_page((page)) ? \
> + (page)->zone_device_data : (page))
> +
> static unsigned long spm_addr_dev0;
> module_param(spm_addr_dev0, long, 0644);
> MODULE_PARM_DESC(spm_addr_dev0,
> @@ -122,6 +133,21 @@ static int dmirror_bounce_init(struct dmirror_bounce *bounce,
> return 0;
> }
>
> +static bool dmirror_is_private_zone(struct dmirror_device *mdevice)
> +{
> + return (mdevice->zone_device_type ==
> + HMM_DMIRROR_MEMORY_DEVICE_PRIVATE) ? true : false;
> +}
> +
> +static enum migrate_vma_direction
> + dmirror_select_device(struct dmirror *dmirror)
> +{
> + return (dmirror->mdevice->zone_device_type ==
> + HMM_DMIRROR_MEMORY_DEVICE_PRIVATE) ?
> + MIGRATE_VMA_SELECT_DEVICE_PRIVATE :
> + MIGRATE_VMA_SELECT_DEVICE_COHERENT;
> +}
> +
> static void dmirror_bounce_fini(struct dmirror_bounce *bounce)
> {
> vfree(bounce->ptr);
> @@ -572,16 +598,19 @@ static int dmirror_allocate_chunk(struct dmirror_device *mdevice,
> static struct page *dmirror_devmem_alloc_page(struct dmirror_device *mdevice)
> {
> struct page *dpage = NULL;
> - struct page *rpage;
> + struct page *rpage = NULL;
>
> /*
> - * This is a fake device so we alloc real system memory to store
> - * our device memory.
> + * For ZONE_DEVICE private type, this is a fake device so we alloc real
> + * system memory to store our device memory.
> + * For ZONE_DEVICE coherent type we use the actual dpage to store the data
> + * and ignore rpage.
> */
> - rpage = alloc_page(GFP_HIGHUSER);
> - if (!rpage)
> - return NULL;
> -
> + if (dmirror_is_private_zone(mdevice)) {
> + rpage = alloc_page(GFP_HIGHUSER);
> + if (!rpage)
> + return NULL;
> + }
> spin_lock(&mdevice->lock);
>
> if (mdevice->free_pages) {
> @@ -601,7 +630,8 @@ static struct page *dmirror_devmem_alloc_page(struct dmirror_device *mdevice)
> return dpage;
>
> error:
> - __free_page(rpage);
> + if (rpage)
> + __free_page(rpage);
> return NULL;
> }
>
> @@ -627,12 +657,15 @@ static void dmirror_migrate_alloc_and_copy(struct migrate_vma *args,
> * unallocated pte_none() or read-only zero page.
> */
> spage = migrate_pfn_to_page(*src);
> + WARN(spage && is_zone_device_page(spage),
> + "page already in device spage pfn: 0x%lx\n",
> + page_to_pfn(spage));
>
> dpage = dmirror_devmem_alloc_page(mdevice);
> if (!dpage)
> continue;
>
> - rpage = dpage->zone_device_data;
> + rpage = BACKING_PAGE(dpage);
> if (spage)
> copy_highpage(rpage, spage);
> else
> @@ -646,6 +679,8 @@ static void dmirror_migrate_alloc_and_copy(struct migrate_vma *args,
> */
> rpage->zone_device_data = dmirror;
>
> + pr_debug("migrating from sys to dev pfn src: 0x%lx pfn dst: 0x%lx\n",
> + page_to_pfn(spage), page_to_pfn(dpage));
> *dst = migrate_pfn(page_to_pfn(dpage)) |
> MIGRATE_PFN_LOCKED;
> if ((*src & MIGRATE_PFN_WRITE) ||
> @@ -724,11 +759,7 @@ static int dmirror_migrate_finalize_and_map(struct migrate_vma *args,
> if (!dpage)
> continue;
>
> - /*
> - * Store the page that holds the data so the page table
> - * doesn't have to deal with ZONE_DEVICE private pages.
> - */
> - entry = dpage->zone_device_data;
> + entry = BACKING_PAGE(dpage);
> if (*dst & MIGRATE_PFN_WRITE)
> entry = xa_tag_pointer(entry, DPT_XA_TAG_WRITE);
> entry = xa_store(&dmirror->pt, pfn, entry, GFP_ATOMIC);
> @@ -808,8 +839,106 @@ static int dmirror_exclusive(struct dmirror *dmirror,
> return ret;
> }
>
> -static int dmirror_migrate(struct dmirror *dmirror,
> - struct hmm_dmirror_cmd *cmd)
> +static vm_fault_t dmirror_devmem_fault_alloc_and_copy(struct migrate_vma *args,
> + struct dmirror *dmirror)
> +{
> + const unsigned long *src = args->src;
> + unsigned long *dst = args->dst;
> + unsigned long start = args->start;
> + unsigned long end = args->end;
> + unsigned long addr;
> +
> + for (addr = start; addr < end; addr += PAGE_SIZE,
> + src++, dst++) {
> + struct page *dpage, *spage;
> +
> + spage = migrate_pfn_to_page(*src);
> + if (!spage || !(*src & MIGRATE_PFN_MIGRATE))
> + continue;
> +
> + WARN_ON(!is_device_page(spage));
> + spage = BACKING_PAGE(spage);
> + dpage = alloc_page_vma(GFP_HIGHUSER_MOVABLE, args->vma, addr);
> + if (!dpage)
> + continue;
> + pr_debug("migrating from dev to sys pfn src: 0x%lx pfn dst: 0x%lx\n",
> + page_to_pfn(spage), page_to_pfn(dpage));
> +
> + lock_page(dpage);
> + xa_erase(&dmirror->pt, addr >> PAGE_SHIFT);
> + copy_highpage(dpage, spage);
> + *dst = migrate_pfn(page_to_pfn(dpage)) | MIGRATE_PFN_LOCKED;
> + if (*src & MIGRATE_PFN_WRITE)
> + *dst |= MIGRATE_PFN_WRITE;
> + }
> + return 0;
> +}
> +
> +static int dmirror_migrate_to_system(struct dmirror *dmirror,
> + struct hmm_dmirror_cmd *cmd)
> +{
> + unsigned long start, end, addr;
> + unsigned long size = cmd->npages << PAGE_SHIFT;
> + struct mm_struct *mm = dmirror->notifier.mm;
> + struct vm_area_struct *vma;
> + unsigned long src_pfns[64];
> + unsigned long dst_pfns[64];
> + struct migrate_vma args;
> + unsigned long next;
> + int ret;
> +
> + start = cmd->addr;
> + end = start + size;
> + if (end < start)
> + return -EINVAL;
> +
> + /* Since the mm is for the mirrored process, get a reference first. */
> + if (!mmget_not_zero(mm))
> + return -EINVAL;
> +
> + mmap_read_lock(mm);
> + for (addr = start; addr < end; addr = next) {
> + vma = find_vma(mm, addr);
> + if (!vma || addr < vma->vm_start ||
> + !(vma->vm_flags & VM_READ)) {
If you use vma_lookup() instead of find_vma(), then the if statement can
be simplified.
> + ret = -EINVAL;
> + goto out;
> + }
> + next = min(end, addr + (ARRAY_SIZE(src_pfns) << PAGE_SHIFT));
> + if (next > vma->vm_end)
> + next = vma->vm_end;
> +
> + args.vma = vma;
> + args.src = src_pfns;
> + args.dst = dst_pfns;
> + args.start = addr;
> + args.end = next;
> + args.pgmap_owner = dmirror->mdevice;
> + args.flags = dmirror_select_device(dmirror);
> +
> + ret = migrate_vma_setup(&args);
> + if (ret)
> + goto out;
> +
> + pr_debug("Migrating from device mem to sys mem\n");
> + dmirror_devmem_fault_alloc_and_copy(&args, dmirror);
> +
> + migrate_vma_pages(&args);
> + migrate_vma_finalize(&args);
> + }
out label could be here.
> + mmap_read_unlock(mm);
> + mmput(mm);
> +
> + return ret;
> +
> +out:
> + mmap_read_unlock(mm);
> + mmput(mm);
> + return ret;
> +}
> +
> +static int dmirror_migrate_to_device(struct dmirror *dmirror,
> + struct hmm_dmirror_cmd *cmd)
> {
> unsigned long start, end, addr;
> unsigned long size = cmd->npages << PAGE_SHIFT;
> @@ -853,6 +982,7 @@ static int dmirror_migrate(struct dmirror *dmirror,
> if (ret)
> goto out;
>
> + pr_debug("Migrating from sys mem to device mem\n");
> dmirror_migrate_alloc_and_copy(&args, dmirror);
> migrate_vma_pages(&args);
> dmirror_migrate_finalize_and_map(&args, dmirror);
> @@ -861,7 +991,7 @@ static int dmirror_migrate(struct dmirror *dmirror,
> mmap_read_unlock(mm);
> mmput(mm);
>
> - /* Return the migrated data for verification. */
> + /* Return the migrated data for verification. only for pages in device zone */
> ret = dmirror_bounce_init(&bounce, start, size);
> if (ret)
> return ret;
> @@ -898,12 +1028,22 @@ static void dmirror_mkentry(struct dmirror *dmirror, struct hmm_range *range,
> }
>
> page = hmm_pfn_to_page(entry);
> - if (is_device_private_page(page)) {
> - /* Is the page migrated to this device or some other? */
> - if (dmirror->mdevice == dmirror_page_to_device(page))
> + if (is_device_page(page)) {
> + /* Is page ZONE_DEVICE coherent? */
> + if (!is_device_private_page(page)) {
> + if (dmirror->mdevice == dmirror_page_to_device(page))
> + *perm = HMM_DMIRROR_PROT_DEV_COHERENT_LOCAL;
> + else
> + *perm = HMM_DMIRROR_PROT_DEV_COHERENT_REMOTE;
> + /*
> + * Is page ZONE_DEVICE private migrated to
> + * this device or some other?
> + */
> + } else if (dmirror->mdevice == dmirror_page_to_device(page)) {
> *perm = HMM_DMIRROR_PROT_DEV_PRIVATE_LOCAL;
> - else
> + } else {
> *perm = HMM_DMIRROR_PROT_DEV_PRIVATE_REMOTE;
> + }
> } else if (is_zero_pfn(page_to_pfn(page)))
> *perm = HMM_DMIRROR_PROT_ZERO;
> else
> @@ -1100,8 +1240,12 @@ static long dmirror_fops_unlocked_ioctl(struct file *filp,
> ret = dmirror_write(dmirror, &cmd);
> break;
>
> - case HMM_DMIRROR_MIGRATE:
> - ret = dmirror_migrate(dmirror, &cmd);
> + case HMM_DMIRROR_MIGRATE_TO_DEV:
> + ret = dmirror_migrate_to_device(dmirror, &cmd);
> + break;
> +
> + case HMM_DMIRROR_MIGRATE_TO_SYS:
> + ret = dmirror_migrate_to_system(dmirror, &cmd);
> break;
>
> case HMM_DMIRROR_EXCLUSIVE:
> @@ -1142,14 +1286,13 @@ static const struct file_operations dmirror_fops = {
>
> static void dmirror_devmem_free(struct page *page)
> {
> - struct page *rpage = page->zone_device_data;
> + struct page *rpage = BACKING_PAGE(page);
> struct dmirror_device *mdevice;
>
> - if (rpage)
> + if (rpage != page)
> __free_page(rpage);
>
> mdevice = dmirror_page_to_device(page);
> -
> spin_lock(&mdevice->lock);
> mdevice->cfree++;
> page->zone_device_data = mdevice->free_pages;
> @@ -1157,38 +1300,6 @@ static void dmirror_devmem_free(struct page *page)
> spin_unlock(&mdevice->lock);
> }
>
> -static vm_fault_t dmirror_devmem_fault_alloc_and_copy(struct migrate_vma *args,
> - struct dmirror *dmirror)
> -{
> - const unsigned long *src = args->src;
> - unsigned long *dst = args->dst;
> - unsigned long start = args->start;
> - unsigned long end = args->end;
> - unsigned long addr;
> -
> - for (addr = start; addr < end; addr += PAGE_SIZE,
> - src++, dst++) {
> - struct page *dpage, *spage;
> -
> - spage = migrate_pfn_to_page(*src);
> - if (!spage || !(*src & MIGRATE_PFN_MIGRATE))
> - continue;
> - spage = spage->zone_device_data;
> -
> - dpage = alloc_page_vma(GFP_HIGHUSER_MOVABLE, args->vma, addr);
> - if (!dpage)
> - continue;
> -
> - lock_page(dpage);
> - xa_erase(&dmirror->pt, addr >> PAGE_SHIFT);
> - copy_highpage(dpage, spage);
> - *dst = migrate_pfn(page_to_pfn(dpage)) | MIGRATE_PFN_LOCKED;
> - if (*src & MIGRATE_PFN_WRITE)
> - *dst |= MIGRATE_PFN_WRITE;
> - }
> - return 0;
> -}
> -
> static vm_fault_t dmirror_devmem_fault(struct vm_fault *vmf)
> {
> struct migrate_vma args;
> @@ -1203,7 +1314,7 @@ static vm_fault_t dmirror_devmem_fault(struct vm_fault *vmf)
> * the mirror but here we use it to hold the page for the simulated
> * device memory and that page holds the pointer to the mirror.
> */
> - rpage = vmf->page->zone_device_data;
> + rpage = BACKING_PAGE(vmf->page);
> dmirror = rpage->zone_device_data;
>
> /* FIXME demonstrate how we can adjust migrate range */
> @@ -1213,7 +1324,7 @@ static vm_fault_t dmirror_devmem_fault(struct vm_fault *vmf)
> args.src = &src_pfns;
> args.dst = &dst_pfns;
> args.pgmap_owner = dmirror->mdevice;
> - args.flags = MIGRATE_VMA_SELECT_DEVICE_PRIVATE;
> + args.flags = dmirror_select_device(dmirror);
>
> if (migrate_vma_setup(&args))
> return VM_FAULT_SIGBUS;
> @@ -1279,14 +1390,26 @@ static void dmirror_device_remove(struct dmirror_device *mdevice)
> static int __init hmm_dmirror_init(void)
> {
> int ret;
> - int id;
> + int id = 0;
> + int ndevices = 0;
>
> ret = alloc_chrdev_region(&dmirror_dev, 0, DMIRROR_NDEVICES,
> "HMM_DMIRROR");
> if (ret)
> goto err_unreg;
>
> - for (id = 0; id < DMIRROR_NDEVICES; id++) {
> + memset(dmirror_devices, 0, DMIRROR_NDEVICES * sizeof(dmirror_devices[0]));
> + dmirror_devices[ndevices++].zone_device_type =
> + HMM_DMIRROR_MEMORY_DEVICE_PRIVATE;
> + dmirror_devices[ndevices++].zone_device_type =
> + HMM_DMIRROR_MEMORY_DEVICE_PRIVATE;
> + if (spm_addr_dev0 && spm_addr_dev1) {
> + dmirror_devices[ndevices++].zone_device_type =
> + HMM_DMIRROR_MEMORY_DEVICE_COHERENT;
> + dmirror_devices[ndevices++].zone_device_type =
> + HMM_DMIRROR_MEMORY_DEVICE_COHERENT;
> + }
> + for (id = 0; id < ndevices; id++) {
> ret = dmirror_device_init(dmirror_devices + id, id);
> if (ret)
> goto err_chrdev;
> @@ -1308,7 +1431,8 @@ static void __exit hmm_dmirror_exit(void)
> int id;
>
> for (id = 0; id < DMIRROR_NDEVICES; id++)
> - dmirror_device_remove(dmirror_devices + id);
> + if (dmirror_devices[id].zone_device_type)
> + dmirror_device_remove(dmirror_devices + id);
> unregister_chrdev_region(dmirror_dev, DMIRROR_NDEVICES);
> }
>
> diff --git a/lib/test_hmm_uapi.h b/lib/test_hmm_uapi.h
> index 625f3690d086..e190b2ab6f19 100644
> --- a/lib/test_hmm_uapi.h
> +++ b/lib/test_hmm_uapi.h
> @@ -33,11 +33,12 @@ struct hmm_dmirror_cmd {
> /* Expose the address space of the calling process through hmm device file */
> #define HMM_DMIRROR_READ _IOWR('H', 0x00, struct hmm_dmirror_cmd)
> #define HMM_DMIRROR_WRITE _IOWR('H', 0x01, struct hmm_dmirror_cmd)
> -#define HMM_DMIRROR_MIGRATE _IOWR('H', 0x02, struct hmm_dmirror_cmd)
> -#define HMM_DMIRROR_SNAPSHOT _IOWR('H', 0x03, struct hmm_dmirror_cmd)
> -#define HMM_DMIRROR_EXCLUSIVE _IOWR('H', 0x04, struct hmm_dmirror_cmd)
> -#define HMM_DMIRROR_CHECK_EXCLUSIVE _IOWR('H', 0x05, struct hmm_dmirror_cmd)
> -#define HMM_DMIRROR_GET_MEM_DEV_TYPE _IOWR('H', 0x06, struct hmm_dmirror_cmd)
> +#define HMM_DMIRROR_MIGRATE_TO_DEV _IOWR('H', 0x02, struct hmm_dmirror_cmd)
> +#define HMM_DMIRROR_MIGRATE_TO_SYS _IOWR('H', 0x03, struct hmm_dmirror_cmd)
> +#define HMM_DMIRROR_SNAPSHOT _IOWR('H', 0x04, struct hmm_dmirror_cmd)
> +#define HMM_DMIRROR_EXCLUSIVE _IOWR('H', 0x05, struct hmm_dmirror_cmd)
> +#define HMM_DMIRROR_CHECK_EXCLUSIVE _IOWR('H', 0x06, struct hmm_dmirror_cmd)
> +#define HMM_DMIRROR_GET_MEM_DEV_TYPE _IOWR('H', 0x07, struct hmm_dmirror_cmd)
>
> /*
> * Values returned in hmm_dmirror_cmd.ptr for HMM_DMIRROR_SNAPSHOT.
> @@ -52,6 +53,8 @@ struct hmm_dmirror_cmd {
> * device the ioctl() is made
> * HMM_DMIRROR_PROT_DEV_PRIVATE_REMOTE: Migrated device private page on some
> * other device
> + * HMM_DMIRROR_PROT_DEV_COHERENT: Migrate device coherent page on the device
> + * the ioctl() is made
> */
> enum {
> HMM_DMIRROR_PROT_ERROR = 0xFF,
> @@ -63,6 +66,8 @@ enum {
> HMM_DMIRROR_PROT_ZERO = 0x10,
> HMM_DMIRROR_PROT_DEV_PRIVATE_LOCAL = 0x20,
> HMM_DMIRROR_PROT_DEV_PRIVATE_REMOTE = 0x30,
> + HMM_DMIRROR_PROT_DEV_COHERENT_LOCAL = 0x40,
> + HMM_DMIRROR_PROT_DEV_COHERENT_REMOTE = 0x50,
> };
>
> enum {
> --
> 2.32.0
>
>
Powered by blists - more mailing lists