[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <48518be8-7766-d826-6349-c820bc47a075@amd.com>
Date: Thu, 3 Aug 2023 12:53:09 -0700
From: Brett Creeley <bcreeley@....com>
To: Shameerali Kolothum Thodi <shameerali.kolothum.thodi@...wei.com>,
Brett Creeley <brett.creeley@....com>,
"kvm@...r.kernel.org" <kvm@...r.kernel.org>,
"netdev@...r.kernel.org" <netdev@...r.kernel.org>,
"alex.williamson@...hat.com" <alex.williamson@...hat.com>,
"jgg@...dia.com" <jgg@...dia.com>, "yishaih@...dia.com"
<yishaih@...dia.com>, "kevin.tian@...el.com" <kevin.tian@...el.com>
Cc: "simon.horman@...igine.com" <simon.horman@...igine.com>,
"shannon.nelson@....com" <shannon.nelson@....com>
Subject: Re: [PATCH v13 vfio 5/7] vfio/pds: Add support for dirty page
tracking
On 8/3/2023 5:43 AM, Shameerali Kolothum Thodi wrote:
> Caution: This message originated from an External Source. Use proper caution when opening attachments, clicking links, or responding.
>
>
> Hi Brett,
>
>> -----Original Message-----
>> From: Brett Creeley [mailto:brett.creeley@....com]
>> Sent: 25 July 2023 22:40
>> To: kvm@...r.kernel.org; netdev@...r.kernel.org;
>> alex.williamson@...hat.com; jgg@...dia.com; yishaih@...dia.com;
>> Shameerali Kolothum Thodi <shameerali.kolothum.thodi@...wei.com>;
>> kevin.tian@...el.com
>> Cc: simon.horman@...igine.com; brett.creeley@....com;
>> shannon.nelson@....com
>> Subject: [PATCH v13 vfio 5/7] vfio/pds: Add support for dirty page tracking
>>
> [...]
>
>> +static int pds_vfio_dirty_enable(struct pds_vfio_pci_device *pds_vfio,
>> + struct rb_root_cached *ranges, u32 nnodes,
>> + u64 *page_size)
>> +{
>> + struct pci_dev *pdev = pds_vfio->vfio_coredev.pdev;
>> + struct device *pdsc_dev = &pci_physfn(pdev)->dev;
>> + struct pds_vfio_dirty *dirty = &pds_vfio->dirty;
>> + u64 region_start, region_size, region_page_size;
>> + struct pds_lm_dirty_region_info *region_info;
>> + struct interval_tree_node *node = NULL;
>> + u8 max_regions = 0, num_regions;
>> + dma_addr_t regions_dma = 0;
>> + u32 num_ranges = nnodes;
>> + u32 page_count;
>> + u16 len;
>> + int err;
>> +
>> + dev_dbg(&pdev->dev, "vf%u: Start dirty page tracking\n",
>> + pds_vfio->vf_id);
>> +
>> + if (pds_vfio_dirty_is_enabled(pds_vfio))
>> + return -EINVAL;
>> +
>> + pds_vfio_dirty_set_enabled(pds_vfio);
>
> Any reason why this is set here? It looks to me you could set this at the
> end if everything goes well and avoid below goto out_set_disabled s.
> Not sure I am missing anything obvious here.
>
> Thanks,
> Shameer.
No, this is a good catch and simplifies this function a bit. I will
respin and fix this.
Thanks for the review,
Brett
>> +
>> + /* find if dirty tracking is disabled, i.e. num_regions == 0 */
>> + err = pds_vfio_dirty_status_cmd(pds_vfio, 0, &max_regions,
>> + &num_regions);
>> + if (err < 0) {
>> + dev_err(&pdev->dev, "Failed to get dirty status, err %pe\n",
>> + ERR_PTR(err));
>> + goto out_set_disabled;
>> + } else if (num_regions) {
>> + dev_err(&pdev->dev,
>> + "Dirty tracking already enabled for %d regions\n",
>> + num_regions);
>> + err = -EEXIST;
>> + goto out_set_disabled;
>> + } else if (!max_regions) {
>> + dev_err(&pdev->dev,
>> + "Device doesn't support dirty tracking, max_regions %d\n",
>> + max_regions);
>> + err = -EOPNOTSUPP;
>> + goto out_set_disabled;
>> + }
>> +
>> + /*
>> + * Only support 1 region for now. If there are any large gaps in the
>> + * VM's address regions, then this would be a waste of memory as we
>> are
>> + * generating 2 bitmaps (ack/seq) from the min address to the max
>> + * address of the VM's address regions. In the future, if we support
>> + * more than one region in the device/driver we can split the bitmaps
>> + * on the largest address region gaps. We can do this split up to the
>> + * max_regions times returned from the dirty_status command.
>> + */
>> + max_regions = 1;
>> + if (num_ranges > max_regions) {
>> + vfio_combine_iova_ranges(ranges, nnodes, max_regions);
>> + num_ranges = max_regions;
>> + }
>> +
>> + node = interval_tree_iter_first(ranges, 0, ULONG_MAX);
>> + if (!node) {
>> + err = -EINVAL;
>> + goto out_set_disabled;
>> + }
>> +
>> + region_size = node->last - node->start + 1;
>> + region_start = node->start;
>> + region_page_size = *page_size;
>> +
>> + len = sizeof(*region_info);
>> + region_info = kzalloc(len, GFP_KERNEL);
>> + if (!region_info) {
>> + err = -ENOMEM;
>> + goto out_set_disabled;
>> + }
>> +
>> + page_count = DIV_ROUND_UP(region_size, region_page_size);
>> +
>> + region_info->dma_base = cpu_to_le64(region_start);
>> + region_info->page_count = cpu_to_le32(page_count);
>> + region_info->page_size_log2 = ilog2(region_page_size);
>> +
>> + regions_dma = dma_map_single(pdsc_dev, (void *)region_info, len,
>> + DMA_BIDIRECTIONAL);
>> + if (dma_mapping_error(pdsc_dev, regions_dma)) {
>> + err = -ENOMEM;
>> + goto out_free_region_info;
>> + }
>> +
>> + err = pds_vfio_dirty_enable_cmd(pds_vfio, regions_dma, max_regions);
>> + dma_unmap_single(pdsc_dev, regions_dma, len, DMA_BIDIRECTIONAL);
>> + if (err)
>> + goto out_free_region_info;
>> +
>> + /*
>> + * page_count might be adjusted by the device,
>> + * update it before freeing region_info DMA
>> + */
>> + page_count = le32_to_cpu(region_info->page_count);
>> +
>> + dev_dbg(&pdev->dev,
>> + "region_info: regions_dma 0x%llx dma_base 0x%llx page_count %u
>> page_size_log2 %u\n",
>> + regions_dma, region_start, page_count,
>> + (u8)ilog2(region_page_size));
>> +
>> + err = pds_vfio_dirty_alloc_bitmaps(dirty, page_count / BITS_PER_BYTE);
>> + if (err) {
>> + dev_err(&pdev->dev, "Failed to alloc dirty bitmaps: %pe\n",
>> + ERR_PTR(err));
>> + goto out_free_region_info;
>> + }
>> +
>> + err = pds_vfio_dirty_alloc_sgl(pds_vfio, page_count);
>> + if (err) {
>> + dev_err(&pdev->dev, "Failed to alloc dirty sg lists: %pe\n",
>> + ERR_PTR(err));
>> + goto out_free_bitmaps;
>> + }
>> +
>> + dirty->region_start = region_start;
>> + dirty->region_size = region_size;
>> + dirty->region_page_size = region_page_size;
>> +
>> + pds_vfio_print_guest_region_info(pds_vfio, max_regions);
>> +
>> + kfree(region_info);
>> +
>> + return 0;
>> +
>> +out_free_bitmaps:
>> + pds_vfio_dirty_free_bitmaps(dirty);
>> +out_free_region_info:
>> + kfree(region_info);
>> +out_set_disabled:
>> + pds_vfio_dirty_set_disabled(pds_vfio);
>> + return err;
>> +}
>> +
>> +void pds_vfio_dirty_disable(struct pds_vfio_pci_device *pds_vfio, bool
>> send_cmd)
>> +{
>> + if (pds_vfio_dirty_is_enabled(pds_vfio)) {
>> + pds_vfio_dirty_set_disabled(pds_vfio);
>> + if (send_cmd)
>> + pds_vfio_dirty_disable_cmd(pds_vfio);
>> + pds_vfio_dirty_free_sgl(pds_vfio);
>> + pds_vfio_dirty_free_bitmaps(&pds_vfio->dirty);
>> + }
>> +
>> + if (send_cmd)
>> + pds_vfio_send_host_vf_lm_status_cmd(pds_vfio,
>> PDS_LM_STA_NONE);
>> +}
>> +
>> +static int pds_vfio_dirty_seq_ack(struct pds_vfio_pci_device *pds_vfio,
>> + struct pds_vfio_bmp_info *bmp_info,
>> + u32 offset, u32 bmp_bytes, bool read_seq)
>> +{
>> + const char *bmp_type_str = read_seq ? "read_seq" : "write_ack";
>> + u8 dma_dir = read_seq ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
>> + struct pci_dev *pdev = pds_vfio->vfio_coredev.pdev;
>> + struct device *pdsc_dev = &pci_physfn(pdev)->dev;
>> + unsigned long long npages;
>> + struct sg_table sg_table;
>> + struct scatterlist *sg;
>> + struct page **pages;
>> + u32 page_offset;
>> + const void *bmp;
>> + size_t size;
>> + u16 num_sge;
>> + int err;
>> + int i;
>> +
>> + bmp = (void *)((u64)bmp_info->bmp + offset);
>> + page_offset = offset_in_page(bmp);
>> + bmp -= page_offset;
>> +
>> + /*
>> + * Start and end of bitmap section to seq/ack might not be page
>> + * aligned, so use the page_offset to account for that so there
>> + * will be enough pages to represent the bmp_bytes
>> + */
>> + npages = DIV_ROUND_UP_ULL(bmp_bytes + page_offset, PAGE_SIZE);
>> + pages = kmalloc_array(npages, sizeof(*pages), GFP_KERNEL);
>> + if (!pages)
>> + return -ENOMEM;
>> +
>> + for (unsigned long long i = 0; i < npages; i++) {
>> + struct page *page = vmalloc_to_page(bmp);
>> +
>> + if (!page) {
>> + err = -EFAULT;
>> + goto out_free_pages;
>> + }
>> +
>> + pages[i] = page;
>> + bmp += PAGE_SIZE;
>> + }
>> +
>> + err = sg_alloc_table_from_pages(&sg_table, pages, npages, page_offset,
>> + bmp_bytes, GFP_KERNEL);
>> + if (err)
>> + goto out_free_pages;
>> +
>> + err = dma_map_sgtable(pdsc_dev, &sg_table, dma_dir, 0);
>> + if (err)
>> + goto out_free_sg_table;
>> +
>> + for_each_sgtable_dma_sg(&sg_table, sg, i) {
>> + struct pds_lm_sg_elem *sg_elem = &bmp_info->sgl[i];
>> +
>> + sg_elem->addr = cpu_to_le64(sg_dma_address(sg));
>> + sg_elem->len = cpu_to_le32(sg_dma_len(sg));
>> + }
>> +
>> + num_sge = sg_table.nents;
>> + size = num_sge * sizeof(struct pds_lm_sg_elem);
>> + dma_sync_single_for_device(pdsc_dev, bmp_info->sgl_addr, size,
>> dma_dir);
>> + err = pds_vfio_dirty_seq_ack_cmd(pds_vfio, bmp_info->sgl_addr,
>> num_sge,
>> + offset, bmp_bytes, read_seq);
>> + if (err)
>> + dev_err(&pdev->dev,
>> + "Dirty bitmap %s failed offset %u bmp_bytes %u num_sge %u
>> DMA 0x%llx: %pe\n",
>> + bmp_type_str, offset, bmp_bytes,
>> + num_sge, bmp_info->sgl_addr, ERR_PTR(err));
>> + dma_sync_single_for_cpu(pdsc_dev, bmp_info->sgl_addr, size,
>> dma_dir);
>> +
>> + dma_unmap_sgtable(pdsc_dev, &sg_table, dma_dir, 0);
>> +out_free_sg_table:
>> + sg_free_table(&sg_table);
>> +out_free_pages:
>> + kfree(pages);
>> +
>> + return err;
>> +}
>> +
>> +static int pds_vfio_dirty_write_ack(struct pds_vfio_pci_device *pds_vfio,
>> + u32 offset, u32 len)
>> +{
>> + return pds_vfio_dirty_seq_ack(pds_vfio, &pds_vfio->dirty.host_ack,
>> + offset, len, WRITE_ACK);
>> +}
>> +
>> +static int pds_vfio_dirty_read_seq(struct pds_vfio_pci_device *pds_vfio,
>> + u32 offset, u32 len)
>> +{
>> + return pds_vfio_dirty_seq_ack(pds_vfio, &pds_vfio->dirty.host_seq,
>> + offset, len, READ_SEQ);
>> +}
>> +
>> +static int pds_vfio_dirty_process_bitmaps(struct pds_vfio_pci_device
>> *pds_vfio,
>> + struct iova_bitmap *dirty_bitmap,
>> + u32 bmp_offset, u32 len_bytes)
>> +{
>> + u64 page_size = pds_vfio->dirty.region_page_size;
>> + u64 region_start = pds_vfio->dirty.region_start;
>> + u32 bmp_offset_bit;
>> + __le64 *seq, *ack;
>> + int dword_count;
>> +
>> + dword_count = len_bytes / sizeof(u64);
>> + seq = (__le64 *)((u64)pds_vfio->dirty.host_seq.bmp + bmp_offset);
>> + ack = (__le64 *)((u64)pds_vfio->dirty.host_ack.bmp + bmp_offset);
>> + bmp_offset_bit = bmp_offset * 8;
>> +
>> + for (int i = 0; i < dword_count; i++) {
>> + u64 xor = le64_to_cpu(seq[i]) ^ le64_to_cpu(ack[i]);
>> +
>> + /* prepare for next write_ack call */
>> + ack[i] = seq[i];
>> +
>> + for (u8 bit_i = 0; bit_i < BITS_PER_TYPE(u64); ++bit_i) {
>> + if (xor & BIT(bit_i)) {
>> + u64 abs_bit_i = bmp_offset_bit +
>> + i * BITS_PER_TYPE(u64) + bit_i;
>> + u64 addr = abs_bit_i * page_size + region_start;
>> +
>> + iova_bitmap_set(dirty_bitmap, addr, page_size);
>> + }
>> + }
>> + }
>> +
>> + return 0;
>> +}
>> +
>> +static int pds_vfio_dirty_sync(struct pds_vfio_pci_device *pds_vfio,
>> + struct iova_bitmap *dirty_bitmap,
>> + unsigned long iova, unsigned long length)
>> +{
>> + struct device *dev = &pds_vfio->vfio_coredev.pdev->dev;
>> + struct pds_vfio_dirty *dirty = &pds_vfio->dirty;
>> + u64 bmp_offset, bmp_bytes;
>> + u64 bitmap_size, pages;
>> + int err;
>> +
>> + dev_dbg(dev, "vf%u: Get dirty page bitmap\n", pds_vfio->vf_id);
>> +
>> + if (!pds_vfio_dirty_is_enabled(pds_vfio)) {
>> + dev_err(dev, "vf%u: Sync failed, dirty tracking is disabled\n",
>> + pds_vfio->vf_id);
>> + return -EINVAL;
>> + }
>> +
>> + pages = DIV_ROUND_UP(length, pds_vfio->dirty.region_page_size);
>> + bitmap_size =
>> + round_up(pages, sizeof(u64) * BITS_PER_BYTE) / BITS_PER_BYTE;
>> +
>> + dev_dbg(dev,
>> + "vf%u: iova 0x%lx length %lu page_size %llu pages %llu
>> bitmap_size %llu\n",
>> + pds_vfio->vf_id, iova, length, pds_vfio->dirty.region_page_size,
>> + pages, bitmap_size);
>> +
>> + if (!length || ((dirty->region_start + iova + length) >
>> + (dirty->region_start + dirty->region_size))) {
>> + dev_err(dev, "Invalid iova 0x%lx and/or length 0x%lx to sync\n",
>> + iova, length);
>> + return -EINVAL;
>> + }
>> +
>> + /* bitmap is modified in 64 bit chunks */
>> + bmp_bytes = ALIGN(DIV_ROUND_UP(length / dirty->region_page_size,
>> + sizeof(u64)),
>> + sizeof(u64));
>> + if (bmp_bytes != bitmap_size) {
>> + dev_err(dev,
>> + "Calculated bitmap bytes %llu not equal to bitmap
>> size %llu\n",
>> + bmp_bytes, bitmap_size);
>> + return -EINVAL;
>> + }
>> +
>> + bmp_offset = DIV_ROUND_UP(iova / dirty->region_page_size,
>> sizeof(u64));
>> +
>> + dev_dbg(dev,
>> + "Syncing dirty bitmap, iova 0x%lx length 0x%lx, bmp_offset %llu
>> bmp_bytes %llu\n",
>> + iova, length, bmp_offset, bmp_bytes);
>> +
>> + err = pds_vfio_dirty_read_seq(pds_vfio, bmp_offset, bmp_bytes);
>> + if (err)
>> + return err;
>> +
>> + err = pds_vfio_dirty_process_bitmaps(pds_vfio, dirty_bitmap,
>> bmp_offset,
>> + bmp_bytes);
>> + if (err)
>> + return err;
>> +
>> + err = pds_vfio_dirty_write_ack(pds_vfio, bmp_offset, bmp_bytes);
>> + if (err)
>> + return err;
>> +
>> + return 0;
>> +}
>> +
>> +int pds_vfio_dma_logging_report(struct vfio_device *vdev, unsigned long
>> iova,
>> + unsigned long length, struct iova_bitmap *dirty)
>> +{
>> + struct pds_vfio_pci_device *pds_vfio =
>> + container_of(vdev, struct pds_vfio_pci_device,
>> + vfio_coredev.vdev);
>> + int err;
>> +
>> + mutex_lock(&pds_vfio->state_mutex);
>> + err = pds_vfio_dirty_sync(pds_vfio, dirty, iova, length);
>> + pds_vfio_state_mutex_unlock(pds_vfio);
>> +
>> + return err;
>> +}
>> +
>> +int pds_vfio_dma_logging_start(struct vfio_device *vdev,
>> + struct rb_root_cached *ranges, u32 nnodes,
>> + u64 *page_size)
>> +{
>> + struct pds_vfio_pci_device *pds_vfio =
>> + container_of(vdev, struct pds_vfio_pci_device,
>> + vfio_coredev.vdev);
>> + int err;
>> +
>> + mutex_lock(&pds_vfio->state_mutex);
>> + pds_vfio_send_host_vf_lm_status_cmd(pds_vfio,
>> PDS_LM_STA_IN_PROGRESS);
>> + err = pds_vfio_dirty_enable(pds_vfio, ranges, nnodes, page_size);
>> + pds_vfio_state_mutex_unlock(pds_vfio);
>> +
>> + return err;
>> +}
>> +
>> +int pds_vfio_dma_logging_stop(struct vfio_device *vdev)
>> +{
>> + struct pds_vfio_pci_device *pds_vfio =
>> + container_of(vdev, struct pds_vfio_pci_device,
>> + vfio_coredev.vdev);
>> +
>> + mutex_lock(&pds_vfio->state_mutex);
>> + pds_vfio_dirty_disable(pds_vfio, true);
>> + pds_vfio_state_mutex_unlock(pds_vfio);
>> +
>> + return 0;
>> +}
>> diff --git a/drivers/vfio/pci/pds/dirty.h b/drivers/vfio/pci/pds/dirty.h
>> new file mode 100644
>> index 000000000000..f78da25d75ca
>> --- /dev/null
>> +++ b/drivers/vfio/pci/pds/dirty.h
>> @@ -0,0 +1,39 @@
>> +/* SPDX-License-Identifier: GPL-2.0 */
>> +/* Copyright(c) 2023 Advanced Micro Devices, Inc. */
>> +
>> +#ifndef _DIRTY_H_
>> +#define _DIRTY_H_
>> +
>> +struct pds_vfio_bmp_info {
>> + unsigned long *bmp;
>> + u32 bmp_bytes;
>> + struct pds_lm_sg_elem *sgl;
>> + dma_addr_t sgl_addr;
>> + u16 num_sge;
>> +};
>> +
>> +struct pds_vfio_dirty {
>> + struct pds_vfio_bmp_info host_seq;
>> + struct pds_vfio_bmp_info host_ack;
>> + u64 region_size;
>> + u64 region_start;
>> + u64 region_page_size;
>> + bool is_enabled;
>> +};
>> +
>> +struct pds_vfio_pci_device;
>> +
>> +bool pds_vfio_dirty_is_enabled(struct pds_vfio_pci_device *pds_vfio);
>> +void pds_vfio_dirty_set_enabled(struct pds_vfio_pci_device *pds_vfio);
>> +void pds_vfio_dirty_set_disabled(struct pds_vfio_pci_device *pds_vfio);
>> +void pds_vfio_dirty_disable(struct pds_vfio_pci_device *pds_vfio,
>> + bool send_cmd);
>> +
>> +int pds_vfio_dma_logging_report(struct vfio_device *vdev, unsigned long
>> iova,
>> + unsigned long length,
>> + struct iova_bitmap *dirty);
>> +int pds_vfio_dma_logging_start(struct vfio_device *vdev,
>> + struct rb_root_cached *ranges, u32 nnodes,
>> + u64 *page_size);
>> +int pds_vfio_dma_logging_stop(struct vfio_device *vdev);
>> +#endif /* _DIRTY_H_ */
>> diff --git a/drivers/vfio/pci/pds/lm.c b/drivers/vfio/pci/pds/lm.c
>> index 7e319529cf74..aec75574cab3 100644
>> --- a/drivers/vfio/pci/pds/lm.c
>> +++ b/drivers/vfio/pci/pds/lm.c
>> @@ -371,7 +371,7 @@ pds_vfio_step_device_state_locked(struct
>> pds_vfio_pci_device *pds_vfio,
>>
>> if (cur == VFIO_DEVICE_STATE_STOP_COPY && next ==
>> VFIO_DEVICE_STATE_STOP) {
>> pds_vfio_put_save_file(pds_vfio);
>> - pds_vfio_send_host_vf_lm_status_cmd(pds_vfio,
>> PDS_LM_STA_NONE);
>> + pds_vfio_dirty_disable(pds_vfio, true);
>> return NULL;
>> }
>>
>> diff --git a/drivers/vfio/pci/pds/vfio_dev.c b/drivers/vfio/pci/pds/vfio_dev.c
>> index b37ef96a7fd8..9e6a96b5db62 100644
>> --- a/drivers/vfio/pci/pds/vfio_dev.c
>> +++ b/drivers/vfio/pci/pds/vfio_dev.c
>> @@ -5,6 +5,7 @@
>> #include <linux/vfio_pci_core.h>
>>
>> #include "lm.h"
>> +#include "dirty.h"
>> #include "vfio_dev.h"
>>
>> struct pci_dev *pds_vfio_to_pci_dev(struct pds_vfio_pci_device *pds_vfio)
>> @@ -25,7 +26,7 @@ struct pds_vfio_pci_device
>> *pds_vfio_pci_drvdata(struct pci_dev *pdev)
>> vfio_coredev);
>> }
>>
>> -static void pds_vfio_state_mutex_unlock(struct pds_vfio_pci_device
>> *pds_vfio)
>> +void pds_vfio_state_mutex_unlock(struct pds_vfio_pci_device *pds_vfio)
>> {
>> again:
>> spin_lock(&pds_vfio->reset_lock);
>> @@ -35,6 +36,7 @@ static void pds_vfio_state_mutex_unlock(struct
>> pds_vfio_pci_device *pds_vfio)
>> pds_vfio->state = VFIO_DEVICE_STATE_RUNNING;
>> pds_vfio_put_restore_file(pds_vfio);
>> pds_vfio_put_save_file(pds_vfio);
>> + pds_vfio_dirty_disable(pds_vfio, false);
>> }
>> spin_unlock(&pds_vfio->reset_lock);
>> goto again;
>> @@ -117,6 +119,12 @@ static const struct vfio_migration_ops
>> pds_vfio_lm_ops = {
>> .migration_get_data_size = pds_vfio_get_device_state_size
>> };
>>
>> +static const struct vfio_log_ops pds_vfio_log_ops = {
>> + .log_start = pds_vfio_dma_logging_start,
>> + .log_stop = pds_vfio_dma_logging_stop,
>> + .log_read_and_clear = pds_vfio_dma_logging_report,
>> +};
>> +
>> static int pds_vfio_init_device(struct vfio_device *vdev)
>> {
>> struct pds_vfio_pci_device *pds_vfio =
>> @@ -137,6 +145,7 @@ static int pds_vfio_init_device(struct vfio_device
>> *vdev)
>>
>> vdev->migration_flags = VFIO_MIGRATION_STOP_COPY |
>> VFIO_MIGRATION_P2P;
>> vdev->mig_ops = &pds_vfio_lm_ops;
>> + vdev->log_ops = &pds_vfio_log_ops;
>>
>> pci_id = PCI_DEVID(pdev->bus->number, pdev->devfn);
>> dev_dbg(&pdev->dev,
>> @@ -175,6 +184,7 @@ static void pds_vfio_close_device(struct vfio_device
>> *vdev)
>> mutex_lock(&pds_vfio->state_mutex);
>> pds_vfio_put_restore_file(pds_vfio);
>> pds_vfio_put_save_file(pds_vfio);
>> + pds_vfio_dirty_disable(pds_vfio, true);
>> mutex_unlock(&pds_vfio->state_mutex);
>> mutex_destroy(&pds_vfio->state_mutex);
>> vfio_pci_core_close_device(vdev);
>> diff --git a/drivers/vfio/pci/pds/vfio_dev.h b/drivers/vfio/pci/pds/vfio_dev.h
>> index 31bd14de0c91..8109fe101694 100644
>> --- a/drivers/vfio/pci/pds/vfio_dev.h
>> +++ b/drivers/vfio/pci/pds/vfio_dev.h
>> @@ -7,6 +7,7 @@
>> #include <linux/pci.h>
>> #include <linux/vfio_pci_core.h>
>>
>> +#include "dirty.h"
>> #include "lm.h"
>>
>> struct pdsc;
>> @@ -17,6 +18,7 @@ struct pds_vfio_pci_device {
>>
>> struct pds_vfio_lm_file *save_file;
>> struct pds_vfio_lm_file *restore_file;
>> + struct pds_vfio_dirty dirty;
>> struct mutex state_mutex; /* protect migration state */
>> enum vfio_device_mig_state state;
>> spinlock_t reset_lock; /* protect reset_done flow */
>> @@ -26,6 +28,8 @@ struct pds_vfio_pci_device {
>> u16 client_id;
>> };
>>
>> +void pds_vfio_state_mutex_unlock(struct pds_vfio_pci_device *pds_vfio);
>> +
>> const struct vfio_device_ops *pds_vfio_ops_info(void);
>> struct pds_vfio_pci_device *pds_vfio_pci_drvdata(struct pci_dev *pdev);
>> void pds_vfio_reset(struct pds_vfio_pci_device *pds_vfio);
>> diff --git a/include/linux/pds/pds_adminq.h
>> b/include/linux/pds/pds_adminq.h
>> index 9c79b3c8fc47..4b4e9a98b37b 100644
>> --- a/include/linux/pds/pds_adminq.h
>> +++ b/include/linux/pds/pds_adminq.h
>> @@ -835,6 +835,13 @@ enum pds_lm_cmd_opcode {
>> PDS_LM_CMD_RESUME = 20,
>> PDS_LM_CMD_SAVE = 21,
>> PDS_LM_CMD_RESTORE = 22,
>> +
>> + /* Dirty page tracking commands */
>> + PDS_LM_CMD_DIRTY_STATUS = 32,
>> + PDS_LM_CMD_DIRTY_ENABLE = 33,
>> + PDS_LM_CMD_DIRTY_DISABLE = 34,
>> + PDS_LM_CMD_DIRTY_READ_SEQ = 35,
>> + PDS_LM_CMD_DIRTY_WRITE_ACK = 36,
>> };
>>
>> /**
>> @@ -992,6 +999,172 @@ enum pds_lm_host_vf_status {
>> PDS_LM_STA_MAX,
>> };
>>
>> +/**
>> + * struct pds_lm_dirty_region_info - Memory region info for STATUS and
>> ENABLE
>> + * @dma_base: Base address of the DMA-contiguous memory region
>> + * @page_count: Number of pages in the memory region
>> + * @page_size_log2: Log2 page size in the memory region
>> + * @rsvd: Word boundary padding
>> + */
>> +struct pds_lm_dirty_region_info {
>> + __le64 dma_base;
>> + __le32 page_count;
>> + u8 page_size_log2;
>> + u8 rsvd[3];
>> +};
>> +
>> +/**
>> + * struct pds_lm_dirty_status_cmd - DIRTY_STATUS command
>> + * @opcode: Opcode PDS_LM_CMD_DIRTY_STATUS
>> + * @rsvd: Word boundary padding
>> + * @vf_id: VF id
>> + * @max_regions: Capacity of the region info buffer
>> + * @rsvd2: Word boundary padding
>> + * @regions_dma: DMA address of the region info buffer
>> + *
>> + * The minimum of max_regions (from the command) and num_regions
>> (from the
>> + * completion) of struct pds_lm_dirty_region_info will be written to
>> + * regions_dma.
>> + *
>> + * The max_regions may be zero, in which case regions_dma is ignored. In
>> that
>> + * case, the completion will only report the maximum number of regions
>> + * supported by the device, and the number of regions currently enabled.
>> + */
>> +struct pds_lm_dirty_status_cmd {
>> + u8 opcode;
>> + u8 rsvd;
>> + __le16 vf_id;
>> + u8 max_regions;
>> + u8 rsvd2[3];
>> + __le64 regions_dma;
>> +} __packed;
>> +
>> +/**
>> + * enum pds_lm_dirty_bmp_type - Type of dirty page bitmap
>> + * @PDS_LM_DIRTY_BMP_TYPE_NONE: No bitmap / disabled
>> + * @PDS_LM_DIRTY_BMP_TYPE_SEQ_ACK: Seq/Ack bitmap representation
>> + */
>> +enum pds_lm_dirty_bmp_type {
>> + PDS_LM_DIRTY_BMP_TYPE_NONE = 0,
>> + PDS_LM_DIRTY_BMP_TYPE_SEQ_ACK = 1,
>> +};
>> +
>> +/**
>> + * struct pds_lm_dirty_status_comp - STATUS command completion
>> + * @status: Status of the command (enum pds_core_status_code)
>> + * @rsvd: Word boundary padding
>> + * @comp_index: Index in the desc ring for which this is the
>> completion
>> + * @max_regions: Maximum number of regions supported by the
>> device
>> + * @num_regions: Number of regions currently enabled
>> + * @bmp_type: Type of dirty bitmap representation
>> + * @rsvd2: Word boundary padding
>> + * @bmp_type_mask: Mask of supported bitmap types, bit index per type
>> + * @rsvd3: Word boundary padding
>> + * @color: Color bit
>> + *
>> + * This completion descriptor is used for STATUS, ENABLE, and DISABLE.
>> + */
>> +struct pds_lm_dirty_status_comp {
>> + u8 status;
>> + u8 rsvd;
>> + __le16 comp_index;
>> + u8 max_regions;
>> + u8 num_regions;
>> + u8 bmp_type;
>> + u8 rsvd2;
>> + __le32 bmp_type_mask;
>> + u8 rsvd3[3];
>> + u8 color;
>> +};
>> +
>> +/**
>> + * struct pds_lm_dirty_enable_cmd - DIRTY_ENABLE command
>> + * @opcode: Opcode PDS_LM_CMD_DIRTY_ENABLE
>> + * @rsvd: Word boundary padding
>> + * @vf_id: VF id
>> + * @bmp_type: Type of dirty bitmap representation
>> + * @num_regions: Number of entries in the region info buffer
>> + * @rsvd2: Word boundary padding
>> + * @regions_dma: DMA address of the region info buffer
>> + *
>> + * The num_regions must be nonzero, and less than or equal to the
>> maximum
>> + * number of regions supported by the device.
>> + *
>> + * The memory regions should not overlap.
>> + *
>> + * The information should be initialized by the driver. The device may
>> modify
>> + * the information on successful completion, such as by size-aligning the
>> + * number of pages in a region.
>> + *
>> + * The modified number of pages will be greater than or equal to the page
>> count
>> + * given in the enable command, and at least as coarsly aligned as the given
>> + * value. For example, the count might be aligned to a multiple of 64, but
>> + * if the value is already a multiple of 128 or higher, it will not change.
>> + * If the driver requires its own minimum alignment of the number of pages,
>> the
>> + * driver should account for that already in the region info of this command.
>> + *
>> + * This command uses struct pds_lm_dirty_status_comp for its completion.
>> + */
>> +struct pds_lm_dirty_enable_cmd {
>> + u8 opcode;
>> + u8 rsvd;
>> + __le16 vf_id;
>> + u8 bmp_type;
>> + u8 num_regions;
>> + u8 rsvd2[2];
>> + __le64 regions_dma;
>> +} __packed;
>> +
>> +/**
>> + * struct pds_lm_dirty_disable_cmd - DIRTY_DISABLE command
>> + * @opcode: Opcode PDS_LM_CMD_DIRTY_DISABLE
>> + * @rsvd: Word boundary padding
>> + * @vf_id: VF id
>> + *
>> + * Dirty page tracking will be disabled. This may be called in any state, as
>> + * long as dirty page tracking is supported by the device, to ensure that dirty
>> + * page tracking is disabled.
>> + *
>> + * This command uses struct pds_lm_dirty_status_comp for its completion.
>> On
>> + * success, num_regions will be zero.
>> + */
>> +struct pds_lm_dirty_disable_cmd {
>> + u8 opcode;
>> + u8 rsvd;
>> + __le16 vf_id;
>> +};
>> +
>> +/**
>> + * struct pds_lm_dirty_seq_ack_cmd - DIRTY_READ_SEQ or _WRITE_ACK
>> command
>> + * @opcode: Opcode PDS_LM_CMD_DIRTY_[READ_SEQ|WRITE_ACK]
>> + * @rsvd: Word boundary padding
>> + * @vf_id: VF id
>> + * @off_bytes: Byte offset in the bitmap
>> + * @len_bytes: Number of bytes to transfer
>> + * @num_sge: Number of DMA scatter gather elements
>> + * @rsvd2: Word boundary padding
>> + * @sgl_addr: DMA address of scatter gather list
>> + *
>> + * Read bytes from the SEQ bitmap, or write bytes into the ACK bitmap.
>> + *
>> + * This command treats the entire bitmap as a byte buffer. It does not
>> + * distinguish between guest memory regions. The driver should refer to
>> the
>> + * number of pages in each region, according to
>> PDS_LM_CMD_DIRTY_STATUS, to
>> + * determine the region boundaries in the bitmap. Each region will be
>> + * represented by exactly the number of bits as the page count for that
>> region,
>> + * immediately following the last bit of the previous region.
>> + */
>> +struct pds_lm_dirty_seq_ack_cmd {
>> + u8 opcode;
>> + u8 rsvd;
>> + __le16 vf_id;
>> + __le32 off_bytes;
>> + __le32 len_bytes;
>> + __le16 num_sge;
>> + u8 rsvd2[2];
>> + __le64 sgl_addr;
>> +} __packed;
>> +
>> /**
>> * struct pds_lm_host_vf_status_cmd - HOST_VF_STATUS command
>> * @opcode: Opcode PDS_LM_CMD_HOST_VF_STATUS
>> @@ -1039,6 +1212,10 @@ union pds_core_adminq_cmd {
>> struct pds_lm_save_cmd lm_save;
>> struct pds_lm_restore_cmd lm_restore;
>> struct pds_lm_host_vf_status_cmd lm_host_vf_status;
>> + struct pds_lm_dirty_status_cmd lm_dirty_status;
>> + struct pds_lm_dirty_enable_cmd lm_dirty_enable;
>> + struct pds_lm_dirty_disable_cmd lm_dirty_disable;
>> + struct pds_lm_dirty_seq_ack_cmd lm_dirty_seq_ack;
>> };
>>
>> union pds_core_adminq_comp {
>> @@ -1065,6 +1242,7 @@ union pds_core_adminq_comp {
>> struct pds_vdpa_vq_reset_comp vdpa_vq_reset;
>>
>> struct pds_lm_state_size_comp lm_state_size;
>> + struct pds_lm_dirty_status_comp lm_dirty_status;
>> };
>>
>> #ifndef __CHECKER__
>> --
>> 2.17.1
>
Powered by blists - more mailing lists