lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Date: Thu, 3 Aug 2023 12:53:09 -0700
From: Brett Creeley <bcreeley@....com>
To: Shameerali Kolothum Thodi <shameerali.kolothum.thodi@...wei.com>,
 Brett Creeley <brett.creeley@....com>,
 "kvm@...r.kernel.org" <kvm@...r.kernel.org>,
 "netdev@...r.kernel.org" <netdev@...r.kernel.org>,
 "alex.williamson@...hat.com" <alex.williamson@...hat.com>,
 "jgg@...dia.com" <jgg@...dia.com>, "yishaih@...dia.com"
 <yishaih@...dia.com>, "kevin.tian@...el.com" <kevin.tian@...el.com>
Cc: "simon.horman@...igine.com" <simon.horman@...igine.com>,
 "shannon.nelson@....com" <shannon.nelson@....com>
Subject: Re: [PATCH v13 vfio 5/7] vfio/pds: Add support for dirty page
 tracking

On 8/3/2023 5:43 AM, Shameerali Kolothum Thodi wrote:
> Caution: This message originated from an External Source. Use proper caution when opening attachments, clicking links, or responding.
> 
> 
> Hi Brett,
> 
>> -----Original Message-----
>> From: Brett Creeley [mailto:brett.creeley@....com]
>> Sent: 25 July 2023 22:40
>> To: kvm@...r.kernel.org; netdev@...r.kernel.org;
>> alex.williamson@...hat.com; jgg@...dia.com; yishaih@...dia.com;
>> Shameerali Kolothum Thodi <shameerali.kolothum.thodi@...wei.com>;
>> kevin.tian@...el.com
>> Cc: simon.horman@...igine.com; brett.creeley@....com;
>> shannon.nelson@....com
>> Subject: [PATCH v13 vfio 5/7] vfio/pds: Add support for dirty page tracking
>>
> [...]
> 
>> +static int pds_vfio_dirty_enable(struct pds_vfio_pci_device *pds_vfio,
>> +                              struct rb_root_cached *ranges, u32 nnodes,
>> +                              u64 *page_size)
>> +{
>> +     struct pci_dev *pdev = pds_vfio->vfio_coredev.pdev;
>> +     struct device *pdsc_dev = &pci_physfn(pdev)->dev;
>> +     struct pds_vfio_dirty *dirty = &pds_vfio->dirty;
>> +     u64 region_start, region_size, region_page_size;
>> +     struct pds_lm_dirty_region_info *region_info;
>> +     struct interval_tree_node *node = NULL;
>> +     u8 max_regions = 0, num_regions;
>> +     dma_addr_t regions_dma = 0;
>> +     u32 num_ranges = nnodes;
>> +     u32 page_count;
>> +     u16 len;
>> +     int err;
>> +
>> +     dev_dbg(&pdev->dev, "vf%u: Start dirty page tracking\n",
>> +             pds_vfio->vf_id);
>> +
>> +     if (pds_vfio_dirty_is_enabled(pds_vfio))
>> +             return -EINVAL;
>> +
>> +     pds_vfio_dirty_set_enabled(pds_vfio);
> 
> Any reason why this is set here? It looks to me you could set this at the
> end if everything goes well and avoid below goto out_set_disabled s.
> Not sure I am missing anything obvious here.
> 
> Thanks,
> Shameer.

No, this is a good catch and simplifies this function a bit. I will 
respin and fix this.

Thanks for the review,

Brett
>> +
>> +     /* find if dirty tracking is disabled, i.e. num_regions == 0 */
>> +     err = pds_vfio_dirty_status_cmd(pds_vfio, 0, &max_regions,
>> +                                     &num_regions);
>> +     if (err < 0) {
>> +             dev_err(&pdev->dev, "Failed to get dirty status, err %pe\n",
>> +                     ERR_PTR(err));
>> +             goto out_set_disabled;
>> +     } else if (num_regions) {
>> +             dev_err(&pdev->dev,
>> +                     "Dirty tracking already enabled for %d regions\n",
>> +                     num_regions);
>> +             err = -EEXIST;
>> +             goto out_set_disabled;
>> +     } else if (!max_regions) {
>> +             dev_err(&pdev->dev,
>> +                     "Device doesn't support dirty tracking, max_regions %d\n",
>> +                     max_regions);
>> +             err = -EOPNOTSUPP;
>> +             goto out_set_disabled;
>> +     }
>> +
>> +     /*
>> +      * Only support 1 region for now. If there are any large gaps in the
>> +      * VM's address regions, then this would be a waste of memory as we
>> are
>> +      * generating 2 bitmaps (ack/seq) from the min address to the max
>> +      * address of the VM's address regions. In the future, if we support
>> +      * more than one region in the device/driver we can split the bitmaps
>> +      * on the largest address region gaps. We can do this split up to the
>> +      * max_regions times returned from the dirty_status command.
>> +      */
>> +     max_regions = 1;
>> +     if (num_ranges > max_regions) {
>> +             vfio_combine_iova_ranges(ranges, nnodes, max_regions);
>> +             num_ranges = max_regions;
>> +     }
>> +
>> +     node = interval_tree_iter_first(ranges, 0, ULONG_MAX);
>> +     if (!node) {
>> +             err = -EINVAL;
>> +             goto out_set_disabled;
>> +     }
>> +
>> +     region_size = node->last - node->start + 1;
>> +     region_start = node->start;
>> +     region_page_size = *page_size;
>> +
>> +     len = sizeof(*region_info);
>> +     region_info = kzalloc(len, GFP_KERNEL);
>> +     if (!region_info) {
>> +             err = -ENOMEM;
>> +             goto out_set_disabled;
>> +     }
>> +
>> +     page_count = DIV_ROUND_UP(region_size, region_page_size);
>> +
>> +     region_info->dma_base = cpu_to_le64(region_start);
>> +     region_info->page_count = cpu_to_le32(page_count);
>> +     region_info->page_size_log2 = ilog2(region_page_size);
>> +
>> +     regions_dma = dma_map_single(pdsc_dev, (void *)region_info, len,
>> +                                  DMA_BIDIRECTIONAL);
>> +     if (dma_mapping_error(pdsc_dev, regions_dma)) {
>> +             err = -ENOMEM;
>> +             goto out_free_region_info;
>> +     }
>> +
>> +     err = pds_vfio_dirty_enable_cmd(pds_vfio, regions_dma, max_regions);
>> +     dma_unmap_single(pdsc_dev, regions_dma, len, DMA_BIDIRECTIONAL);
>> +     if (err)
>> +             goto out_free_region_info;
>> +
>> +     /*
>> +      * page_count might be adjusted by the device,
>> +      * update it before freeing region_info DMA
>> +      */
>> +     page_count = le32_to_cpu(region_info->page_count);
>> +
>> +     dev_dbg(&pdev->dev,
>> +             "region_info: regions_dma 0x%llx dma_base 0x%llx page_count %u
>> page_size_log2 %u\n",
>> +             regions_dma, region_start, page_count,
>> +             (u8)ilog2(region_page_size));
>> +
>> +     err = pds_vfio_dirty_alloc_bitmaps(dirty, page_count / BITS_PER_BYTE);
>> +     if (err) {
>> +             dev_err(&pdev->dev, "Failed to alloc dirty bitmaps: %pe\n",
>> +                     ERR_PTR(err));
>> +             goto out_free_region_info;
>> +     }
>> +
>> +     err = pds_vfio_dirty_alloc_sgl(pds_vfio, page_count);
>> +     if (err) {
>> +             dev_err(&pdev->dev, "Failed to alloc dirty sg lists: %pe\n",
>> +                     ERR_PTR(err));
>> +             goto out_free_bitmaps;
>> +     }
>> +
>> +     dirty->region_start = region_start;
>> +     dirty->region_size = region_size;
>> +     dirty->region_page_size = region_page_size;
>> +
>> +     pds_vfio_print_guest_region_info(pds_vfio, max_regions);
>> +
>> +     kfree(region_info);
>> +
>> +     return 0;
>> +
>> +out_free_bitmaps:
>> +     pds_vfio_dirty_free_bitmaps(dirty);
>> +out_free_region_info:
>> +     kfree(region_info);
>> +out_set_disabled:
>> +     pds_vfio_dirty_set_disabled(pds_vfio);
>> +     return err;
>> +}
>> +
>> +void pds_vfio_dirty_disable(struct pds_vfio_pci_device *pds_vfio, bool
>> send_cmd)
>> +{
>> +     if (pds_vfio_dirty_is_enabled(pds_vfio)) {
>> +             pds_vfio_dirty_set_disabled(pds_vfio);
>> +             if (send_cmd)
>> +                     pds_vfio_dirty_disable_cmd(pds_vfio);
>> +             pds_vfio_dirty_free_sgl(pds_vfio);
>> +             pds_vfio_dirty_free_bitmaps(&pds_vfio->dirty);
>> +     }
>> +
>> +     if (send_cmd)
>> +             pds_vfio_send_host_vf_lm_status_cmd(pds_vfio,
>> PDS_LM_STA_NONE);
>> +}
>> +
>> +static int pds_vfio_dirty_seq_ack(struct pds_vfio_pci_device *pds_vfio,
>> +                               struct pds_vfio_bmp_info *bmp_info,
>> +                               u32 offset, u32 bmp_bytes, bool read_seq)
>> +{
>> +     const char *bmp_type_str = read_seq ? "read_seq" : "write_ack";
>> +     u8 dma_dir = read_seq ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
>> +     struct pci_dev *pdev = pds_vfio->vfio_coredev.pdev;
>> +     struct device *pdsc_dev = &pci_physfn(pdev)->dev;
>> +     unsigned long long npages;
>> +     struct sg_table sg_table;
>> +     struct scatterlist *sg;
>> +     struct page **pages;
>> +     u32 page_offset;
>> +     const void *bmp;
>> +     size_t size;
>> +     u16 num_sge;
>> +     int err;
>> +     int i;
>> +
>> +     bmp = (void *)((u64)bmp_info->bmp + offset);
>> +     page_offset = offset_in_page(bmp);
>> +     bmp -= page_offset;
>> +
>> +     /*
>> +      * Start and end of bitmap section to seq/ack might not be page
>> +      * aligned, so use the page_offset to account for that so there
>> +      * will be enough pages to represent the bmp_bytes
>> +      */
>> +     npages = DIV_ROUND_UP_ULL(bmp_bytes + page_offset, PAGE_SIZE);
>> +     pages = kmalloc_array(npages, sizeof(*pages), GFP_KERNEL);
>> +     if (!pages)
>> +             return -ENOMEM;
>> +
>> +     for (unsigned long long i = 0; i < npages; i++) {
>> +             struct page *page = vmalloc_to_page(bmp);
>> +
>> +             if (!page) {
>> +                     err = -EFAULT;
>> +                     goto out_free_pages;
>> +             }
>> +
>> +             pages[i] = page;
>> +             bmp += PAGE_SIZE;
>> +     }
>> +
>> +     err = sg_alloc_table_from_pages(&sg_table, pages, npages, page_offset,
>> +                                     bmp_bytes, GFP_KERNEL);
>> +     if (err)
>> +             goto out_free_pages;
>> +
>> +     err = dma_map_sgtable(pdsc_dev, &sg_table, dma_dir, 0);
>> +     if (err)
>> +             goto out_free_sg_table;
>> +
>> +     for_each_sgtable_dma_sg(&sg_table, sg, i) {
>> +             struct pds_lm_sg_elem *sg_elem = &bmp_info->sgl[i];
>> +
>> +             sg_elem->addr = cpu_to_le64(sg_dma_address(sg));
>> +             sg_elem->len = cpu_to_le32(sg_dma_len(sg));
>> +     }
>> +
>> +     num_sge = sg_table.nents;
>> +     size = num_sge * sizeof(struct pds_lm_sg_elem);
>> +     dma_sync_single_for_device(pdsc_dev, bmp_info->sgl_addr, size,
>> dma_dir);
>> +     err = pds_vfio_dirty_seq_ack_cmd(pds_vfio, bmp_info->sgl_addr,
>> num_sge,
>> +                                      offset, bmp_bytes, read_seq);
>> +     if (err)
>> +             dev_err(&pdev->dev,
>> +                     "Dirty bitmap %s failed offset %u bmp_bytes %u num_sge %u
>> DMA 0x%llx: %pe\n",
>> +                     bmp_type_str, offset, bmp_bytes,
>> +                     num_sge, bmp_info->sgl_addr, ERR_PTR(err));
>> +     dma_sync_single_for_cpu(pdsc_dev, bmp_info->sgl_addr, size,
>> dma_dir);
>> +
>> +     dma_unmap_sgtable(pdsc_dev, &sg_table, dma_dir, 0);
>> +out_free_sg_table:
>> +     sg_free_table(&sg_table);
>> +out_free_pages:
>> +     kfree(pages);
>> +
>> +     return err;
>> +}
>> +
>> +static int pds_vfio_dirty_write_ack(struct pds_vfio_pci_device *pds_vfio,
>> +                                 u32 offset, u32 len)
>> +{
>> +     return pds_vfio_dirty_seq_ack(pds_vfio, &pds_vfio->dirty.host_ack,
>> +                                   offset, len, WRITE_ACK);
>> +}
>> +
>> +static int pds_vfio_dirty_read_seq(struct pds_vfio_pci_device *pds_vfio,
>> +                                u32 offset, u32 len)
>> +{
>> +     return pds_vfio_dirty_seq_ack(pds_vfio, &pds_vfio->dirty.host_seq,
>> +                                   offset, len, READ_SEQ);
>> +}
>> +
>> +static int pds_vfio_dirty_process_bitmaps(struct pds_vfio_pci_device
>> *pds_vfio,
>> +                                       struct iova_bitmap *dirty_bitmap,
>> +                                       u32 bmp_offset, u32 len_bytes)
>> +{
>> +     u64 page_size = pds_vfio->dirty.region_page_size;
>> +     u64 region_start = pds_vfio->dirty.region_start;
>> +     u32 bmp_offset_bit;
>> +     __le64 *seq, *ack;
>> +     int dword_count;
>> +
>> +     dword_count = len_bytes / sizeof(u64);
>> +     seq = (__le64 *)((u64)pds_vfio->dirty.host_seq.bmp + bmp_offset);
>> +     ack = (__le64 *)((u64)pds_vfio->dirty.host_ack.bmp + bmp_offset);
>> +     bmp_offset_bit = bmp_offset * 8;
>> +
>> +     for (int i = 0; i < dword_count; i++) {
>> +             u64 xor = le64_to_cpu(seq[i]) ^ le64_to_cpu(ack[i]);
>> +
>> +             /* prepare for next write_ack call */
>> +             ack[i] = seq[i];
>> +
>> +             for (u8 bit_i = 0; bit_i < BITS_PER_TYPE(u64); ++bit_i) {
>> +                     if (xor & BIT(bit_i)) {
>> +                             u64 abs_bit_i = bmp_offset_bit +
>> +                                             i * BITS_PER_TYPE(u64) + bit_i;
>> +                             u64 addr = abs_bit_i * page_size + region_start;
>> +
>> +                             iova_bitmap_set(dirty_bitmap, addr, page_size);
>> +                     }
>> +             }
>> +     }
>> +
>> +     return 0;
>> +}
>> +
>> +static int pds_vfio_dirty_sync(struct pds_vfio_pci_device *pds_vfio,
>> +                            struct iova_bitmap *dirty_bitmap,
>> +                            unsigned long iova, unsigned long length)
>> +{
>> +     struct device *dev = &pds_vfio->vfio_coredev.pdev->dev;
>> +     struct pds_vfio_dirty *dirty = &pds_vfio->dirty;
>> +     u64 bmp_offset, bmp_bytes;
>> +     u64 bitmap_size, pages;
>> +     int err;
>> +
>> +     dev_dbg(dev, "vf%u: Get dirty page bitmap\n", pds_vfio->vf_id);
>> +
>> +     if (!pds_vfio_dirty_is_enabled(pds_vfio)) {
>> +             dev_err(dev, "vf%u: Sync failed, dirty tracking is disabled\n",
>> +                     pds_vfio->vf_id);
>> +             return -EINVAL;
>> +     }
>> +
>> +     pages = DIV_ROUND_UP(length, pds_vfio->dirty.region_page_size);
>> +     bitmap_size =
>> +             round_up(pages, sizeof(u64) * BITS_PER_BYTE) / BITS_PER_BYTE;
>> +
>> +     dev_dbg(dev,
>> +             "vf%u: iova 0x%lx length %lu page_size %llu pages %llu
>> bitmap_size %llu\n",
>> +             pds_vfio->vf_id, iova, length, pds_vfio->dirty.region_page_size,
>> +             pages, bitmap_size);
>> +
>> +     if (!length || ((dirty->region_start + iova + length) >
>> +                     (dirty->region_start + dirty->region_size))) {
>> +             dev_err(dev, "Invalid iova 0x%lx and/or length 0x%lx to sync\n",
>> +                     iova, length);
>> +             return -EINVAL;
>> +     }
>> +
>> +     /* bitmap is modified in 64 bit chunks */
>> +     bmp_bytes = ALIGN(DIV_ROUND_UP(length / dirty->region_page_size,
>> +                                    sizeof(u64)),
>> +                       sizeof(u64));
>> +     if (bmp_bytes != bitmap_size) {
>> +             dev_err(dev,
>> +                     "Calculated bitmap bytes %llu not equal to bitmap
>> size %llu\n",
>> +                     bmp_bytes, bitmap_size);
>> +             return -EINVAL;
>> +     }
>> +
>> +     bmp_offset = DIV_ROUND_UP(iova / dirty->region_page_size,
>> sizeof(u64));
>> +
>> +     dev_dbg(dev,
>> +             "Syncing dirty bitmap, iova 0x%lx length 0x%lx, bmp_offset %llu
>> bmp_bytes %llu\n",
>> +             iova, length, bmp_offset, bmp_bytes);
>> +
>> +     err = pds_vfio_dirty_read_seq(pds_vfio, bmp_offset, bmp_bytes);
>> +     if (err)
>> +             return err;
>> +
>> +     err = pds_vfio_dirty_process_bitmaps(pds_vfio, dirty_bitmap,
>> bmp_offset,
>> +                                          bmp_bytes);
>> +     if (err)
>> +             return err;
>> +
>> +     err = pds_vfio_dirty_write_ack(pds_vfio, bmp_offset, bmp_bytes);
>> +     if (err)
>> +             return err;
>> +
>> +     return 0;
>> +}
>> +
>> +int pds_vfio_dma_logging_report(struct vfio_device *vdev, unsigned long
>> iova,
>> +                             unsigned long length, struct iova_bitmap *dirty)
>> +{
>> +     struct pds_vfio_pci_device *pds_vfio =
>> +             container_of(vdev, struct pds_vfio_pci_device,
>> +                          vfio_coredev.vdev);
>> +     int err;
>> +
>> +     mutex_lock(&pds_vfio->state_mutex);
>> +     err = pds_vfio_dirty_sync(pds_vfio, dirty, iova, length);
>> +     pds_vfio_state_mutex_unlock(pds_vfio);
>> +
>> +     return err;
>> +}
>> +
>> +int pds_vfio_dma_logging_start(struct vfio_device *vdev,
>> +                            struct rb_root_cached *ranges, u32 nnodes,
>> +                            u64 *page_size)
>> +{
>> +     struct pds_vfio_pci_device *pds_vfio =
>> +             container_of(vdev, struct pds_vfio_pci_device,
>> +                          vfio_coredev.vdev);
>> +     int err;
>> +
>> +     mutex_lock(&pds_vfio->state_mutex);
>> +     pds_vfio_send_host_vf_lm_status_cmd(pds_vfio,
>> PDS_LM_STA_IN_PROGRESS);
>> +     err = pds_vfio_dirty_enable(pds_vfio, ranges, nnodes, page_size);
>> +     pds_vfio_state_mutex_unlock(pds_vfio);
>> +
>> +     return err;
>> +}
>> +
>> +int pds_vfio_dma_logging_stop(struct vfio_device *vdev)
>> +{
>> +     struct pds_vfio_pci_device *pds_vfio =
>> +             container_of(vdev, struct pds_vfio_pci_device,
>> +                          vfio_coredev.vdev);
>> +
>> +     mutex_lock(&pds_vfio->state_mutex);
>> +     pds_vfio_dirty_disable(pds_vfio, true);
>> +     pds_vfio_state_mutex_unlock(pds_vfio);
>> +
>> +     return 0;
>> +}
>> diff --git a/drivers/vfio/pci/pds/dirty.h b/drivers/vfio/pci/pds/dirty.h
>> new file mode 100644
>> index 000000000000..f78da25d75ca
>> --- /dev/null
>> +++ b/drivers/vfio/pci/pds/dirty.h
>> @@ -0,0 +1,39 @@
>> +/* SPDX-License-Identifier: GPL-2.0 */
>> +/* Copyright(c) 2023 Advanced Micro Devices, Inc. */
>> +
>> +#ifndef _DIRTY_H_
>> +#define _DIRTY_H_
>> +
>> +struct pds_vfio_bmp_info {
>> +     unsigned long *bmp;
>> +     u32 bmp_bytes;
>> +     struct pds_lm_sg_elem *sgl;
>> +     dma_addr_t sgl_addr;
>> +     u16 num_sge;
>> +};
>> +
>> +struct pds_vfio_dirty {
>> +     struct pds_vfio_bmp_info host_seq;
>> +     struct pds_vfio_bmp_info host_ack;
>> +     u64 region_size;
>> +     u64 region_start;
>> +     u64 region_page_size;
>> +     bool is_enabled;
>> +};
>> +
>> +struct pds_vfio_pci_device;
>> +
>> +bool pds_vfio_dirty_is_enabled(struct pds_vfio_pci_device *pds_vfio);
>> +void pds_vfio_dirty_set_enabled(struct pds_vfio_pci_device *pds_vfio);
>> +void pds_vfio_dirty_set_disabled(struct pds_vfio_pci_device *pds_vfio);
>> +void pds_vfio_dirty_disable(struct pds_vfio_pci_device *pds_vfio,
>> +                         bool send_cmd);
>> +
>> +int pds_vfio_dma_logging_report(struct vfio_device *vdev, unsigned long
>> iova,
>> +                             unsigned long length,
>> +                             struct iova_bitmap *dirty);
>> +int pds_vfio_dma_logging_start(struct vfio_device *vdev,
>> +                            struct rb_root_cached *ranges, u32 nnodes,
>> +                            u64 *page_size);
>> +int pds_vfio_dma_logging_stop(struct vfio_device *vdev);
>> +#endif /* _DIRTY_H_ */
>> diff --git a/drivers/vfio/pci/pds/lm.c b/drivers/vfio/pci/pds/lm.c
>> index 7e319529cf74..aec75574cab3 100644
>> --- a/drivers/vfio/pci/pds/lm.c
>> +++ b/drivers/vfio/pci/pds/lm.c
>> @@ -371,7 +371,7 @@ pds_vfio_step_device_state_locked(struct
>> pds_vfio_pci_device *pds_vfio,
>>
>>        if (cur == VFIO_DEVICE_STATE_STOP_COPY && next ==
>> VFIO_DEVICE_STATE_STOP) {
>>                pds_vfio_put_save_file(pds_vfio);
>> -             pds_vfio_send_host_vf_lm_status_cmd(pds_vfio,
>> PDS_LM_STA_NONE);
>> +             pds_vfio_dirty_disable(pds_vfio, true);
>>                return NULL;
>>        }
>>
>> diff --git a/drivers/vfio/pci/pds/vfio_dev.c b/drivers/vfio/pci/pds/vfio_dev.c
>> index b37ef96a7fd8..9e6a96b5db62 100644
>> --- a/drivers/vfio/pci/pds/vfio_dev.c
>> +++ b/drivers/vfio/pci/pds/vfio_dev.c
>> @@ -5,6 +5,7 @@
>>   #include <linux/vfio_pci_core.h>
>>
>>   #include "lm.h"
>> +#include "dirty.h"
>>   #include "vfio_dev.h"
>>
>>   struct pci_dev *pds_vfio_to_pci_dev(struct pds_vfio_pci_device *pds_vfio)
>> @@ -25,7 +26,7 @@ struct pds_vfio_pci_device
>> *pds_vfio_pci_drvdata(struct pci_dev *pdev)
>>                            vfio_coredev);
>>   }
>>
>> -static void pds_vfio_state_mutex_unlock(struct pds_vfio_pci_device
>> *pds_vfio)
>> +void pds_vfio_state_mutex_unlock(struct pds_vfio_pci_device *pds_vfio)
>>   {
>>   again:
>>        spin_lock(&pds_vfio->reset_lock);
>> @@ -35,6 +36,7 @@ static void pds_vfio_state_mutex_unlock(struct
>> pds_vfio_pci_device *pds_vfio)
>>                        pds_vfio->state = VFIO_DEVICE_STATE_RUNNING;
>>                        pds_vfio_put_restore_file(pds_vfio);
>>                        pds_vfio_put_save_file(pds_vfio);
>> +                     pds_vfio_dirty_disable(pds_vfio, false);
>>                }
>>                spin_unlock(&pds_vfio->reset_lock);
>>                goto again;
>> @@ -117,6 +119,12 @@ static const struct vfio_migration_ops
>> pds_vfio_lm_ops = {
>>        .migration_get_data_size = pds_vfio_get_device_state_size
>>   };
>>
>> +static const struct vfio_log_ops pds_vfio_log_ops = {
>> +     .log_start = pds_vfio_dma_logging_start,
>> +     .log_stop = pds_vfio_dma_logging_stop,
>> +     .log_read_and_clear = pds_vfio_dma_logging_report,
>> +};
>> +
>>   static int pds_vfio_init_device(struct vfio_device *vdev)
>>   {
>>        struct pds_vfio_pci_device *pds_vfio =
>> @@ -137,6 +145,7 @@ static int pds_vfio_init_device(struct vfio_device
>> *vdev)
>>
>>        vdev->migration_flags = VFIO_MIGRATION_STOP_COPY |
>> VFIO_MIGRATION_P2P;
>>        vdev->mig_ops = &pds_vfio_lm_ops;
>> +     vdev->log_ops = &pds_vfio_log_ops;
>>
>>        pci_id = PCI_DEVID(pdev->bus->number, pdev->devfn);
>>        dev_dbg(&pdev->dev,
>> @@ -175,6 +184,7 @@ static void pds_vfio_close_device(struct vfio_device
>> *vdev)
>>        mutex_lock(&pds_vfio->state_mutex);
>>        pds_vfio_put_restore_file(pds_vfio);
>>        pds_vfio_put_save_file(pds_vfio);
>> +     pds_vfio_dirty_disable(pds_vfio, true);
>>        mutex_unlock(&pds_vfio->state_mutex);
>>        mutex_destroy(&pds_vfio->state_mutex);
>>        vfio_pci_core_close_device(vdev);
>> diff --git a/drivers/vfio/pci/pds/vfio_dev.h b/drivers/vfio/pci/pds/vfio_dev.h
>> index 31bd14de0c91..8109fe101694 100644
>> --- a/drivers/vfio/pci/pds/vfio_dev.h
>> +++ b/drivers/vfio/pci/pds/vfio_dev.h
>> @@ -7,6 +7,7 @@
>>   #include <linux/pci.h>
>>   #include <linux/vfio_pci_core.h>
>>
>> +#include "dirty.h"
>>   #include "lm.h"
>>
>>   struct pdsc;
>> @@ -17,6 +18,7 @@ struct pds_vfio_pci_device {
>>
>>        struct pds_vfio_lm_file *save_file;
>>        struct pds_vfio_lm_file *restore_file;
>> +     struct pds_vfio_dirty dirty;
>>        struct mutex state_mutex; /* protect migration state */
>>        enum vfio_device_mig_state state;
>>        spinlock_t reset_lock; /* protect reset_done flow */
>> @@ -26,6 +28,8 @@ struct pds_vfio_pci_device {
>>        u16 client_id;
>>   };
>>
>> +void pds_vfio_state_mutex_unlock(struct pds_vfio_pci_device *pds_vfio);
>> +
>>   const struct vfio_device_ops *pds_vfio_ops_info(void);
>>   struct pds_vfio_pci_device *pds_vfio_pci_drvdata(struct pci_dev *pdev);
>>   void pds_vfio_reset(struct pds_vfio_pci_device *pds_vfio);
>> diff --git a/include/linux/pds/pds_adminq.h
>> b/include/linux/pds/pds_adminq.h
>> index 9c79b3c8fc47..4b4e9a98b37b 100644
>> --- a/include/linux/pds/pds_adminq.h
>> +++ b/include/linux/pds/pds_adminq.h
>> @@ -835,6 +835,13 @@ enum pds_lm_cmd_opcode {
>>        PDS_LM_CMD_RESUME          = 20,
>>        PDS_LM_CMD_SAVE            = 21,
>>        PDS_LM_CMD_RESTORE         = 22,
>> +
>> +     /* Dirty page tracking commands */
>> +     PDS_LM_CMD_DIRTY_STATUS    = 32,
>> +     PDS_LM_CMD_DIRTY_ENABLE    = 33,
>> +     PDS_LM_CMD_DIRTY_DISABLE   = 34,
>> +     PDS_LM_CMD_DIRTY_READ_SEQ  = 35,
>> +     PDS_LM_CMD_DIRTY_WRITE_ACK = 36,
>>   };
>>
>>   /**
>> @@ -992,6 +999,172 @@ enum pds_lm_host_vf_status {
>>        PDS_LM_STA_MAX,
>>   };
>>
>> +/**
>> + * struct pds_lm_dirty_region_info - Memory region info for STATUS and
>> ENABLE
>> + * @dma_base:                Base address of the DMA-contiguous memory region
>> + * @page_count:              Number of pages in the memory region
>> + * @page_size_log2:  Log2 page size in the memory region
>> + * @rsvd:            Word boundary padding
>> + */
>> +struct pds_lm_dirty_region_info {
>> +     __le64 dma_base;
>> +     __le32 page_count;
>> +     u8     page_size_log2;
>> +     u8     rsvd[3];
>> +};
>> +
>> +/**
>> + * struct pds_lm_dirty_status_cmd - DIRTY_STATUS command
>> + * @opcode:          Opcode PDS_LM_CMD_DIRTY_STATUS
>> + * @rsvd:            Word boundary padding
>> + * @vf_id:           VF id
>> + * @max_regions:     Capacity of the region info buffer
>> + * @rsvd2:           Word boundary padding
>> + * @regions_dma:     DMA address of the region info buffer
>> + *
>> + * The minimum of max_regions (from the command) and num_regions
>> (from the
>> + * completion) of struct pds_lm_dirty_region_info will be written to
>> + * regions_dma.
>> + *
>> + * The max_regions may be zero, in which case regions_dma is ignored.  In
>> that
>> + * case, the completion will only report the maximum number of regions
>> + * supported by the device, and the number of regions currently enabled.
>> + */
>> +struct pds_lm_dirty_status_cmd {
>> +     u8     opcode;
>> +     u8     rsvd;
>> +     __le16 vf_id;
>> +     u8     max_regions;
>> +     u8     rsvd2[3];
>> +     __le64 regions_dma;
>> +} __packed;
>> +
>> +/**
>> + * enum pds_lm_dirty_bmp_type - Type of dirty page bitmap
>> + * @PDS_LM_DIRTY_BMP_TYPE_NONE: No bitmap / disabled
>> + * @PDS_LM_DIRTY_BMP_TYPE_SEQ_ACK: Seq/Ack bitmap representation
>> + */
>> +enum pds_lm_dirty_bmp_type {
>> +     PDS_LM_DIRTY_BMP_TYPE_NONE     = 0,
>> +     PDS_LM_DIRTY_BMP_TYPE_SEQ_ACK  = 1,
>> +};
>> +
>> +/**
>> + * struct pds_lm_dirty_status_comp - STATUS command completion
>> + * @status:          Status of the command (enum pds_core_status_code)
>> + * @rsvd:            Word boundary padding
>> + * @comp_index:              Index in the desc ring for which this is the
>> completion
>> + * @max_regions:     Maximum number of regions supported by the
>> device
>> + * @num_regions:     Number of regions currently enabled
>> + * @bmp_type:                Type of dirty bitmap representation
>> + * @rsvd2:           Word boundary padding
>> + * @bmp_type_mask:   Mask of supported bitmap types, bit index per type
>> + * @rsvd3:           Word boundary padding
>> + * @color:           Color bit
>> + *
>> + * This completion descriptor is used for STATUS, ENABLE, and DISABLE.
>> + */
>> +struct pds_lm_dirty_status_comp {
>> +     u8     status;
>> +     u8     rsvd;
>> +     __le16 comp_index;
>> +     u8     max_regions;
>> +     u8     num_regions;
>> +     u8     bmp_type;
>> +     u8     rsvd2;
>> +     __le32 bmp_type_mask;
>> +     u8     rsvd3[3];
>> +     u8     color;
>> +};
>> +
>> +/**
>> + * struct pds_lm_dirty_enable_cmd - DIRTY_ENABLE command
>> + * @opcode:          Opcode PDS_LM_CMD_DIRTY_ENABLE
>> + * @rsvd:            Word boundary padding
>> + * @vf_id:           VF id
>> + * @bmp_type:                Type of dirty bitmap representation
>> + * @num_regions:     Number of entries in the region info buffer
>> + * @rsvd2:           Word boundary padding
>> + * @regions_dma:     DMA address of the region info buffer
>> + *
>> + * The num_regions must be nonzero, and less than or equal to the
>> maximum
>> + * number of regions supported by the device.
>> + *
>> + * The memory regions should not overlap.
>> + *
>> + * The information should be initialized by the driver.  The device may
>> modify
>> + * the information on successful completion, such as by size-aligning the
>> + * number of pages in a region.
>> + *
>> + * The modified number of pages will be greater than or equal to the page
>> count
>> + * given in the enable command, and at least as coarsly aligned as the given
>> + * value.  For example, the count might be aligned to a multiple of 64, but
>> + * if the value is already a multiple of 128 or higher, it will not change.
>> + * If the driver requires its own minimum alignment of the number of pages,
>> the
>> + * driver should account for that already in the region info of this command.
>> + *
>> + * This command uses struct pds_lm_dirty_status_comp for its completion.
>> + */
>> +struct pds_lm_dirty_enable_cmd {
>> +     u8     opcode;
>> +     u8     rsvd;
>> +     __le16 vf_id;
>> +     u8     bmp_type;
>> +     u8     num_regions;
>> +     u8     rsvd2[2];
>> +     __le64 regions_dma;
>> +} __packed;
>> +
>> +/**
>> + * struct pds_lm_dirty_disable_cmd - DIRTY_DISABLE command
>> + * @opcode:  Opcode PDS_LM_CMD_DIRTY_DISABLE
>> + * @rsvd:    Word boundary padding
>> + * @vf_id:   VF id
>> + *
>> + * Dirty page tracking will be disabled.  This may be called in any state, as
>> + * long as dirty page tracking is supported by the device, to ensure that dirty
>> + * page tracking is disabled.
>> + *
>> + * This command uses struct pds_lm_dirty_status_comp for its completion.
>> On
>> + * success, num_regions will be zero.
>> + */
>> +struct pds_lm_dirty_disable_cmd {
>> +     u8     opcode;
>> +     u8     rsvd;
>> +     __le16 vf_id;
>> +};
>> +
>> +/**
>> + * struct pds_lm_dirty_seq_ack_cmd - DIRTY_READ_SEQ or _WRITE_ACK
>> command
>> + * @opcode:  Opcode PDS_LM_CMD_DIRTY_[READ_SEQ|WRITE_ACK]
>> + * @rsvd:    Word boundary padding
>> + * @vf_id:   VF id
>> + * @off_bytes:       Byte offset in the bitmap
>> + * @len_bytes:       Number of bytes to transfer
>> + * @num_sge: Number of DMA scatter gather elements
>> + * @rsvd2:   Word boundary padding
>> + * @sgl_addr:        DMA address of scatter gather list
>> + *
>> + * Read bytes from the SEQ bitmap, or write bytes into the ACK bitmap.
>> + *
>> + * This command treats the entire bitmap as a byte buffer.  It does not
>> + * distinguish between guest memory regions.  The driver should refer to
>> the
>> + * number of pages in each region, according to
>> PDS_LM_CMD_DIRTY_STATUS, to
>> + * determine the region boundaries in the bitmap.  Each region will be
>> + * represented by exactly the number of bits as the page count for that
>> region,
>> + * immediately following the last bit of the previous region.
>> + */
>> +struct pds_lm_dirty_seq_ack_cmd {
>> +     u8     opcode;
>> +     u8     rsvd;
>> +     __le16 vf_id;
>> +     __le32 off_bytes;
>> +     __le32 len_bytes;
>> +     __le16 num_sge;
>> +     u8     rsvd2[2];
>> +     __le64 sgl_addr;
>> +} __packed;
>> +
>>   /**
>>    * struct pds_lm_host_vf_status_cmd - HOST_VF_STATUS command
>>    * @opcode:  Opcode PDS_LM_CMD_HOST_VF_STATUS
>> @@ -1039,6 +1212,10 @@ union pds_core_adminq_cmd {
>>        struct pds_lm_save_cmd            lm_save;
>>        struct pds_lm_restore_cmd         lm_restore;
>>        struct pds_lm_host_vf_status_cmd  lm_host_vf_status;
>> +     struct pds_lm_dirty_status_cmd    lm_dirty_status;
>> +     struct pds_lm_dirty_enable_cmd    lm_dirty_enable;
>> +     struct pds_lm_dirty_disable_cmd   lm_dirty_disable;
>> +     struct pds_lm_dirty_seq_ack_cmd   lm_dirty_seq_ack;
>>   };
>>
>>   union pds_core_adminq_comp {
>> @@ -1065,6 +1242,7 @@ union pds_core_adminq_comp {
>>        struct pds_vdpa_vq_reset_comp     vdpa_vq_reset;
>>
>>        struct pds_lm_state_size_comp     lm_state_size;
>> +     struct pds_lm_dirty_status_comp   lm_dirty_status;
>>   };
>>
>>   #ifndef __CHECKER__
>> --
>> 2.17.1
> 

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ