[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <265e3448-2e8e-c38b-e625-1546ae3d408b@linux.ibm.com>
Date: Wed, 19 Jan 2022 10:29:02 +0100
From: Pierre Morel <pmorel@...ux.ibm.com>
To: Matthew Rosato <mjrosato@...ux.ibm.com>, linux-s390@...r.kernel.org
Cc: alex.williamson@...hat.com, cohuck@...hat.com,
schnelle@...ux.ibm.com, farman@...ux.ibm.com,
borntraeger@...ux.ibm.com, hca@...ux.ibm.com, gor@...ux.ibm.com,
gerald.schaefer@...ux.ibm.com, agordeev@...ux.ibm.com,
frankja@...ux.ibm.com, david@...hat.com, imbrenda@...ux.ibm.com,
vneethv@...ux.ibm.com, oberpar@...ux.ibm.com, freude@...ux.ibm.com,
thuth@...hat.com, pasic@...ux.ibm.com, kvm@...r.kernel.org,
linux-kernel@...r.kernel.org
Subject: Re: [PATCH v2 21/30] KVM: s390: pci: handle refresh of PCI
translations
On 1/14/22 21:31, Matthew Rosato wrote:
> Add a routine that will perform a shadow operation between a guest
> and host IOAT. A subsequent patch will invoke this in response to
> an 04 RPCIT instruction intercept.
>
> Signed-off-by: Matthew Rosato <mjrosato@...ux.ibm.com>
> ---
> arch/s390/include/asm/kvm_pci.h | 1 +
> arch/s390/include/asm/pci_dma.h | 1 +
> arch/s390/kvm/pci.c | 208 +++++++++++++++++++++++++++++++-
> arch/s390/kvm/pci.h | 8 +-
> 4 files changed, 216 insertions(+), 2 deletions(-)
>
> diff --git a/arch/s390/include/asm/kvm_pci.h b/arch/s390/include/asm/kvm_pci.h
> index 770849f13a70..fa90729a35cf 100644
> --- a/arch/s390/include/asm/kvm_pci.h
> +++ b/arch/s390/include/asm/kvm_pci.h
> @@ -30,6 +30,7 @@ struct kvm_zdev_ioat {
> struct kvm_zdev {
> struct zpci_dev *zdev;
> struct kvm *kvm;
> + u64 rpcit_count;
> struct kvm_zdev_ioat ioat;
> struct zpci_fib fib;
> };
> diff --git a/arch/s390/include/asm/pci_dma.h b/arch/s390/include/asm/pci_dma.h
> index 69e616d0712c..38004e0a4383 100644
> --- a/arch/s390/include/asm/pci_dma.h
> +++ b/arch/s390/include/asm/pci_dma.h
> @@ -52,6 +52,7 @@ enum zpci_ioat_dtype {
> #define ZPCI_TABLE_ENTRIES (ZPCI_TABLE_SIZE / ZPCI_TABLE_ENTRY_SIZE)
> #define ZPCI_TABLE_PAGES (ZPCI_TABLE_SIZE >> PAGE_SHIFT)
> #define ZPCI_TABLE_ENTRIES_PAGES (ZPCI_TABLE_ENTRIES * ZPCI_TABLE_PAGES)
> +#define ZPCI_TABLE_ENTRIES_PER_PAGE (ZPCI_TABLE_ENTRIES / ZPCI_TABLE_PAGES)
>
> #define ZPCI_TABLE_BITS 11
> #define ZPCI_PT_BITS 8
> diff --git a/arch/s390/kvm/pci.c b/arch/s390/kvm/pci.c
> index 39c13c25a700..38d2b77ec565 100644
> --- a/arch/s390/kvm/pci.c
> +++ b/arch/s390/kvm/pci.c
> @@ -149,6 +149,208 @@ int kvm_s390_pci_aen_init(u8 nisc)
> return rc;
> }
>
> +static int dma_shadow_cpu_trans(struct kvm_vcpu *vcpu, unsigned long *entry,
> + unsigned long *gentry)
> +{
> + phys_addr_t gaddr = 0;
> + unsigned long idx;
> + struct page *page;
> + kvm_pfn_t pfn;
> + gpa_t addr;
> + int rc = 0;
> +
> + if (pt_entry_isvalid(*gentry)) {
> + /* pin and validate */
> + addr = *gentry & ZPCI_PTE_ADDR_MASK;
> + idx = srcu_read_lock(&vcpu->kvm->srcu);
> + page = gfn_to_page(vcpu->kvm, gpa_to_gfn(addr));
> + srcu_read_unlock(&vcpu->kvm->srcu, idx);
> + if (is_error_page(page))
> + return -EIO;
> + gaddr = page_to_phys(page) + (addr & ~PAGE_MASK);
> + }
> +
> + if (pt_entry_isvalid(*entry)) {
> + /* Either we are invalidating, replacing or no-op */
> + if (gaddr != 0) {
> + if ((*entry & ZPCI_PTE_ADDR_MASK) == gaddr) {
> + /* Duplicate */
> + kvm_release_pfn_dirty(*entry >> PAGE_SHIFT);
> + } else {
> + /* Replace */
> + pfn = (*entry >> PAGE_SHIFT);
> + invalidate_pt_entry(entry);
> + set_pt_pfaa(entry, gaddr);
> + validate_pt_entry(entry);
> + kvm_release_pfn_dirty(pfn);
> + rc = 1;
> + }
> + } else {
> + /* Invalidate */
> + pfn = (*entry >> PAGE_SHIFT);
> + invalidate_pt_entry(entry);
> + kvm_release_pfn_dirty(pfn);
> + rc = 1;
> + }
> + } else if (gaddr != 0) {
> + /* New Entry */
> + set_pt_pfaa(entry, gaddr);
> + validate_pt_entry(entry);
> + }
> +
> + return rc;
> +}
> +
> +static unsigned long *dma_walk_guest_cpu_trans(struct kvm_vcpu *vcpu,
> + struct kvm_zdev_ioat *ioat,
> + dma_addr_t dma_addr)
> +{
> + unsigned long *rto, *sto, *pto;
> + unsigned int rtx, rts, sx, px, idx;
> + struct page *page;
> + gpa_t addr;
> + int i;
> +
> + /* Pin guest segment table if needed */
> + rtx = calc_rtx(dma_addr);
> + rto = ioat->head[(rtx / ZPCI_TABLE_ENTRIES_PER_PAGE)];
> + rts = rtx * ZPCI_TABLE_PAGES;
> + if (!ioat->seg[rts]) {
> + if (!reg_entry_isvalid(rto[rtx % ZPCI_TABLE_ENTRIES_PER_PAGE]))
> + return NULL;
> + sto = get_rt_sto(rto[rtx % ZPCI_TABLE_ENTRIES_PER_PAGE]);
> + addr = ((u64)sto & ZPCI_RTE_ADDR_MASK);
> + idx = srcu_read_lock(&vcpu->kvm->srcu);
> + for (i = 0; i < ZPCI_TABLE_PAGES; i++) {
> + page = gfn_to_page(vcpu->kvm, gpa_to_gfn(addr));
> + if (is_error_page(page)) {
> + srcu_read_unlock(&vcpu->kvm->srcu, idx);
> + return NULL;
> + }
> + ioat->seg[rts + i] = page_to_virt(page) +
> + (addr & ~PAGE_MASK);
> + addr += PAGE_SIZE;
> + }
> + srcu_read_unlock(&vcpu->kvm->srcu, idx);
> + }
> +
> + /* Allocate pin pointers for another segment table if needed */
> + if (!ioat->pt[rtx]) {
> + ioat->pt[rtx] = kcalloc(ZPCI_TABLE_ENTRIES,
> + (sizeof(unsigned long *)), GFP_KERNEL);
> + if (!ioat->pt[rtx])
> + return NULL;
> + }
> + /* Pin guest page table if needed */
> + sx = calc_sx(dma_addr);
> + sto = ioat->seg[(rts + (sx / ZPCI_TABLE_ENTRIES_PER_PAGE))];
> + if (!ioat->pt[rtx][sx]) {
> + if (!reg_entry_isvalid(sto[sx % ZPCI_TABLE_ENTRIES_PER_PAGE]))
> + return NULL;
> + pto = get_st_pto(sto[sx % ZPCI_TABLE_ENTRIES_PER_PAGE]);
> + if (!pto)
> + return NULL;
> + addr = ((u64)pto & ZPCI_STE_ADDR_MASK);
> + idx = srcu_read_lock(&vcpu->kvm->srcu);
> + page = gfn_to_page(vcpu->kvm, gpa_to_gfn(addr));
> + srcu_read_unlock(&vcpu->kvm->srcu, idx);
> + if (is_error_page(page))
> + return NULL;
> + ioat->pt[rtx][sx] = page_to_virt(page) + (addr & ~PAGE_MASK);
> + }
> + pto = ioat->pt[rtx][sx];
> +
> + /* Return guest PTE */
> + px = calc_px(dma_addr);
> + return &pto[px];
> +}
> +
> +
> +static int dma_table_shadow(struct kvm_vcpu *vcpu, struct zpci_dev *zdev,
> + dma_addr_t dma_addr, size_t size)
> +{
> + unsigned int nr_pages = PAGE_ALIGN(size) >> PAGE_SHIFT;
> + struct kvm_zdev *kzdev = zdev->kzdev;
> + unsigned long *entry, *gentry;
> + int i, rc = 0, rc2;
> +
> + if (!nr_pages || !kzdev)
> + return -EINVAL;
> +
> + mutex_lock(&kzdev->ioat.lock);
> + if (!zdev->dma_table || !kzdev->ioat.head[0]) {
> + rc = -EINVAL;
> + goto out_unlock;
> + }
> +
> + for (i = 0; i < nr_pages; i++) {
> + gentry = dma_walk_guest_cpu_trans(vcpu, &kzdev->ioat, dma_addr);
> + if (!gentry)
> + continue;
> + entry = dma_walk_cpu_trans(zdev->dma_table, dma_addr);
> +
> + if (!entry) {
> + rc = -ENOMEM;
> + goto out_unlock;
> + }
> +
> + rc2 = dma_shadow_cpu_trans(vcpu, entry, gentry);
> + if (rc2 < 0) {
> + rc = -EIO;
> + goto out_unlock;
> + }
> + dma_addr += PAGE_SIZE;
> + rc += rc2;
> + }
> +
In case of error, shouldn't we invalidate the shadow tables entries we
did validate until the error?
> +out_unlock:
> + mutex_unlock(&kzdev->ioat.lock);
> + return rc;
> +}
> +
> +int kvm_s390_pci_refresh_trans(struct kvm_vcpu *vcpu, unsigned long req,
> + unsigned long start, unsigned long size,
> + u8 *status)
> +{
> + struct zpci_dev *zdev;
> + u32 fh = req >> 32;
> + int rc;
> +
> + /* Make sure this is a valid device associated with this guest */
> + zdev = get_zdev_by_fh(fh);
> + if (!zdev || !zdev->kzdev || zdev->kzdev->kvm != vcpu->kvm) {
> + *status = 0;
Wouldn't it be interesting to add some debug information here.
When would this appear?
Also if we have this error this looks like we have a VM problem,
shouldn't we treat this in QEMU and return -EOPNOTSUPP ?
> + return -EINVAL;
> + }
> +
> + /* Only proceed if the device is using the assist */
> + if (zdev->kzdev->ioat.head[0] == 0)
> + return -EOPNOTSUPP;
> +
> + rc = dma_table_shadow(vcpu, zdev, start, size);
> + if (rc < 0) {
> + /*
> + * If errors encountered during shadow operations, we must
> + * fabricate status to present to the guest
> + */
> + switch (rc) {
> + case -ENOMEM:
> + *status = KVM_S390_RPCIT_INS_RES;
> + break;
> + default:
> + *status = KVM_S390_RPCIT_ERR;
> + break;
> + }
> + } else if (rc > 0) {
> + /* Host RPCIT must be issued */
> + rc = zpci_refresh_trans((u64) zdev->fh << 32, start, size,
> + status);
> + }
> + zdev->kzdev->rpcit_count++;
> +
> + return rc;
> +}
> +
> /* Modify PCI: Register floating adapter interruption forwarding */
> static int kvm_zpci_set_airq(struct zpci_dev *zdev)
> {
> @@ -620,6 +822,8 @@ EXPORT_SYMBOL_GPL(kvm_s390_pci_attach_kvm);
>
> int kvm_s390_pci_init(void)
> {
> + int rc;
> +
> aift = kzalloc(sizeof(struct zpci_aift), GFP_KERNEL);
> if (!aift)
> return -ENOMEM;
> @@ -627,5 +831,7 @@ int kvm_s390_pci_init(void)
> spin_lock_init(&aift->gait_lock);
> mutex_init(&aift->lock);
>
> - return 0;
> + rc = zpci_get_mdd(&aift->mdd);
> +
> + return rc;
> }
> diff --git a/arch/s390/kvm/pci.h b/arch/s390/kvm/pci.h
> index 54355634df82..bb2be7fc3934 100644
> --- a/arch/s390/kvm/pci.h
> +++ b/arch/s390/kvm/pci.h
> @@ -18,6 +18,9 @@
>
> #define KVM_S390_PCI_DTSM_MASK 0x40
>
> +#define KVM_S390_RPCIT_INS_RES 0x10
> +#define KVM_S390_RPCIT_ERR 0x28
> +
> struct zpci_gaite {
> u32 gisa;
> u8 gisc;
> @@ -33,6 +36,7 @@ struct zpci_aift {
> struct kvm_zdev **kzdev;
> spinlock_t gait_lock; /* Protects the gait, used during AEN forward */
> struct mutex lock; /* Protects the other structures in aift */
> + u32 mdd;
> };
>
> extern struct zpci_aift *aift;
> @@ -47,7 +51,9 @@ static inline struct kvm *kvm_s390_pci_si_to_kvm(struct zpci_aift *aift,
>
> int kvm_s390_pci_aen_init(u8 nisc);
> void kvm_s390_pci_aen_exit(void);
> -
> +int kvm_s390_pci_refresh_trans(struct kvm_vcpu *vcpu, unsigned long req,
> + unsigned long start, unsigned long end,
> + u8 *status);
> int kvm_s390_pci_init(void);
>
> #endif /* __KVM_S390_PCI_H */
>
--
Pierre Morel
IBM Lab Boeblingen
Powered by blists - more mailing lists