[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20251120171544.96841-20-imbrenda@linux.ibm.com>
Date: Thu, 20 Nov 2025 18:15:40 +0100
From: Claudio Imbrenda <imbrenda@...ux.ibm.com>
To: kvm@...r.kernel.org
Cc: linux-kernel@...r.kernel.org, linux-s390@...r.kernel.org,
borntraeger@...ibm.com, frankja@...ux.ibm.com, nsg@...ux.ibm.com,
nrb@...ux.ibm.com, seiden@...ux.ibm.com, gra@...ux.ibm.com,
schlameuss@...ux.ibm.com, hca@...ux.ibm.com, svens@...ux.ibm.com,
agordeev@...ux.ibm.com, gor@...ux.ibm.com, david@...hat.com,
gerald.schaefer@...ux.ibm.com
Subject: [PATCH v4 19/23] KVM: s390: Switch to new gmap
Switch KVM/s390 to use the new gmap code.
Remove includes to <gmap.h> and include "gmap.h" instead; fix all the
existing users of the old gmap functions to use the new ones instead.
Fix guest storage key access functions to work with the new gmap.
Signed-off-by: Claudio Imbrenda <imbrenda@...ux.ibm.com>
---
arch/s390/Kconfig | 2 +-
arch/s390/include/asm/kvm_host.h | 5 +-
arch/s390/include/asm/mmu_context.h | 4 -
arch/s390/include/asm/tlb.h | 3 -
arch/s390/include/asm/uaccess.h | 70 +--
arch/s390/kvm/Makefile | 2 +-
arch/s390/kvm/diag.c | 2 +-
arch/s390/kvm/gaccess.c | 866 +++++++++++++++++-----------
arch/s390/kvm/gaccess.h | 18 +-
arch/s390/kvm/gmap-vsie.c | 141 -----
arch/s390/kvm/gmap.c | 6 +-
arch/s390/kvm/intercept.c | 15 +-
arch/s390/kvm/interrupt.c | 2 +-
arch/s390/kvm/kvm-s390.c | 757 +++++++-----------------
arch/s390/kvm/kvm-s390.h | 20 +-
arch/s390/kvm/priv.c | 211 +++----
arch/s390/kvm/pv.c | 64 +-
arch/s390/kvm/vsie.c | 153 +++--
arch/s390/lib/uaccess.c | 184 +-----
arch/s390/mm/gmap_helpers.c | 29 -
20 files changed, 991 insertions(+), 1563 deletions(-)
delete mode 100644 arch/s390/kvm/gmap-vsie.c
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index df22b10d9141..3b4ba19a3611 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -33,7 +33,7 @@ config GENERIC_LOCKBREAK
def_bool y if PREEMPTION
config PGSTE
- def_bool y if KVM
+ def_bool n
config AUDIT_ARCH
def_bool y
diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h
index 958a3b8c32d1..9abaa23bbb76 100644
--- a/arch/s390/include/asm/kvm_host.h
+++ b/arch/s390/include/asm/kvm_host.h
@@ -441,7 +441,7 @@ struct kvm_vcpu_arch {
bool acrs_loaded;
struct kvm_s390_pv_vcpu pv;
union diag318_info diag318_info;
- void *mc; /* Placeholder */
+ struct kvm_s390_mmu_cache *mc;
};
struct kvm_vm_stat {
@@ -633,6 +633,8 @@ struct kvm_s390_pv {
struct mmu_notifier mmu_notifier;
};
+struct kvm_s390_mmu_cache;
+
struct kvm_arch{
void *sca;
int use_esca;
@@ -673,6 +675,7 @@ struct kvm_arch{
struct kvm_s390_pv pv;
struct list_head kzdev_list;
spinlock_t kzdev_list_lock;
+ struct kvm_s390_mmu_cache *mc;
};
#define KVM_HVA_ERR_BAD (-1UL)
diff --git a/arch/s390/include/asm/mmu_context.h b/arch/s390/include/asm/mmu_context.h
index 48e548c01daa..bd1ef5e2d2eb 100644
--- a/arch/s390/include/asm/mmu_context.h
+++ b/arch/s390/include/asm/mmu_context.h
@@ -30,11 +30,7 @@ static inline int init_new_context(struct task_struct *tsk,
mm->context.gmap_asce = 0;
mm->context.flush_mm = 0;
#if IS_ENABLED(CONFIG_KVM)
- mm->context.has_pgste = 0;
- mm->context.uses_skeys = 0;
- mm->context.uses_cmm = 0;
mm->context.allow_cow_sharing = 1;
- mm->context.allow_gmap_hpage_1m = 0;
#endif
switch (mm->context.asce_limit) {
default:
diff --git a/arch/s390/include/asm/tlb.h b/arch/s390/include/asm/tlb.h
index 1e50f6f1ad9d..7354b42ee994 100644
--- a/arch/s390/include/asm/tlb.h
+++ b/arch/s390/include/asm/tlb.h
@@ -36,7 +36,6 @@ static inline bool __tlb_remove_folio_pages(struct mmu_gather *tlb,
#include <asm/tlbflush.h>
#include <asm-generic/tlb.h>
-#include <asm/gmap.h>
/*
* Release the page cache reference for a pte removed by
@@ -85,8 +84,6 @@ static inline void pte_free_tlb(struct mmu_gather *tlb, pgtable_t pte,
tlb->mm->context.flush_mm = 1;
tlb->freed_tables = 1;
tlb->cleared_pmds = 1;
- if (mm_has_pgste(tlb->mm))
- gmap_unlink(tlb->mm, (unsigned long *)pte, address);
tlb_remove_ptdesc(tlb, virt_to_ptdesc(pte));
}
diff --git a/arch/s390/include/asm/uaccess.h b/arch/s390/include/asm/uaccess.h
index 3e5b8b677057..6380e03cfb62 100644
--- a/arch/s390/include/asm/uaccess.h
+++ b/arch/s390/include/asm/uaccess.h
@@ -471,65 +471,15 @@ do { \
#define __get_kernel_nofault __mvc_kernel_nofault
#define __put_kernel_nofault __mvc_kernel_nofault
-void __cmpxchg_user_key_called_with_bad_pointer(void);
-
-int __cmpxchg_user_key1(unsigned long address, unsigned char *uval,
- unsigned char old, unsigned char new, unsigned long key);
-int __cmpxchg_user_key2(unsigned long address, unsigned short *uval,
- unsigned short old, unsigned short new, unsigned long key);
-int __cmpxchg_user_key4(unsigned long address, unsigned int *uval,
- unsigned int old, unsigned int new, unsigned long key);
-int __cmpxchg_user_key8(unsigned long address, unsigned long *uval,
- unsigned long old, unsigned long new, unsigned long key);
-int __cmpxchg_user_key16(unsigned long address, __uint128_t *uval,
- __uint128_t old, __uint128_t new, unsigned long key);
-
-static __always_inline int _cmpxchg_user_key(unsigned long address, void *uval,
- __uint128_t old, __uint128_t new,
- unsigned long key, int size)
-{
- switch (size) {
- case 1: return __cmpxchg_user_key1(address, uval, old, new, key);
- case 2: return __cmpxchg_user_key2(address, uval, old, new, key);
- case 4: return __cmpxchg_user_key4(address, uval, old, new, key);
- case 8: return __cmpxchg_user_key8(address, uval, old, new, key);
- case 16: return __cmpxchg_user_key16(address, uval, old, new, key);
- default: __cmpxchg_user_key_called_with_bad_pointer();
- }
- return 0;
-}
-
-/**
- * cmpxchg_user_key() - cmpxchg with user space target, honoring storage keys
- * @ptr: User space address of value to compare to @old and exchange with
- * @new. Must be aligned to sizeof(*@ptr).
- * @uval: Address where the old value of *@ptr is written to.
- * @old: Old value. Compared to the content pointed to by @ptr in order to
- * determine if the exchange occurs. The old value read from *@ptr is
- * written to *@...l.
- * @new: New value to place at *@....
- * @key: Access key to use for checking storage key protection.
- *
- * Perform a cmpxchg on a user space target, honoring storage key protection.
- * @key alone determines how key checking is performed, neither
- * storage-protection-override nor fetch-protection-override apply.
- * The caller must compare *@...l and @old to determine if values have been
- * exchanged. In case of an exception *@...l is set to zero.
- *
- * Return: 0: cmpxchg executed
- * -EFAULT: an exception happened when trying to access *@ptr
- * -EAGAIN: maxed out number of retries (byte and short only)
- */
-#define cmpxchg_user_key(ptr, uval, old, new, key) \
-({ \
- __typeof__(ptr) __ptr = (ptr); \
- __typeof__(uval) __uval = (uval); \
- \
- BUILD_BUG_ON(sizeof(*(__ptr)) != sizeof(*(__uval))); \
- might_fault(); \
- __chk_user_ptr(__ptr); \
- _cmpxchg_user_key((unsigned long)(__ptr), (void *)(__uval), \
- (old), (new), (key), sizeof(*(__ptr))); \
-})
+int __cmpxchg_key1(void *address, unsigned char *uval, unsigned char old,
+ unsigned char new, unsigned long key);
+int __cmpxchg_key2(void *address, unsigned short *uval, unsigned short old,
+ unsigned short new, unsigned long key);
+int __cmpxchg_key4(void *address, unsigned int *uval, unsigned int old,
+ unsigned int new, unsigned long key);
+int __cmpxchg_key8(void *address, unsigned long *uval, unsigned long old,
+ unsigned long new, unsigned long key);
+int __cmpxchg_key16(void *address, __uint128_t *uval, __uint128_t old,
+ __uint128_t new, unsigned long key);
#endif /* __S390_UACCESS_H */
diff --git a/arch/s390/kvm/Makefile b/arch/s390/kvm/Makefile
index 1e2dcd3e2436..dac9d53b23d8 100644
--- a/arch/s390/kvm/Makefile
+++ b/arch/s390/kvm/Makefile
@@ -8,7 +8,7 @@ include $(srctree)/virt/kvm/Makefile.kvm
ccflags-y := -Ivirt/kvm -Iarch/s390/kvm
kvm-y += kvm-s390.o intercept.o interrupt.o priv.o sigp.o
-kvm-y += diag.o gaccess.o guestdbg.o vsie.o pv.o gmap-vsie.o
+kvm-y += diag.o gaccess.o guestdbg.o vsie.o pv.o
kvm-y += dat.o gmap.o faultin.o
kvm-$(CONFIG_VFIO_PCI_ZDEV_KVM) += pci.o
diff --git a/arch/s390/kvm/diag.c b/arch/s390/kvm/diag.c
index 53233dec8cad..d89d1c381522 100644
--- a/arch/s390/kvm/diag.c
+++ b/arch/s390/kvm/diag.c
@@ -10,13 +10,13 @@
#include <linux/kvm.h>
#include <linux/kvm_host.h>
-#include <asm/gmap.h>
#include <asm/gmap_helpers.h>
#include <asm/virtio-ccw.h>
#include "kvm-s390.h"
#include "trace.h"
#include "trace-s390.h"
#include "gaccess.h"
+#include "gmap.h"
static void do_discard_gfn_range(struct kvm_vcpu *vcpu, gfn_t gfn_start, gfn_t gfn_end)
{
diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c
index a054de80a5cc..0c70f46ae323 100644
--- a/arch/s390/kvm/gaccess.c
+++ b/arch/s390/kvm/gaccess.c
@@ -11,15 +11,43 @@
#include <linux/err.h>
#include <linux/pgtable.h>
#include <linux/bitfield.h>
+#include <linux/kvm_host.h>
+#include <linux/kvm_types.h>
+#include <asm/diag.h>
#include <asm/access-regs.h>
#include <asm/fault.h>
-#include <asm/gmap.h>
#include <asm/dat-bits.h>
#include "kvm-s390.h"
+#include "dat.h"
+#include "gmap.h"
#include "gaccess.h"
+#include "faultin.h"
#define GMAP_SHADOW_FAKE_TABLE 1ULL
+union dat_table_entry {
+ unsigned long val;
+ union region1_table_entry pgd;
+ union region2_table_entry p4d;
+ union region3_table_entry pud;
+ union segment_table_entry pmd;
+ union page_table_entry pte;
+};
+
+#define WALK_N_ENTRIES 7
+#define LEVEL_MEM -2
+struct pgtwalk {
+ struct guest_fault raw_entries[WALK_N_ENTRIES];
+ gpa_t last_addr;
+ int level;
+ bool p;
+};
+
+static inline struct guest_fault *get_entries(struct pgtwalk *w)
+{
+ return w->raw_entries - LEVEL_MEM;
+}
+
/*
* raddress union which will contain the result (real or absolute address)
* after a page table walk. The rfaa, sfaa and pfra members are used to
@@ -81,6 +109,28 @@ struct aste {
/* .. more fields there */
};
+union oac {
+ unsigned int val;
+ struct {
+ struct {
+ unsigned short key : 4;
+ unsigned short : 4;
+ unsigned short as : 2;
+ unsigned short : 4;
+ unsigned short k : 1;
+ unsigned short a : 1;
+ } oac1;
+ struct {
+ unsigned short key : 4;
+ unsigned short : 4;
+ unsigned short as : 2;
+ unsigned short : 4;
+ unsigned short k : 1;
+ unsigned short a : 1;
+ } oac2;
+ };
+};
+
int ipte_lock_held(struct kvm *kvm)
{
if (sclp.has_siif) {
@@ -618,28 +668,16 @@ static int low_address_protection_enabled(struct kvm_vcpu *vcpu,
static int vm_check_access_key_gpa(struct kvm *kvm, u8 access_key,
enum gacc_mode mode, gpa_t gpa)
{
- u8 storage_key, access_control;
- bool fetch_protected;
- unsigned long hva;
+ union skey storage_key;
int r;
- if (access_key == 0)
- return 0;
-
- hva = gfn_to_hva(kvm, gpa_to_gfn(gpa));
- if (kvm_is_error_hva(hva))
- return PGM_ADDRESSING;
-
- mmap_read_lock(current->mm);
- r = get_guest_storage_key(current->mm, hva, &storage_key);
- mmap_read_unlock(current->mm);
+ scoped_guard(read_lock, &kvm->mmu_lock)
+ r = dat_get_storage_key(kvm->arch.gmap->asce, gpa_to_gfn(gpa), &storage_key);
if (r)
return r;
- access_control = FIELD_GET(_PAGE_ACC_BITS, storage_key);
- if (access_control == access_key)
+ if (access_key == 0 || storage_key.acc == access_key)
return 0;
- fetch_protected = storage_key & _PAGE_FP_BIT;
- if ((mode == GACC_FETCH || mode == GACC_IFETCH) && !fetch_protected)
+ if ((mode == GACC_FETCH || mode == GACC_IFETCH) && !storage_key.fp)
return 0;
return PGM_PROTECTION;
}
@@ -682,8 +720,7 @@ static int vcpu_check_access_key_gpa(struct kvm_vcpu *vcpu, u8 access_key,
enum gacc_mode mode, union asce asce, gpa_t gpa,
unsigned long ga, unsigned int len)
{
- u8 storage_key, access_control;
- unsigned long hva;
+ union skey storage_key;
int r;
/* access key 0 matches any storage key -> allow */
@@ -693,26 +730,23 @@ static int vcpu_check_access_key_gpa(struct kvm_vcpu *vcpu, u8 access_key,
* caller needs to ensure that gfn is accessible, so we can
* assume that this cannot fail
*/
- hva = gfn_to_hva(vcpu->kvm, gpa_to_gfn(gpa));
- mmap_read_lock(current->mm);
- r = get_guest_storage_key(current->mm, hva, &storage_key);
- mmap_read_unlock(current->mm);
+ scoped_guard(read_lock, &vcpu->kvm->mmu_lock)
+ r = dat_get_storage_key(vcpu->arch.gmap->asce, gpa_to_gfn(gpa), &storage_key);
if (r)
return r;
- access_control = FIELD_GET(_PAGE_ACC_BITS, storage_key);
/* access key matches storage key -> allow */
- if (access_control == access_key)
+ if (storage_key.acc == access_key)
return 0;
if (mode == GACC_FETCH || mode == GACC_IFETCH) {
/* it is a fetch and fetch protection is off -> allow */
- if (!(storage_key & _PAGE_FP_BIT))
+ if (!storage_key.fp)
return 0;
if (fetch_prot_override_applicable(vcpu, mode, asce) &&
fetch_prot_override_applies(ga, len))
return 0;
}
if (storage_prot_override_applicable(vcpu) &&
- storage_prot_override_applies(access_control))
+ storage_prot_override_applies(storage_key.acc))
return 0;
return PGM_PROTECTION;
}
@@ -812,37 +846,79 @@ static int access_guest_page_gpa(struct kvm *kvm, enum gacc_mode mode, gpa_t gpa
return rc;
}
+static int mvcos_key(void *to, const void *from, unsigned long size, u8 dst_key, u8 src_key)
+{
+ union oac spec = {
+ .oac1.key = dst_key,
+ .oac1.k = !!dst_key,
+ .oac2.key = src_key,
+ .oac2.k = !!src_key,
+ };
+ int exception = PGM_PROTECTION;
+
+ asm_inline volatile(
+ " lr %%r0,%[spec]\n"
+ "0: mvcos %[to],%[from],%[size]\n"
+ "1: lhi %[exc],0\n"
+ "2:\n"
+ EX_TABLE(0b, 2b)
+ EX_TABLE(1b, 2b)
+ : [size] "+d" (size), [to] "=Q" (*(char *)to), [exc] "+d" (exception)
+ : [spec] "d" (spec.val), [from] "Q" (*(const char *)from)
+ : "memory", "cc", "0");
+ return exception;
+}
+
+struct acc_page_key_context {
+ void *data;
+ int exception;
+ unsigned short offset;
+ unsigned short len;
+ bool store;
+ u8 access_key;
+};
+
+static void _access_guest_page_with_key_gpa(struct guest_fault *f)
+{
+ struct acc_page_key_context *context = f->priv;
+ void *ptr;
+ int r;
+
+ ptr = __va(PFN_PHYS(f->pfn) | context->offset);
+
+ if (context->store)
+ r = mvcos_key(ptr, context->data, context->len, context->access_key, 0);
+ else
+ r = mvcos_key(context->data, ptr, context->len, 0, context->access_key);
+
+ context->exception = r;
+}
+
static int access_guest_page_with_key_gpa(struct kvm *kvm, enum gacc_mode mode, gpa_t gpa,
- void *data, unsigned int len, u8 access_key)
+ void *data, unsigned int len, u8 acc)
{
- struct kvm_memory_slot *slot;
- bool writable;
- gfn_t gfn;
- hva_t hva;
+ struct acc_page_key_context context = {
+ .offset = offset_in_page(gpa),
+ .len = len,
+ .data = data,
+ .access_key = acc,
+ .store = mode == GACC_STORE,
+ };
+ struct guest_fault fault = {
+ .gfn = gpa_to_gfn(gpa),
+ .priv = &context,
+ .write_attempt = mode == GACC_STORE,
+ .callback = _access_guest_page_with_key_gpa,
+ };
int rc;
- gfn = gpa_to_gfn(gpa);
- slot = gfn_to_memslot(kvm, gfn);
- hva = gfn_to_hva_memslot_prot(slot, gfn, &writable);
+ if (KVM_BUG_ON((len + context.offset) > PAGE_SIZE, kvm))
+ return -EINVAL;
- if (kvm_is_error_hva(hva))
- return PGM_ADDRESSING;
- /*
- * Check if it's a ro memslot, even tho that can't occur (they're unsupported).
- * Don't try to actually handle that case.
- */
- if (!writable && mode == GACC_STORE)
- return -EOPNOTSUPP;
- hva += offset_in_page(gpa);
- if (mode == GACC_STORE)
- rc = copy_to_user_key((void __user *)hva, data, len, access_key);
- else
- rc = copy_from_user_key(data, (void __user *)hva, len, access_key);
+ rc = kvm_s390_faultin_gfn(NULL, kvm, &fault);
if (rc)
- return PGM_PROTECTION;
- if (mode == GACC_STORE)
- mark_page_dirty_in_slot(kvm, slot, gfn);
- return 0;
+ return rc;
+ return context.exception;
}
int access_guest_abs_with_key(struct kvm *kvm, gpa_t gpa, void *data,
@@ -965,18 +1041,101 @@ int access_guest_real(struct kvm_vcpu *vcpu, unsigned long gra,
return rc;
}
+/**
+ * __cmpxchg_with_key() - cmpxchg memory, honoring storage keys
+ * @ptr: Address of value to compare to *@old and exchange with
+ * @new. Must be aligned to sizeof(*@ptr).
+ * @uval: Address where the old value of *@ptr is written to.
+ * @old: Old value. Compared to the content pointed to by @ptr in order to
+ * determine if the exchange occurs. The old value read from *@ptr is
+ * written to *@...l.
+ * @new: New value to place at *@....
+ * @access_key: Access key to use for checking storage key protection.
+ *
+ * Perform a cmpxchg on guest memory, honoring storage key protection.
+ * @access_key alone determines how key checking is performed, neither
+ * storage-protection-override nor fetch-protection-override apply.
+ * In case of an exception *@...l is set to zero.
+ *
+ * Return:
+ * * 0: cmpxchg executed successfully
+ * * 1: cmpxchg executed unsuccessfully
+ * * PGM_PROTECTION: an exception happened when trying to access *@ptr
+ * * -EAGAIN: maxed out number of retries (byte and short only)
+ */
+static int __cmpxchg_with_key(union kvm_s390_quad *ptr, union kvm_s390_quad *old,
+ union kvm_s390_quad new, int size, u8 access_key)
+{
+ union kvm_s390_quad tmp = { .sixteen = 0 };
+ int rc;
+
+ /*
+ * The cmpxchg_key macro depends on the type of "old", so we need
+ * a case for each valid length and get some code duplication as long
+ * as we don't introduce a new macro.
+ */
+ switch (size) {
+ case 1:
+ rc = __cmpxchg_key1(&ptr->one, &tmp.one, old->one, new.one, access_key);
+ break;
+ case 2:
+ rc = __cmpxchg_key2(&ptr->two, &tmp.two, old->two, new.two, access_key);
+ break;
+ case 4:
+ rc = __cmpxchg_key4(&ptr->four, &tmp.four, old->four, new.four, access_key);
+ break;
+ case 8:
+ rc = __cmpxchg_key8(&ptr->eight, &tmp.eight, old->eight, new.eight, access_key);
+ break;
+ case 16:
+ rc = __cmpxchg_key16(&ptr->sixteen, &tmp.sixteen, old->sixteen, new.sixteen,
+ access_key);
+ break;
+ default:
+ return -EINVAL;
+ }
+ if (!rc && memcmp(&tmp, old, size))
+ rc = 1;
+ *old = tmp;
+ /*
+ * Assume that the fault is caused by protection, either key protection
+ * or user page write protection.
+ */
+ if (rc == -EFAULT)
+ rc = PGM_PROTECTION;
+ return rc;
+}
+
+struct cmpxchg_key_context {
+ union kvm_s390_quad new;
+ union kvm_s390_quad *old;
+ int exception;
+ unsigned short offset;
+ u8 access_key;
+ u8 len;
+};
+
+static void _cmpxchg_guest_abs_with_key(struct guest_fault *f)
+{
+ struct cmpxchg_key_context *context = f->priv;
+
+ context->exception = __cmpxchg_with_key(__va(PFN_PHYS(f->pfn) | context->offset),
+ context->old, context->new, context->len,
+ context->access_key);
+}
+
/**
* cmpxchg_guest_abs_with_key() - Perform cmpxchg on guest absolute address.
* @kvm: Virtual machine instance.
* @gpa: Absolute guest address of the location to be changed.
* @len: Operand length of the cmpxchg, required: 1 <= len <= 16. Providing a
* non power of two will result in failure.
- * @old_addr: Pointer to old value. If the location at @gpa contains this value,
- * the exchange will succeed. After calling cmpxchg_guest_abs_with_key()
- * *@..._addr contains the value at @gpa before the attempt to
- * exchange the value.
+ * @old: Pointer to old value. If the location at @gpa contains this value,
+ * the exchange will succeed. After calling cmpxchg_guest_abs_with_key()
+ * *@old contains the value at @gpa before the attempt to
+ * exchange the value.
* @new: The value to place at @gpa.
- * @access_key: The access key to use for the guest access.
+ * @acc: The access key to use for the guest access.
* @success: output value indicating if an exchange occurred.
*
* Atomically exchange the value at @gpa by @new, if it contains *@....
@@ -989,89 +1148,36 @@ int access_guest_real(struct kvm_vcpu *vcpu, unsigned long gra,
* * -EAGAIN: transient failure (len 1 or 2)
* * -EOPNOTSUPP: read-only memslot (should never occur)
*/
-int cmpxchg_guest_abs_with_key(struct kvm *kvm, gpa_t gpa, int len, union kvm_s390_quad *old_addr,
+int cmpxchg_guest_abs_with_key(struct kvm *kvm, gpa_t gpa, int len, union kvm_s390_quad *old,
union kvm_s390_quad new, u8 acc, bool *success)
{
- gfn_t gfn = gpa_to_gfn(gpa);
- struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
- bool writable;
- hva_t hva;
- int ret;
-
- if (!IS_ALIGNED(gpa, len))
- return -EINVAL;
-
- hva = gfn_to_hva_memslot_prot(slot, gfn, &writable);
- if (kvm_is_error_hva(hva))
- return PGM_ADDRESSING;
- /*
- * Check if it's a read-only memslot, even though that cannot occur
- * since those are unsupported.
- * Don't try to actually handle that case.
- */
- if (!writable)
- return -EOPNOTSUPP;
-
- hva += offset_in_page(gpa);
- /*
- * The cmpxchg_user_key macro depends on the type of "old", so we need
- * a case for each valid length and get some code duplication as long
- * as we don't introduce a new macro.
- */
- switch (len) {
- case 1: {
- u8 old;
-
- ret = cmpxchg_user_key((u8 __user *)hva, &old, old_addr->one, new.one, acc);
- *success = !ret && old == old_addr->one;
- old_addr->one = old;
- break;
- }
- case 2: {
- u16 old;
-
- ret = cmpxchg_user_key((u16 __user *)hva, &old, old_addr->two, new.two, acc);
- *success = !ret && old == old_addr->two;
- old_addr->two = old;
- break;
- }
- case 4: {
- u32 old;
+ struct cmpxchg_key_context context = {
+ .old = old,
+ .new = new,
+ .offset = offset_in_page(gpa),
+ .len = len,
+ .access_key = acc,
+ };
+ struct guest_fault fault = {
+ .gfn = gpa_to_gfn(gpa),
+ .priv = &context,
+ .write_attempt = true,
+ .callback = _cmpxchg_guest_abs_with_key,
+ };
+ int rc;
- ret = cmpxchg_user_key((u32 __user *)hva, &old, old_addr->four, new.four, acc);
- *success = !ret && old == old_addr->four;
- old_addr->four = old;
- break;
- }
- case 8: {
- u64 old;
+ lockdep_assert_held(&kvm->srcu);
- ret = cmpxchg_user_key((u64 __user *)hva, &old, old_addr->eight, new.eight, acc);
- *success = !ret && old == old_addr->eight;
- old_addr->eight = old;
- break;
- }
- case 16: {
- __uint128_t old;
-
- ret = cmpxchg_user_key((__uint128_t __user *)hva, &old, old_addr->sixteen,
- new.sixteen, acc);
- *success = !ret && old == old_addr->sixteen;
- old_addr->sixteen = old;
- break;
- }
- default:
+ if (len > 16 || !IS_ALIGNED(gpa, len))
return -EINVAL;
- }
- if (*success)
- mark_page_dirty_in_slot(kvm, slot, gfn);
- /*
- * Assume that the fault is caused by protection, either key protection
- * or user page write protection.
- */
- if (ret == -EFAULT)
- ret = PGM_PROTECTION;
- return ret;
+
+ rc = kvm_s390_faultin_gfn(NULL, kvm, &fault);
+ if (rc)
+ return rc;
+ *success = !context.exception;
+ if (context.exception == 1)
+ return 0;
+ return context.exception;
}
/**
@@ -1173,304 +1279,362 @@ int kvm_s390_check_low_addr_prot_real(struct kvm_vcpu *vcpu, unsigned long gra)
}
/**
- * kvm_s390_shadow_tables - walk the guest page table and create shadow tables
+ * walk_guest_tables() - walk the guest page table and pin the dat tables
* @sg: pointer to the shadow guest address space structure
* @saddr: faulting address in the shadow gmap
- * @pgt: pointer to the beginning of the page table for the given address if
- * successful (return value 0), or to the first invalid DAT entry in
- * case of exceptions (return value > 0)
- * @dat_protection: referenced memory is write protected
- * @fake: pgt references contiguous guest memory block, not a pgtable
+ * @w: will be filled with information on the pinned pages
+ * @wr: indicates a write access if true
+ *
+ * Return:
+ * * 0 in case of success,
+ * * a PIC code > 0 in case the address translation fails
+ * * an error code < 0 if other errors happen in the host
*/
-static int kvm_s390_shadow_tables(struct gmap *sg, unsigned long saddr,
- unsigned long *pgt, int *dat_protection,
- int *fake)
+static int walk_guest_tables(struct gmap *sg, unsigned long saddr, struct pgtwalk *w, bool wr)
{
- struct kvm *kvm;
- struct gmap *parent;
- union asce asce;
+ struct gmap *parent = sg->parent;
+ struct guest_fault *entries;
+ union dat_table_entry table;
union vaddress vaddr;
unsigned long ptr;
+ struct kvm *kvm;
+ union asce asce;
int rc;
- *fake = 0;
- *dat_protection = 0;
- kvm = sg->private;
- parent = sg->parent;
+ kvm = parent->kvm;
+ asce = sg->guest_asce;
+ entries = get_entries(w);
+
+ w->level = LEVEL_MEM;
+ w->last_addr = saddr;
+ if (asce.r)
+ return kvm_s390_get_guest_page(kvm, entries + LEVEL_MEM, gpa_to_gfn(saddr), false);
+
vaddr.addr = saddr;
- asce.val = sg->orig_asce;
ptr = asce.rsto * PAGE_SIZE;
- if (asce.r) {
- *fake = 1;
- ptr = 0;
- asce.dt = ASCE_TYPE_REGION1;
- }
+
+ if (!asce_contains_gfn(asce, gpa_to_gfn(saddr)))
+ return PGM_ASCE_TYPE;
switch (asce.dt) {
case ASCE_TYPE_REGION1:
- if (vaddr.rfx01 > asce.tl && !*fake)
+ if (vaddr.rfx01 > asce.tl)
return PGM_REGION_FIRST_TRANS;
break;
case ASCE_TYPE_REGION2:
- if (vaddr.rfx)
- return PGM_ASCE_TYPE;
if (vaddr.rsx01 > asce.tl)
return PGM_REGION_SECOND_TRANS;
break;
case ASCE_TYPE_REGION3:
- if (vaddr.rfx || vaddr.rsx)
- return PGM_ASCE_TYPE;
if (vaddr.rtx01 > asce.tl)
return PGM_REGION_THIRD_TRANS;
break;
case ASCE_TYPE_SEGMENT:
- if (vaddr.rfx || vaddr.rsx || vaddr.rtx)
- return PGM_ASCE_TYPE;
if (vaddr.sx01 > asce.tl)
return PGM_SEGMENT_TRANSLATION;
break;
}
+ w->level = asce.dt;
switch (asce.dt) {
- case ASCE_TYPE_REGION1: {
- union region1_table_entry rfte;
-
- if (*fake) {
- ptr += vaddr.rfx * _REGION1_SIZE;
- rfte.val = ptr;
- goto shadow_r2t;
- }
- *pgt = ptr + vaddr.rfx * 8;
- rc = gmap_read_table(parent, ptr + vaddr.rfx * 8, &rfte.val);
+ case ASCE_TYPE_REGION1:
+ w->last_addr = ptr + vaddr.rfx * 8;
+ rc = kvm_s390_get_guest_page_and_read_gpa(kvm, entries + w->level,
+ w->last_addr, &table.val);
if (rc)
return rc;
- if (rfte.i)
+ if (table.pgd.i)
return PGM_REGION_FIRST_TRANS;
- if (rfte.tt != TABLE_TYPE_REGION1)
+ if (table.pgd.tt != TABLE_TYPE_REGION1)
return PGM_TRANSLATION_SPEC;
- if (vaddr.rsx01 < rfte.tf || vaddr.rsx01 > rfte.tl)
+ if (vaddr.rsx01 < table.pgd.tf || vaddr.rsx01 > table.pgd.tl)
return PGM_REGION_SECOND_TRANS;
if (sg->edat_level >= 1)
- *dat_protection |= rfte.p;
- ptr = rfte.rto * PAGE_SIZE;
-shadow_r2t:
- rc = gmap_shadow_r2t(sg, saddr, rfte.val, *fake);
- if (rc)
- return rc;
- kvm->stat.gmap_shadow_r1_entry++;
- }
+ w->p |= table.pgd.p;
+ ptr = table.pgd.rto * PAGE_SIZE;
+ w->level--;
fallthrough;
- case ASCE_TYPE_REGION2: {
- union region2_table_entry rste;
-
- if (*fake) {
- ptr += vaddr.rsx * _REGION2_SIZE;
- rste.val = ptr;
- goto shadow_r3t;
- }
- *pgt = ptr + vaddr.rsx * 8;
- rc = gmap_read_table(parent, ptr + vaddr.rsx * 8, &rste.val);
+ case ASCE_TYPE_REGION2:
+ w->last_addr = ptr + vaddr.rsx * 8;
+ rc = kvm_s390_get_guest_page_and_read_gpa(kvm, entries + w->level,
+ w->last_addr, &table.val);
if (rc)
return rc;
- if (rste.i)
+ if (table.p4d.i)
return PGM_REGION_SECOND_TRANS;
- if (rste.tt != TABLE_TYPE_REGION2)
+ if (table.p4d.tt != TABLE_TYPE_REGION2)
return PGM_TRANSLATION_SPEC;
- if (vaddr.rtx01 < rste.tf || vaddr.rtx01 > rste.tl)
+ if (vaddr.rtx01 < table.p4d.tf || vaddr.rtx01 > table.p4d.tl)
return PGM_REGION_THIRD_TRANS;
if (sg->edat_level >= 1)
- *dat_protection |= rste.p;
- ptr = rste.rto * PAGE_SIZE;
-shadow_r3t:
- rste.p |= *dat_protection;
- rc = gmap_shadow_r3t(sg, saddr, rste.val, *fake);
- if (rc)
- return rc;
- kvm->stat.gmap_shadow_r2_entry++;
- }
+ w->p |= table.p4d.p;
+ ptr = table.p4d.rto * PAGE_SIZE;
+ w->level--;
fallthrough;
- case ASCE_TYPE_REGION3: {
- union region3_table_entry rtte;
-
- if (*fake) {
- ptr += vaddr.rtx * _REGION3_SIZE;
- rtte.val = ptr;
- goto shadow_sgt;
- }
- *pgt = ptr + vaddr.rtx * 8;
- rc = gmap_read_table(parent, ptr + vaddr.rtx * 8, &rtte.val);
+ case ASCE_TYPE_REGION3:
+ w->last_addr = ptr + vaddr.rtx * 8;
+ rc = kvm_s390_get_guest_page_and_read_gpa(kvm, entries + w->level,
+ w->last_addr, &table.val);
if (rc)
return rc;
- if (rtte.i)
+ if (table.pud.i)
return PGM_REGION_THIRD_TRANS;
- if (rtte.tt != TABLE_TYPE_REGION3)
+ if (table.pud.tt != TABLE_TYPE_REGION3)
return PGM_TRANSLATION_SPEC;
- if (rtte.cr && asce.p && sg->edat_level >= 2)
+ if (table.pud.cr && asce.p && sg->edat_level >= 2)
return PGM_TRANSLATION_SPEC;
- if (rtte.fc && sg->edat_level >= 2) {
- *dat_protection |= rtte.fc0.p;
- *fake = 1;
- ptr = rtte.fc1.rfaa * _REGION3_SIZE;
- rtte.val = ptr;
- goto shadow_sgt;
+ if (sg->edat_level >= 1)
+ w->p |= table.pud.p;
+ if (table.pud.fc && sg->edat_level >= 2) {
+ table.val = u64_replace_bits(table.val, saddr, ~_REGION3_MASK);
+ goto edat_applies;
}
- if (vaddr.sx01 < rtte.fc0.tf || vaddr.sx01 > rtte.fc0.tl)
+ if (vaddr.sx01 < table.pud.fc0.tf || vaddr.sx01 > table.pud.fc0.tl)
return PGM_SEGMENT_TRANSLATION;
- if (sg->edat_level >= 1)
- *dat_protection |= rtte.fc0.p;
- ptr = rtte.fc0.sto * PAGE_SIZE;
-shadow_sgt:
- rtte.fc0.p |= *dat_protection;
- rc = gmap_shadow_sgt(sg, saddr, rtte.val, *fake);
- if (rc)
- return rc;
- kvm->stat.gmap_shadow_r3_entry++;
- }
+ ptr = table.pud.fc0.sto * PAGE_SIZE;
+ w->level--;
fallthrough;
- case ASCE_TYPE_SEGMENT: {
- union segment_table_entry ste;
-
- if (*fake) {
- ptr += vaddr.sx * _SEGMENT_SIZE;
- ste.val = ptr;
- goto shadow_pgt;
- }
- *pgt = ptr + vaddr.sx * 8;
- rc = gmap_read_table(parent, ptr + vaddr.sx * 8, &ste.val);
+ case ASCE_TYPE_SEGMENT:
+ w->last_addr = ptr + vaddr.sx * 8;
+ rc = kvm_s390_get_guest_page_and_read_gpa(kvm, entries + w->level,
+ w->last_addr, &table.val);
if (rc)
return rc;
- if (ste.i)
+ if (table.pmd.i)
return PGM_SEGMENT_TRANSLATION;
- if (ste.tt != TABLE_TYPE_SEGMENT)
+ if (table.pmd.tt != TABLE_TYPE_SEGMENT)
return PGM_TRANSLATION_SPEC;
- if (ste.cs && asce.p)
+ if (table.pmd.cs && asce.p)
return PGM_TRANSLATION_SPEC;
- *dat_protection |= ste.fc0.p;
- if (ste.fc && sg->edat_level >= 1) {
- *fake = 1;
- ptr = ste.fc1.sfaa * _SEGMENT_SIZE;
- ste.val = ptr;
- goto shadow_pgt;
+ w->p |= table.pmd.p;
+ if (table.pmd.fc && sg->edat_level >= 1) {
+ table.val = u64_replace_bits(table.val, saddr, ~_SEGMENT_MASK);
+ goto edat_applies;
}
- ptr = ste.fc0.pto * (PAGE_SIZE / 2);
-shadow_pgt:
- ste.fc0.p |= *dat_protection;
- rc = gmap_shadow_pgt(sg, saddr, ste.val, *fake);
+ ptr = table.pmd.fc0.pto * (PAGE_SIZE / 2);
+ w->level--;
+ }
+ w->last_addr = ptr + vaddr.px * 8;
+ rc = kvm_s390_get_guest_page_and_read_gpa(kvm, entries + w->level,
+ w->last_addr, &table.val);
+ if (rc)
+ return rc;
+ if (table.pte.i)
+ return PGM_PAGE_TRANSLATION;
+ if (table.pte.z)
+ return PGM_TRANSLATION_SPEC;
+ w->p |= table.pte.p;
+edat_applies:
+ if (wr && w->p)
+ return PGM_PROTECTION;
+
+ return kvm_s390_get_guest_page(kvm, entries + LEVEL_MEM, table.pte.pfra, wr);
+}
+
+static int _do_shadow_pte(struct gmap *sg, gpa_t raddr, union pte *ptep_h, union pte *ptep,
+ struct guest_fault *f, bool p)
+{
+ union pgste pgste;
+ union pte newpte;
+ int rc;
+
+ scoped_guard(spinlock, &sg->host_to_rmap_lock)
+ rc = gmap_insert_rmap(sg, f->gfn, gpa_to_gfn(raddr), TABLE_TYPE_PAGE_TABLE);
+ if (rc)
+ return rc;
+
+ pgste = pgste_get_lock(ptep_h);
+ newpte = _pte(f->pfn, f->writable, !p, 0);
+ newpte.s.d |= ptep->s.d;
+ newpte.s.sd |= ptep->s.sd;
+ newpte.h.p &= ptep->h.p;
+ pgste = gmap_ptep_xchg(sg->parent, ptep_h, newpte, pgste, f->gfn);
+ pgste.vsie_notif = 1;
+ pgste_set_unlock(ptep_h, pgste);
+
+ newpte = _pte(f->pfn, 0, !p, 0);
+ pgste = pgste_get_lock(ptep);
+ pgste = __dat_ptep_xchg(ptep, pgste, newpte, gpa_to_gfn(raddr), sg->asce, sg->uses_skeys);
+ pgste_set_unlock(ptep, pgste);
+
+ return 0;
+}
+
+static int _do_shadow_crste(struct gmap *sg, gpa_t raddr, union crste *host, union crste *table,
+ struct guest_fault *f, bool p)
+{
+ union crste newcrste;
+ gfn_t gfn;
+ int rc;
+
+ lockdep_assert_held_write(&sg->kvm->mmu_lock);
+
+ gfn = f->gfn & gpa_to_gfn(is_pmd(*table) ? _SEGMENT_MASK : _REGION3_MASK);
+ scoped_guard(spinlock, &sg->host_to_rmap_lock)
+ rc = gmap_insert_rmap(sg, gfn, gpa_to_gfn(raddr), host->h.tt);
+ if (rc)
+ return rc;
+
+ newcrste = _crste_fc1(f->pfn, host->h.tt, f->writable, !p);
+ newcrste.s.fc1.d |= host->s.fc1.d;
+ newcrste.s.fc1.sd |= host->s.fc1.sd;
+ newcrste.h.p &= host->h.p;
+ newcrste.s.fc1.vsie_notif = 1;
+ newcrste.s.fc1.prefix_notif = host->s.fc1.prefix_notif;
+ gmap_crstep_xchg(sg->parent, host, newcrste, f->gfn);
+
+ newcrste = _crste_fc1(f->pfn, host->h.tt, 0, !p);
+ dat_crstep_xchg(table, newcrste, gpa_to_gfn(raddr), sg->asce);
+ return 0;
+}
+
+static int _gaccess_do_shadow(struct kvm_s390_mmu_cache *mc, struct gmap *sg,
+ unsigned long saddr, struct pgtwalk *w)
+{
+ struct guest_fault *entries;
+ int flags, i, hl, gl, l, rc;
+ union crste *table, *host;
+ union pte *ptep, *ptep_h;
+
+ lockdep_assert_held(&sg->kvm->mmu_lock);
+ entries = get_entries(w);
+ ptep_h = NULL;
+ ptep = NULL;
+
+ rc = dat_entry_walk(NULL, gpa_to_gfn(saddr), sg->asce, DAT_WALK_ANY, TABLE_TYPE_PAGE_TABLE,
+ &table, &ptep);
+ if (rc)
+ return rc;
+
+ /* A race occourred. The shadow mapping is already valid, nothing to do */
+ if ((ptep && !ptep->h.i) || (!ptep && crste_leaf(*table)))
+ return 0;
+
+ gl = get_level(table, ptep);
+
+ /*
+ * Skip levels that are already protected. For each level, protect
+ * only the page containing the entry, not the whole table.
+ */
+ for (i = gl ; i > w->level; i--) {
+ rc = gmap_protect_rmap(mc, sg, entries[i - 1].gfn, gpa_to_gfn(saddr),
+ entries[i - 1].pfn, i, entries[i - 1].writable);
if (rc)
return rc;
- kvm->stat.gmap_shadow_sg_entry++;
}
+
+ rc = dat_entry_walk(NULL, entries[LEVEL_MEM].gfn, sg->parent->asce, DAT_WALK_LEAF,
+ TABLE_TYPE_PAGE_TABLE, &host, &ptep_h);
+ if (rc)
+ return rc;
+
+ hl = get_level(host, ptep_h);
+ /* Get the smallest granularity */
+ l = min3(gl, hl, w->level);
+
+ flags = DAT_WALK_SPLIT_ALLOC | (sg->parent->uses_skeys ? DAT_WALK_USES_SKEYS : 0);
+ /* If necessary, create the shadow mapping */
+ if (l < gl) {
+ rc = dat_entry_walk(mc, gpa_to_gfn(saddr), sg->asce, flags, l, &table, &ptep);
+ if (rc)
+ return rc;
}
- /* Return the parent address of the page table */
- *pgt = ptr;
- return 0;
+ if (l < hl) {
+ rc = dat_entry_walk(mc, entries[LEVEL_MEM].gfn, sg->parent->asce,
+ flags, l, &host, &ptep_h);
+ if (rc)
+ return rc;
+ }
+
+ if (KVM_BUG_ON(l > TABLE_TYPE_REGION3, sg->kvm))
+ return -EFAULT;
+ if (l == TABLE_TYPE_PAGE_TABLE)
+ return _do_shadow_pte(sg, saddr, ptep_h, ptep, entries + LEVEL_MEM, w->p);
+ return _do_shadow_crste(sg, saddr, host, table, entries + LEVEL_MEM, w->p);
}
-/**
- * shadow_pgt_lookup() - find a shadow page table
- * @sg: pointer to the shadow guest address space structure
- * @saddr: the address in the shadow aguest address space
- * @pgt: parent gmap address of the page table to get shadowed
- * @dat_protection: if the pgtable is marked as protected by dat
- * @fake: pgt references contiguous guest memory block, not a pgtable
- *
- * Returns 0 if the shadow page table was found and -EAGAIN if the page
- * table was not found.
- *
- * Called with sg->mm->mmap_lock in read.
- */
-static int shadow_pgt_lookup(struct gmap *sg, unsigned long saddr, unsigned long *pgt,
- int *dat_protection, int *fake)
+static inline int _gaccess_shadow_fault(struct kvm_vcpu *vcpu, struct gmap *sg, gpa_t saddr,
+ unsigned long seq, struct pgtwalk *walk)
{
- unsigned long pt_index;
- unsigned long *table;
- struct page *page;
int rc;
- spin_lock(&sg->guest_table_lock);
- table = gmap_table_walk(sg, saddr, 1); /* get segment pointer */
- if (table && !(*table & _SEGMENT_ENTRY_INVALID)) {
- /* Shadow page tables are full pages (pte+pgste) */
- page = pfn_to_page(*table >> PAGE_SHIFT);
- pt_index = gmap_pgste_get_pgt_addr(page_to_virt(page));
- *pgt = pt_index & ~GMAP_SHADOW_FAKE_TABLE;
- *dat_protection = !!(*table & _SEGMENT_ENTRY_PROTECT);
- *fake = !!(pt_index & GMAP_SHADOW_FAKE_TABLE);
- rc = 0;
- } else {
- rc = -EAGAIN;
+ if (kvm_s390_array_needs_retry_unsafe(vcpu->kvm, seq, walk->raw_entries))
+ return -EAGAIN;
+again:
+ rc = kvm_s390_mmu_cache_topup(vcpu->arch.mc);
+ if (rc)
+ return rc;
+ scoped_guard(read_lock, &vcpu->kvm->mmu_lock) {
+ if (kvm_s390_array_needs_retry_safe(vcpu->kvm, seq, walk->raw_entries))
+ return -EAGAIN;
+ scoped_guard(spinlock, &sg->parent->children_lock) {
+ if (sg->removed)
+ return -EAGAIN;
+ rc = _gaccess_do_shadow(vcpu->arch.mc, sg, saddr, walk);
+ }
+ if (rc == -ENOMEM)
+ goto again;
+ if (!rc)
+ kvm_s390_release_faultin_array(vcpu->kvm, walk->raw_entries, false);
}
- spin_unlock(&sg->guest_table_lock);
return rc;
}
/**
- * kvm_s390_shadow_fault - handle fault on a shadow page table
- * @vcpu: virtual cpu
- * @sg: pointer to the shadow guest address space structure
+ * __kvm_s390_shadow_fault() - handle fault on a shadow page table
+ * @vcpu: virtual cpu that triggered the action
+ * @sg: the shadow guest address space structure
* @saddr: faulting address in the shadow gmap
* @datptr: will contain the address of the faulting DAT table entry, or of
* the valid leaf, plus some flags
+ * @wr: whether this is a write access
*
- * Returns: - 0 if the shadow fault was successfully resolved
- * - > 0 (pgm exception code) on exceptions while faulting
- * - -EAGAIN if the caller can retry immediately
- * - -EFAULT when accessing invalid guest addresses
- * - -ENOMEM if out of memory
+ * Return:
+ * * 0 if the shadow fault was successfully resolved
+ * * > 0 (pgm exception code) on exceptions while faulting
+ * * -EAGAIN if the caller can retry immediately
+ * * -EFAULT when accessing invalid guest addresses
+ * * -ENOMEM if out of memory
*/
-int kvm_s390_shadow_fault(struct kvm_vcpu *vcpu, struct gmap *sg,
- unsigned long saddr, unsigned long *datptr)
+static int __gaccess_shadow_fault(struct kvm_vcpu *vcpu, struct gmap *sg, gpa_t saddr,
+ union mvpg_pei *datptr, bool wr)
{
- union vaddress vaddr;
- union page_table_entry pte;
- unsigned long pgt = 0;
- int dat_protection, fake;
+ struct pgtwalk walk = { .p = false, };
+ unsigned long seq;
int rc;
- if (KVM_BUG_ON(!gmap_is_shadow(sg), vcpu->kvm))
- return -EFAULT;
+ seq = vcpu->kvm->mmu_invalidate_seq;
+ /* Pairs with the smp_wmb() in kvm_mmu_invalidate_end(). */
+ smp_rmb();
- mmap_read_lock(sg->mm);
- /*
- * We don't want any guest-2 tables to change - so the parent
- * tables/pointers we read stay valid - unshadowing is however
- * always possible - only guest_table_lock protects us.
- */
- ipte_lock(vcpu->kvm);
-
- rc = shadow_pgt_lookup(sg, saddr, &pgt, &dat_protection, &fake);
+ rc = walk_guest_tables(sg, saddr, &walk, wr);
+ if (datptr) {
+ datptr->val = walk.last_addr;
+ datptr->dat_prot = wr && walk.p;
+ datptr->not_pte = walk.level > TABLE_TYPE_PAGE_TABLE;
+ datptr->real = sg->guest_asce.r;
+ }
+ if (!rc)
+ rc = _gaccess_shadow_fault(vcpu, sg, saddr, seq, &walk);
if (rc)
- rc = kvm_s390_shadow_tables(sg, saddr, &pgt, &dat_protection,
- &fake);
+ kvm_s390_release_faultin_array(vcpu->kvm, walk.raw_entries, true);
+ return rc;
+}
- vaddr.addr = saddr;
- if (fake) {
- pte.val = pgt + vaddr.px * PAGE_SIZE;
- goto shadow_page;
- }
+int gaccess_shadow_fault(struct kvm_vcpu *vcpu, struct gmap *sg, gpa_t saddr,
+ union mvpg_pei *datptr, bool wr)
+{
+ int rc;
- switch (rc) {
- case PGM_SEGMENT_TRANSLATION:
- case PGM_REGION_THIRD_TRANS:
- case PGM_REGION_SECOND_TRANS:
- case PGM_REGION_FIRST_TRANS:
- pgt |= PEI_NOT_PTE;
- break;
- case 0:
- pgt += vaddr.px * 8;
- rc = gmap_read_table(sg->parent, pgt, &pte.val);
- }
- if (datptr)
- *datptr = pgt | dat_protection * PEI_DAT_PROT;
- if (!rc && pte.i)
- rc = PGM_PAGE_TRANSLATION;
- if (!rc && pte.z)
- rc = PGM_TRANSLATION_SPEC;
-shadow_page:
- pte.p |= dat_protection;
- if (!rc)
- rc = gmap_shadow_page(sg, saddr, __pte(pte.val));
- vcpu->kvm->stat.gmap_shadow_pg_entry++;
+ if (KVM_BUG_ON(!sg->is_shadow, vcpu->kvm))
+ return -EFAULT;
+
+ rc = kvm_s390_mmu_cache_topup(vcpu->arch.mc);
+ if (rc)
+ return rc;
+
+ ipte_lock(vcpu->kvm);
+ rc = __gaccess_shadow_fault(vcpu, sg, saddr, datptr, wr || sg->guest_asce.r);
ipte_unlock(vcpu->kvm);
- mmap_read_unlock(sg->mm);
+
return rc;
}
diff --git a/arch/s390/kvm/gaccess.h b/arch/s390/kvm/gaccess.h
index 774cdf19998f..b5385cec60f4 100644
--- a/arch/s390/kvm/gaccess.h
+++ b/arch/s390/kvm/gaccess.h
@@ -206,7 +206,7 @@ int access_guest_with_key(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar,
int access_guest_real(struct kvm_vcpu *vcpu, unsigned long gra,
void *data, unsigned long len, enum gacc_mode mode);
-int cmpxchg_guest_abs_with_key(struct kvm *kvm, gpa_t gpa, int len, union kvm_s390_quad *old_addr,
+int cmpxchg_guest_abs_with_key(struct kvm *kvm, gpa_t gpa, int len, union kvm_s390_quad *old,
union kvm_s390_quad new, u8 access_key, bool *success);
/**
@@ -450,11 +450,17 @@ void ipte_unlock(struct kvm *kvm);
int ipte_lock_held(struct kvm *kvm);
int kvm_s390_check_low_addr_prot_real(struct kvm_vcpu *vcpu, unsigned long gra);
-/* MVPG PEI indication bits */
-#define PEI_DAT_PROT 2
-#define PEI_NOT_PTE 4
+union mvpg_pei {
+ unsigned long val;
+ struct {
+ unsigned long addr : 61;
+ unsigned long not_pte : 1;
+ unsigned long dat_prot: 1;
+ unsigned long real : 1;
+ };
+};
-int kvm_s390_shadow_fault(struct kvm_vcpu *vcpu, struct gmap *shadow,
- unsigned long saddr, unsigned long *datptr);
+int gaccess_shadow_fault(struct kvm_vcpu *vcpu, struct gmap *sg, gpa_t saddr,
+ union mvpg_pei *datptr, bool wr);
#endif /* __KVM_S390_GACCESS_H */
diff --git a/arch/s390/kvm/gmap-vsie.c b/arch/s390/kvm/gmap-vsie.c
deleted file mode 100644
index 56ef153eb8fe..000000000000
--- a/arch/s390/kvm/gmap-vsie.c
+++ /dev/null
@@ -1,141 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Guest memory management for KVM/s390 nested VMs.
- *
- * Copyright IBM Corp. 2008, 2020, 2024
- *
- * Author(s): Claudio Imbrenda <imbrenda@...ux.ibm.com>
- * Martin Schwidefsky <schwidefsky@...ibm.com>
- * David Hildenbrand <david@...hat.com>
- * Janosch Frank <frankja@...ux.vnet.ibm.com>
- */
-
-#include <linux/compiler.h>
-#include <linux/kvm.h>
-#include <linux/kvm_host.h>
-#include <linux/pgtable.h>
-#include <linux/pagemap.h>
-#include <linux/mman.h>
-
-#include <asm/lowcore.h>
-#include <asm/gmap.h>
-#include <asm/uv.h>
-
-#include "kvm-s390.h"
-
-/**
- * gmap_find_shadow - find a specific asce in the list of shadow tables
- * @parent: pointer to the parent gmap
- * @asce: ASCE for which the shadow table is created
- * @edat_level: edat level to be used for the shadow translation
- *
- * Returns the pointer to a gmap if a shadow table with the given asce is
- * already available, ERR_PTR(-EAGAIN) if another one is just being created,
- * otherwise NULL
- *
- * Context: Called with parent->shadow_lock held
- */
-static struct gmap *gmap_find_shadow(struct gmap *parent, unsigned long asce, int edat_level)
-{
- struct gmap *sg;
-
- lockdep_assert_held(&parent->shadow_lock);
- list_for_each_entry(sg, &parent->children, list) {
- if (!gmap_shadow_valid(sg, asce, edat_level))
- continue;
- if (!sg->initialized)
- return ERR_PTR(-EAGAIN);
- refcount_inc(&sg->ref_count);
- return sg;
- }
- return NULL;
-}
-
-/**
- * gmap_shadow - create/find a shadow guest address space
- * @parent: pointer to the parent gmap
- * @asce: ASCE for which the shadow table is created
- * @edat_level: edat level to be used for the shadow translation
- *
- * The pages of the top level page table referred by the asce parameter
- * will be set to read-only and marked in the PGSTEs of the kvm process.
- * The shadow table will be removed automatically on any change to the
- * PTE mapping for the source table.
- *
- * Returns a guest address space structure, ERR_PTR(-ENOMEM) if out of memory,
- * ERR_PTR(-EAGAIN) if the caller has to retry and ERR_PTR(-EFAULT) if the
- * parent gmap table could not be protected.
- */
-struct gmap *gmap_shadow(struct gmap *parent, unsigned long asce, int edat_level)
-{
- struct gmap *sg, *new;
- unsigned long limit;
- int rc;
-
- if (KVM_BUG_ON(parent->mm->context.allow_gmap_hpage_1m, (struct kvm *)parent->private) ||
- KVM_BUG_ON(gmap_is_shadow(parent), (struct kvm *)parent->private))
- return ERR_PTR(-EFAULT);
- spin_lock(&parent->shadow_lock);
- sg = gmap_find_shadow(parent, asce, edat_level);
- spin_unlock(&parent->shadow_lock);
- if (sg)
- return sg;
- /* Create a new shadow gmap */
- limit = -1UL >> (33 - (((asce & _ASCE_TYPE_MASK) >> 2) * 11));
- if (asce & _ASCE_REAL_SPACE)
- limit = -1UL;
- new = gmap_alloc(limit);
- if (!new)
- return ERR_PTR(-ENOMEM);
- new->mm = parent->mm;
- new->parent = gmap_get(parent);
- new->private = parent->private;
- new->orig_asce = asce;
- new->edat_level = edat_level;
- new->initialized = false;
- spin_lock(&parent->shadow_lock);
- /* Recheck if another CPU created the same shadow */
- sg = gmap_find_shadow(parent, asce, edat_level);
- if (sg) {
- spin_unlock(&parent->shadow_lock);
- gmap_free(new);
- return sg;
- }
- if (asce & _ASCE_REAL_SPACE) {
- /* only allow one real-space gmap shadow */
- list_for_each_entry(sg, &parent->children, list) {
- if (sg->orig_asce & _ASCE_REAL_SPACE) {
- spin_lock(&sg->guest_table_lock);
- gmap_unshadow(sg);
- spin_unlock(&sg->guest_table_lock);
- list_del(&sg->list);
- gmap_put(sg);
- break;
- }
- }
- }
- refcount_set(&new->ref_count, 2);
- list_add(&new->list, &parent->children);
- if (asce & _ASCE_REAL_SPACE) {
- /* nothing to protect, return right away */
- new->initialized = true;
- spin_unlock(&parent->shadow_lock);
- return new;
- }
- spin_unlock(&parent->shadow_lock);
- /* protect after insertion, so it will get properly invalidated */
- mmap_read_lock(parent->mm);
- rc = __kvm_s390_mprotect_many(parent, asce & _ASCE_ORIGIN,
- ((asce & _ASCE_TABLE_LENGTH) + 1),
- PROT_READ, GMAP_NOTIFY_SHADOW);
- mmap_read_unlock(parent->mm);
- spin_lock(&parent->shadow_lock);
- new->initialized = true;
- if (rc) {
- list_del(&new->list);
- gmap_free(new);
- new = ERR_PTR(rc);
- }
- spin_unlock(&parent->shadow_lock);
- return new;
-}
diff --git a/arch/s390/kvm/gmap.c b/arch/s390/kvm/gmap.c
index cbb777e940d1..502012c0dfad 100644
--- a/arch/s390/kvm/gmap.c
+++ b/arch/s390/kvm/gmap.c
@@ -730,13 +730,13 @@ static int _gmap_enable_skeys(struct gmap *gmap)
gfn_t start = 0;
int rc;
- if (mm_uses_skeys(gmap->kvm->mm))
+ if (gmap->uses_skeys)
return 0;
- gmap->kvm->mm->context.uses_skeys = 1;
+ WRITE_ONCE(gmap->uses_skeys, 1);
rc = gmap_helper_disable_cow_sharing();
if (rc) {
- gmap->kvm->mm->context.uses_skeys = 0;
+ WRITE_ONCE(gmap->uses_skeys, 0);
return rc;
}
diff --git a/arch/s390/kvm/intercept.c b/arch/s390/kvm/intercept.c
index c7908950c1f4..ecc41587efeb 100644
--- a/arch/s390/kvm/intercept.c
+++ b/arch/s390/kvm/intercept.c
@@ -21,6 +21,7 @@
#include "gaccess.h"
#include "trace.h"
#include "trace-s390.h"
+#include "faultin.h"
u8 kvm_s390_get_ilen(struct kvm_vcpu *vcpu)
{
@@ -367,8 +368,11 @@ static int handle_mvpg_pei(struct kvm_vcpu *vcpu)
reg2, &srcaddr, GACC_FETCH, 0);
if (rc)
return kvm_s390_inject_prog_cond(vcpu, rc);
- rc = kvm_s390_handle_dat_fault(vcpu, srcaddr, 0);
- if (rc != 0)
+
+ do {
+ rc = kvm_s390_faultin_gfn_simple(vcpu, NULL, gpa_to_gfn(srcaddr), false);
+ } while (rc == -EAGAIN);
+ if (rc)
return rc;
/* Ensure that the source is paged-in, no actual access -> no key checking */
@@ -376,8 +380,11 @@ static int handle_mvpg_pei(struct kvm_vcpu *vcpu)
reg1, &dstaddr, GACC_STORE, 0);
if (rc)
return kvm_s390_inject_prog_cond(vcpu, rc);
- rc = kvm_s390_handle_dat_fault(vcpu, dstaddr, FOLL_WRITE);
- if (rc != 0)
+
+ do {
+ rc = kvm_s390_faultin_gfn_simple(vcpu, NULL, gpa_to_gfn(dstaddr), true);
+ } while (rc == -EAGAIN);
+ if (rc)
return rc;
kvm_s390_retry_instr(vcpu);
diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c
index c62a868cf2b6..aae0bc8bf038 100644
--- a/arch/s390/kvm/interrupt.c
+++ b/arch/s390/kvm/interrupt.c
@@ -27,7 +27,6 @@
#include <linux/uaccess.h>
#include <asm/sclp.h>
#include <asm/isc.h>
-#include <asm/gmap.h>
#include <asm/nmi.h>
#include <asm/airq.h>
#include <asm/tpi.h>
@@ -35,6 +34,7 @@
#include "gaccess.h"
#include "trace-s390.h"
#include "pci.h"
+#include "gmap.h"
#define PFAULT_INIT 0x0600
#define PFAULT_DONE 0x0680
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index ab69c9fd7926..c8662177c63c 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -40,7 +40,6 @@
#include <asm/lowcore.h>
#include <asm/machine.h>
#include <asm/stp.h>
-#include <asm/gmap.h>
#include <asm/gmap_helpers.h>
#include <asm/nmi.h>
#include <asm/isc.h>
@@ -53,6 +52,8 @@
#include <asm/uv.h>
#include "kvm-s390.h"
#include "gaccess.h"
+#include "gmap.h"
+#include "faultin.h"
#include "pci.h"
#define CREATE_TRACE_POINTS
@@ -263,15 +264,11 @@ static DECLARE_BITMAP(kvm_s390_available_cpu_feat, KVM_S390_VM_CPU_FEAT_NR_BITS)
/* available subfunctions indicated via query / "test bit" */
static struct kvm_s390_vm_cpu_subfunc kvm_s390_available_subfunc;
-static struct gmap_notifier gmap_notifier;
-static struct gmap_notifier vsie_gmap_notifier;
debug_info_t *kvm_s390_dbf;
debug_info_t *kvm_s390_dbf_uv;
/* Section: not file related */
/* forward declarations */
-static void kvm_gmap_notifier(struct gmap *gmap, unsigned long start,
- unsigned long end);
static int sca_switch_to_extended(struct kvm *kvm);
static void kvm_clock_sync_scb(struct kvm_s390_sie_block *scb, u64 delta)
@@ -529,10 +526,6 @@ static int __init __kvm_s390_init(void)
if (rc)
goto err_gib;
- gmap_notifier.notifier_call = kvm_gmap_notifier;
- gmap_register_pte_notifier(&gmap_notifier);
- vsie_gmap_notifier.notifier_call = kvm_s390_vsie_gmap_notifier;
- gmap_register_pte_notifier(&vsie_gmap_notifier);
atomic_notifier_chain_register(&s390_epoch_delta_notifier,
&kvm_clock_notifier);
@@ -552,8 +545,6 @@ static int __init __kvm_s390_init(void)
static void __kvm_s390_exit(void)
{
- gmap_unregister_pte_notifier(&gmap_notifier);
- gmap_unregister_pte_notifier(&vsie_gmap_notifier);
atomic_notifier_chain_unregister(&s390_epoch_delta_notifier,
&kvm_clock_notifier);
@@ -569,7 +560,7 @@ long kvm_arch_dev_ioctl(struct file *filp,
unsigned int ioctl, unsigned long arg)
{
if (ioctl == KVM_S390_ENABLE_SIE)
- return s390_enable_sie();
+ return 0;
return -EINVAL;
}
@@ -695,32 +686,10 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot)
{
- int i;
- gfn_t cur_gfn, last_gfn;
- unsigned long gaddr, vmaddr;
- struct gmap *gmap = kvm->arch.gmap;
- DECLARE_BITMAP(bitmap, _PAGE_ENTRIES);
-
- /* Loop over all guest segments */
- cur_gfn = memslot->base_gfn;
- last_gfn = memslot->base_gfn + memslot->npages;
- for (; cur_gfn <= last_gfn; cur_gfn += _PAGE_ENTRIES) {
- gaddr = gfn_to_gpa(cur_gfn);
- vmaddr = gfn_to_hva_memslot(memslot, cur_gfn);
- if (kvm_is_error_hva(vmaddr))
- continue;
-
- bitmap_zero(bitmap, _PAGE_ENTRIES);
- gmap_sync_dirty_log_pmd(gmap, bitmap, gaddr, vmaddr);
- for (i = 0; i < _PAGE_ENTRIES; i++) {
- if (test_bit(i, bitmap))
- mark_page_dirty(kvm, cur_gfn + i);
- }
+ gfn_t last_gfn = memslot->base_gfn + memslot->npages;
- if (fatal_signal_pending(current))
- return;
- cond_resched();
- }
+ scoped_guard(read_lock, &kvm->mmu_lock)
+ gmap_sync_dirty_log(kvm->arch.gmap, memslot->base_gfn, last_gfn);
}
/* Section: vm related */
@@ -880,9 +849,6 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap)
r = -EINVAL;
else {
r = 0;
- mmap_write_lock(kvm->mm);
- kvm->mm->context.allow_gmap_hpage_1m = 1;
- mmap_write_unlock(kvm->mm);
/*
* We might have to create fake 4k page
* tables. To avoid that the hardware works on
@@ -949,7 +915,7 @@ static int kvm_s390_get_mem_control(struct kvm *kvm, struct kvm_device_attr *att
static int kvm_s390_set_mem_control(struct kvm *kvm, struct kvm_device_attr *attr)
{
int ret;
- unsigned int idx;
+
switch (attr->attr) {
case KVM_S390_VM_MEM_ENABLE_CMMA:
ret = -ENXIO;
@@ -960,8 +926,6 @@ static int kvm_s390_set_mem_control(struct kvm *kvm, struct kvm_device_attr *att
mutex_lock(&kvm->lock);
if (kvm->created_vcpus)
ret = -EBUSY;
- else if (kvm->mm->context.allow_gmap_hpage_1m)
- ret = -EINVAL;
else {
kvm->arch.use_cmma = 1;
/* Not compatible with cmma. */
@@ -970,7 +934,9 @@ static int kvm_s390_set_mem_control(struct kvm *kvm, struct kvm_device_attr *att
}
mutex_unlock(&kvm->lock);
break;
- case KVM_S390_VM_MEM_CLR_CMMA:
+ case KVM_S390_VM_MEM_CLR_CMMA: {
+ gfn_t start_gfn = 0;
+
ret = -ENXIO;
if (!sclp.has_cmma)
break;
@@ -979,13 +945,13 @@ static int kvm_s390_set_mem_control(struct kvm *kvm, struct kvm_device_attr *att
break;
VM_EVENT(kvm, 3, "%s", "RESET: CMMA states");
- mutex_lock(&kvm->lock);
- idx = srcu_read_lock(&kvm->srcu);
- s390_reset_cmma(kvm->arch.gmap->mm);
- srcu_read_unlock(&kvm->srcu, idx);
- mutex_unlock(&kvm->lock);
+ do {
+ start_gfn = dat_reset_cmma(kvm->arch.gmap->asce, start_gfn);
+ cond_resched();
+ } while (start_gfn);
ret = 0;
break;
+ }
case KVM_S390_VM_MEM_LIMIT_SIZE: {
unsigned long new_limit;
@@ -1002,29 +968,12 @@ static int kvm_s390_set_mem_control(struct kvm *kvm, struct kvm_device_attr *att
if (!new_limit)
return -EINVAL;
- /* gmap_create takes last usable address */
- if (new_limit != KVM_S390_NO_MEM_LIMIT)
- new_limit -= 1;
-
ret = -EBUSY;
- mutex_lock(&kvm->lock);
- if (!kvm->created_vcpus) {
- /* gmap_create will round the limit up */
- struct gmap *new = gmap_create(current->mm, new_limit);
-
- if (!new) {
- ret = -ENOMEM;
- } else {
- gmap_remove(kvm->arch.gmap);
- new->private = kvm;
- kvm->arch.gmap = new;
- ret = 0;
- }
- }
- mutex_unlock(&kvm->lock);
+ if (!kvm->created_vcpus)
+ ret = gmap_set_limit(kvm->arch.gmap, gpa_to_gfn(new_limit));
VM_EVENT(kvm, 3, "SET: max guest address: %lu", new_limit);
VM_EVENT(kvm, 3, "New guest asce: 0x%p",
- (void *) kvm->arch.gmap->asce);
+ (void *)kvm->arch.gmap->asce.val);
break;
}
default:
@@ -1189,19 +1138,13 @@ static int kvm_s390_vm_start_migration(struct kvm *kvm)
kvm->arch.migration_mode = 1;
return 0;
}
- /* mark all the pages in active slots as dirty */
kvm_for_each_memslot(ms, bkt, slots) {
if (!ms->dirty_bitmap)
return -EINVAL;
- /*
- * The second half of the bitmap is only used on x86,
- * and would be wasted otherwise, so we put it to good
- * use here to keep track of the state of the storage
- * attributes.
- */
- memset(kvm_second_dirty_bitmap(ms), 0xff, kvm_dirty_bitmap_bytes(ms));
ram_pages += ms->npages;
}
+ /* mark all the pages as dirty */
+ gmap_set_cmma_all_dirty(kvm->arch.gmap);
atomic64_set(&kvm->arch.cmma_dirty_pages, ram_pages);
kvm->arch.migration_mode = 1;
kvm_s390_sync_request_broadcast(kvm, KVM_REQ_START_MIGRATION);
@@ -2113,40 +2056,32 @@ static int kvm_s390_vm_has_attr(struct kvm *kvm, struct kvm_device_attr *attr)
static int kvm_s390_get_skeys(struct kvm *kvm, struct kvm_s390_skeys *args)
{
- uint8_t *keys;
- uint64_t hva;
- int srcu_idx, i, r = 0;
+ union skey *keys;
+ int i, r = 0;
if (args->flags != 0)
return -EINVAL;
/* Is this guest using storage keys? */
- if (!mm_uses_skeys(current->mm))
+ if (!kvm->arch.gmap->uses_skeys)
return KVM_S390_GET_SKEYS_NONE;
/* Enforce sane limit on memory allocation */
if (args->count < 1 || args->count > KVM_S390_SKEYS_MAX)
return -EINVAL;
- keys = kvmalloc_array(args->count, sizeof(uint8_t), GFP_KERNEL_ACCOUNT);
+ keys = kvmalloc_array(args->count, sizeof(*keys), GFP_KERNEL_ACCOUNT);
if (!keys)
return -ENOMEM;
- mmap_read_lock(current->mm);
- srcu_idx = srcu_read_lock(&kvm->srcu);
- for (i = 0; i < args->count; i++) {
- hva = gfn_to_hva(kvm, args->start_gfn + i);
- if (kvm_is_error_hva(hva)) {
- r = -EFAULT;
- break;
+ scoped_guard(read_lock, &kvm->mmu_lock) {
+ for (i = 0; i < args->count; i++) {
+ r = dat_get_storage_key(kvm->arch.gmap->asce,
+ args->start_gfn + i, keys + i);
+ if (r)
+ break;
}
-
- r = get_guest_storage_key(current->mm, hva, &keys[i]);
- if (r)
- break;
}
- srcu_read_unlock(&kvm->srcu, srcu_idx);
- mmap_read_unlock(current->mm);
if (!r) {
r = copy_to_user((uint8_t __user *)args->skeydata_addr, keys,
@@ -2161,10 +2096,9 @@ static int kvm_s390_get_skeys(struct kvm *kvm, struct kvm_s390_skeys *args)
static int kvm_s390_set_skeys(struct kvm *kvm, struct kvm_s390_skeys *args)
{
- uint8_t *keys;
- uint64_t hva;
- int srcu_idx, i, r = 0;
- bool unlocked;
+ struct kvm_s390_mmu_cache *mc;
+ union skey *keys;
+ int i, r = 0;
if (args->flags != 0)
return -EINVAL;
@@ -2173,7 +2107,7 @@ static int kvm_s390_set_skeys(struct kvm *kvm, struct kvm_s390_skeys *args)
if (args->count < 1 || args->count > KVM_S390_SKEYS_MAX)
return -EINVAL;
- keys = kvmalloc_array(args->count, sizeof(uint8_t), GFP_KERNEL_ACCOUNT);
+ keys = kvmalloc_array(args->count, sizeof(*keys), GFP_KERNEL_ACCOUNT);
if (!keys)
return -ENOMEM;
@@ -2185,159 +2119,41 @@ static int kvm_s390_set_skeys(struct kvm *kvm, struct kvm_s390_skeys *args)
}
/* Enable storage key handling for the guest */
- r = s390_enable_skey();
+ r = gmap_enable_skeys(kvm->arch.gmap);
if (r)
goto out;
- i = 0;
- mmap_read_lock(current->mm);
- srcu_idx = srcu_read_lock(&kvm->srcu);
- while (i < args->count) {
- unlocked = false;
- hva = gfn_to_hva(kvm, args->start_gfn + i);
- if (kvm_is_error_hva(hva)) {
- r = -EFAULT;
- break;
- }
-
+ r = -EINVAL;
+ for (i = 0; i < args->count; i++) {
/* Lowest order bit is reserved */
- if (keys[i] & 0x01) {
- r = -EINVAL;
- break;
- }
-
- r = set_guest_storage_key(current->mm, hva, keys[i], 0);
- if (r) {
- r = fixup_user_fault(current->mm, hva,
- FAULT_FLAG_WRITE, &unlocked);
- if (r)
- break;
- }
- if (!r)
- i++;
- }
- srcu_read_unlock(&kvm->srcu, srcu_idx);
- mmap_read_unlock(current->mm);
-out:
- kvfree(keys);
- return r;
-}
-
-/*
- * Base address and length must be sent at the start of each block, therefore
- * it's cheaper to send some clean data, as long as it's less than the size of
- * two longs.
- */
-#define KVM_S390_MAX_BIT_DISTANCE (2 * sizeof(void *))
-/* for consistency */
-#define KVM_S390_CMMA_SIZE_MAX ((u32)KVM_S390_SKEYS_MAX)
-
-static int kvm_s390_peek_cmma(struct kvm *kvm, struct kvm_s390_cmma_log *args,
- u8 *res, unsigned long bufsize)
-{
- unsigned long pgstev, hva, cur_gfn = args->start_gfn;
-
- args->count = 0;
- while (args->count < bufsize) {
- hva = gfn_to_hva(kvm, cur_gfn);
- /*
- * We return an error if the first value was invalid, but we
- * return successfully if at least one value was copied.
- */
- if (kvm_is_error_hva(hva))
- return args->count ? 0 : -EFAULT;
- if (get_pgste(kvm->mm, hva, &pgstev) < 0)
- pgstev = 0;
- res[args->count++] = (pgstev >> 24) & 0x43;
- cur_gfn++;
+ if (keys[i].zero)
+ goto out;
}
- return 0;
-}
-
-static struct kvm_memory_slot *gfn_to_memslot_approx(struct kvm_memslots *slots,
- gfn_t gfn)
-{
- return ____gfn_to_memslot(slots, gfn, true);
-}
-
-static unsigned long kvm_s390_next_dirty_cmma(struct kvm_memslots *slots,
- unsigned long cur_gfn)
-{
- struct kvm_memory_slot *ms = gfn_to_memslot_approx(slots, cur_gfn);
- unsigned long ofs = cur_gfn - ms->base_gfn;
- struct rb_node *mnode = &ms->gfn_node[slots->node_idx];
-
- if (ms->base_gfn + ms->npages <= cur_gfn) {
- mnode = rb_next(mnode);
- /* If we are above the highest slot, wrap around */
- if (!mnode)
- mnode = rb_first(&slots->gfn_tree);
-
- ms = container_of(mnode, struct kvm_memory_slot, gfn_node[slots->node_idx]);
- ofs = 0;
- }
-
- if (cur_gfn < ms->base_gfn)
- ofs = 0;
-
- ofs = find_next_bit(kvm_second_dirty_bitmap(ms), ms->npages, ofs);
- while (ofs >= ms->npages && (mnode = rb_next(mnode))) {
- ms = container_of(mnode, struct kvm_memory_slot, gfn_node[slots->node_idx]);
- ofs = find_first_bit(kvm_second_dirty_bitmap(ms), ms->npages);
+ mc = kvm_s390_new_mmu_cache();
+ if (!mc) {
+ r = -ENOMEM;
+ goto out;
}
- return ms->base_gfn + ofs;
-}
-static int kvm_s390_get_cmma(struct kvm *kvm, struct kvm_s390_cmma_log *args,
- u8 *res, unsigned long bufsize)
-{
- unsigned long mem_end, cur_gfn, next_gfn, hva, pgstev;
- struct kvm_memslots *slots = kvm_memslots(kvm);
- struct kvm_memory_slot *ms;
-
- if (unlikely(kvm_memslots_empty(slots)))
- return 0;
-
- cur_gfn = kvm_s390_next_dirty_cmma(slots, args->start_gfn);
- ms = gfn_to_memslot(kvm, cur_gfn);
- args->count = 0;
- args->start_gfn = cur_gfn;
- if (!ms)
- return 0;
- next_gfn = kvm_s390_next_dirty_cmma(slots, cur_gfn + 1);
- mem_end = kvm_s390_get_gfn_end(slots);
-
- while (args->count < bufsize) {
- hva = gfn_to_hva(kvm, cur_gfn);
- if (kvm_is_error_hva(hva))
- return 0;
- /* Decrement only if we actually flipped the bit to 0 */
- if (test_and_clear_bit(cur_gfn - ms->base_gfn, kvm_second_dirty_bitmap(ms)))
- atomic64_dec(&kvm->arch.cmma_dirty_pages);
- if (get_pgste(kvm->mm, hva, &pgstev) < 0)
- pgstev = 0;
- /* Save the value */
- res[args->count++] = (pgstev >> 24) & 0x43;
- /* If the next bit is too far away, stop. */
- if (next_gfn > cur_gfn + KVM_S390_MAX_BIT_DISTANCE)
- return 0;
- /* If we reached the previous "next", find the next one */
- if (cur_gfn == next_gfn)
- next_gfn = kvm_s390_next_dirty_cmma(slots, cur_gfn + 1);
- /* Reached the end of memory or of the buffer, stop */
- if ((next_gfn >= mem_end) ||
- (next_gfn - args->start_gfn >= bufsize))
- return 0;
- cur_gfn++;
- /* Reached the end of the current memslot, take the next one. */
- if (cur_gfn - ms->base_gfn >= ms->npages) {
- ms = gfn_to_memslot(kvm, cur_gfn);
- if (!ms)
- return 0;
+ r = 0;
+ do {
+ r = kvm_s390_mmu_cache_topup(mc);
+ if (r == -ENOMEM)
+ break;
+ scoped_guard(read_lock, &kvm->mmu_lock) {
+ for (i = 0 ; i < args->count; i++) {
+ r = dat_set_storage_key(mc, kvm->arch.gmap->asce,
+ args->start_gfn + i, keys[i], 0);
+ if (r)
+ break;
+ }
}
- }
- return 0;
+ } while (r == -ENOMEM);
+ kvm_s390_free_mmu_cache(mc);
+out:
+ kvfree(keys);
+ return r;
}
/*
@@ -2351,8 +2167,7 @@ static int kvm_s390_get_cmma(struct kvm *kvm, struct kvm_s390_cmma_log *args,
static int kvm_s390_get_cmma_bits(struct kvm *kvm,
struct kvm_s390_cmma_log *args)
{
- unsigned long bufsize;
- int srcu_idx, peek, ret;
+ int peek, ret;
u8 *values;
if (!kvm->arch.use_cmma)
@@ -2365,8 +2180,8 @@ static int kvm_s390_get_cmma_bits(struct kvm *kvm,
if (!peek && !kvm->arch.migration_mode)
return -EINVAL;
/* CMMA is disabled or was not used, or the buffer has length zero */
- bufsize = min(args->count, KVM_S390_CMMA_SIZE_MAX);
- if (!bufsize || !kvm->mm->context.uses_cmm) {
+ args->count = min(args->count, KVM_S390_CMMA_SIZE_MAX);
+ if (!args->count || !kvm->arch.gmap->uses_cmm) {
memset(args, 0, sizeof(*args));
return 0;
}
@@ -2376,18 +2191,18 @@ static int kvm_s390_get_cmma_bits(struct kvm *kvm,
return 0;
}
- values = vmalloc(bufsize);
+ values = vmalloc(args->count);
if (!values)
return -ENOMEM;
- mmap_read_lock(kvm->mm);
- srcu_idx = srcu_read_lock(&kvm->srcu);
- if (peek)
- ret = kvm_s390_peek_cmma(kvm, args, values, bufsize);
- else
- ret = kvm_s390_get_cmma(kvm, args, values, bufsize);
- srcu_read_unlock(&kvm->srcu, srcu_idx);
- mmap_read_unlock(kvm->mm);
+ scoped_guard(read_lock, &kvm->mmu_lock) {
+ if (peek)
+ ret = dat_peek_cmma(args->start_gfn, kvm->arch.gmap->asce, &args->count,
+ values);
+ else
+ ret = dat_get_cmma(kvm->arch.gmap->asce, &args->start_gfn, &args->count,
+ values, &kvm->arch.cmma_dirty_pages);
+ }
if (kvm->arch.migration_mode)
args->remaining = atomic64_read(&kvm->arch.cmma_dirty_pages);
@@ -2409,11 +2224,9 @@ static int kvm_s390_get_cmma_bits(struct kvm *kvm,
static int kvm_s390_set_cmma_bits(struct kvm *kvm,
const struct kvm_s390_cmma_log *args)
{
- unsigned long hva, mask, pgstev, i;
- uint8_t *bits;
- int srcu_idx, r = 0;
-
- mask = args->mask;
+ struct kvm_s390_mmu_cache *mc;
+ u8 *bits = NULL;
+ int r = 0;
if (!kvm->arch.use_cmma)
return -ENXIO;
@@ -2427,9 +2240,12 @@ static int kvm_s390_set_cmma_bits(struct kvm *kvm,
if (args->count == 0)
return 0;
+ mc = kvm_s390_new_mmu_cache();
+ if (!mc)
+ return -ENOMEM;
bits = vmalloc(array_size(sizeof(*bits), args->count));
if (!bits)
- return -ENOMEM;
+ goto out;
r = copy_from_user(bits, (void __user *)args->values, args->count);
if (r) {
@@ -2437,29 +2253,19 @@ static int kvm_s390_set_cmma_bits(struct kvm *kvm,
goto out;
}
- mmap_read_lock(kvm->mm);
- srcu_idx = srcu_read_lock(&kvm->srcu);
- for (i = 0; i < args->count; i++) {
- hva = gfn_to_hva(kvm, args->start_gfn + i);
- if (kvm_is_error_hva(hva)) {
- r = -EFAULT;
+ do {
+ r = kvm_s390_mmu_cache_topup(mc);
+ if (r)
break;
+ scoped_guard(read_lock, &kvm->mmu_lock) {
+ r = dat_set_cmma_bits(mc, kvm->arch.gmap->asce, args->start_gfn,
+ args->count, args->mask, bits);
}
+ } while (r == -ENOMEM);
- pgstev = bits[i];
- pgstev = pgstev << 24;
- mask &= _PGSTE_GPS_USAGE_MASK | _PGSTE_GPS_NODAT;
- set_pgste_bits(kvm->mm, hva, mask, pgstev);
- }
- srcu_read_unlock(&kvm->srcu, srcu_idx);
- mmap_read_unlock(kvm->mm);
-
- if (!kvm->mm->context.uses_cmm) {
- mmap_write_lock(kvm->mm);
- kvm->mm->context.uses_cmm = 1;
- mmap_write_unlock(kvm->mm);
- }
+ WRITE_ONCE(kvm->arch.gmap->uses_cmm, 1);
out:
+ kvm_s390_free_mmu_cache(mc);
vfree(bits);
return r;
}
@@ -2923,9 +2729,6 @@ static int kvm_s390_vm_mem_op_abs(struct kvm *kvm, struct kvm_s390_mem_op *mop)
acc_mode = mop->op == KVM_S390_MEMOP_ABSOLUTE_READ ? GACC_FETCH : GACC_STORE;
scoped_guard(srcu, &kvm->srcu) {
- if (!kvm_is_gpa_in_memslot(kvm, mop->gaddr))
- return PGM_ADDRESSING;
-
if (mop->flags & KVM_S390_MEMOP_F_CHECK_ONLY)
return check_gpa_range(kvm, mop->gaddr, mop->size, acc_mode, mop->key);
@@ -2938,7 +2741,6 @@ static int kvm_s390_vm_mem_op_abs(struct kvm *kvm, struct kvm_s390_mem_op *mop)
if (acc_mode != GACC_STORE && copy_to_user(uaddr, tmpbuf, mop->size))
return -EFAULT;
}
-
return 0;
}
@@ -2967,9 +2769,6 @@ static int kvm_s390_vm_mem_op_cmpxchg(struct kvm *kvm, struct kvm_s390_mem_op *m
return -EFAULT;
scoped_guard(srcu, &kvm->srcu) {
- if (!kvm_is_gpa_in_memslot(kvm, mop->gaddr))
- return PGM_ADDRESSING;
-
r = cmpxchg_guest_abs_with_key(kvm, mop->gaddr, mop->size, &old, new,
mop->key, &success);
@@ -3329,11 +3128,6 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
if (type)
goto out_err;
#endif
-
- rc = s390_enable_sie();
- if (rc)
- goto out_err;
-
rc = -ENOMEM;
if (!sclp.has_64bscao)
@@ -3413,6 +3207,12 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
debug_register_view(kvm->arch.dbf, &debug_sprintf_view);
VM_EVENT(kvm, 3, "vm created with type %lu", type);
+ kvm->arch.mem_limit = type & KVM_VM_S390_UCONTROL ? KVM_S390_NO_MEM_LIMIT : sclp.hamax + 1;
+ kvm->arch.gmap = gmap_new(kvm, gpa_to_gfn(kvm->arch.mem_limit));
+ if (!kvm->arch.gmap)
+ goto out_err;
+ kvm->arch.gmap->pfault_enabled = 0;
+
if (type & KVM_VM_S390_UCONTROL) {
struct kvm_userspace_memory_region2 fake_memslot = {
.slot = KVM_S390_UCONTROL_MEMSLOT,
@@ -3422,23 +3222,15 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
.flags = 0,
};
- kvm->arch.gmap = NULL;
- kvm->arch.mem_limit = KVM_S390_NO_MEM_LIMIT;
/* one flat fake memslot covering the whole address-space */
mutex_lock(&kvm->slots_lock);
KVM_BUG_ON(kvm_set_internal_memslot(kvm, &fake_memslot), kvm);
mutex_unlock(&kvm->slots_lock);
+ kvm->arch.gmap->is_ucontrol = 1;
} else {
- if (sclp.hamax == U64_MAX)
- kvm->arch.mem_limit = TASK_SIZE_MAX;
- else
- kvm->arch.mem_limit = min_t(unsigned long, TASK_SIZE_MAX,
- sclp.hamax + 1);
- kvm->arch.gmap = gmap_create(current->mm, kvm->arch.mem_limit - 1);
- if (!kvm->arch.gmap)
- goto out_err;
- kvm->arch.gmap->private = kvm;
- kvm->arch.gmap->pfault_enabled = 0;
+ struct crst_table *table = dereference_asce(kvm->arch.gmap->asce);
+
+ crst_table_init((void *)table, _CRSTE_HOLE(table->crstes[0].h.tt).val);
}
kvm->arch.use_pfmfi = sclp.has_pfmfi;
@@ -3472,8 +3264,11 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
sca_del_vcpu(vcpu);
kvm_s390_update_topology_change_report(vcpu->kvm, 1);
- if (kvm_is_ucontrol(vcpu->kvm))
- gmap_remove(vcpu->arch.gmap);
+ if (kvm_is_ucontrol(vcpu->kvm)) {
+ scoped_guard(spinlock, &vcpu->kvm->arch.gmap->children_lock)
+ gmap_remove_child(vcpu->arch.gmap);
+ gmap_dispose(vcpu->arch.gmap);
+ }
if (vcpu->kvm->arch.use_cmma)
kvm_s390_vcpu_unsetup_cmma(vcpu);
@@ -3481,6 +3276,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
if (kvm_s390_pv_cpu_get_handle(vcpu))
kvm_s390_pv_destroy_cpu(vcpu, &rc, &rrc);
free_page((unsigned long)(vcpu->arch.sie_block));
+ kvm_s390_free_mmu_cache(vcpu->arch.mc);
}
void kvm_arch_destroy_vm(struct kvm *kvm)
@@ -3507,25 +3303,13 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
debug_unregister(kvm->arch.dbf);
free_page((unsigned long)kvm->arch.sie_page2);
- if (!kvm_is_ucontrol(kvm))
- gmap_remove(kvm->arch.gmap);
kvm_s390_destroy_adapters(kvm);
kvm_s390_clear_float_irqs(kvm);
kvm_s390_vsie_destroy(kvm);
+ gmap_dispose(kvm->arch.gmap);
KVM_EVENT(3, "vm 0x%p destroyed", kvm);
}
-/* Section: vcpu related */
-static int __kvm_ucontrol_vcpu_init(struct kvm_vcpu *vcpu)
-{
- vcpu->arch.gmap = gmap_create(current->mm, -1UL);
- if (!vcpu->arch.gmap)
- return -ENOMEM;
- vcpu->arch.gmap->private = vcpu->kvm;
-
- return 0;
-}
-
static void sca_del_vcpu(struct kvm_vcpu *vcpu)
{
if (!kvm_s390_use_sca_entries())
@@ -3961,9 +3745,15 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
int rc;
BUILD_BUG_ON(sizeof(struct sie_page) != 4096);
+ vcpu->arch.mc = kvm_s390_new_mmu_cache();
+ if (!vcpu->arch.mc)
+ return -ENOMEM;
sie_page = (struct sie_page *) get_zeroed_page(GFP_KERNEL_ACCOUNT);
- if (!sie_page)
+ if (!sie_page) {
+ kvm_s390_free_mmu_cache(vcpu->arch.mc);
+ vcpu->arch.mc = NULL;
return -ENOMEM;
+ }
vcpu->arch.sie_block = &sie_page->sie_block;
vcpu->arch.sie_block->itdba = virt_to_phys(&sie_page->itdb);
@@ -4005,8 +3795,9 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
vcpu->run->kvm_valid_regs |= KVM_SYNC_FPRS;
if (kvm_is_ucontrol(vcpu->kvm)) {
- rc = __kvm_ucontrol_vcpu_init(vcpu);
- if (rc)
+ rc = -ENOMEM;
+ vcpu->arch.gmap = gmap_new_child(vcpu->kvm->arch.gmap, -1UL);
+ if (!vcpu->arch.gmap)
goto out_free_sie_block;
}
@@ -4022,8 +3813,10 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
return 0;
out_ucontrol_uninit:
- if (kvm_is_ucontrol(vcpu->kvm))
- gmap_remove(vcpu->arch.gmap);
+ if (kvm_is_ucontrol(vcpu->kvm)) {
+ gmap_remove_child(vcpu->arch.gmap);
+ gmap_dispose(vcpu->arch.gmap);
+ }
out_free_sie_block:
free_page((unsigned long)(vcpu->arch.sie_block));
return rc;
@@ -4087,32 +3880,6 @@ void kvm_s390_sync_request(int req, struct kvm_vcpu *vcpu)
kvm_s390_vcpu_request(vcpu);
}
-static void kvm_gmap_notifier(struct gmap *gmap, unsigned long start,
- unsigned long end)
-{
- struct kvm *kvm = gmap->private;
- struct kvm_vcpu *vcpu;
- unsigned long prefix;
- unsigned long i;
-
- trace_kvm_s390_gmap_notifier(start, end, gmap_is_shadow(gmap));
-
- if (gmap_is_shadow(gmap))
- return;
- if (start >= 1UL << 31)
- /* We are only interested in prefix pages */
- return;
- kvm_for_each_vcpu(i, vcpu, kvm) {
- /* match against both prefix pages */
- prefix = kvm_s390_get_prefix(vcpu);
- if (prefix <= end && start <= prefix + 2*PAGE_SIZE - 1) {
- VCPU_EVENT(vcpu, 2, "gmap notifier for %lx-%lx",
- start, end);
- kvm_s390_sync_request(KVM_REQ_REFRESH_GUEST_PREFIX, vcpu);
- }
- }
-}
-
bool kvm_arch_no_poll(struct kvm_vcpu *vcpu)
{
/* do not poll with more than halt_poll_max_steal percent of steal time */
@@ -4496,72 +4263,53 @@ static bool ibs_enabled(struct kvm_vcpu *vcpu)
return kvm_s390_test_cpuflags(vcpu, CPUSTAT_IBS);
}
-static int __kvm_s390_fixup_fault_sync(struct gmap *gmap, gpa_t gaddr, unsigned int flags)
+static int vcpu_ucontrol_translate(struct kvm_vcpu *vcpu, gpa_t *gaddr)
{
- struct kvm *kvm = gmap->private;
- gfn_t gfn = gpa_to_gfn(gaddr);
- bool unlocked;
- hva_t vmaddr;
- gpa_t tmp;
+ union crste *crstep;
+ union pte *ptep;
int rc;
- if (kvm_is_ucontrol(kvm)) {
- tmp = __gmap_translate(gmap, gaddr);
- gfn = gpa_to_gfn(tmp);
- }
-
- vmaddr = gfn_to_hva(kvm, gfn);
- rc = fixup_user_fault(gmap->mm, vmaddr, FAULT_FLAG_WRITE, &unlocked);
- if (!rc)
- rc = __gmap_link(gmap, gaddr, vmaddr);
- return rc;
-}
-
-/**
- * __kvm_s390_mprotect_many() - Apply specified protection to guest pages
- * @gmap: the gmap of the guest
- * @gpa: the starting guest address
- * @npages: how many pages to protect
- * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE
- * @bits: pgste notification bits to set
- *
- * Returns: 0 in case of success, < 0 in case of error - see gmap_protect_one()
- *
- * Context: kvm->srcu and gmap->mm need to be held in read mode
- */
-int __kvm_s390_mprotect_many(struct gmap *gmap, gpa_t gpa, u8 npages, unsigned int prot,
- unsigned long bits)
-{
- unsigned int fault_flag = (prot & PROT_WRITE) ? FAULT_FLAG_WRITE : 0;
- gpa_t end = gpa + npages * PAGE_SIZE;
- int rc;
-
- for (; gpa < end; gpa = ALIGN(gpa + 1, rc)) {
- rc = gmap_protect_one(gmap, gpa, prot, bits);
- if (rc == -EAGAIN) {
- __kvm_s390_fixup_fault_sync(gmap, gpa, fault_flag);
- rc = gmap_protect_one(gmap, gpa, prot, bits);
+ if (kvm_is_ucontrol(vcpu->kvm)) {
+ /*
+ * This translates the per-vCPU guest address into a
+ * fake guest address, which can then be used with the
+ * fake memslots that are identity mapping userspace.
+ * This allows ucontrol VMs to use the normal fault
+ * resolution path, like normal VMs.
+ */
+ rc = dat_entry_walk(NULL, gpa_to_gfn(*gaddr), vcpu->arch.gmap->asce,
+ 0, TABLE_TYPE_PAGE_TABLE, &crstep, &ptep);
+ if (rc) {
+ vcpu->run->exit_reason = KVM_EXIT_S390_UCONTROL;
+ vcpu->run->s390_ucontrol.trans_exc_code = *gaddr;
+ vcpu->run->s390_ucontrol.pgm_code = PGM_SEGMENT_TRANSLATION;
+ return -EREMOTE;
}
- if (rc < 0)
- return rc;
+ *gaddr &= ~_SEGMENT_MASK;
+ *gaddr |= dat_get_ptval(pte_table_start(ptep), PTVAL_VMADDR) << _SEGMENT_SHIFT;
}
-
return 0;
}
-static int kvm_s390_mprotect_notify_prefix(struct kvm_vcpu *vcpu)
+static int kvm_s390_fixup_prefix(struct kvm_vcpu *vcpu)
{
gpa_t gaddr = kvm_s390_get_prefix(vcpu);
- int idx, rc;
-
- idx = srcu_read_lock(&vcpu->kvm->srcu);
- mmap_read_lock(vcpu->arch.gmap->mm);
+ gfn_t gfn;
+ int rc;
- rc = __kvm_s390_mprotect_many(vcpu->arch.gmap, gaddr, 2, PROT_WRITE, GMAP_NOTIFY_MPROT);
+ if (vcpu_ucontrol_translate(vcpu, &gaddr))
+ return -EREMOTE;
+ gfn = gpa_to_gfn(gaddr);
- mmap_read_unlock(vcpu->arch.gmap->mm);
- srcu_read_unlock(&vcpu->kvm->srcu, idx);
+ rc = kvm_s390_faultin_gfn_simple(vcpu, NULL, gfn, true);
+ if (rc)
+ return rc;
+ rc = kvm_s390_faultin_gfn_simple(vcpu, NULL, gfn + 1, true);
+ if (rc)
+ return rc;
+ scoped_guard(write_lock, &vcpu->kvm->mmu_lock)
+ rc = dat_set_prefix_notif_bit(vcpu->kvm->arch.gmap->asce, gfn);
return rc;
}
@@ -4581,7 +4329,7 @@ static int kvm_s390_handle_requests(struct kvm_vcpu *vcpu)
if (kvm_check_request(KVM_REQ_REFRESH_GUEST_PREFIX, vcpu)) {
int rc;
- rc = kvm_s390_mprotect_notify_prefix(vcpu);
+ rc = kvm_s390_fixup_prefix(vcpu);
if (rc) {
kvm_make_request(KVM_REQ_REFRESH_GUEST_PREFIX, vcpu);
return rc;
@@ -4631,7 +4379,7 @@ static int kvm_s390_handle_requests(struct kvm_vcpu *vcpu)
* CMM has been used.
*/
if ((vcpu->kvm->arch.use_cmma) &&
- (vcpu->kvm->mm->context.uses_cmm))
+ (vcpu->arch.gmap->uses_cmm))
vcpu->arch.sie_block->ecb2 |= ECB2_CMMA;
goto retry;
}
@@ -4839,98 +4587,25 @@ static void kvm_s390_assert_primary_as(struct kvm_vcpu *vcpu)
current->thread.gmap_int_code, current->thread.gmap_teid.val);
}
-/*
- * __kvm_s390_handle_dat_fault() - handle a dat fault for the gmap of a vcpu
- * @vcpu: the vCPU whose gmap is to be fixed up
- * @gfn: the guest frame number used for memslots (including fake memslots)
- * @gaddr: the gmap address, does not have to match @gfn for ucontrol gmaps
- * @foll: FOLL_* flags
- *
- * Return: 0 on success, < 0 in case of error.
- * Context: The mm lock must not be held before calling. May sleep.
- */
-int __kvm_s390_handle_dat_fault(struct kvm_vcpu *vcpu, gfn_t gfn, gpa_t gaddr, unsigned int foll)
-{
- struct kvm_memory_slot *slot;
- unsigned int fault_flags;
- bool writable, unlocked;
- unsigned long vmaddr;
- struct page *page;
- kvm_pfn_t pfn;
+static int vcpu_dat_fault_handler(struct kvm_vcpu *vcpu, gpa_t gaddr, bool wr)
+{
+ struct guest_fault f = {
+ .write_attempt = wr,
+ .attempt_pfault = vcpu->arch.gmap->pfault_enabled,
+ };
int rc;
- slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
- if (!slot || slot->flags & KVM_MEMSLOT_INVALID)
- return vcpu_post_run_addressing_exception(vcpu);
-
- fault_flags = foll & FOLL_WRITE ? FAULT_FLAG_WRITE : 0;
- if (vcpu->arch.gmap->pfault_enabled)
- foll |= FOLL_NOWAIT;
- vmaddr = __gfn_to_hva_memslot(slot, gfn);
-
-try_again:
- pfn = __kvm_faultin_pfn(slot, gfn, foll, &writable, &page);
+ if (vcpu_ucontrol_translate(vcpu, &gaddr))
+ return -EREMOTE;
+ f.gfn = gpa_to_gfn(gaddr);
- /* Access outside memory, inject addressing exception */
- if (is_noslot_pfn(pfn))
+ rc = kvm_s390_faultin_gfn(vcpu, NULL, &f);
+ if (rc <= 0)
+ return rc;
+ if (rc == PGM_ADDRESSING)
return vcpu_post_run_addressing_exception(vcpu);
- /* Signal pending: try again */
- if (pfn == KVM_PFN_ERR_SIGPENDING)
- return -EAGAIN;
-
- /* Needs I/O, try to setup async pfault (only possible with FOLL_NOWAIT) */
- if (pfn == KVM_PFN_ERR_NEEDS_IO) {
- trace_kvm_s390_major_guest_pfault(vcpu);
- if (kvm_arch_setup_async_pf(vcpu))
- return 0;
- vcpu->stat.pfault_sync++;
- /* Could not setup async pfault, try again synchronously */
- foll &= ~FOLL_NOWAIT;
- goto try_again;
- }
- /* Any other error */
- if (is_error_pfn(pfn))
- return -EFAULT;
-
- /* Success */
- mmap_read_lock(vcpu->arch.gmap->mm);
- /* Mark the userspace PTEs as young and/or dirty, to avoid page fault loops */
- rc = fixup_user_fault(vcpu->arch.gmap->mm, vmaddr, fault_flags, &unlocked);
- if (!rc)
- rc = __gmap_link(vcpu->arch.gmap, gaddr, vmaddr);
- scoped_guard(read_lock, &vcpu->kvm->mmu_lock) {
- kvm_release_faultin_page(vcpu->kvm, page, false, writable);
- }
- mmap_read_unlock(vcpu->arch.gmap->mm);
- return rc;
-}
-
-static int vcpu_dat_fault_handler(struct kvm_vcpu *vcpu, unsigned long gaddr, unsigned int foll)
-{
- unsigned long gaddr_tmp;
- gfn_t gfn;
-
- gfn = gpa_to_gfn(gaddr);
- if (kvm_is_ucontrol(vcpu->kvm)) {
- /*
- * This translates the per-vCPU guest address into a
- * fake guest address, which can then be used with the
- * fake memslots that are identity mapping userspace.
- * This allows ucontrol VMs to use the normal fault
- * resolution path, like normal VMs.
- */
- mmap_read_lock(vcpu->arch.gmap->mm);
- gaddr_tmp = __gmap_translate(vcpu->arch.gmap, gaddr);
- mmap_read_unlock(vcpu->arch.gmap->mm);
- if (gaddr_tmp == -EFAULT) {
- vcpu->run->exit_reason = KVM_EXIT_S390_UCONTROL;
- vcpu->run->s390_ucontrol.trans_exc_code = gaddr;
- vcpu->run->s390_ucontrol.pgm_code = PGM_SEGMENT_TRANSLATION;
- return -EREMOTE;
- }
- gfn = gpa_to_gfn(gaddr_tmp);
- }
- return __kvm_s390_handle_dat_fault(vcpu, gfn, gaddr, foll);
+ KVM_BUG_ON(rc, vcpu->kvm);
+ return -EINVAL;
}
static int vcpu_post_run_handle_fault(struct kvm_vcpu *vcpu)
@@ -5102,7 +4777,7 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
exit_reason = kvm_s390_enter_exit_sie(vcpu->arch.sie_block,
vcpu->run->s.regs.gprs,
- vcpu->arch.gmap->asce);
+ vcpu->arch.gmap->asce.val);
__enable_cpu_timer_accounting(vcpu);
guest_timing_exit_irqoff();
@@ -5633,8 +5308,8 @@ static long kvm_s390_vcpu_mem_op(struct kvm_vcpu *vcpu,
struct kvm_s390_mem_op *mop)
{
void __user *uaddr = (void __user *)mop->buf;
+ void *tmpbuf __free(kvfree) = NULL;
enum gacc_mode acc_mode;
- void *tmpbuf = NULL;
int r;
r = mem_op_validate_common(mop, KVM_S390_MEMOP_F_INJECT_EXCEPTION |
@@ -5656,32 +5331,21 @@ static long kvm_s390_vcpu_mem_op(struct kvm_vcpu *vcpu,
if (mop->flags & KVM_S390_MEMOP_F_CHECK_ONLY) {
r = check_gva_range(vcpu, mop->gaddr, mop->ar, mop->size,
acc_mode, mop->key);
- goto out_inject;
- }
- if (acc_mode == GACC_FETCH) {
+ } else if (acc_mode == GACC_FETCH) {
r = read_guest_with_key(vcpu, mop->gaddr, mop->ar, tmpbuf,
mop->size, mop->key);
- if (r)
- goto out_inject;
- if (copy_to_user(uaddr, tmpbuf, mop->size)) {
- r = -EFAULT;
- goto out_free;
- }
+ if (!r && copy_to_user(uaddr, tmpbuf, mop->size))
+ return -EFAULT;
} else {
- if (copy_from_user(tmpbuf, uaddr, mop->size)) {
- r = -EFAULT;
- goto out_free;
- }
+ if (copy_from_user(tmpbuf, uaddr, mop->size))
+ return -EFAULT;
r = write_guest_with_key(vcpu, mop->gaddr, mop->ar, tmpbuf,
mop->size, mop->key);
}
-out_inject:
if (r > 0 && (mop->flags & KVM_S390_MEMOP_F_INJECT_EXCEPTION) != 0)
kvm_s390_inject_prog_irq(vcpu, &vcpu->arch.pgm);
-out_free:
- vfree(tmpbuf);
return r;
}
@@ -5871,37 +5535,39 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
}
#ifdef CONFIG_KVM_S390_UCONTROL
case KVM_S390_UCAS_MAP: {
- struct kvm_s390_ucas_mapping ucasmap;
+ struct kvm_s390_ucas_mapping ucas;
- if (copy_from_user(&ucasmap, argp, sizeof(ucasmap))) {
- r = -EFAULT;
+ r = -EFAULT;
+ if (copy_from_user(&ucas, argp, sizeof(ucas)))
break;
- }
- if (!kvm_is_ucontrol(vcpu->kvm)) {
- r = -EINVAL;
+ r = -EINVAL;
+ if (!kvm_is_ucontrol(vcpu->kvm))
+ break;
+ if (!IS_ALIGNED(ucas.user_addr | ucas.vcpu_addr | ucas.length, _SEGMENT_SIZE))
break;
- }
- r = gmap_map_segment(vcpu->arch.gmap, ucasmap.user_addr,
- ucasmap.vcpu_addr, ucasmap.length);
+ r = gmap_ucas_map(vcpu->arch.gmap, gpa_to_gfn(ucas.user_addr),
+ gpa_to_gfn(ucas.vcpu_addr),
+ ucas.length >> _SEGMENT_SHIFT);
break;
}
case KVM_S390_UCAS_UNMAP: {
- struct kvm_s390_ucas_mapping ucasmap;
+ struct kvm_s390_ucas_mapping ucas;
- if (copy_from_user(&ucasmap, argp, sizeof(ucasmap))) {
- r = -EFAULT;
+ r = -EFAULT;
+ if (copy_from_user(&ucas, argp, sizeof(ucas)))
break;
- }
- if (!kvm_is_ucontrol(vcpu->kvm)) {
- r = -EINVAL;
+ r = -EINVAL;
+ if (!kvm_is_ucontrol(vcpu->kvm))
+ break;
+ if (!IS_ALIGNED(ucas.vcpu_addr | ucas.length, _SEGMENT_SIZE))
break;
- }
- r = gmap_unmap_segment(vcpu->arch.gmap, ucasmap.vcpu_addr,
- ucasmap.length);
+ gmap_ucas_unmap(vcpu->arch.gmap, gpa_to_gfn(ucas.vcpu_addr),
+ ucas.length >> _SEGMENT_SHIFT);
+ r = 0;
break;
}
#endif
@@ -6074,34 +5740,39 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
const struct kvm_memory_slot *new,
enum kvm_mr_change change)
{
+ struct kvm_s390_mmu_cache *mc = NULL;
int rc = 0;
- if (kvm_is_ucontrol(kvm))
+ if (change == KVM_MR_FLAGS_ONLY)
return;
+ mc = kvm_s390_new_mmu_cache();
+ if (!mc) {
+ rc = -ENOMEM;
+ goto out;
+ }
+
switch (change) {
case KVM_MR_DELETE:
- rc = gmap_unmap_segment(kvm->arch.gmap, old->base_gfn * PAGE_SIZE,
- old->npages * PAGE_SIZE);
+ rc = dat_delete_slot(mc, kvm->arch.gmap->asce, old->base_gfn, old->npages);
break;
case KVM_MR_MOVE:
- rc = gmap_unmap_segment(kvm->arch.gmap, old->base_gfn * PAGE_SIZE,
- old->npages * PAGE_SIZE);
+ rc = dat_delete_slot(mc, kvm->arch.gmap->asce, old->base_gfn, old->npages);
if (rc)
break;
fallthrough;
case KVM_MR_CREATE:
- rc = gmap_map_segment(kvm->arch.gmap, new->userspace_addr,
- new->base_gfn * PAGE_SIZE,
- new->npages * PAGE_SIZE);
+ rc = dat_create_slot(mc, kvm->arch.gmap->asce, new->base_gfn, new->npages);
break;
case KVM_MR_FLAGS_ONLY:
break;
default:
WARN(1, "Unknown KVM MR CHANGE: %d\n", change);
}
+out:
if (rc)
pr_warn("failed to commit memory region\n");
+ kvm_s390_free_mmu_cache(mc);
return;
}
@@ -6115,7 +5786,8 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
*/
bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
{
- return false;
+ scoped_guard(read_lock, &kvm->mmu_lock)
+ return dat_test_age_gfn(kvm->arch.gmap->asce, range->start, range->end);
}
/**
@@ -6128,7 +5800,8 @@ bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
*/
bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
{
- return false;
+ scoped_guard(read_lock, &kvm->mmu_lock)
+ return gmap_age_gfn(kvm->arch.gmap, range->start, range->end);
}
/**
@@ -6145,7 +5818,7 @@ bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
*/
bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
{
- return false;
+ return gmap_unmap_gfn_range(kvm->arch.gmap, range->slot, range->start, range->end);
}
static inline unsigned long nonhyp_mask(int i)
diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h
index 495ee9caaa30..8a979b1f1a7b 100644
--- a/arch/s390/kvm/kvm-s390.h
+++ b/arch/s390/kvm/kvm-s390.h
@@ -19,6 +19,8 @@
#include <asm/facility.h>
#include <asm/processor.h>
#include <asm/sclp.h>
+#include "dat.h"
+#include "gmap.h"
#define KVM_S390_UCONTROL_MEMSLOT (KVM_USER_MEM_SLOTS + 0)
@@ -114,9 +116,7 @@ static inline int is_vcpu_idle(struct kvm_vcpu *vcpu)
static inline int kvm_is_ucontrol(struct kvm *kvm)
{
#ifdef CONFIG_KVM_S390_UCONTROL
- if (kvm->arch.gmap)
- return 0;
- return 1;
+ return kvm->arch.gmap->is_ucontrol;
#else
return 0;
#endif
@@ -440,14 +440,10 @@ int kvm_s390_skey_check_enable(struct kvm_vcpu *vcpu);
/* implemented in vsie.c */
int kvm_s390_handle_vsie(struct kvm_vcpu *vcpu);
void kvm_s390_vsie_kick(struct kvm_vcpu *vcpu);
-void kvm_s390_vsie_gmap_notifier(struct gmap *gmap, unsigned long start,
- unsigned long end);
+void kvm_s390_vsie_gmap_notifier(struct gmap *gmap, gpa_t start, gpa_t end);
void kvm_s390_vsie_init(struct kvm *kvm);
void kvm_s390_vsie_destroy(struct kvm *kvm);
-int gmap_shadow_valid(struct gmap *sg, unsigned long asce, int edat_level);
-
-/* implemented in gmap-vsie.c */
-struct gmap *gmap_shadow(struct gmap *parent, unsigned long asce, int edat_level);
+int gmap_shadow_valid(struct gmap *sg, union asce asce, int edat_level);
/* implemented in sigp.c */
int kvm_s390_handle_sigp(struct kvm_vcpu *vcpu);
@@ -469,15 +465,9 @@ void kvm_s390_vcpu_unsetup_cmma(struct kvm_vcpu *vcpu);
void kvm_s390_set_cpu_timer(struct kvm_vcpu *vcpu, __u64 cputm);
__u64 kvm_s390_get_cpu_timer(struct kvm_vcpu *vcpu);
int kvm_s390_cpus_from_pv(struct kvm *kvm, u16 *rc, u16 *rrc);
-int __kvm_s390_handle_dat_fault(struct kvm_vcpu *vcpu, gfn_t gfn, gpa_t gaddr, unsigned int flags);
int __kvm_s390_mprotect_many(struct gmap *gmap, gpa_t gpa, u8 npages, unsigned int prot,
unsigned long bits);
-static inline int kvm_s390_handle_dat_fault(struct kvm_vcpu *vcpu, gpa_t gaddr, unsigned int flags)
-{
- return __kvm_s390_handle_dat_fault(vcpu, gpa_to_gfn(gaddr), gaddr, flags);
-}
-
bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu);
/* implemented in diag.c */
diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c
index 9a71b6e00948..4ecc20688db6 100644
--- a/arch/s390/kvm/priv.c
+++ b/arch/s390/kvm/priv.c
@@ -21,13 +21,14 @@
#include <asm/ebcdic.h>
#include <asm/sysinfo.h>
#include <asm/page-states.h>
-#include <asm/gmap.h>
#include <asm/ptrace.h>
#include <asm/sclp.h>
#include <asm/ap.h>
+#include <asm/gmap_helpers.h>
#include "gaccess.h"
#include "kvm-s390.h"
#include "trace.h"
+#include "gmap.h"
static int handle_ri(struct kvm_vcpu *vcpu)
{
@@ -222,7 +223,7 @@ int kvm_s390_skey_check_enable(struct kvm_vcpu *vcpu)
if (vcpu->arch.skey_enabled)
return 0;
- rc = s390_enable_skey();
+ rc = gmap_enable_skeys(vcpu->arch.gmap);
VCPU_EVENT(vcpu, 3, "enabling storage keys for guest: %d", rc);
if (rc)
return rc;
@@ -255,10 +256,9 @@ static int try_handle_skey(struct kvm_vcpu *vcpu)
static int handle_iske(struct kvm_vcpu *vcpu)
{
- unsigned long gaddr, vmaddr;
- unsigned char key;
+ unsigned long gaddr;
int reg1, reg2;
- bool unlocked;
+ union skey key;
int rc;
vcpu->stat.instruction_iske++;
@@ -275,37 +275,21 @@ static int handle_iske(struct kvm_vcpu *vcpu)
gaddr = vcpu->run->s.regs.gprs[reg2] & PAGE_MASK;
gaddr = kvm_s390_logical_to_effective(vcpu, gaddr);
gaddr = kvm_s390_real_to_abs(vcpu, gaddr);
- vmaddr = gfn_to_hva(vcpu->kvm, gpa_to_gfn(gaddr));
- if (kvm_is_error_hva(vmaddr))
- return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
-retry:
- unlocked = false;
- mmap_read_lock(current->mm);
- rc = get_guest_storage_key(current->mm, vmaddr, &key);
-
- if (rc) {
- rc = fixup_user_fault(current->mm, vmaddr,
- FAULT_FLAG_WRITE, &unlocked);
- if (!rc) {
- mmap_read_unlock(current->mm);
- goto retry;
- }
- }
- mmap_read_unlock(current->mm);
- if (rc == -EFAULT)
- return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
+ scoped_guard(read_lock, &vcpu->kvm->mmu_lock)
+ rc = dat_get_storage_key(vcpu->arch.gmap->asce, gpa_to_gfn(gaddr), &key);
+ if (rc > 0)
+ return kvm_s390_inject_program_int(vcpu, rc);
if (rc < 0)
return rc;
vcpu->run->s.regs.gprs[reg1] &= ~0xff;
- vcpu->run->s.regs.gprs[reg1] |= key;
+ vcpu->run->s.regs.gprs[reg1] |= key.skey;
return 0;
}
static int handle_rrbe(struct kvm_vcpu *vcpu)
{
- unsigned long vmaddr, gaddr;
+ unsigned long gaddr;
int reg1, reg2;
- bool unlocked;
int rc;
vcpu->stat.instruction_rrbe++;
@@ -322,24 +306,10 @@ static int handle_rrbe(struct kvm_vcpu *vcpu)
gaddr = vcpu->run->s.regs.gprs[reg2] & PAGE_MASK;
gaddr = kvm_s390_logical_to_effective(vcpu, gaddr);
gaddr = kvm_s390_real_to_abs(vcpu, gaddr);
- vmaddr = gfn_to_hva(vcpu->kvm, gpa_to_gfn(gaddr));
- if (kvm_is_error_hva(vmaddr))
- return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
-retry:
- unlocked = false;
- mmap_read_lock(current->mm);
- rc = reset_guest_reference_bit(current->mm, vmaddr);
- if (rc < 0) {
- rc = fixup_user_fault(current->mm, vmaddr,
- FAULT_FLAG_WRITE, &unlocked);
- if (!rc) {
- mmap_read_unlock(current->mm);
- goto retry;
- }
- }
- mmap_read_unlock(current->mm);
- if (rc == -EFAULT)
- return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
+ scoped_guard(read_lock, &vcpu->kvm->mmu_lock)
+ rc = dat_reset_reference_bit(vcpu->arch.gmap->asce, gpa_to_gfn(gaddr));
+ if (rc > 0)
+ return kvm_s390_inject_program_int(vcpu, rc);
if (rc < 0)
return rc;
kvm_s390_set_psw_cc(vcpu, rc);
@@ -354,9 +324,8 @@ static int handle_sske(struct kvm_vcpu *vcpu)
{
unsigned char m3 = vcpu->arch.sie_block->ipb >> 28;
unsigned long start, end;
- unsigned char key, oldkey;
+ union skey key, oldkey;
int reg1, reg2;
- bool unlocked;
int rc;
vcpu->stat.instruction_sske++;
@@ -377,7 +346,7 @@ static int handle_sske(struct kvm_vcpu *vcpu)
kvm_s390_get_regs_rre(vcpu, ®1, ®2);
- key = vcpu->run->s.regs.gprs[reg1] & 0xfe;
+ key.skey = vcpu->run->s.regs.gprs[reg1] & 0xfe;
start = vcpu->run->s.regs.gprs[reg2] & PAGE_MASK;
start = kvm_s390_logical_to_effective(vcpu, start);
if (m3 & SSKE_MB) {
@@ -389,27 +358,17 @@ static int handle_sske(struct kvm_vcpu *vcpu)
}
while (start != end) {
- unsigned long vmaddr = gfn_to_hva(vcpu->kvm, gpa_to_gfn(start));
- unlocked = false;
-
- if (kvm_is_error_hva(vmaddr))
- return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
-
- mmap_read_lock(current->mm);
- rc = cond_set_guest_storage_key(current->mm, vmaddr, key, &oldkey,
- m3 & SSKE_NQ, m3 & SSKE_MR,
- m3 & SSKE_MC);
-
- if (rc < 0) {
- rc = fixup_user_fault(current->mm, vmaddr,
- FAULT_FLAG_WRITE, &unlocked);
- rc = !rc ? -EAGAIN : rc;
+ scoped_guard(read_lock, &vcpu->kvm->mmu_lock) {
+ rc = dat_cond_set_storage_key(vcpu->arch.mc, vcpu->arch.gmap->asce,
+ gpa_to_gfn(start), key, &oldkey,
+ m3 & SSKE_NQ, m3 & SSKE_MR, m3 & SSKE_MC);
}
- mmap_read_unlock(current->mm);
- if (rc == -EFAULT)
+ if (rc > 1)
return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
- if (rc == -EAGAIN)
+ if (rc == -ENOMEM) {
+ kvm_s390_mmu_cache_topup(vcpu->arch.mc);
continue;
+ }
if (rc < 0)
return rc;
start += PAGE_SIZE;
@@ -422,7 +381,7 @@ static int handle_sske(struct kvm_vcpu *vcpu)
} else {
kvm_s390_set_psw_cc(vcpu, rc);
vcpu->run->s.regs.gprs[reg1] &= ~0xff00UL;
- vcpu->run->s.regs.gprs[reg1] |= (u64) oldkey << 8;
+ vcpu->run->s.regs.gprs[reg1] |= (u64)oldkey.skey << 8;
}
}
if (m3 & SSKE_MB) {
@@ -1082,7 +1041,7 @@ static int handle_pfmf(struct kvm_vcpu *vcpu)
bool mr = false, mc = false, nq;
int reg1, reg2;
unsigned long start, end;
- unsigned char key;
+ union skey key;
vcpu->stat.instruction_pfmf++;
@@ -1110,7 +1069,7 @@ static int handle_pfmf(struct kvm_vcpu *vcpu)
}
nq = vcpu->run->s.regs.gprs[reg1] & PFMF_NQ;
- key = vcpu->run->s.regs.gprs[reg1] & PFMF_KEY;
+ key.skey = vcpu->run->s.regs.gprs[reg1] & PFMF_KEY;
start = vcpu->run->s.regs.gprs[reg2] & PAGE_MASK;
start = kvm_s390_logical_to_effective(vcpu, start);
@@ -1141,14 +1100,6 @@ static int handle_pfmf(struct kvm_vcpu *vcpu)
}
while (start != end) {
- unsigned long vmaddr;
- bool unlocked = false;
-
- /* Translate guest address to host address */
- vmaddr = gfn_to_hva(vcpu->kvm, gpa_to_gfn(start));
- if (kvm_is_error_hva(vmaddr))
- return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
-
if (vcpu->run->s.regs.gprs[reg1] & PFMF_CF) {
if (kvm_clear_guest(vcpu->kvm, start, PAGE_SIZE))
return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
@@ -1159,19 +1110,17 @@ static int handle_pfmf(struct kvm_vcpu *vcpu)
if (rc)
return rc;
- mmap_read_lock(current->mm);
- rc = cond_set_guest_storage_key(current->mm, vmaddr,
- key, NULL, nq, mr, mc);
- if (rc < 0) {
- rc = fixup_user_fault(current->mm, vmaddr,
- FAULT_FLAG_WRITE, &unlocked);
- rc = !rc ? -EAGAIN : rc;
+ scoped_guard(read_lock, &vcpu->kvm->mmu_lock) {
+ rc = dat_cond_set_storage_key(vcpu->arch.mc, vcpu->arch.gmap->asce,
+ gpa_to_gfn(start), key,
+ NULL, nq, mr, mc);
}
- mmap_read_unlock(current->mm);
- if (rc == -EFAULT)
- return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
- if (rc == -EAGAIN)
+ if (rc > 1)
+ return kvm_s390_inject_program_int(vcpu, rc);
+ if (rc == -ENOMEM) {
+ kvm_s390_mmu_cache_topup(vcpu->arch.mc);
continue;
+ }
if (rc < 0)
return rc;
}
@@ -1195,8 +1144,10 @@ static int handle_pfmf(struct kvm_vcpu *vcpu)
static inline int __do_essa(struct kvm_vcpu *vcpu, const int orc)
{
int r1, r2, nappended, entries;
- unsigned long gfn, hva, res, pgstev, ptev;
+ union essa_state state;
unsigned long *cbrlo;
+ unsigned long gfn;
+ bool dirtied;
/*
* We don't need to set SD.FPF.SK to 1 here, because if we have a
@@ -1205,33 +1156,12 @@ static inline int __do_essa(struct kvm_vcpu *vcpu, const int orc)
kvm_s390_get_regs_rre(vcpu, &r1, &r2);
gfn = vcpu->run->s.regs.gprs[r2] >> PAGE_SHIFT;
- hva = gfn_to_hva(vcpu->kvm, gfn);
entries = (vcpu->arch.sie_block->cbrlo & ~PAGE_MASK) >> 3;
- if (kvm_is_error_hva(hva))
- return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
-
- nappended = pgste_perform_essa(vcpu->kvm->mm, hva, orc, &ptev, &pgstev);
- if (nappended < 0) {
- res = orc ? 0x10 : 0;
- vcpu->run->s.regs.gprs[r1] = res; /* Exception Indication */
+ nappended = dat_perform_essa(vcpu->arch.gmap->asce, gfn, orc, &state, &dirtied);
+ vcpu->run->s.regs.gprs[r1] = state.val;
+ if (nappended < 0)
return 0;
- }
- res = (pgstev & _PGSTE_GPS_USAGE_MASK) >> 22;
- /*
- * Set the block-content state part of the result. 0 means resident, so
- * nothing to do if the page is valid. 2 is for preserved pages
- * (non-present and non-zero), and 3 for zero pages (non-present and
- * zero).
- */
- if (ptev & _PAGE_INVALID) {
- res |= 2;
- if (pgstev & _PGSTE_GPS_ZERO)
- res |= 1;
- }
- if (pgstev & _PGSTE_GPS_NODAT)
- res |= 0x20;
- vcpu->run->s.regs.gprs[r1] = res;
/*
* It is possible that all the normal 511 slots were full, in which case
* we will now write in the 512th slot, which is reserved for host use.
@@ -1243,17 +1173,34 @@ static inline int __do_essa(struct kvm_vcpu *vcpu, const int orc)
cbrlo[entries] = gfn << PAGE_SHIFT;
}
- if (orc) {
- struct kvm_memory_slot *ms = gfn_to_memslot(vcpu->kvm, gfn);
-
- /* Increment only if we are really flipping the bit */
- if (ms && !test_and_set_bit(gfn - ms->base_gfn, kvm_second_dirty_bitmap(ms)))
- atomic64_inc(&vcpu->kvm->arch.cmma_dirty_pages);
- }
+ if (dirtied)
+ atomic64_inc(&vcpu->kvm->arch.cmma_dirty_pages);
return nappended;
}
+static void _essa_clear_cbrl(struct kvm_vcpu *vcpu, unsigned long *cbrl, int len)
+{
+ union crste *crstep;
+ union pgste pgste;
+ union pte *ptep;
+ int i;
+
+ lockdep_assert_held(&vcpu->kvm->mmu_lock);
+
+ for (i = 0; i < len; i++) {
+ if (dat_entry_walk(NULL, gpa_to_gfn(cbrl[i]), vcpu->arch.gmap->asce,
+ 0, TABLE_TYPE_PAGE_TABLE, &crstep, &ptep))
+ continue;
+ if (!ptep || ptep->s.pr)
+ continue;
+ pgste = pgste_get_lock(ptep);
+ if (pgste.usage == PGSTE_GPS_USAGE_UNUSED || pgste.zero)
+ gmap_helper_zap_one_page(vcpu->kvm->mm, cbrl[i]);
+ pgste_set_unlock(ptep, pgste);
+ }
+}
+
static int handle_essa(struct kvm_vcpu *vcpu)
{
lockdep_assert_held(&vcpu->kvm->srcu);
@@ -1289,11 +1236,7 @@ static int handle_essa(struct kvm_vcpu *vcpu)
* value really needs to be written to; if the value is
* already correct, we do nothing and avoid the lock.
*/
- if (vcpu->kvm->mm->context.uses_cmm == 0) {
- mmap_write_lock(vcpu->kvm->mm);
- vcpu->kvm->mm->context.uses_cmm = 1;
- mmap_write_unlock(vcpu->kvm->mm);
- }
+ WRITE_ONCE(vcpu->arch.gmap->uses_cmm, 1);
/*
* If we are here, we are supposed to have CMMA enabled in
* the SIE block. Enabling CMMA works on a per-CPU basis,
@@ -1307,20 +1250,22 @@ static int handle_essa(struct kvm_vcpu *vcpu)
/* Retry the ESSA instruction */
kvm_s390_retry_instr(vcpu);
} else {
- mmap_read_lock(vcpu->kvm->mm);
- i = __do_essa(vcpu, orc);
- mmap_read_unlock(vcpu->kvm->mm);
+ scoped_guard(read_lock, &vcpu->kvm->mmu_lock)
+ i = __do_essa(vcpu, orc);
if (i < 0)
return i;
/* Account for the possible extra cbrl entry */
entries += i;
}
- vcpu->arch.sie_block->cbrlo &= PAGE_MASK; /* reset nceo */
+ /* reset nceo */
+ vcpu->arch.sie_block->cbrlo &= PAGE_MASK;
cbrlo = phys_to_virt(vcpu->arch.sie_block->cbrlo);
- mmap_read_lock(gmap->mm);
- for (i = 0; i < entries; ++i)
- __gmap_zap(gmap, cbrlo[i]);
- mmap_read_unlock(gmap->mm);
+
+ mmap_read_lock(vcpu->kvm->mm);
+ scoped_guard(read_lock, &vcpu->kvm->mmu_lock)
+ _essa_clear_cbrl(vcpu, cbrlo, entries);
+ mmap_read_unlock(vcpu->kvm->mm);
+
return 0;
}
diff --git a/arch/s390/kvm/pv.c b/arch/s390/kvm/pv.c
index 6ba5a0305e25..d8a5c7b91148 100644
--- a/arch/s390/kvm/pv.c
+++ b/arch/s390/kvm/pv.c
@@ -12,13 +12,16 @@
#include <linux/minmax.h>
#include <linux/pagemap.h>
#include <linux/sched/signal.h>
-#include <asm/gmap.h>
#include <asm/uv.h>
#include <asm/mman.h>
#include <linux/pagewalk.h>
#include <linux/sched/mm.h>
#include <linux/mmu_notifier.h>
#include "kvm-s390.h"
+#include "dat.h"
+#include "gaccess.h"
+#include "gmap.h"
+#include "faultin.h"
bool kvm_s390_pv_is_protected(struct kvm *kvm)
{
@@ -299,35 +302,6 @@ static int kvm_s390_pv_dispose_one_leftover(struct kvm *kvm,
return 0;
}
-/**
- * kvm_s390_destroy_lower_2g - Destroy the first 2GB of protected guest memory.
- * @kvm: the VM whose memory is to be cleared.
- *
- * Destroy the first 2GB of guest memory, to avoid prefix issues after reboot.
- * The CPUs of the protected VM need to be destroyed beforehand.
- */
-static void kvm_s390_destroy_lower_2g(struct kvm *kvm)
-{
- const unsigned long pages_2g = SZ_2G / PAGE_SIZE;
- struct kvm_memory_slot *slot;
- unsigned long len;
- int srcu_idx;
-
- srcu_idx = srcu_read_lock(&kvm->srcu);
-
- /* Take the memslot containing guest absolute address 0 */
- slot = gfn_to_memslot(kvm, 0);
- /* Clear all slots or parts thereof that are below 2GB */
- while (slot && slot->base_gfn < pages_2g) {
- len = min_t(u64, slot->npages, pages_2g - slot->base_gfn) * PAGE_SIZE;
- s390_uv_destroy_range(kvm->mm, slot->userspace_addr, slot->userspace_addr + len);
- /* Take the next memslot */
- slot = gfn_to_memslot(kvm, slot->base_gfn + slot->npages);
- }
-
- srcu_read_unlock(&kvm->srcu, srcu_idx);
-}
-
static int kvm_s390_pv_deinit_vm_fast(struct kvm *kvm, u16 *rc, u16 *rrc)
{
struct uv_cb_destroy_fast uvcb = {
@@ -342,7 +316,6 @@ static int kvm_s390_pv_deinit_vm_fast(struct kvm *kvm, u16 *rc, u16 *rrc)
*rc = uvcb.header.rc;
if (rrc)
*rrc = uvcb.header.rrc;
- WRITE_ONCE(kvm->arch.gmap->guest_handle, 0);
KVM_UV_EVENT(kvm, 3, "PROTVIRT DESTROY VM FAST: rc %x rrc %x",
uvcb.header.rc, uvcb.header.rrc);
WARN_ONCE(cc && uvcb.header.rc != 0x104,
@@ -391,7 +364,7 @@ int kvm_s390_pv_set_aside(struct kvm *kvm, u16 *rc, u16 *rrc)
return -EINVAL;
/* Guest with segment type ASCE, refuse to destroy asynchronously */
- if ((kvm->arch.gmap->asce & _ASCE_TYPE_MASK) == _ASCE_TYPE_SEGMENT)
+ if (kvm->arch.gmap->asce.dt == TABLE_TYPE_SEGMENT)
return -EINVAL;
priv = kzalloc(sizeof(*priv), GFP_KERNEL);
@@ -404,8 +377,7 @@ int kvm_s390_pv_set_aside(struct kvm *kvm, u16 *rc, u16 *rrc)
priv->stor_var = kvm->arch.pv.stor_var;
priv->stor_base = kvm->arch.pv.stor_base;
priv->handle = kvm_s390_pv_get_handle(kvm);
- priv->old_gmap_table = (unsigned long)kvm->arch.gmap->table;
- WRITE_ONCE(kvm->arch.gmap->guest_handle, 0);
+ priv->old_gmap_table = (unsigned long)dereference_asce(kvm->arch.gmap->asce);
if (s390_replace_asce(kvm->arch.gmap))
res = -ENOMEM;
}
@@ -415,7 +387,7 @@ int kvm_s390_pv_set_aside(struct kvm *kvm, u16 *rc, u16 *rrc)
return res;
}
- kvm_s390_destroy_lower_2g(kvm);
+ gmap_pv_destroy_range(kvm->arch.gmap, 0, gpa_to_gfn(SZ_2G), false);
kvm_s390_clear_pv_state(kvm);
kvm->arch.pv.set_aside = priv;
@@ -449,7 +421,6 @@ int kvm_s390_pv_deinit_vm(struct kvm *kvm, u16 *rc, u16 *rrc)
cc = uv_cmd_nodata(kvm_s390_pv_get_handle(kvm),
UVC_CMD_DESTROY_SEC_CONF, rc, rrc);
- WRITE_ONCE(kvm->arch.gmap->guest_handle, 0);
if (!cc) {
atomic_dec(&kvm->mm->context.protected_count);
kvm_s390_pv_dealloc_vm(kvm);
@@ -532,7 +503,7 @@ int kvm_s390_pv_deinit_cleanup_all(struct kvm *kvm, u16 *rc, u16 *rrc)
* cleanup has been performed.
*/
if (need_zap && mmget_not_zero(kvm->mm)) {
- s390_uv_destroy_range(kvm->mm, 0, TASK_SIZE);
+ gmap_pv_destroy_range(kvm->arch.gmap, 0, asce_end(kvm->arch.gmap->asce), false);
mmput(kvm->mm);
}
@@ -570,7 +541,7 @@ int kvm_s390_pv_deinit_aside_vm(struct kvm *kvm, u16 *rc, u16 *rrc)
return -EINVAL;
/* When a fatal signal is received, stop immediately */
- if (s390_uv_destroy_range_interruptible(kvm->mm, 0, TASK_SIZE_MAX))
+ if (gmap_pv_destroy_range(kvm->arch.gmap, 0, asce_end(kvm->arch.gmap->asce), true))
goto done;
if (kvm_s390_pv_dispose_one_leftover(kvm, p, rc, rrc))
ret = -EIO;
@@ -642,7 +613,7 @@ int kvm_s390_pv_init_vm(struct kvm *kvm, u16 *rc, u16 *rrc)
/* Inputs */
uvcb.guest_stor_origin = 0; /* MSO is 0 for KVM */
uvcb.guest_stor_len = kvm->arch.pv.guest_len;
- uvcb.guest_asce = kvm->arch.gmap->asce;
+ uvcb.guest_asce = kvm->arch.gmap->asce.val;
uvcb.guest_sca = virt_to_phys(kvm->arch.sca);
uvcb.conf_base_stor_origin =
virt_to_phys((void *)kvm->arch.pv.stor_base);
@@ -669,7 +640,6 @@ int kvm_s390_pv_init_vm(struct kvm *kvm, u16 *rc, u16 *rrc)
}
return -EIO;
}
- kvm->arch.gmap->guest_handle = uvcb.guest_handle;
return 0;
}
@@ -704,26 +674,14 @@ static int unpack_one(struct kvm *kvm, unsigned long addr, u64 tweak,
.tweak[1] = offset,
};
int ret = kvm_s390_pv_make_secure(kvm, addr, &uvcb);
- unsigned long vmaddr;
- bool unlocked;
*rc = uvcb.header.rc;
*rrc = uvcb.header.rrc;
if (ret == -ENXIO) {
- mmap_read_lock(kvm->mm);
- vmaddr = gfn_to_hva(kvm, gpa_to_gfn(addr));
- if (kvm_is_error_hva(vmaddr)) {
- ret = -EFAULT;
- } else {
- ret = fixup_user_fault(kvm->mm, vmaddr, FAULT_FLAG_WRITE, &unlocked);
- if (!ret)
- ret = __gmap_link(kvm->arch.gmap, addr, vmaddr);
- }
- mmap_read_unlock(kvm->mm);
+ ret = kvm_s390_faultin_gfn_simple(NULL, kvm, gpa_to_gfn(addr), true);
if (!ret)
return -EAGAIN;
- return ret;
}
if (ret && ret != -EAGAIN)
diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c
index 347268f89f2f..775c6d3b33d7 100644
--- a/arch/s390/kvm/vsie.c
+++ b/arch/s390/kvm/vsie.c
@@ -15,7 +15,6 @@
#include <linux/io.h>
#include <linux/mman.h>
-#include <asm/gmap.h>
#include <asm/mmu_context.h>
#include <asm/sclp.h>
#include <asm/nmi.h>
@@ -23,9 +22,11 @@
#include <asm/facility.h>
#include "kvm-s390.h"
#include "gaccess.h"
+#include "gmap.h"
enum vsie_page_flags {
VSIE_PAGE_IN_USE = 0,
+ VSIE_PAGE_RUNNING,
};
struct vsie_page {
@@ -62,11 +63,20 @@ struct vsie_page {
* looked up by other CPUs.
*/
unsigned long flags; /* 0x0260 */
- __u8 reserved[0x0700 - 0x0268]; /* 0x0268 */
+ /* Per-gmap list of vsie_pages that use that gmap */
+ struct list_head list; /* 0x0268 */
+ __u8 reserved[0x0700 - 0x0278]; /* 0x0278 */
struct kvm_s390_crypto_cb crycb; /* 0x0700 */
__u8 fac[S390_ARCH_FAC_LIST_SIZE_BYTE]; /* 0x0800 */
};
+static_assert(sizeof(struct vsie_page) == PAGE_SIZE);
+
+static inline bool is_vsie_page_running(struct vsie_page *vsie_page)
+{
+ return test_bit(VSIE_PAGE_RUNNING, &vsie_page->flags);
+}
+
/**
* gmap_shadow_valid() - check if a shadow guest address space matches the
* given properties and is still valid
@@ -78,11 +88,11 @@ struct vsie_page {
* properties, the caller can continue using it. Returns 0 otherwise; the
* caller has to request a new shadow gmap in this case.
*/
-int gmap_shadow_valid(struct gmap *sg, unsigned long asce, int edat_level)
+int gmap_shadow_valid(struct gmap *sg, union asce asce, int edat_level)
{
if (sg->removed)
return 0;
- return sg->orig_asce == asce && sg->edat_level == edat_level;
+ return sg->guest_asce.val == asce.val && sg->edat_level == edat_level;
}
/* trigger a validity icpt for the given scb */
@@ -612,31 +622,29 @@ static int shadow_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
return rc;
}
-void kvm_s390_vsie_gmap_notifier(struct gmap *gmap, unsigned long start,
- unsigned long end)
+void kvm_s390_vsie_gmap_notifier(struct gmap *gmap, gpa_t start, gpa_t end)
{
- struct kvm *kvm = gmap->private;
- struct vsie_page *cur;
+ struct vsie_page *cur, *next;
unsigned long prefix;
- int i;
- if (!gmap_is_shadow(gmap))
- return;
+ KVM_BUG_ON(!gmap->is_shadow, gmap->kvm);
+ KVM_BUG_ON(!gmap->parent, gmap->kvm);
+ lockdep_assert_held(&gmap->parent->children_lock);
/*
* Only new shadow blocks are added to the list during runtime,
* therefore we can safely reference them all the time.
*/
- for (i = 0; i < kvm->arch.vsie.page_count; i++) {
- cur = READ_ONCE(kvm->arch.vsie.pages[i]);
- if (!cur)
- continue;
- if (READ_ONCE(cur->gmap) != gmap)
- continue;
+ list_for_each_entry_safe(cur, next, &gmap->scb_users, list) {
prefix = cur->scb_s.prefix << GUEST_PREFIX_SHIFT;
/* with mso/msl, the prefix lies at an offset */
prefix += cur->scb_s.mso;
- if (prefix <= end && start <= prefix + 2 * PAGE_SIZE - 1)
+ if (prefix <= end && start <= prefix + 2 * PAGE_SIZE - 1) {
prefix_unmapped_sync(cur);
+ if (gmap->removed && !is_vsie_page_running(cur)) {
+ list_del(&cur->list);
+ cur->gmap = NULL;
+ }
+ }
}
}
@@ -667,10 +675,10 @@ static int map_prefix(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
/* with mso/msl, the prefix lies at offset *mso* */
prefix += scb_s->mso;
- rc = kvm_s390_shadow_fault(vcpu, vsie_page->gmap, prefix, NULL);
+ rc = gaccess_shadow_fault(vcpu, vsie_page->gmap, prefix, NULL, true);
if (!rc && (scb_s->ecb & ECB_TE))
- rc = kvm_s390_shadow_fault(vcpu, vsie_page->gmap,
- prefix + PAGE_SIZE, NULL);
+ rc = gaccess_shadow_fault(vcpu, vsie_page->gmap,
+ prefix + PAGE_SIZE, NULL, true);
/*
* We don't have to mprotect, we will be called for all unshadows.
* SIE will detect if protection applies and trigger a validity.
@@ -953,6 +961,7 @@ static int inject_fault(struct kvm_vcpu *vcpu, __u16 code, __u64 vaddr,
*/
static int handle_fault(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
{
+ bool wr = kvm_s390_cur_gmap_fault_is_write();
int rc;
if ((current->thread.gmap_int_code & PGM_INT_CODE_MASK) == PGM_PROTECTION)
@@ -960,12 +969,11 @@ static int handle_fault(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
return inject_fault(vcpu, PGM_PROTECTION,
current->thread.gmap_teid.addr * PAGE_SIZE, 1);
- rc = kvm_s390_shadow_fault(vcpu, vsie_page->gmap,
- current->thread.gmap_teid.addr * PAGE_SIZE, NULL);
+ rc = gaccess_shadow_fault(vcpu, vsie_page->gmap,
+ current->thread.gmap_teid.addr * PAGE_SIZE, NULL, wr);
if (rc > 0) {
rc = inject_fault(vcpu, rc,
- current->thread.gmap_teid.addr * PAGE_SIZE,
- kvm_s390_cur_gmap_fault_is_write());
+ current->thread.gmap_teid.addr * PAGE_SIZE, wr);
if (rc >= 0)
vsie_page->fault_addr = current->thread.gmap_teid.addr * PAGE_SIZE;
}
@@ -982,8 +990,8 @@ static void handle_last_fault(struct kvm_vcpu *vcpu,
struct vsie_page *vsie_page)
{
if (vsie_page->fault_addr)
- kvm_s390_shadow_fault(vcpu, vsie_page->gmap,
- vsie_page->fault_addr, NULL);
+ gaccess_shadow_fault(vcpu, vsie_page->gmap,
+ vsie_page->fault_addr, NULL, true);
vsie_page->fault_addr = 0;
}
@@ -1068,8 +1076,9 @@ static u64 vsie_get_register(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page,
static int vsie_handle_mvpg(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
{
struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
- unsigned long pei_dest, pei_src, src, dest, mask, prefix;
+ unsigned long src, dest, mask, prefix;
u64 *pei_block = &vsie_page->scb_o->mcic;
+ union mvpg_pei pei_dest, pei_src;
int edat, rc_dest, rc_src;
union ctlreg0 cr0;
@@ -1083,8 +1092,8 @@ static int vsie_handle_mvpg(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
src = vsie_get_register(vcpu, vsie_page, scb_s->ipb >> 16) & mask;
src = _kvm_s390_real_to_abs(prefix, src) + scb_s->mso;
- rc_dest = kvm_s390_shadow_fault(vcpu, vsie_page->gmap, dest, &pei_dest);
- rc_src = kvm_s390_shadow_fault(vcpu, vsie_page->gmap, src, &pei_src);
+ rc_dest = gaccess_shadow_fault(vcpu, vsie_page->gmap, dest, &pei_dest, true);
+ rc_src = gaccess_shadow_fault(vcpu, vsie_page->gmap, src, &pei_src, false);
/*
* Either everything went well, or something non-critical went wrong
* e.g. because of a race. In either case, simply retry.
@@ -1119,8 +1128,8 @@ static int vsie_handle_mvpg(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
rc_src = rc_src != PGM_PAGE_TRANSLATION ? rc_src : 0;
}
if (!rc_dest && !rc_src) {
- pei_block[0] = pei_dest;
- pei_block[1] = pei_src;
+ pei_block[0] = pei_dest.val;
+ pei_block[1] = pei_src.val;
return 1;
}
@@ -1182,7 +1191,8 @@ static int do_vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
if (!kvm_s390_vcpu_sie_inhibited(vcpu)) {
local_irq_disable();
guest_timing_enter_irqoff();
- rc = kvm_s390_enter_exit_sie(scb_s, vcpu->run->s.regs.gprs, vsie_page->gmap->asce);
+ rc = kvm_s390_enter_exit_sie(scb_s, vcpu->run->s.regs.gprs,
+ vsie_page->gmap->asce.val);
guest_timing_exit_irqoff();
local_irq_enable();
}
@@ -1230,42 +1240,62 @@ static int do_vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
static void release_gmap_shadow(struct vsie_page *vsie_page)
{
- if (vsie_page->gmap)
- gmap_put(vsie_page->gmap);
- WRITE_ONCE(vsie_page->gmap, NULL);
+ struct gmap *gmap = vsie_page->gmap;
+
+ KVM_BUG_ON(!gmap->parent, gmap->kvm);
+ lockdep_assert_held(&gmap->parent->children_lock);
+
+ vsie_page->gmap = NULL;
+ list_del(&vsie_page->list);
+
+ if (list_empty(&gmap->scb_users)) {
+ gmap_remove_child(gmap);
+ gmap_dispose(gmap);
+ }
prefix_unmapped(vsie_page);
}
static int acquire_gmap_shadow(struct kvm_vcpu *vcpu,
struct vsie_page *vsie_page)
{
- unsigned long asce;
union ctlreg0 cr0;
struct gmap *gmap;
+ union asce asce;
int edat;
- asce = vcpu->arch.sie_block->gcr[1];
+ asce.val = vcpu->arch.sie_block->gcr[1];
cr0.val = vcpu->arch.sie_block->gcr[0];
edat = cr0.edat && test_kvm_facility(vcpu->kvm, 8);
edat += edat && test_kvm_facility(vcpu->kvm, 78);
- /*
- * ASCE or EDAT could have changed since last icpt, or the gmap
- * we're holding has been unshadowed. If the gmap is still valid,
- * we can safely reuse it.
- */
- if (vsie_page->gmap && gmap_shadow_valid(vsie_page->gmap, asce, edat)) {
- vcpu->kvm->stat.gmap_shadow_reuse++;
- return 0;
+ scoped_guard(spinlock, &vcpu->kvm->arch.gmap->children_lock) {
+ if (vsie_page->gmap) {
+ /*
+ * ASCE or EDAT could have changed since last icpt, or the gmap
+ * we're holding has been unshadowed. If the gmap is still valid,
+ * we can safely reuse it.
+ */
+ if (gmap_shadow_valid(vsie_page->gmap, asce, edat)) {
+ vcpu->kvm->stat.gmap_shadow_reuse++;
+ return 0;
+ }
+ /* release the old shadow - if any, and mark the prefix as unmapped */
+ if (vsie_page->gmap)
+ release_gmap_shadow(vsie_page);
+ }
}
-
- /* release the old shadow - if any, and mark the prefix as unmapped */
- release_gmap_shadow(vsie_page);
- gmap = gmap_shadow(vcpu->arch.gmap, asce, edat);
+ gmap = gmap_create_shadow(vcpu->arch.mc, vcpu->kvm->arch.gmap, asce, edat);
if (IS_ERR(gmap))
return PTR_ERR(gmap);
- vcpu->kvm->stat.gmap_shadow_create++;
- WRITE_ONCE(vsie_page->gmap, gmap);
+ scoped_guard(spinlock, &vcpu->kvm->arch.gmap->children_lock) {
+ /* unlikely race condition, remove the previous shadow */
+ if (vsie_page->gmap)
+ release_gmap_shadow(vsie_page);
+ vcpu->kvm->stat.gmap_shadow_create++;
+ list_add(&vsie_page->list, &gmap->scb_users);
+ vsie_page->gmap = gmap;
+ prefix_unmapped(vsie_page);
+ }
return 0;
}
@@ -1321,6 +1351,7 @@ static int vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
int rc = 0;
+ set_bit(VSIE_PAGE_RUNNING, &vsie_page->flags);
while (1) {
rc = acquire_gmap_shadow(vcpu, vsie_page);
if (!rc)
@@ -1353,6 +1384,11 @@ static int vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
}
cond_resched();
}
+ scoped_guard(spinlock, &vcpu->kvm->arch.gmap->children_lock) {
+ if (vsie_page->gmap && vsie_page->gmap->removed)
+ release_gmap_shadow(vsie_page);
+ clear_bit(VSIE_PAGE_RUNNING, &vsie_page->flags);
+ }
if (rc == -EFAULT) {
/*
@@ -1448,8 +1484,7 @@ static struct vsie_page *get_vsie_page(struct kvm *kvm, unsigned long addr)
vsie_page->scb_gpa = ULONG_MAX;
/* Double use of the same address or allocation failure. */
- if (radix_tree_insert(&kvm->arch.vsie.addr_to_page, addr >> 9,
- vsie_page)) {
+ if (radix_tree_insert(&kvm->arch.vsie.addr_to_page, addr >> 9, vsie_page)) {
put_vsie_page(vsie_page);
mutex_unlock(&kvm->arch.vsie.mutex);
return NULL;
@@ -1458,7 +1493,11 @@ static struct vsie_page *get_vsie_page(struct kvm *kvm, unsigned long addr)
mutex_unlock(&kvm->arch.vsie.mutex);
memset(&vsie_page->scb_s, 0, sizeof(struct kvm_s390_sie_block));
- release_gmap_shadow(vsie_page);
+ if (vsie_page->gmap) {
+ scoped_guard(spinlock, &vsie_page->gmap->parent->children_lock)
+ release_gmap_shadow(vsie_page);
+ }
+ prefix_unmapped(vsie_page);
vsie_page->fault_addr = 0;
vsie_page->scb_s.ihcpu = 0xffffU;
return vsie_page;
@@ -1535,8 +1574,10 @@ void kvm_s390_vsie_destroy(struct kvm *kvm)
mutex_lock(&kvm->arch.vsie.mutex);
for (i = 0; i < kvm->arch.vsie.page_count; i++) {
vsie_page = kvm->arch.vsie.pages[i];
+ scoped_guard(spinlock, &kvm->arch.gmap->children_lock)
+ if (vsie_page->gmap)
+ release_gmap_shadow(vsie_page);
kvm->arch.vsie.pages[i] = NULL;
- release_gmap_shadow(vsie_page);
/* free the radix tree entry */
if (vsie_page->scb_gpa != ULONG_MAX)
radix_tree_delete(&kvm->arch.vsie.addr_to_page,
diff --git a/arch/s390/lib/uaccess.c b/arch/s390/lib/uaccess.c
index 1a6ba105e071..0ac2f3998b14 100644
--- a/arch/s390/lib/uaccess.c
+++ b/arch/s390/lib/uaccess.c
@@ -34,136 +34,19 @@ void debug_user_asce(int exit)
}
#endif /*CONFIG_DEBUG_ENTRY */
-union oac {
- unsigned int val;
- struct {
- struct {
- unsigned short key : 4;
- unsigned short : 4;
- unsigned short as : 2;
- unsigned short : 4;
- unsigned short k : 1;
- unsigned short a : 1;
- } oac1;
- struct {
- unsigned short key : 4;
- unsigned short : 4;
- unsigned short as : 2;
- unsigned short : 4;
- unsigned short k : 1;
- unsigned short a : 1;
- } oac2;
- };
-};
-
-static uaccess_kmsan_or_inline __must_check unsigned long
-raw_copy_from_user_key(void *to, const void __user *from, unsigned long size, unsigned long key)
-{
- unsigned long osize;
- union oac spec = {
- .oac2.key = key,
- .oac2.as = PSW_BITS_AS_SECONDARY,
- .oac2.k = 1,
- .oac2.a = 1,
- };
- int cc;
-
- while (1) {
- osize = size;
- asm_inline volatile(
- " lr %%r0,%[spec]\n"
- "0: mvcos %[to],%[from],%[size]\n"
- "1: nopr %%r7\n"
- CC_IPM(cc)
- EX_TABLE_UA_MVCOS_FROM(0b, 0b)
- EX_TABLE_UA_MVCOS_FROM(1b, 0b)
- : CC_OUT(cc, cc), [size] "+d" (size), [to] "=Q" (*(char *)to)
- : [spec] "d" (spec.val), [from] "Q" (*(const char __user *)from)
- : CC_CLOBBER_LIST("memory", "0"));
- if (CC_TRANSFORM(cc) == 0)
- return osize - size;
- size -= 4096;
- to += 4096;
- from += 4096;
- }
-}
-
-unsigned long _copy_from_user_key(void *to, const void __user *from,
- unsigned long n, unsigned long key)
-{
- unsigned long res = n;
-
- might_fault();
- if (!should_fail_usercopy()) {
- instrument_copy_from_user_before(to, from, n);
- res = raw_copy_from_user_key(to, from, n, key);
- instrument_copy_from_user_after(to, from, n, res);
- }
- if (unlikely(res))
- memset(to + (n - res), 0, res);
- return res;
-}
-EXPORT_SYMBOL(_copy_from_user_key);
-
-static uaccess_kmsan_or_inline __must_check unsigned long
-raw_copy_to_user_key(void __user *to, const void *from, unsigned long size, unsigned long key)
-{
- unsigned long osize;
- union oac spec = {
- .oac1.key = key,
- .oac1.as = PSW_BITS_AS_SECONDARY,
- .oac1.k = 1,
- .oac1.a = 1,
- };
- int cc;
-
- while (1) {
- osize = size;
- asm_inline volatile(
- " lr %%r0,%[spec]\n"
- "0: mvcos %[to],%[from],%[size]\n"
- "1: nopr %%r7\n"
- CC_IPM(cc)
- EX_TABLE_UA_MVCOS_TO(0b, 0b)
- EX_TABLE_UA_MVCOS_TO(1b, 0b)
- : CC_OUT(cc, cc), [size] "+d" (size), [to] "=Q" (*(char __user *)to)
- : [spec] "d" (spec.val), [from] "Q" (*(const char *)from)
- : CC_CLOBBER_LIST("memory", "0"));
- if (CC_TRANSFORM(cc) == 0)
- return osize - size;
- size -= 4096;
- to += 4096;
- from += 4096;
- }
-}
-
-unsigned long _copy_to_user_key(void __user *to, const void *from,
- unsigned long n, unsigned long key)
-{
- might_fault();
- if (should_fail_usercopy())
- return n;
- instrument_copy_to_user(to, from, n);
- return raw_copy_to_user_key(to, from, n, key);
-}
-EXPORT_SYMBOL(_copy_to_user_key);
-
#define CMPXCHG_USER_KEY_MAX_LOOPS 128
-static nokprobe_inline int __cmpxchg_user_key_small(unsigned long address, unsigned int *uval,
- unsigned int old, unsigned int new,
- unsigned int mask, unsigned long key)
+static nokprobe_inline int __cmpxchg_key_small(void *address, unsigned int *uval,
+ unsigned int old, unsigned int new,
+ unsigned int mask, unsigned long key)
{
unsigned long count;
unsigned int prev;
- bool sacf_flag;
int rc = 0;
skey_regions_initialize();
- sacf_flag = enable_sacf_uaccess();
asm_inline volatile(
"20: spka 0(%[key])\n"
- " sacf 256\n"
" llill %[count],%[max_loops]\n"
"0: l %[prev],%[address]\n"
"1: nr %[prev],%[mask]\n"
@@ -178,8 +61,7 @@ static nokprobe_inline int __cmpxchg_user_key_small(unsigned long address, unsig
" nr %[tmp],%[mask]\n"
" jnz 5f\n"
" brct %[count],2b\n"
- "5: sacf 768\n"
- " spka %[default_key]\n"
+ "5: spka %[default_key]\n"
"21:\n"
EX_TABLE_UA_LOAD_REG(0b, 5b, %[rc], %[prev])
EX_TABLE_UA_LOAD_REG(1b, 5b, %[rc], %[prev])
@@ -197,16 +79,16 @@ static nokprobe_inline int __cmpxchg_user_key_small(unsigned long address, unsig
[default_key] "J" (PAGE_DEFAULT_KEY),
[max_loops] "J" (CMPXCHG_USER_KEY_MAX_LOOPS)
: "memory", "cc");
- disable_sacf_uaccess(sacf_flag);
*uval = prev;
if (!count)
rc = -EAGAIN;
return rc;
}
-int __kprobes __cmpxchg_user_key1(unsigned long address, unsigned char *uval,
- unsigned char old, unsigned char new, unsigned long key)
+int __kprobes __cmpxchg_key1(void *addr, unsigned char *uval, unsigned char old,
+ unsigned char new, unsigned long key)
{
+ unsigned long address = (unsigned long)addr;
unsigned int prev, shift, mask, _old, _new;
int rc;
@@ -215,15 +97,16 @@ int __kprobes __cmpxchg_user_key1(unsigned long address, unsigned char *uval,
_old = (unsigned int)old << shift;
_new = (unsigned int)new << shift;
mask = ~(0xff << shift);
- rc = __cmpxchg_user_key_small(address, &prev, _old, _new, mask, key);
+ rc = __cmpxchg_key_small((void *)address, &prev, _old, _new, mask, key);
*uval = prev >> shift;
return rc;
}
-EXPORT_SYMBOL(__cmpxchg_user_key1);
+EXPORT_SYMBOL(__cmpxchg_key1);
-int __kprobes __cmpxchg_user_key2(unsigned long address, unsigned short *uval,
- unsigned short old, unsigned short new, unsigned long key)
+int __kprobes __cmpxchg_key2(void *addr, unsigned short *uval, unsigned short old,
+ unsigned short new, unsigned long key)
{
+ unsigned long address = (unsigned long)addr;
unsigned int prev, shift, mask, _old, _new;
int rc;
@@ -232,27 +115,23 @@ int __kprobes __cmpxchg_user_key2(unsigned long address, unsigned short *uval,
_old = (unsigned int)old << shift;
_new = (unsigned int)new << shift;
mask = ~(0xffff << shift);
- rc = __cmpxchg_user_key_small(address, &prev, _old, _new, mask, key);
+ rc = __cmpxchg_key_small((void *)address, &prev, _old, _new, mask, key);
*uval = prev >> shift;
return rc;
}
-EXPORT_SYMBOL(__cmpxchg_user_key2);
+EXPORT_SYMBOL(__cmpxchg_key2);
-int __kprobes __cmpxchg_user_key4(unsigned long address, unsigned int *uval,
- unsigned int old, unsigned int new, unsigned long key)
+int __kprobes __cmpxchg_key4(void *address, unsigned int *uval, unsigned int old,
+ unsigned int new, unsigned long key)
{
unsigned int prev = old;
- bool sacf_flag;
int rc = 0;
skey_regions_initialize();
- sacf_flag = enable_sacf_uaccess();
asm_inline volatile(
"20: spka 0(%[key])\n"
- " sacf 256\n"
"0: cs %[prev],%[new],%[address]\n"
- "1: sacf 768\n"
- " spka %[default_key]\n"
+ "1: spka %[default_key]\n"
"21:\n"
EX_TABLE_UA_LOAD_REG(0b, 1b, %[rc], %[prev])
EX_TABLE_UA_LOAD_REG(1b, 1b, %[rc], %[prev])
@@ -264,27 +143,22 @@ int __kprobes __cmpxchg_user_key4(unsigned long address, unsigned int *uval,
[key] "a" (key << 4),
[default_key] "J" (PAGE_DEFAULT_KEY)
: "memory", "cc");
- disable_sacf_uaccess(sacf_flag);
*uval = prev;
return rc;
}
-EXPORT_SYMBOL(__cmpxchg_user_key4);
+EXPORT_SYMBOL(__cmpxchg_key4);
-int __kprobes __cmpxchg_user_key8(unsigned long address, unsigned long *uval,
- unsigned long old, unsigned long new, unsigned long key)
+int __kprobes __cmpxchg_key8(void *address, unsigned long *uval, unsigned long old,
+ unsigned long new, unsigned long key)
{
unsigned long prev = old;
- bool sacf_flag;
int rc = 0;
skey_regions_initialize();
- sacf_flag = enable_sacf_uaccess();
asm_inline volatile(
"20: spka 0(%[key])\n"
- " sacf 256\n"
"0: csg %[prev],%[new],%[address]\n"
- "1: sacf 768\n"
- " spka %[default_key]\n"
+ "1: spka %[default_key]\n"
"21:\n"
EX_TABLE_UA_LOAD_REG(0b, 1b, %[rc], %[prev])
EX_TABLE_UA_LOAD_REG(1b, 1b, %[rc], %[prev])
@@ -296,27 +170,22 @@ int __kprobes __cmpxchg_user_key8(unsigned long address, unsigned long *uval,
[key] "a" (key << 4),
[default_key] "J" (PAGE_DEFAULT_KEY)
: "memory", "cc");
- disable_sacf_uaccess(sacf_flag);
*uval = prev;
return rc;
}
-EXPORT_SYMBOL(__cmpxchg_user_key8);
+EXPORT_SYMBOL(__cmpxchg_key8);
-int __kprobes __cmpxchg_user_key16(unsigned long address, __uint128_t *uval,
- __uint128_t old, __uint128_t new, unsigned long key)
+int __kprobes __cmpxchg_key16(void *address, __uint128_t *uval, __uint128_t old,
+ __uint128_t new, unsigned long key)
{
__uint128_t prev = old;
- bool sacf_flag;
int rc = 0;
skey_regions_initialize();
- sacf_flag = enable_sacf_uaccess();
asm_inline volatile(
"20: spka 0(%[key])\n"
- " sacf 256\n"
"0: cdsg %[prev],%[new],%[address]\n"
- "1: sacf 768\n"
- " spka %[default_key]\n"
+ "1: spka %[default_key]\n"
"21:\n"
EX_TABLE_UA_LOAD_REGPAIR(0b, 1b, %[rc], %[prev])
EX_TABLE_UA_LOAD_REGPAIR(1b, 1b, %[rc], %[prev])
@@ -328,8 +197,7 @@ int __kprobes __cmpxchg_user_key16(unsigned long address, __uint128_t *uval,
[key] "a" (key << 4),
[default_key] "J" (PAGE_DEFAULT_KEY)
: "memory", "cc");
- disable_sacf_uaccess(sacf_flag);
*uval = prev;
return rc;
}
-EXPORT_SYMBOL(__cmpxchg_user_key16);
+EXPORT_SYMBOL(__cmpxchg_key16);
diff --git a/arch/s390/mm/gmap_helpers.c b/arch/s390/mm/gmap_helpers.c
index dca783859a73..da81519db55a 100644
--- a/arch/s390/mm/gmap_helpers.c
+++ b/arch/s390/mm/gmap_helpers.c
@@ -34,28 +34,6 @@ static void ptep_zap_swap_entry(struct mm_struct *mm, swp_entry_t entry)
free_swap_and_cache(entry);
}
-static inline pgste_t pgste_get_lock(pte_t *ptep)
-{
- unsigned long value = 0;
-#ifdef CONFIG_PGSTE
- unsigned long *ptr = (unsigned long *)(ptep + PTRS_PER_PTE);
-
- do {
- value = __atomic64_or_barrier(PGSTE_PCL_BIT, ptr);
- } while (value & PGSTE_PCL_BIT);
- value |= PGSTE_PCL_BIT;
-#endif
- return __pgste(value);
-}
-
-static inline void pgste_set_unlock(pte_t *ptep, pgste_t pgste)
-{
-#ifdef CONFIG_PGSTE
- barrier();
- WRITE_ONCE(*(unsigned long *)(ptep + PTRS_PER_PTE), pgste_val(pgste) & ~PGSTE_PCL_BIT);
-#endif
-}
-
/**
* gmap_helper_zap_one_page() - discard a page if it was swapped.
* @mm: the mm
@@ -69,7 +47,6 @@ void gmap_helper_zap_one_page(struct mm_struct *mm, unsigned long vmaddr)
{
struct vm_area_struct *vma;
spinlock_t *ptl;
- pgste_t pgste;
pte_t *ptep;
mmap_assert_locked(mm);
@@ -84,14 +61,8 @@ void gmap_helper_zap_one_page(struct mm_struct *mm, unsigned long vmaddr)
if (unlikely(!ptep))
return;
if (pte_swap(*ptep)) {
- preempt_disable();
- pgste = pgste_get_lock(ptep);
-
ptep_zap_swap_entry(mm, pte_to_swp_entry(*ptep));
pte_clear(mm, vmaddr, ptep);
-
- pgste_set_unlock(ptep, pgste);
- preempt_enable();
}
pte_unmap_unlock(ptep, ptl);
}
--
2.51.1
Powered by blists - more mailing lists