[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20251106161117.350395-13-imbrenda@linux.ibm.com>
Date: Thu, 6 Nov 2025 17:11:06 +0100
From: Claudio Imbrenda <imbrenda@...ux.ibm.com>
To: kvm@...r.kernel.org
Cc: linux-kernel@...r.kernel.org, linux-s390@...r.kernel.org,
borntraeger@...ibm.com, frankja@...ux.ibm.com, nsg@...ux.ibm.com,
nrb@...ux.ibm.com, seiden@...ux.ibm.com, schlameuss@...ux.ibm.com,
hca@...ux.ibm.com, svens@...ux.ibm.com, agordeev@...ux.ibm.com,
gor@...ux.ibm.com, david@...hat.com, gerald.schaefer@...ux.ibm.com
Subject: [PATCH v3 12/23] KVM: s390: KVM page table management functions: lifecycle management
Add page table management functions to be used for KVM guest (gmap)
page tables.
This patch adds functions to handle memslot creation and destruction,
additional per-pagetable data stored in the PGSTEs, mapping physical
addresses into the gmap, and marking address ranges as prefix.
Signed-off-by: Claudio Imbrenda <imbrenda@...ux.ibm.com>
---
arch/s390/kvm/dat.c | 281 ++++++++++++++++++++++++++++++++++++++++++++
arch/s390/kvm/dat.h | 54 +++++++++
2 files changed, 335 insertions(+)
diff --git a/arch/s390/kvm/dat.c b/arch/s390/kvm/dat.c
index 121f99335ae9..666aebec72eb 100644
--- a/arch/s390/kvm/dat.c
+++ b/arch/s390/kvm/dat.c
@@ -102,6 +102,38 @@ void dat_free_level(struct crst_table *table, bool owns_ptes)
dat_free_crst(table);
}
+int dat_set_asce_limit(struct kvm_s390_mmu_cache *mc, union asce *asce, int newtype)
+{
+ struct crst_table *table;
+ union crste crste;
+
+ while (asce->dt > newtype) {
+ table = dereference_asce(*asce);
+ crste = table->crstes[0];
+ if (crste.h.fc)
+ return 0;
+ if (!crste.h.i) {
+ asce->rsto = crste.h.fc0.to;
+ dat_free_crst(table);
+ } else {
+ crste.h.tt--;
+ crst_table_init((void *)table, crste.val);
+ }
+ asce->dt--;
+ }
+ while (asce->dt < newtype) {
+ crste = _crste_fc0(asce->rsto, asce->dt + 1);
+ table = dat_alloc_crst_noinit(mc);
+ if (!table)
+ return -ENOMEM;
+ crst_table_init((void *)table, _CRSTE_HOLE(crste.h.tt).val);
+ table->crstes[0] = crste;
+ asce->rsto = __pa(table) >> PAGE_SHIFT;
+ asce->dt++;
+ }
+ return 0;
+}
+
/**
* dat_crstep_xchg - exchange a gmap CRSTE with another
* @crstep: pointer to the CRST entry
@@ -817,3 +849,252 @@ long dat_reset_skeys(union asce asce, gfn_t start)
return _dat_walk_gfn_range(start, asce_end(asce), asce, &ops, DAT_WALK_IGN_HOLES, NULL);
}
+
+struct slot_priv {
+ unsigned long token;
+ struct kvm_s390_mmu_cache *mc;
+};
+
+static long _dat_slot_pte(union pte *ptep, gfn_t gfn, gfn_t next, struct dat_walk *walk)
+{
+ struct slot_priv *p = walk->priv;
+ union crste dummy = { .val = p->token };
+ union pte new_pte, pte = READ_ONCE(*ptep);
+
+ new_pte = _PTE_TOK(dummy.tok.type, dummy.tok.par);
+
+ /* Table entry already in the desired state */
+ if (pte.val == new_pte.val)
+ return 0;
+
+ dat_ptep_xchg(ptep, new_pte, gfn, walk->asce, false);
+ return 0;
+}
+
+static long _dat_slot_crste(union crste *crstep, gfn_t gfn, gfn_t next, struct dat_walk *walk)
+{
+ union crste new_crste, crste = READ_ONCE(*crstep);
+ struct slot_priv *p = walk->priv;
+
+ new_crste.val = p->token;
+ new_crste.h.tt = crste.h.tt;
+
+ /* Table entry already in the desired state */
+ if (crste.val == new_crste.val)
+ return 0;
+
+ /* This table entry needs to be updated */
+ if (walk->start <= gfn && walk->end >= next) {
+ dat_crstep_xchg_atomic(crstep, crste, new_crste, gfn, walk->asce);
+ /* A lower level table was present, needs to be freed */
+ if (!crste.h.fc && !crste.h.i)
+ dat_free_level(dereference_crste(crste), true);
+ return 0;
+ }
+
+ /* A lower level table is present, things will handled there */
+ if (!crste.h.fc && !crste.h.i)
+ return 0;
+ /* Split (install a lower level table), and handle things there */
+ return dat_split_crste(p->mc, crstep, gfn, walk->asce, false);
+}
+
+static const struct dat_walk_ops dat_slot_ops = {
+ .pte_entry = _dat_slot_pte,
+ .crste_ops = { _dat_slot_crste, _dat_slot_crste, _dat_slot_crste, _dat_slot_crste, },
+};
+
+int dat_set_slot(struct kvm_s390_mmu_cache *mc, union asce asce, gfn_t start, gfn_t end,
+ u16 type, u16 param)
+{
+ struct slot_priv priv = {
+ .token = _CRSTE_TOK(0, type, param).val,
+ .mc = mc,
+ };
+
+ return _dat_walk_gfn_range(start, end, asce, &dat_slot_ops,
+ DAT_WALK_IGN_HOLES | DAT_WALK_ANY, &priv);
+}
+
+static void pgste_set_unlock_multiple(union pte *first, int n, union pgste *pgstes)
+{
+ int i;
+
+ for (i = 0; i < n; i++) {
+ if (!pgstes[i].pcl)
+ break;
+ pgste_set_unlock(first + i, pgstes[i]);
+ }
+}
+
+static bool pgste_get_trylock_multiple(union pte *first, int n, union pgste *pgstes)
+{
+ int i;
+
+ for (i = 0; i < n; i++) {
+ if (!pgste_get_trylock(first + i, pgstes + i))
+ break;
+ }
+ if (i == n)
+ return true;
+ pgste_set_unlock_multiple(first, n, pgstes);
+ return false;
+}
+
+unsigned long dat_get_ptval(struct page_table *table, struct ptval_param param)
+{
+ union pgste pgstes[4] = {};
+ unsigned long res = 0;
+ int i, n;
+
+ n = param.len + 1;
+
+ while (!pgste_get_trylock_multiple(table->ptes + param.offset, n, pgstes))
+ cpu_relax();
+
+ for (i = 0; i < n; i++)
+ res = res << 16 | pgstes[i].val16;
+
+ pgste_set_unlock_multiple(table->ptes + param.offset, n, pgstes);
+ return res;
+}
+
+void dat_set_ptval(struct page_table *table, struct ptval_param param, unsigned long val)
+{
+ union pgste pgstes[4] = {};
+ int i, n;
+
+ n = param.len + 1;
+
+ while (!pgste_get_trylock_multiple(table->ptes + param.offset, n, pgstes))
+ cpu_relax();
+
+ for (i = param.len; i >= 0; i--) {
+ pgstes[i].val16 = val;
+ val = val >> 16;
+ }
+
+ pgste_set_unlock_multiple(table->ptes + param.offset, n, pgstes);
+}
+
+static long _dat_test_young_pte(union pte *ptep, gfn_t start, gfn_t end, struct dat_walk *walk)
+{
+ return ptep->s.y;
+}
+
+static long _dat_test_young_crste(union crste *crstep, gfn_t start, gfn_t end,
+ struct dat_walk *walk)
+{
+ return crstep->h.fc && crstep->s.fc1.y;
+}
+
+static const struct dat_walk_ops test_age_ops = {
+ .pte_entry = _dat_test_young_pte,
+ .pmd_entry = _dat_test_young_crste,
+ .pud_entry = _dat_test_young_crste,
+};
+
+/**
+ * dat_test_age_gfn() - test young
+ * @kvm: the kvm instance
+ * @range: the range of guest addresses whose young status needs to be cleared
+ *
+ * Context: called by KVM common code with the kvm mmu write lock held
+ * Return: 1 if any page in the given range is young, otherwise 0.
+ */
+bool dat_test_age_gfn(union asce asce, gfn_t start, gfn_t end)
+{
+ return _dat_walk_gfn_range(start, end, asce, &test_age_ops, 0, NULL) > 0;
+}
+
+int dat_link(struct kvm_s390_mmu_cache *mc, union asce asce, int level,
+ bool uses_skeys, struct guest_fault *f)
+{
+ union crste oldval, newval;
+ union pte newpte, oldpte;
+ union pgste pgste;
+ int rc = 0;
+
+ rc = dat_entry_walk(mc, f->gfn, asce, DAT_WALK_ALLOC_CONTINUE, level, &f->crstep, &f->ptep);
+ if (rc)
+ return rc == -EINVAL ? rc : -EAGAIN;
+
+ if (WARN_ON_ONCE(unlikely(get_level(f->crstep, f->ptep) > level)))
+ return -EINVAL;
+
+ if (f->ptep) {
+ pgste = pgste_get_lock(f->ptep);
+ oldpte = *f->ptep;
+ newpte = _pte(f->pfn, f->writable, f->write_attempt | oldpte.s.d, !f->page);
+ newpte.s.sd = oldpte.s.sd;
+ oldpte.s.sd = 0;
+ if (oldpte.val == _PTE_EMPTY.val || oldpte.h.pfra == f->pfn) {
+ pgste = __dat_ptep_xchg(f->ptep, pgste, newpte, f->gfn, asce, uses_skeys);
+ if (f->callback)
+ f->callback(f);
+ } else {
+ rc = -EAGAIN;
+ }
+ pgste_set_unlock(f->ptep, pgste);
+ } else {
+ oldval = READ_ONCE(*f->crstep);
+ newval = _crste_fc1(f->pfn, oldval.h.tt, f->writable,
+ f->write_attempt | oldval.s.fc1.d);
+ newval.s.fc1.sd = oldval.s.fc1.sd;
+ if (oldval.val != _CRSTE_EMPTY(oldval.h.tt).val &&
+ crste_origin_large(oldval) != crste_origin_large(newval))
+ return -EAGAIN;
+ if (!dat_crstep_xchg_atomic(f->crstep, oldval, newval, f->gfn, asce))
+ return -EAGAIN;
+ if (f->callback)
+ f->callback(f);
+ }
+
+ return rc;
+}
+
+static long dat_set_pn_crste(union crste *crstep, gfn_t gfn, gfn_t next, struct dat_walk *walk)
+{
+ union crste crste = READ_ONCE(*crstep);
+ int *n = walk->priv;
+
+ if (!crste.h.fc || crste.h.i || crste.h.p)
+ return 0;
+
+ *n = 2;
+ if (crste.s.fc1.prefix_notif)
+ return 0;
+ crste.s.fc1.prefix_notif = 1;
+ dat_crstep_xchg(crstep, crste, gfn, walk->asce);
+ return 0;
+}
+
+static long dat_set_pn_pte(union pte *ptep, gfn_t gfn, gfn_t next, struct dat_walk *walk)
+{
+ int *n = walk->priv;
+ union pgste pgste;
+
+ pgste = pgste_get_lock(ptep);
+ if (!ptep->h.i && !ptep->h.p) {
+ pgste.prefix_notif = 1;
+ *n += 1;
+ }
+ pgste_set_unlock(ptep, pgste);
+ return 0;
+}
+
+int dat_set_prefix_notif_bit(union asce asce, gfn_t gfn)
+{
+ static const struct dat_walk_ops ops = {
+ .pte_entry = dat_set_pn_pte,
+ .pmd_entry = dat_set_pn_crste,
+ .pud_entry = dat_set_pn_crste,
+ };
+
+ int n = 0;
+
+ _dat_walk_gfn_range(gfn, gfn + 2, asce, &ops, DAT_WALK_IGN_HOLES, &n);
+ if (n != 2)
+ return -EAGAIN;
+ return 0;
+}
diff --git a/arch/s390/kvm/dat.h b/arch/s390/kvm/dat.h
index bbfb141ffcb4..0dc80893ed7c 100644
--- a/arch/s390/kvm/dat.h
+++ b/arch/s390/kvm/dat.h
@@ -362,6 +362,11 @@ struct dat_walk {
void *priv;
};
+struct ptval_param {
+ unsigned char offset : 6;
+ unsigned char len : 2;
+};
+
/**
* _pte() - Useful constructor for union pte
* @pfn: the pfn this pte should point to.
@@ -460,6 +465,32 @@ struct kvm_s390_mmu_cache {
short int n_rmaps;
};
+struct guest_fault {
+ gfn_t gfn; /* Guest frame */
+ kvm_pfn_t pfn; /* Host PFN */
+ struct page *page; /* Host page */
+ union pte *ptep; /* Used to resolve the fault, or NULL */
+ union crste *crstep; /* Used to resolve the fault, or NULL */
+ bool writable; /* Mapping is writable */
+ bool write_attempt; /* Write access attempted */
+ bool attempt_pfault; /* Attempt a pfault first */
+ bool valid; /* This entry contains valid data */
+ void (*callback)(struct guest_fault *f);
+ void *priv;
+};
+
+/*
+ * 0 1 2 3 4 5 6 7
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ * 0 | | PGT_ADDR |
+ * 8 | VMADDR | |
+ * 16 | |
+ * 24 | |
+ */
+#define MKPTVAL(o, l) ((struct ptval_param) { .offset = (o), .len = ((l) + 1) / 2 - 1})
+#define PTVAL_PGT_ADDR MKPTVAL(4, 8)
+#define PTVAL_VMADDR MKPTVAL(8, 6)
+
union pgste __must_check __dat_ptep_xchg(union pte *ptep, union pgste pgste, union pte new,
gfn_t gfn, union asce asce, bool uses_skeys);
bool dat_crstep_xchg_atomic(union crste *crstep, union crste old, union crste new, gfn_t gfn,
@@ -473,6 +504,7 @@ int dat_entry_walk(struct kvm_s390_mmu_cache *mc, gfn_t gfn, union asce asce, in
int walk_level, union crste **last, union pte **ptepp);
void dat_free_level(struct crst_table *table, bool owns_ptes);
struct crst_table *dat_alloc_crst_sleepable(unsigned long init);
+int dat_set_asce_limit(struct kvm_s390_mmu_cache *mc, union asce *asce, int newtype);
int dat_get_storage_key(union asce asce, gfn_t gfn, union skey *skey);
int dat_set_storage_key(struct kvm_s390_mmu_cache *mc, union asce asce, gfn_t gfn,
union skey skey, bool nq);
@@ -481,6 +513,16 @@ int dat_cond_set_storage_key(struct kvm_s390_mmu_cache *mmc, union asce asce, gf
int dat_reset_reference_bit(union asce asce, gfn_t gfn);
long dat_reset_skeys(union asce asce, gfn_t start);
+unsigned long dat_get_ptval(struct page_table *table, struct ptval_param param);
+void dat_set_ptval(struct page_table *table, struct ptval_param param, unsigned long val);
+
+int dat_set_slot(struct kvm_s390_mmu_cache *mc, union asce asce, gfn_t start, gfn_t end,
+ u16 type, u16 param);
+int dat_set_prefix_notif_bit(union asce asce, gfn_t gfn);
+bool dat_test_age_gfn(union asce asce, gfn_t start, gfn_t end);
+int dat_link(struct kvm_s390_mmu_cache *mc, union asce asce, int level,
+ bool uses_skeys, struct guest_fault *f);
+
int kvm_s390_mmu_cache_topup(struct kvm_s390_mmu_cache *mc);
#define GFP_KVM_S390_MMU_CACHE (GFP_ATOMIC | __GFP_ACCOUNT | __GFP_NOWARN)
@@ -886,4 +928,16 @@ static inline int get_level(union crste *crstep, union pte *ptep)
return ptep ? TABLE_TYPE_PAGE_TABLE : crstep->h.tt;
}
+static inline int dat_delete_slot(struct kvm_s390_mmu_cache *mc, union asce asce, gfn_t start,
+ unsigned long npages)
+{
+ return dat_set_slot(mc, asce, start, start + npages, _DAT_TOKEN_PIC, PGM_ADDRESSING);
+}
+
+static inline int dat_create_slot(struct kvm_s390_mmu_cache *mc, union asce asce, gfn_t start,
+ unsigned long npages)
+{
+ return dat_set_slot(mc, asce, start, start + npages, _DAT_TOKEN_NONE, 0);
+}
+
#endif /* __KVM_S390_DAT_H */
--
2.51.1
Powered by blists - more mailing lists