[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20251106161117.350395-14-imbrenda@linux.ibm.com>
Date: Thu, 6 Nov 2025 17:11:07 +0100
From: Claudio Imbrenda <imbrenda@...ux.ibm.com>
To: kvm@...r.kernel.org
Cc: linux-kernel@...r.kernel.org, linux-s390@...r.kernel.org,
borntraeger@...ibm.com, frankja@...ux.ibm.com, nsg@...ux.ibm.com,
nrb@...ux.ibm.com, seiden@...ux.ibm.com, schlameuss@...ux.ibm.com,
hca@...ux.ibm.com, svens@...ux.ibm.com, agordeev@...ux.ibm.com,
gor@...ux.ibm.com, david@...hat.com, gerald.schaefer@...ux.ibm.com
Subject: [PATCH v3 13/23] KVM: s390: KVM page table management functions: CMMA
Add page table management functions to be used for KVM guest (gmap)
page tables.
This patch adds functions to handle CMMA and the ESSA instruction.
Signed-off-by: Claudio Imbrenda <imbrenda@...ux.ibm.com>
---
arch/s390/kvm/dat.c | 262 ++++++++++++++++++++++++++++++++++++++++++++
arch/s390/kvm/dat.h | 27 +++++
2 files changed, 289 insertions(+)
diff --git a/arch/s390/kvm/dat.c b/arch/s390/kvm/dat.c
index 666aebec72eb..d2a68cee8ff6 100644
--- a/arch/s390/kvm/dat.c
+++ b/arch/s390/kvm/dat.c
@@ -1098,3 +1098,265 @@ int dat_set_prefix_notif_bit(union asce asce, gfn_t gfn)
return -EAGAIN;
return 0;
}
+
+/**
+ * dat_perform_essa() - perform ESSA actions on the PGSTE.
+ * @asce: the asce to operate on.
+ * @gfn: the guest page frame to operate on.
+ * @orc: the specific action to perform, see the ESSA_SET_* macros.
+ * @state: the storage attributes to be returned to the guest.
+ * @dirty: returns whether the function dirtied a previously clean entry.
+ *
+ * Context: Called with kvm->mmu_lock held.
+ *
+ * Return:
+ * * 1 if the page state has been altered and the page is to be added to the CBRL
+ * * 0 if the page state has been altered, but the page is not to be added to the CBRL
+ * * -1 if the page state has not been altered and the page is not to be added to the CBRL
+ */
+int dat_perform_essa(union asce asce, gfn_t gfn, int orc, union essa_state *state, bool *dirty)
+{
+ union crste *crstep;
+ union pgste pgste;
+ union pte *ptep;
+ int res = 0;
+
+ if (dat_entry_walk(NULL, gfn, asce, 0, TABLE_TYPE_PAGE_TABLE, &crstep, &ptep)) {
+ *state = (union essa_state) { .exception = 1 };
+ return -1;
+ }
+
+ pgste = pgste_get_lock(ptep);
+
+ *state = (union essa_state) {
+ .content = (ptep->h.i << 1) + (ptep->h.i && pgste.zero),
+ .nodat = pgste.nodat,
+ .usage = pgste.usage,
+ };
+
+ switch (orc) {
+ case ESSA_GET_STATE:
+ res = -1;
+ break;
+ case ESSA_SET_STABLE:
+ pgste.usage = PGSTE_GPS_USAGE_STABLE;
+ pgste.nodat = 0;
+ break;
+ case ESSA_SET_UNUSED:
+ pgste.usage = PGSTE_GPS_USAGE_UNUSED;
+ if (ptep->h.i)
+ res = 1;
+ break;
+ case ESSA_SET_VOLATILE:
+ pgste.usage = PGSTE_GPS_USAGE_VOLATILE;
+ if (ptep->h.i)
+ res = 1;
+ break;
+ case ESSA_SET_POT_VOLATILE:
+ if (!ptep->h.i) {
+ pgste.usage = PGSTE_GPS_USAGE_POT_VOLATILE;
+ } else if (pgste.zero) {
+ pgste.usage = PGSTE_GPS_USAGE_VOLATILE;
+ } else if (!pgste.gc) {
+ pgste.usage = PGSTE_GPS_USAGE_VOLATILE;
+ res = 1;
+ }
+ break;
+ case ESSA_SET_STABLE_RESIDENT:
+ pgste.usage = PGSTE_GPS_USAGE_STABLE;
+ /*
+ * Since the resident state can go away any time after this
+ * call, we will not make this page resident. We can revisit
+ * this decision if a guest will ever start using this.
+ */
+ break;
+ case ESSA_SET_STABLE_IF_RESIDENT:
+ if (!ptep->h.i)
+ pgste.usage = PGSTE_GPS_USAGE_STABLE;
+ break;
+ case ESSA_SET_STABLE_NODAT:
+ pgste.usage = PGSTE_GPS_USAGE_STABLE;
+ pgste.nodat = 1;
+ break;
+ default:
+ WARN_ONCE(1, "Invalid ORC!");
+ res = -1;
+ break;
+ }
+ /* If we are discarding a page, set it to logical zero */
+ pgste.zero = res == 1;
+ if (orc > 0) {
+ *dirty = !pgste.cmma_d;
+ pgste.cmma_d = 1;
+ }
+
+ pgste_set_unlock(ptep, pgste);
+
+ return res;
+}
+
+static long dat_reset_cmma_pte(union pte *ptep, gfn_t gfn, gfn_t next, struct dat_walk *walk)
+{
+ union pgste pgste;
+
+ pgste = pgste_get_lock(ptep);
+ pgste.usage = 0;
+ pgste.nodat = 0;
+ pgste.cmma_d = 0;
+ pgste_set_unlock(ptep, pgste);
+ if (need_resched())
+ return next;
+ return 0;
+}
+
+long dat_reset_cmma(union asce asce, gfn_t start)
+{
+ const struct dat_walk_ops dat_reset_cmma_ops = {
+ .pte_entry = dat_reset_cmma_pte,
+ };
+
+ return _dat_walk_gfn_range(start, asce_end(asce), asce, &dat_reset_cmma_ops,
+ DAT_WALK_IGN_HOLES, NULL);
+}
+
+struct dat_get_cmma_state {
+ gfn_t start;
+ gfn_t end;
+ unsigned int count;
+ u8 *values;
+ atomic64_t *remaining;
+};
+
+static long __dat_peek_cmma_pte(union pte *ptep, gfn_t gfn, gfn_t next, struct dat_walk *walk)
+{
+ struct dat_get_cmma_state *state = walk->priv;
+ union pgste pgste;
+
+ pgste = pgste_get_lock(ptep);
+ state->values[gfn - walk->start] = pgste.usage | (pgste.nodat << 6);
+ pgste_set_unlock(ptep, pgste);
+ state->end = next;
+
+ return 0;
+}
+
+static long __dat_peek_cmma_crste(union crste *crstep, gfn_t gfn, gfn_t next, struct dat_walk *walk)
+{
+ struct dat_get_cmma_state *state = walk->priv;
+
+ if (crstep->h.i)
+ state->end = min(walk->end, next);
+ return 0;
+}
+
+int dat_peek_cmma(gfn_t start, union asce asce, unsigned int *count, u8 *values)
+{
+ const struct dat_walk_ops ops = {
+ .pte_entry = __dat_peek_cmma_pte,
+ .pmd_entry = __dat_peek_cmma_crste,
+ .pud_entry = __dat_peek_cmma_crste,
+ .p4d_entry = __dat_peek_cmma_crste,
+ .pgd_entry = __dat_peek_cmma_crste,
+ };
+ struct dat_get_cmma_state state = { .values = values, };
+ int rc;
+
+ rc = _dat_walk_gfn_range(start, start + *count, asce, &ops, DAT_WALK_DEFAULT, &state);
+ *count = state.end - start;
+ /* Return success if at least one value was saved, otherwise an error. */
+ return (rc == -EFAULT && *count > 0) ? 0 : rc;
+}
+
+static long __dat_get_cmma_pte(union pte *ptep, gfn_t gfn, gfn_t next, struct dat_walk *walk)
+{
+ struct dat_get_cmma_state *state = walk->priv;
+ union pgste pgste;
+
+ if (state->start != -1) {
+ if ((gfn - state->end) > KVM_S390_MAX_BIT_DISTANCE)
+ return 1;
+ if (gfn - state->start >= state->count)
+ return 1;
+ }
+
+ if (!READ_ONCE(*pgste_of(ptep)).cmma_d)
+ return 0;
+
+ pgste = pgste_get_lock(ptep);
+ if (pgste.cmma_d) {
+ if (state->start == -1)
+ state->start = gfn;
+ pgste.cmma_d = 0;
+ atomic64_dec(state->remaining);
+ state->values[gfn - state->start] = pgste.usage | pgste.nodat << 6;
+ state->end = next;
+ }
+ pgste_set_unlock(ptep, pgste);
+ return 0;
+}
+
+int dat_get_cmma(union asce asce, gfn_t *start, unsigned int *count, u8 *values, atomic64_t *rem)
+{
+ const struct dat_walk_ops ops = { .pte_entry = __dat_get_cmma_pte, };
+ struct dat_get_cmma_state state = {
+ .remaining = rem,
+ .values = values,
+ .count = *count,
+ .start = -1,
+ };
+
+ _dat_walk_gfn_range(*start, asce_end(asce), asce, &ops, DAT_WALK_IGN_HOLES, &state);
+
+ if (state.start == -1) {
+ *count = 0;
+ } else {
+ *count = state.end - state.start;
+ *start = state.start;
+ }
+
+ return 0;
+}
+
+struct dat_set_cmma_state {
+ unsigned long mask;
+ const u8 *bits;
+};
+
+static long __dat_set_cmma_pte(union pte *ptep, gfn_t gfn, gfn_t next, struct dat_walk *walk)
+{
+ struct dat_set_cmma_state *state = walk->priv;
+ union pgste pgste, tmp;
+
+ tmp.val = (state->bits[gfn - walk->start] << 24) & state->mask;
+
+ pgste = pgste_get_lock(ptep);
+ pgste.usage = tmp.usage;
+ pgste.nodat = tmp.nodat;
+ pgste_set_unlock(ptep, pgste);
+
+ return 0;
+}
+
+/*
+ * This function sets the CMMA attributes for the given pages. If the input
+ * buffer has zero length, no action is taken, otherwise the attributes are
+ * set and the mm->context.uses_cmm flag is set.
+ */
+int dat_set_cmma_bits(struct kvm_s390_mmu_cache *mc, union asce asce, gfn_t gfn,
+ unsigned long count, unsigned long mask, const uint8_t *bits)
+{
+ const struct dat_walk_ops ops = { .pte_entry = __dat_set_cmma_pte, };
+ struct dat_set_cmma_state state = { .mask = mask, .bits = bits, };
+ union crste *crstep;
+ union pte *ptep;
+ gfn_t cur;
+ int rc;
+
+ for (cur = ALIGN_DOWN(gfn, _PAGE_ENTRIES); cur < gfn + count; cur += _PAGE_ENTRIES) {
+ rc = dat_entry_walk(mc, cur, asce, DAT_WALK_ALLOC, TABLE_TYPE_PAGE_TABLE,
+ &crstep, &ptep);
+ if (rc)
+ return rc;
+ }
+ return _dat_walk_gfn_range(gfn, gfn + count, asce, &ops, DAT_WALK_IGN_HOLES, &state);
+}
diff --git a/arch/s390/kvm/dat.h b/arch/s390/kvm/dat.h
index 0dc80893ed7c..098cd454b161 100644
--- a/arch/s390/kvm/dat.h
+++ b/arch/s390/kvm/dat.h
@@ -18,6 +18,15 @@
#include <asm/tlbflush.h>
#include <asm/dat-bits.h>
+/*
+ * Base address and length must be sent at the start of each block, therefore
+ * it's cheaper to send some clean data, as long as it's less than the size of
+ * two longs.
+ */
+#define KVM_S390_MAX_BIT_DISTANCE (2 * sizeof(void *))
+/* For consistency */
+#define KVM_S390_CMMA_SIZE_MAX ((u32)KVM_S390_SKEYS_MAX)
+
#define _ASCE(x) ((union asce) { .val = (x), })
#define NULL_ASCE _ASCE(0)
@@ -434,6 +443,17 @@ static inline union crste _crste_fc1(kvm_pfn_t pfn, int tt, bool writable, bool
return res;
}
+union essa_state {
+ unsigned char val;
+ struct {
+ unsigned char : 2;
+ unsigned char nodat : 1;
+ unsigned char exception : 1;
+ unsigned char usage : 2;
+ unsigned char content : 2;
+ };
+};
+
/**
* struct vsie_rmap - reverse mapping for shadow page table entries
* @next: pointer to next rmap in the list
@@ -523,6 +543,13 @@ bool dat_test_age_gfn(union asce asce, gfn_t start, gfn_t end);
int dat_link(struct kvm_s390_mmu_cache *mc, union asce asce, int level,
bool uses_skeys, struct guest_fault *f);
+int dat_perform_essa(union asce asce, gfn_t gfn, int orc, union essa_state *state, bool *dirty);
+long dat_reset_cmma(union asce asce, gfn_t start_gfn);
+int dat_peek_cmma(gfn_t start, union asce asce, unsigned int *count, u8 *values);
+int dat_get_cmma(union asce asce, gfn_t *start, unsigned int *count, u8 *values, atomic64_t *rem);
+int dat_set_cmma_bits(struct kvm_s390_mmu_cache *mc, union asce asce, gfn_t gfn,
+ unsigned long count, unsigned long mask, const uint8_t *bits);
+
int kvm_s390_mmu_cache_topup(struct kvm_s390_mmu_cache *mc);
#define GFP_KVM_S390_MMU_CACHE (GFP_ATOMIC | __GFP_ACCOUNT | __GFP_NOWARN)
--
2.51.1
Powered by blists - more mailing lists