linux-kernel - [PATCH 14/21] x86/intel_rdt/cqm: Add mon

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite for Android: free password hash cracker in your pocket
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <1498503368-20173-15-git-send-email-vikas.shivappa@linux.intel.com>
Date:   Mon, 26 Jun 2017 11:56:01 -0700
From:   Vikas Shivappa <vikas.shivappa@...ux.intel.com>
To:     x86@...nel.org, linux-kernel@...r.kernel.org, tglx@...utronix.de
Cc:     hpa@...or.com, peterz@...radead.org, ravi.v.shankar@...el.com,
        vikas.shivappa@...el.com, tony.luck@...el.com,
        fenghua.yu@...el.com, andi.kleen@...el.com
Subject: [PATCH 14/21] x86/intel_rdt/cqm: Add mon_data

Add a mon_data directory for the root rdtgroup and all other rdtgroups.
The directory holds all of the monitored data for all domains and events
of all resources being monitored.

The mon_data itself has a list of directories in the format
mon_<domain_name>_<domain_id>. Each of these subdirectories contain one
file per event in the mode "0444". Reading the file displays a snapshot
of the monitored data for the event the file represents.

For ex, on a 2 socket Broadwell with llc_occupancy being
monitored the mon_data contents look as below:

$ ls /sys/fs/resctrl/p1/mon_data/
mon_L3_00
mon_L3_01

Each domain directory has one file per event:
$ ls /sys/fs/resctrl/p1/mon_data/mon_L3_00/
llc_occupancy

To read current llc_occupancy of ctrl_mon group p1
$ cat /sys/fs/resctrl/p1/mon_data/mon_L3_00/llc_occupancy
33789096

[This patch idea is based on Tony's sample patches to organise data in a
per domain directory and have one file per event (and use the fp->priv to
store mon data bits)]

Signed-off-by: Vikas Shivappa <vikas.shivappa@...ux.intel.com>
---
 arch/x86/kernel/cpu/Makefile                |   2 +-
 arch/x86/kernel/cpu/intel_rdt.c             |   4 +-
 arch/x86/kernel/cpu/intel_rdt.h             |  27 +++
 arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c | 332 ++++++++++++++++++++++++++++
 arch/x86/kernel/cpu/intel_rdt_monitor.c     |  42 ++++
 arch/x86/kernel/cpu/intel_rdt_rdtgroup.c    | 155 +++++++++++++
 arch/x86/kernel/cpu/intel_rdt_schemata.c    | 286 ------------------------
 7 files changed, 559 insertions(+), 289 deletions(-)
 create mode 100644 arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c
 delete mode 100644 arch/x86/kernel/cpu/intel_rdt_schemata.c

diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 81b0060..1245f98 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -32,7 +32,7 @@ obj-$(CONFIG_CPU_SUP_CENTAUR)		+= centaur.o
 obj-$(CONFIG_CPU_SUP_TRANSMETA_32)	+= transmeta.o
 obj-$(CONFIG_CPU_SUP_UMC_32)		+= umc.o
 
-obj-$(CONFIG_INTEL_RDT)	+= intel_rdt.o intel_rdt_rdtgroup.o intel_rdt_schemata.o intel_rdt_monitor.o
+obj-$(CONFIG_INTEL_RDT)	+= intel_rdt.o intel_rdt_rdtgroup.o intel_rdt_monitor.o intel_rdt_ctrlmondata.o
 
 obj-$(CONFIG_X86_MCE)			+= mcheck/
 obj-$(CONFIG_MTRR)			+= mtrr/
diff --git a/arch/x86/kernel/cpu/intel_rdt.c b/arch/x86/kernel/cpu/intel_rdt.c
index b0f8c35..63bfb47c 100644
--- a/arch/x86/kernel/cpu/intel_rdt.c
+++ b/arch/x86/kernel/cpu/intel_rdt.c
@@ -344,8 +344,8 @@ void rdt_ctrl_update(void *arg)
  * caller, return the first domain whose id is bigger than the input id.
  * The domain list is sorted by id in ascending order.
  */
-static struct rdt_domain *rdt_find_domain(struct rdt_resource *r, int id,
-					  struct list_head **pos)
+struct rdt_domain *rdt_find_domain(struct rdt_resource *r, int id,
+				   struct list_head **pos)
 {
 	struct rdt_domain *d;
 	struct list_head *l;
diff --git a/arch/x86/kernel/cpu/intel_rdt.h b/arch/x86/kernel/cpu/intel_rdt.h
index fec8ba9..631d58e 100644
--- a/arch/x86/kernel/cpu/intel_rdt.h
+++ b/arch/x86/kernel/cpu/intel_rdt.h
@@ -33,6 +33,27 @@ struct mon_evt {
 	struct list_head	list;
 };
 
+/**
+ * struct mon_data_bits - Monitoring details for each event file
+ * @rid:               Resource id associated with the event file.
+ * @evtid:             Event id associated with the event file
+ * @domid:             The domain to which the event file belongs
+ */
+union mon_data_bits {
+	void *priv;
+	struct {
+		unsigned int rid	: 10;
+		unsigned int evtid	: 8;
+		unsigned int domid	: 14;
+	} u;
+};
+
+struct rmid_read {
+	struct rdtgroup		*rgrp;
+	int			evtid;
+	u64			val;
+};
+
 extern unsigned int intel_cqm_threshold;
 extern bool rdt_alloc_enabled;
 extern int rdt_mon_features;
@@ -48,6 +69,7 @@ enum rdt_group_type {
 /**
  * struct rdtgroup - store rdtgroup's data in resctrl file system.
  * @kn:				kernfs node
+ * @mon_data_kn		kernlfs node for the mon_data directory
  * @rdtgroup_list:		linked list for all rdtgroups
  * @parent:			parent rdtgrp
  * @crdtgrp_list:		child rdtgroup node list
@@ -62,6 +84,7 @@ enum rdt_group_type {
  */
 struct rdtgroup {
 	struct kernfs_node	*kn;
+	struct kernfs_node	*mon_data_kn;
 	struct list_head	rdtgroup_list;
 	struct rdtgroup		*parent;
 	struct list_head	crdtgrp_list;
@@ -311,6 +334,8 @@ enum {
 void rdt_ctrl_update(void *arg);
 struct rdtgroup *rdtgroup_kn_lock_live(struct kernfs_node *kn);
 void rdtgroup_kn_unlock(struct kernfs_node *kn);
+struct rdt_domain *rdt_find_domain(struct rdt_resource *r, int id,
+				   struct list_head **pos);
 ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of,
 				char *buf, size_t nbytes, loff_t off);
 int rdtgroup_schemata_show(struct kernfs_open_file *of,
@@ -318,5 +343,7 @@ int rdtgroup_schemata_show(struct kernfs_open_file *of,
 int alloc_rmid(void);
 void free_rmid(u32 rmid);
 void rdt_get_mon_l3_config(struct rdt_resource *r);
+void mon_event_count(void *info);
+int rdtgroup_mondata_show(struct seq_file *m, void *arg);
 
 #endif /* _ASM_X86_INTEL_RDT_H */
diff --git a/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c b/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c
new file mode 100644
index 0000000..0c8bca0
--- /dev/null
+++ b/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c
@@ -0,0 +1,332 @@
+/*
+ * Resource Director Technology(RDT)
+ * - Cache Allocation code.
+ *
+ * Copyright (C) 2016 Intel Corporation
+ *
+ * Authors:
+ *    Fenghua Yu <fenghua.yu@...el.com>
+ *    Tony Luck <tony.luck@...el.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * More information about RDT be found in the Intel (R) x86 Architecture
+ * Software Developer Manual June 2016, volume 3, section 17.17.
+ */
+
+#define pr_fmt(fmt)	KBUILD_MODNAME ": " fmt
+
+#include <linux/kernfs.h>
+#include <linux/seq_file.h>
+#include <linux/slab.h>
+#include "intel_rdt.h"
+
+/*
+ * Check whether MBA bandwidth percentage value is correct. The value is
+ * checked against the minimum and max bandwidth values specified by the
+ * hardware. The allocated bandwidth percentage is rounded to the next
+ * control step available on the hardware.
+ */
+static bool bw_validate(char *buf, unsigned long *data, struct rdt_resource *r)
+{
+	unsigned long bw;
+	int ret;
+
+	/*
+	 * Only linear delay values is supported for current Intel SKUs.
+	 */
+	if (!r->membw.delay_linear)
+		return false;
+
+	ret = kstrtoul(buf, 10, &bw);
+	if (ret)
+		return false;
+
+	if (bw < r->membw.min_bw || bw > r->default_ctrl)
+		return false;
+
+	*data = roundup(bw, (unsigned long)r->membw.bw_gran);
+	return true;
+}
+
+int parse_bw(char *buf, struct rdt_resource *r, struct rdt_domain *d)
+{
+	unsigned long data;
+
+	if (d->have_new_ctrl)
+		return -EINVAL;
+
+	if (!bw_validate(buf, &data, r))
+		return -EINVAL;
+	d->new_ctrl = data;
+	d->have_new_ctrl = true;
+
+	return 0;
+}
+
+/*
+ * Check whether a cache bit mask is valid. The SDM says:
+ *	Please note that all (and only) contiguous '1' combinations
+ *	are allowed (e.g. FFFFH, 0FF0H, 003CH, etc.).
+ * Additionally Haswell requires at least two bits set.
+ */
+static bool cbm_validate(char *buf, unsigned long *data, struct rdt_resource *r)
+{
+	unsigned long first_bit, zero_bit, val;
+	unsigned int cbm_len = r->cache.cbm_len;
+	int ret;
+
+	ret = kstrtoul(buf, 16, &val);
+	if (ret)
+		return false;
+
+	if (val == 0 || val > r->default_ctrl)
+		return false;
+
+	first_bit = find_first_bit(&val, cbm_len);
+	zero_bit = find_next_zero_bit(&val, cbm_len, first_bit);
+
+	if (find_next_bit(&val, cbm_len, zero_bit) < cbm_len)
+		return false;
+
+	if ((zero_bit - first_bit) < r->cache.min_cbm_bits)
+		return false;
+
+	*data = val;
+	return true;
+}
+
+/*
+ * Read one cache bit mask (hex). Check that it is valid for the current
+ * resource type.
+ */
+int parse_cbm(char *buf, struct rdt_resource *r, struct rdt_domain *d)
+{
+	unsigned long data;
+
+	if (d->have_new_ctrl)
+		return -EINVAL;
+
+	if (!cbm_validate(buf, &data, r))
+		return -EINVAL;
+	d->new_ctrl = data;
+	d->have_new_ctrl = true;
+
+	return 0;
+}
+
+/*
+ * For each domain in this resource we expect to find a series of:
+ *	id=mask
+ * separated by ";". The "id" is in decimal, and must match one of
+ * the "id"s for this resource.
+ */
+static int parse_line(char *line, struct rdt_resource *r)
+{
+	char *dom = NULL, *id;
+	struct rdt_domain *d;
+	unsigned long dom_id;
+
+next:
+	if (!line || line[0] == '\0')
+		return 0;
+	dom = strsep(&line, ";");
+	id = strsep(&dom, "=");
+	if (!dom || kstrtoul(id, 10, &dom_id))
+		return -EINVAL;
+	dom = strim(dom);
+	list_for_each_entry(d, &r->domains, list) {
+		if (d->id == dom_id) {
+			if (r->parse_ctrlval(dom, r, d))
+				return -EINVAL;
+			goto next;
+		}
+	}
+	return -EINVAL;
+}
+
+static int update_domains(struct rdt_resource *r, int closid)
+{
+	struct msr_param msr_param;
+	cpumask_var_t cpu_mask;
+	struct rdt_domain *d;
+	int cpu;
+
+	if (!zalloc_cpumask_var(&cpu_mask, GFP_KERNEL))
+		return -ENOMEM;
+
+	msr_param.low = closid;
+	msr_param.high = msr_param.low + 1;
+	msr_param.res = r;
+
+	list_for_each_entry(d, &r->domains, list) {
+		if (d->have_new_ctrl && d->new_ctrl != d->ctrl_val[closid]) {
+			cpumask_set_cpu(cpumask_any(&d->cpu_mask), cpu_mask);
+			d->ctrl_val[closid] = d->new_ctrl;
+		}
+	}
+	if (cpumask_empty(cpu_mask))
+		goto done;
+	cpu = get_cpu();
+	/* Update CBM on this cpu if it's in cpu_mask. */
+	if (cpumask_test_cpu(cpu, cpu_mask))
+		rdt_ctrl_update(&msr_param);
+	/* Update CBM on other cpus. */
+	smp_call_function_many(cpu_mask, rdt_ctrl_update, &msr_param, 1);
+	put_cpu();
+
+done:
+	free_cpumask_var(cpu_mask);
+
+	return 0;
+}
+
+static int rdtgroup_parse_resource(char *resname, char *tok, int closid)
+{
+	struct rdt_resource *r;
+
+	for_each_alloc_enabled_rdt_resource(r) {
+		if (!strcmp(resname, r->name) && closid < r->num_closid)
+			return parse_line(tok, r);
+	}
+	return -EINVAL;
+}
+
+ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of,
+				char *buf, size_t nbytes, loff_t off)
+{
+	struct rdtgroup *rdtgrp;
+	struct rdt_domain *dom;
+	struct rdt_resource *r;
+	char *tok, *resname;
+	int closid, ret = 0;
+
+	/* Valid input requires a trailing newline */
+	if (nbytes == 0 || buf[nbytes - 1] != '\n')
+		return -EINVAL;
+	buf[nbytes - 1] = '\0';
+
+	rdtgrp = rdtgroup_kn_lock_live(of->kn);
+	if (!rdtgrp) {
+		rdtgroup_kn_unlock(of->kn);
+		return -ENOENT;
+	}
+
+	closid = rdtgrp->closid;
+
+	for_each_alloc_enabled_rdt_resource(r) {
+		list_for_each_entry(dom, &r->domains, list)
+			dom->have_new_ctrl = false;
+	}
+
+	while ((tok = strsep(&buf, "\n")) != NULL) {
+		resname = strim(strsep(&tok, ":"));
+		if (!tok) {
+			ret = -EINVAL;
+			goto out;
+		}
+		ret = rdtgroup_parse_resource(resname, tok, closid);
+		if (ret)
+			goto out;
+	}
+
+	for_each_alloc_enabled_rdt_resource(r) {
+		ret = update_domains(r, closid);
+		if (ret)
+			goto out;
+	}
+
+out:
+	rdtgroup_kn_unlock(of->kn);
+	return ret ?: nbytes;
+}
+
+static void show_doms(struct seq_file *s, struct rdt_resource *r, int closid)
+{
+	struct rdt_domain *dom;
+	bool sep = false;
+
+	seq_printf(s, "%*s:", max_name_width, r->name);
+	list_for_each_entry(dom, &r->domains, list) {
+		if (sep)
+			seq_puts(s, ";");
+		seq_printf(s, r->format_str, dom->id, max_data_width,
+			   dom->ctrl_val[closid]);
+		sep = true;
+	}
+	seq_puts(s, "\n");
+}
+
+int rdtgroup_schemata_show(struct kernfs_open_file *of,
+			   struct seq_file *s, void *v)
+{
+	struct rdtgroup *rdtgrp;
+	struct rdt_resource *r;
+	int closid, ret = 0;
+
+	rdtgrp = rdtgroup_kn_lock_live(of->kn);
+	if (rdtgrp) {
+		closid = rdtgrp->closid;
+		for_each_alloc_enabled_rdt_resource(r) {
+			if (closid < r->num_closid)
+				show_doms(s, r, closid);
+		}
+	} else {
+		ret = -ENOENT;
+	}
+	rdtgroup_kn_unlock(of->kn);
+	return ret;
+}
+
+int rdtgroup_mondata_show(struct seq_file *m, void *arg)
+{
+	struct kernfs_open_file *of = m->private;
+	u32 resid, evtid, domid;
+	struct rdtgroup *rdtgrp;
+	struct rdt_resource *r;
+	union mon_data_bits md;
+	struct rdt_domain *d;
+	struct rmid_read rr;
+	int ret = 0;
+
+	rdtgrp = rdtgroup_kn_lock_live(of->kn);
+
+	md.priv = of->kn->priv;
+	resid = md.u.rid;
+	domid = md.u.domid;
+	evtid = md.u.evtid;
+
+	r = &rdt_resources_all[resid];
+	d = rdt_find_domain(r, domid, NULL);
+	if (!d) {
+		ret = -ENOENT;
+		goto out;
+	}
+
+	/*
+	 * setup the parameters to send to the IPI to read the data.
+	 */
+	rr.rgrp = rdtgrp;
+	rr.evtid = evtid;
+	rr.val = 0;
+
+	smp_call_function_any(&d->cpu_mask, mon_event_count, &rr, 1);
+
+	if (rr.val & RMID_VAL_ERROR)
+		seq_puts(m, "Error\n");
+	else if (rr.val & RMID_VAL_UNAVAIL)
+		seq_puts(m, "Unavailable\n");
+	else
+		seq_printf(m, "%llu\n", rr.val * r->mon_scale);
+
+out:
+	rdtgroup_kn_unlock(of->kn);
+	return ret;
+}
diff --git a/arch/x86/kernel/cpu/intel_rdt_monitor.c b/arch/x86/kernel/cpu/intel_rdt_monitor.c
index 624a0aa..cc252eb 100644
--- a/arch/x86/kernel/cpu/intel_rdt_monitor.c
+++ b/arch/x86/kernel/cpu/intel_rdt_monitor.c
@@ -204,6 +204,48 @@ void free_rmid(u32 rmid)
 		list_add_tail(&entry->list, &rmid_free_lru);
 }
 
+static bool __mon_event_count(u32 rmid, struct rmid_read *rr)
+{
+	u64 tval;
+
+	tval = __rmid_read(rmid, rr->evtid);
+	if (tval & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL)) {
+		rr->val = tval;
+		return false;
+	}
+	switch (rr->evtid) {
+	case QOS_L3_OCCUP_EVENT_ID:
+		rr->val += tval;
+		return true;
+	default:
+		return false;
+	}
+}
+
+void mon_event_count(void *info)
+{
+	struct rdtgroup *rdtgrp, *entry;
+	struct rmid_read *rr = info;
+	struct list_head *llist;
+
+	rdtgrp = rr->rgrp;
+
+	if (!__mon_event_count(rdtgrp->rmid, rr))
+		return;
+
+	/*
+	 * For Ctrl groups read data from child monitor groups.
+	 */
+	llist = &rdtgrp->crdtgrp_list;
+
+	if (rdtgrp->type == RDTCTRL_GROUP) {
+		list_for_each_entry(entry, llist, crdtgrp_list) {
+			if (!__mon_event_count(entry->rmid, rr))
+				return;
+		}
+	}
+}
+
 static int dom_data_init(struct rdt_resource *r)
 {
 	struct rmid_entry *entry = NULL;
diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
index d32b781..9377bcd 100644
--- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
+++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
@@ -152,6 +152,11 @@ static ssize_t rdtgroup_file_write(struct kernfs_open_file *of, char *buf,
 	.seq_show		= rdtgroup_seqfile_show,
 };
 
+static struct kernfs_ops kf_mondata_ops = {
+	.atomic_write_len	= PAGE_SIZE,
+	.seq_show		= rdtgroup_mondata_show,
+};
+
 static bool is_cpu_list(struct kernfs_open_file *of)
 {
 	struct rftype *rft = of->kn->priv;
@@ -1251,6 +1256,152 @@ static void rdt_kill_sb(struct super_block *sb)
 	.kill_sb = rdt_kill_sb,
 };
 
+static int mon_addfile(struct kernfs_node *parent_kn, const char *name,
+		       void *priv)
+{
+	struct kernfs_node *kn;
+	int ret = 0;
+
+	kn = __kernfs_create_file(parent_kn, name, 0444, 0,
+				  &kf_mondata_ops, priv, NULL, NULL);
+	if (IS_ERR(kn))
+		return PTR_ERR(kn);
+
+	ret = rdtgroup_kn_set_ugid(kn);
+	if (ret) {
+		kernfs_remove(kn);
+		return ret;
+	}
+
+	return ret;
+}
+
+static int get_rdt_resourceid(struct rdt_resource *r)
+{
+	if (r > (rdt_resources_all + RDT_NUM_RESOURCES - 1) ||
+	    r < rdt_resources_all ||
+	    ((r - rdt_resources_all) % sizeof(struct rdt_resource)))
+		return -EINVAL;
+
+	return ((r - rdt_resources_all) / sizeof(struct rdt_resource));
+}
+
+static int mkdir_mondata_subdir(struct kernfs_node *parent_kn, int domid,
+				struct rdt_resource *r, struct rdtgroup *pr)
+{
+	union mon_data_bits priv;
+	struct kernfs_node *kn;
+	struct mon_evt *mevt;
+	char name[32];
+	int ret, rid;
+
+	rid = get_rdt_resourceid(r);
+	if (rid < 0)
+		return -EINVAL;
+
+	sprintf(name, "mon_%s_%02d", r->name, domid);
+	/* create the directory */
+	kn = kernfs_create_dir(parent_kn, name, parent_kn->mode, pr);
+	if (IS_ERR(kn))
+		return PTR_ERR(kn);
+
+	/*
+	 * This extra ref will be put in kernfs_remove() and guarantees
+	 * that @rdtgrp->kn is always accessible.
+	 */
+	kernfs_get(kn);
+	ret = rdtgroup_kn_set_ugid(kn);
+	if (ret)
+		goto out_destroy;
+
+	if (WARN_ON(list_empty(&r->evt_list))) {
+		ret = -EPERM;
+		goto out_destroy;
+	}
+
+	priv.u.rid = rid;
+	priv.u.domid = domid;
+	list_for_each_entry(mevt, &r->evt_list, list) {
+		priv.u.evtid = mevt->evtid;
+		ret = mon_addfile(kn, mevt->name, priv.priv);
+		if (ret)
+			goto out_destroy;
+	}
+	kernfs_activate(kn);
+	return 0;
+
+out_destroy:
+	kernfs_remove(kn);
+	return ret;
+}
+
+static int mkdir_mondata_subdir_alldom(struct kernfs_node *parent_kn,
+				       struct rdt_resource *r,
+				       struct rdtgroup *pr)
+{
+	struct rdt_domain *dom;
+	int ret;
+
+	list_for_each_entry(dom, &r->domains, list) {
+		ret = mkdir_mondata_subdir(parent_kn, dom->id, r, pr);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+/*
+ * This creates a directory mon_data which holds one subdirectory
+ * per domain which contains the monitored data.
+ *
+ * mon_data has one directory for each domain whic are named
+ * mon_<domain_name>_<domain_id>. For ex: A mon_data with
+ * with L3 domain looks as below:
+ * ./mon_data:
+ * mon_L3_00
+ * mon_L3_01
+ * mon_L3_02
+ * ...
+ *
+ * Each domain directory has one file per event:
+ * ./mon_L3_00/:
+ * llc_occupancy
+ *
+ */
+static int mkdir_mondata_all(struct kernfs_node *parent_kn, struct rdtgroup *pr,
+			     struct kernfs_node **dest_kn)
+{
+	struct rdt_resource *r;
+	struct kernfs_node *kn;
+	int ret;
+
+	/*
+	 * Create the mon_data directory first.
+	 */
+	ret = mongroup_create_dir(parent_kn, NULL, "mon_data", &kn);
+	if (ret)
+		return ret;
+
+	if (dest_kn)
+		*dest_kn = kn;
+
+	/*
+	 * Create the subdirectories for each domain. Note that all events
+	 * in a domain like L3 are grouped into a resource whose domain is L3
+	 */
+	for_each_mon_enabled_rdt_resource(r) {
+		ret = mkdir_mondata_subdir_alldom(kn, r, pr);
+		if (ret)
+			goto out_destroy;
+	}
+
+	return 0;
+
+out_destroy:
+	kernfs_remove(kn);
+	return ret;
+}
 /*
  * Common code for ctrl_mon and monitor group mkdir.
  * The caller needs to unlock the global mutex upon success.
@@ -1307,6 +1458,10 @@ static int mkdir_rdt_common(struct kernfs_node *pkn, struct kernfs_node *prkn,
 		goto out_destroy;
 
 	if (rdt_mon_features) {
+		ret = mkdir_mondata_all(kn, rdtgrp, &rdtgrp->mon_data_kn);
+		if (ret)
+			goto out_destroy;
+
 		ret = alloc_rmid();
 		if (ret < 0)
 			return ret;
diff --git a/arch/x86/kernel/cpu/intel_rdt_schemata.c b/arch/x86/kernel/cpu/intel_rdt_schemata.c
deleted file mode 100644
index 952156c..0000000
--- a/arch/x86/kernel/cpu/intel_rdt_schemata.c
+++ /dev/null
@@ -1,286 +0,0 @@
-/*
- * Resource Director Technology(RDT)
- * - Cache Allocation code.
- *
- * Copyright (C) 2016 Intel Corporation
- *
- * Authors:
- *    Fenghua Yu <fenghua.yu@...el.com>
- *    Tony Luck <tony.luck@...el.com>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * More information about RDT be found in the Intel (R) x86 Architecture
- * Software Developer Manual June 2016, volume 3, section 17.17.
- */
-
-#define pr_fmt(fmt)	KBUILD_MODNAME ": " fmt
-
-#include <linux/kernfs.h>
-#include <linux/seq_file.h>
-#include <linux/slab.h>
-#include "intel_rdt.h"
-
-/*
- * Check whether MBA bandwidth percentage value is correct. The value is
- * checked against the minimum and max bandwidth values specified by the
- * hardware. The allocated bandwidth percentage is rounded to the next
- * control step available on the hardware.
- */
-static bool bw_validate(char *buf, unsigned long *data, struct rdt_resource *r)
-{
-	unsigned long bw;
-	int ret;
-
-	/*
-	 * Only linear delay values is supported for current Intel SKUs.
-	 */
-	if (!r->membw.delay_linear)
-		return false;
-
-	ret = kstrtoul(buf, 10, &bw);
-	if (ret)
-		return false;
-
-	if (bw < r->membw.min_bw || bw > r->default_ctrl)
-		return false;
-
-	*data = roundup(bw, (unsigned long)r->membw.bw_gran);
-	return true;
-}
-
-int parse_bw(char *buf, struct rdt_resource *r, struct rdt_domain *d)
-{
-	unsigned long data;
-
-	if (d->have_new_ctrl)
-		return -EINVAL;
-
-	if (!bw_validate(buf, &data, r))
-		return -EINVAL;
-	d->new_ctrl = data;
-	d->have_new_ctrl = true;
-
-	return 0;
-}
-
-/*
- * Check whether a cache bit mask is valid. The SDM says:
- *	Please note that all (and only) contiguous '1' combinations
- *	are allowed (e.g. FFFFH, 0FF0H, 003CH, etc.).
- * Additionally Haswell requires at least two bits set.
- */
-static bool cbm_validate(char *buf, unsigned long *data, struct rdt_resource *r)
-{
-	unsigned long first_bit, zero_bit, val;
-	unsigned int cbm_len = r->cache.cbm_len;
-	int ret;
-
-	ret = kstrtoul(buf, 16, &val);
-	if (ret)
-		return false;
-
-	if (val == 0 || val > r->default_ctrl)
-		return false;
-
-	first_bit = find_first_bit(&val, cbm_len);
-	zero_bit = find_next_zero_bit(&val, cbm_len, first_bit);
-
-	if (find_next_bit(&val, cbm_len, zero_bit) < cbm_len)
-		return false;
-
-	if ((zero_bit - first_bit) < r->cache.min_cbm_bits)
-		return false;
-
-	*data = val;
-	return true;
-}
-
-/*
- * Read one cache bit mask (hex). Check that it is valid for the current
- * resource type.
- */
-int parse_cbm(char *buf, struct rdt_resource *r, struct rdt_domain *d)
-{
-	unsigned long data;
-
-	if (d->have_new_ctrl)
-		return -EINVAL;
-
-	if(!cbm_validate(buf, &data, r))
-		return -EINVAL;
-	d->new_ctrl = data;
-	d->have_new_ctrl = true;
-
-	return 0;
-}
-
-/*
- * For each domain in this resource we expect to find a series of:
- *	id=mask
- * separated by ";". The "id" is in decimal, and must match one of
- * the "id"s for this resource.
- */
-static int parse_line(char *line, struct rdt_resource *r)
-{
-	char *dom = NULL, *id;
-	struct rdt_domain *d;
-	unsigned long dom_id;
-
-next:
-	if (!line || line[0] == '\0')
-		return 0;
-	dom = strsep(&line, ";");
-	id = strsep(&dom, "=");
-	if (!dom || kstrtoul(id, 10, &dom_id))
-		return -EINVAL;
-	dom = strim(dom);
-	list_for_each_entry(d, &r->domains, list) {
-		if (d->id == dom_id) {
-			if (r->parse_ctrlval(dom, r, d))
-				return -EINVAL;
-			goto next;
-		}
-	}
-	return -EINVAL;
-}
-
-static int update_domains(struct rdt_resource *r, int closid)
-{
-	struct msr_param msr_param;
-	cpumask_var_t cpu_mask;
-	struct rdt_domain *d;
-	int cpu;
-
-	if (!zalloc_cpumask_var(&cpu_mask, GFP_KERNEL))
-		return -ENOMEM;
-
-	msr_param.low = closid;
-	msr_param.high = msr_param.low + 1;
-	msr_param.res = r;
-
-	list_for_each_entry(d, &r->domains, list) {
-		if (d->have_new_ctrl && d->new_ctrl != d->ctrl_val[closid]) {
-			cpumask_set_cpu(cpumask_any(&d->cpu_mask), cpu_mask);
-			d->ctrl_val[closid] = d->new_ctrl;
-		}
-	}
-	if (cpumask_empty(cpu_mask))
-		goto done;
-	cpu = get_cpu();
-	/* Update CBM on this cpu if it's in cpu_mask. */
-	if (cpumask_test_cpu(cpu, cpu_mask))
-		rdt_ctrl_update(&msr_param);
-	/* Update CBM on other cpus. */
-	smp_call_function_many(cpu_mask, rdt_ctrl_update, &msr_param, 1);
-	put_cpu();
-
-done:
-	free_cpumask_var(cpu_mask);
-
-	return 0;
-}
-
-static int rdtgroup_parse_resource(char *resname, char *tok, int closid)
-{
-	struct rdt_resource *r;
-
-	for_each_alloc_enabled_rdt_resource(r) {
-		if (!strcmp(resname, r->name) && closid < r->num_closid)
-			return parse_line(tok, r);
-	}
-	return -EINVAL;
-}
-
-ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of,
-				char *buf, size_t nbytes, loff_t off)
-{
-	struct rdtgroup *rdtgrp;
-	struct rdt_domain *dom;
-	struct rdt_resource *r;
-	char *tok, *resname;
-	int closid, ret = 0;
-
-	/* Valid input requires a trailing newline */
-	if (nbytes == 0 || buf[nbytes - 1] != '\n')
-		return -EINVAL;
-	buf[nbytes - 1] = '\0';
-
-	rdtgrp = rdtgroup_kn_lock_live(of->kn);
-	if (!rdtgrp) {
-		rdtgroup_kn_unlock(of->kn);
-		return -ENOENT;
-	}
-
-	closid = rdtgrp->closid;
-
-	for_each_alloc_enabled_rdt_resource(r) {
-		list_for_each_entry(dom, &r->domains, list)
-			dom->have_new_ctrl = false;
-	}
-
-	while ((tok = strsep(&buf, "\n")) != NULL) {
-		resname = strim(strsep(&tok, ":"));
-		if (!tok) {
-			ret = -EINVAL;
-			goto out;
-		}
-		ret = rdtgroup_parse_resource(resname, tok, closid);
-		if (ret)
-			goto out;
-	}
-
-	for_each_alloc_enabled_rdt_resource(r) {
-		ret = update_domains(r, closid);
-		if (ret)
-			goto out;
-	}
-
-out:
-	rdtgroup_kn_unlock(of->kn);
-	return ret ?: nbytes;
-}
-
-static void show_doms(struct seq_file *s, struct rdt_resource *r, int closid)
-{
-	struct rdt_domain *dom;
-	bool sep = false;
-
-	seq_printf(s, "%*s:", max_name_width, r->name);
-	list_for_each_entry(dom, &r->domains, list) {
-		if (sep)
-			seq_puts(s, ";");
-		seq_printf(s, r->format_str, dom->id, max_data_width,
-			   dom->ctrl_val[closid]);
-		sep = true;
-	}
-	seq_puts(s, "\n");
-}
-
-int rdtgroup_schemata_show(struct kernfs_open_file *of,
-			   struct seq_file *s, void *v)
-{
-	struct rdtgroup *rdtgrp;
-	struct rdt_resource *r;
-	int closid, ret = 0;
-
-	rdtgrp = rdtgroup_kn_lock_live(of->kn);
-	if (rdtgrp) {
-		closid = rdtgrp->closid;
-		for_each_alloc_enabled_rdt_resource(r) {
-			if (closid < r->num_closid)
-				show_doms(s, r, closid);
-		}
-	} else {
-		ret = -ENOENT;
-	}
-	rdtgroup_kn_unlock(of->kn);
-	return ret;
-}
-- 
1.9.1