lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <99ac181d-d93c-4b11-84aa-48953d9eb5a3@amd.com>
Date: Mon, 15 Apr 2024 15:44:48 -0500
From: "Moger, Babu" <babu.moger@....com>
To: James Morse <james.morse@....com>, x86@...nel.org,
 linux-kernel@...r.kernel.org
Cc: Fenghua Yu <fenghua.yu@...el.com>,
 Reinette Chatre <reinette.chatre@...el.com>,
 Thomas Gleixner <tglx@...utronix.de>, Ingo Molnar <mingo@...hat.com>,
 Borislav Petkov <bp@...en8.de>, H Peter Anvin <hpa@...or.com>,
 shameerali.kolothum.thodi@...wei.com,
 D Scott Phillips OS <scott@...amperecomputing.com>,
 carl@...amperecomputing.com, lcherian@...vell.com,
 bobo.shaobowang@...wei.com, tan.shaopeng@...itsu.com,
 baolin.wang@...ux.alibaba.com, Jamie Iles <quic_jiles@...cinc.com>,
 Xin Hao <xhao@...ux.alibaba.com>, peternewman@...gle.com,
 dfustini@...libre.com, amitsinght@...vell.com,
 David Hildenbrand <david@...hat.com>, Rex Nie <rex.nie@...uarmicro.com>,
 Dave Martin <dave.martin@....com>
Subject: Re: [PATCH v1 31/31] x86/resctrl: Move the resctrl filesystem code to
 /fs/resctrl

James/Dave,
This is a huge change. Can this be broken into multiple patches?
It will be a major task in case we need to bisect to pinpoint any issues
later.
Thanks
Babu

On 3/21/24 11:51, James Morse wrote:
> resctrl is linux's defacto interface for managing cache and bandwidth
> policies for groups of tasks.
> 
> To allow other architectures to make use of this pseudo filesystem,
> move it live in /fs/resctrl instead of /arch/x86.
> 
> This move leaves behind the parts of resctrl that form the architecture
> interface for x86.
> 
> Signed-off-by: James Morse <james.morse@....com>
> ---
> Discussion needed on how/when to merge this, as it would conflict with
> all outstanding series. It's probably worth deferring to some opportune
> time, but is included here for illustration.
> ---
>  arch/x86/kernel/cpu/resctrl/core.c        |   15 -
>  arch/x86/kernel/cpu/resctrl/ctrlmondata.c |  505 ---
>  arch/x86/kernel/cpu/resctrl/internal.h    |  310 --
>  arch/x86/kernel/cpu/resctrl/monitor.c     |  821 -----
>  arch/x86/kernel/cpu/resctrl/pseudo_lock.c | 1093 ------
>  arch/x86/kernel/cpu/resctrl/rdtgroup.c    | 3994 --------------------
>  fs/resctrl/ctrlmondata.c                  |  527 +++
>  fs/resctrl/internal.h                     |  340 ++
>  fs/resctrl/monitor.c                      |  843 +++++
>  fs/resctrl/psuedo_lock.c                  | 1122 ++++++
>  fs/resctrl/rdtgroup.c                     | 4013 +++++++++++++++++++++
>  11 files changed, 6845 insertions(+), 6738 deletions(-)
> 
> diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c
> index c0fb2e22e110..8ddfebd5f008 100644
> --- a/arch/x86/kernel/cpu/resctrl/core.c
> +++ b/arch/x86/kernel/cpu/resctrl/core.c
> @@ -164,21 +164,6 @@ static inline void cache_alloc_hsw_probe(void)
>  	rdt_alloc_capable = true;
>  }
>  
> -bool is_mba_sc(struct rdt_resource *r)
> -{
> -	if (!r)
> -		r = resctrl_arch_get_resource(RDT_RESOURCE_MBA);
> -
> -	/*
> -	 * The software controller support is only applicable to MBA resource.
> -	 * Make sure to check for resource type.
> -	 */
> -	if (r->rid != RDT_RESOURCE_MBA)
> -		return false;
> -
> -	return r->membw.mba_sc;
> -}
> -
>  /*
>   * rdt_get_mb_table() - get a mapping of bandwidth(b/w) percentage values
>   * exposed to user interface and the h/w understandable delay values.
> diff --git a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
> index b4627ae19291..c5c3eaea27b6 100644
> --- a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
> +++ b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
> @@ -23,260 +23,6 @@
>  
>  #include "internal.h"
>  
> -struct rdt_parse_data {
> -	struct rdtgroup		*rdtgrp;
> -	char			*buf;
> -};
> -
> -typedef int (ctrlval_parser_t)(struct rdt_parse_data *data,
> -			       struct resctrl_schema *s,
> -			       struct rdt_domain *d);
> -
> -/*
> - * Check whether MBA bandwidth percentage value is correct. The value is
> - * checked against the minimum and max bandwidth values specified by the
> - * hardware. The allocated bandwidth percentage is rounded to the next
> - * control step available on the hardware.
> - */
> -static bool bw_validate(char *buf, unsigned long *data, struct rdt_resource *r)
> -{
> -	unsigned long bw;
> -	int ret;
> -
> -	/*
> -	 * Only linear delay values is supported for current Intel SKUs.
> -	 */
> -	if (!r->membw.delay_linear && r->membw.arch_needs_linear) {
> -		rdt_last_cmd_puts("No support for non-linear MB domains\n");
> -		return false;
> -	}
> -
> -	ret = kstrtoul(buf, 10, &bw);
> -	if (ret) {
> -		rdt_last_cmd_printf("Non-decimal digit in MB value %s\n", buf);
> -		return false;
> -	}
> -
> -	if ((bw < r->membw.min_bw || bw > r->default_ctrl) &&
> -	    !is_mba_sc(r)) {
> -		rdt_last_cmd_printf("MB value %ld out of range [%d,%d]\n", bw,
> -				    r->membw.min_bw, r->default_ctrl);
> -		return false;
> -	}
> -
> -	*data = roundup(bw, (unsigned long)r->membw.bw_gran);
> -	return true;
> -}
> -
> -static int parse_bw(struct rdt_parse_data *data, struct resctrl_schema *s,
> -		    struct rdt_domain *d)
> -{
> -	struct resctrl_staged_config *cfg;
> -	u32 closid = data->rdtgrp->closid;
> -	struct rdt_resource *r = s->res;
> -	unsigned long bw_val;
> -
> -	cfg = &d->staged_config[s->conf_type];
> -	if (cfg->have_new_ctrl) {
> -		rdt_last_cmd_printf("Duplicate domain %d\n", d->id);
> -		return -EINVAL;
> -	}
> -
> -	if (!bw_validate(data->buf, &bw_val, r))
> -		return -EINVAL;
> -
> -	if (is_mba_sc(r)) {
> -		d->mbps_val[closid] = bw_val;
> -		return 0;
> -	}
> -
> -	cfg->new_ctrl = bw_val;
> -	cfg->have_new_ctrl = true;
> -
> -	return 0;
> -}
> -
> -/*
> - * Check whether a cache bit mask is valid.
> - * On Intel CPUs, non-contiguous 1s value support is indicated by CPUID:
> - *   - CPUID.0x10.1:ECX[3]: L3 non-contiguous 1s value supported if 1
> - *   - CPUID.0x10.2:ECX[3]: L2 non-contiguous 1s value supported if 1
> - *
> - * Haswell does not support a non-contiguous 1s value and additionally
> - * requires at least two bits set.
> - * AMD allows non-contiguous bitmasks.
> - */
> -static bool cbm_validate(char *buf, u32 *data, struct rdt_resource *r)
> -{
> -	unsigned long first_bit, zero_bit, val;
> -	unsigned int cbm_len = r->cache.cbm_len;
> -	int ret;
> -
> -	ret = kstrtoul(buf, 16, &val);
> -	if (ret) {
> -		rdt_last_cmd_printf("Non-hex character in the mask %s\n", buf);
> -		return false;
> -	}
> -
> -	if ((r->cache.min_cbm_bits > 0 && val == 0) || val > r->default_ctrl) {
> -		rdt_last_cmd_puts("Mask out of range\n");
> -		return false;
> -	}
> -
> -	first_bit = find_first_bit(&val, cbm_len);
> -	zero_bit = find_next_zero_bit(&val, cbm_len, first_bit);
> -
> -	/* Are non-contiguous bitmasks allowed? */
> -	if (!r->cache.arch_has_sparse_bitmasks &&
> -	    (find_next_bit(&val, cbm_len, zero_bit) < cbm_len)) {
> -		rdt_last_cmd_printf("The mask %lx has non-consecutive 1-bits\n", val);
> -		return false;
> -	}
> -
> -	if ((zero_bit - first_bit) < r->cache.min_cbm_bits) {
> -		rdt_last_cmd_printf("Need at least %d bits in the mask\n",
> -				    r->cache.min_cbm_bits);
> -		return false;
> -	}
> -
> -	*data = val;
> -	return true;
> -}
> -
> -/*
> - * Read one cache bit mask (hex). Check that it is valid for the current
> - * resource type.
> - */
> -static int parse_cbm(struct rdt_parse_data *data, struct resctrl_schema *s,
> -		     struct rdt_domain *d)
> -{
> -	struct rdtgroup *rdtgrp = data->rdtgrp;
> -	struct resctrl_staged_config *cfg;
> -	struct rdt_resource *r = s->res;
> -	u32 cbm_val;
> -
> -	cfg = &d->staged_config[s->conf_type];
> -	if (cfg->have_new_ctrl) {
> -		rdt_last_cmd_printf("Duplicate domain %d\n", d->id);
> -		return -EINVAL;
> -	}
> -
> -	/*
> -	 * Cannot set up more than one pseudo-locked region in a cache
> -	 * hierarchy.
> -	 */
> -	if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP &&
> -	    rdtgroup_pseudo_locked_in_hierarchy(d)) {
> -		rdt_last_cmd_puts("Pseudo-locked region in hierarchy\n");
> -		return -EINVAL;
> -	}
> -
> -	if (!cbm_validate(data->buf, &cbm_val, r))
> -		return -EINVAL;
> -
> -	if ((rdtgrp->mode == RDT_MODE_EXCLUSIVE ||
> -	     rdtgrp->mode == RDT_MODE_SHAREABLE) &&
> -	    rdtgroup_cbm_overlaps_pseudo_locked(d, cbm_val)) {
> -		rdt_last_cmd_puts("CBM overlaps with pseudo-locked region\n");
> -		return -EINVAL;
> -	}
> -
> -	/*
> -	 * The CBM may not overlap with the CBM of another closid if
> -	 * either is exclusive.
> -	 */
> -	if (rdtgroup_cbm_overlaps(s, d, cbm_val, rdtgrp->closid, true)) {
> -		rdt_last_cmd_puts("Overlaps with exclusive group\n");
> -		return -EINVAL;
> -	}
> -
> -	if (rdtgroup_cbm_overlaps(s, d, cbm_val, rdtgrp->closid, false)) {
> -		if (rdtgrp->mode == RDT_MODE_EXCLUSIVE ||
> -		    rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
> -			rdt_last_cmd_puts("Overlaps with other group\n");
> -			return -EINVAL;
> -		}
> -	}
> -
> -	cfg->new_ctrl = cbm_val;
> -	cfg->have_new_ctrl = true;
> -
> -	return 0;
> -}
> -
> -static ctrlval_parser_t *get_parser(struct rdt_resource *res)
> -{
> -	if (res->fflags & RFTYPE_RES_CACHE)
> -		return &parse_cbm;
> -	else
> -		return &parse_bw;
> -}
> -
> -/*
> - * For each domain in this resource we expect to find a series of:
> - *	id=mask
> - * separated by ";". The "id" is in decimal, and must match one of
> - * the "id"s for this resource.
> - */
> -static int parse_line(char *line, struct resctrl_schema *s,
> -		      struct rdtgroup *rdtgrp)
> -{
> -	ctrlval_parser_t *parse_ctrlval = get_parser(s->res);
> -	enum resctrl_conf_type t = s->conf_type;
> -	struct resctrl_staged_config *cfg;
> -	struct rdt_resource *r = s->res;
> -	struct rdt_parse_data data;
> -	char *dom = NULL, *id;
> -	struct rdt_domain *d;
> -	unsigned long dom_id;
> -
> -	/* Walking r->domains, ensure it can't race with cpuhp */
> -	lockdep_assert_cpus_held();
> -
> -	if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP &&
> -	    (r->rid == RDT_RESOURCE_MBA || r->rid == RDT_RESOURCE_SMBA)) {
> -		rdt_last_cmd_puts("Cannot pseudo-lock MBA resource\n");
> -		return -EINVAL;
> -	}
> -
> -next:
> -	if (!line || line[0] == '\0')
> -		return 0;
> -	dom = strsep(&line, ";");
> -	id = strsep(&dom, "=");
> -	if (!dom || kstrtoul(id, 10, &dom_id)) {
> -		rdt_last_cmd_puts("Missing '=' or non-numeric domain\n");
> -		return -EINVAL;
> -	}
> -	dom = strim(dom);
> -	list_for_each_entry(d, &r->domains, list) {
> -		if (d->id == dom_id) {
> -			data.buf = dom;
> -			data.rdtgrp = rdtgrp;
> -			if (parse_ctrlval(&data, s, d))
> -				return -EINVAL;
> -			if (rdtgrp->mode ==  RDT_MODE_PSEUDO_LOCKSETUP) {
> -				cfg = &d->staged_config[t];
> -				/*
> -				 * In pseudo-locking setup mode and just
> -				 * parsed a valid CBM that should be
> -				 * pseudo-locked. Only one locked region per
> -				 * resource group and domain so just do
> -				 * the required initialization for single
> -				 * region and return.
> -				 */
> -				rdtgrp->plr->s = s;
> -				rdtgrp->plr->d = d;
> -				rdtgrp->plr->cbm = cfg->new_ctrl;
> -				d->plr = rdtgrp->plr;
> -				return 0;
> -			}
> -			goto next;
> -		}
> -	}
> -	return -EINVAL;
> -}
> -
>  static bool apply_config(struct rdt_hw_domain *hw_dom,
>  			 struct resctrl_staged_config *cfg, u32 idx,
>  			 cpumask_var_t cpu_mask)
> @@ -365,100 +111,6 @@ int resctrl_arch_update_domains(struct rdt_resource *r, u32 closid)
>  	return 0;
>  }
>  
> -static int rdtgroup_parse_resource(char *resname, char *tok,
> -				   struct rdtgroup *rdtgrp)
> -{
> -	struct resctrl_schema *s;
> -
> -	list_for_each_entry(s, &resctrl_schema_all, list) {
> -		if (!strcmp(resname, s->name) && rdtgrp->closid < s->num_closid)
> -			return parse_line(tok, s, rdtgrp);
> -	}
> -	rdt_last_cmd_printf("Unknown or unsupported resource name '%s'\n", resname);
> -	return -EINVAL;
> -}
> -
> -ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of,
> -				char *buf, size_t nbytes, loff_t off)
> -{
> -	struct resctrl_schema *s;
> -	struct rdtgroup *rdtgrp;
> -	struct rdt_resource *r;
> -	char *tok, *resname;
> -	int ret = 0;
> -
> -	/* Valid input requires a trailing newline */
> -	if (nbytes == 0 || buf[nbytes - 1] != '\n')
> -		return -EINVAL;
> -	buf[nbytes - 1] = '\0';
> -
> -	rdtgrp = rdtgroup_kn_lock_live(of->kn);
> -	if (!rdtgrp) {
> -		rdtgroup_kn_unlock(of->kn);
> -		return -ENOENT;
> -	}
> -	rdt_last_cmd_clear();
> -
> -	/*
> -	 * No changes to pseudo-locked region allowed. It has to be removed
> -	 * and re-created instead.
> -	 */
> -	if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) {
> -		ret = -EINVAL;
> -		rdt_last_cmd_puts("Resource group is pseudo-locked\n");
> -		goto out;
> -	}
> -
> -	rdt_staged_configs_clear();
> -
> -	while ((tok = strsep(&buf, "\n")) != NULL) {
> -		resname = strim(strsep(&tok, ":"));
> -		if (!tok) {
> -			rdt_last_cmd_puts("Missing ':'\n");
> -			ret = -EINVAL;
> -			goto out;
> -		}
> -		if (tok[0] == '\0') {
> -			rdt_last_cmd_printf("Missing '%s' value\n", resname);
> -			ret = -EINVAL;
> -			goto out;
> -		}
> -		ret = rdtgroup_parse_resource(resname, tok, rdtgrp);
> -		if (ret)
> -			goto out;
> -	}
> -
> -	list_for_each_entry(s, &resctrl_schema_all, list) {
> -		r = s->res;
> -
> -		/*
> -		 * Writes to mba_sc resources update the software controller,
> -		 * not the control MSR.
> -		 */
> -		if (is_mba_sc(r))
> -			continue;
> -
> -		ret = resctrl_arch_update_domains(r, rdtgrp->closid);
> -		if (ret)
> -			goto out;
> -	}
> -
> -	if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
> -		/*
> -		 * If pseudo-locking fails we keep the resource group in
> -		 * mode RDT_MODE_PSEUDO_LOCKSETUP with its class of service
> -		 * active and updated for just the domain the pseudo-locked
> -		 * region was requested for.
> -		 */
> -		ret = rdtgroup_pseudo_lock_create(rdtgrp);
> -	}
> -
> -out:
> -	rdt_staged_configs_clear();
> -	rdtgroup_kn_unlock(of->kn);
> -	return ret ?: nbytes;
> -}
> -
>  u32 resctrl_arch_get_config(struct rdt_resource *r, struct rdt_domain *d,
>  			    u32 closid, enum resctrl_conf_type type)
>  {
> @@ -467,160 +119,3 @@ u32 resctrl_arch_get_config(struct rdt_resource *r, struct rdt_domain *d,
>  
>  	return hw_dom->ctrl_val[idx];
>  }
> -
> -static void show_doms(struct seq_file *s, struct resctrl_schema *schema, int closid)
> -{
> -	struct rdt_resource *r = schema->res;
> -	struct rdt_domain *dom;
> -	bool sep = false;
> -	u32 ctrl_val;
> -
> -	/* Walking r->domains, ensure it can't race with cpuhp */
> -	lockdep_assert_cpus_held();
> -
> -	seq_printf(s, "%*s:", max_name_width, schema->name);
> -	list_for_each_entry(dom, &r->domains, list) {
> -		if (sep)
> -			seq_puts(s, ";");
> -
> -		if (is_mba_sc(r))
> -			ctrl_val = dom->mbps_val[closid];
> -		else
> -			ctrl_val = resctrl_arch_get_config(r, dom, closid,
> -							   schema->conf_type);
> -
> -		seq_printf(s, r->format_str, dom->id, max_data_width,
> -			   ctrl_val);
> -		sep = true;
> -	}
> -	seq_puts(s, "\n");
> -}
> -
> -int rdtgroup_schemata_show(struct kernfs_open_file *of,
> -			   struct seq_file *s, void *v)
> -{
> -	struct resctrl_schema *schema;
> -	struct rdtgroup *rdtgrp;
> -	int ret = 0;
> -	u32 closid;
> -
> -	rdtgrp = rdtgroup_kn_lock_live(of->kn);
> -	if (rdtgrp) {
> -		if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
> -			list_for_each_entry(schema, &resctrl_schema_all, list) {
> -				seq_printf(s, "%s:uninitialized\n", schema->name);
> -			}
> -		} else if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) {
> -			if (!rdtgrp->plr->d) {
> -				rdt_last_cmd_clear();
> -				rdt_last_cmd_puts("Cache domain offline\n");
> -				ret = -ENODEV;
> -			} else {
> -				seq_printf(s, "%s:%d=%x\n",
> -					   rdtgrp->plr->s->res->name,
> -					   rdtgrp->plr->d->id,
> -					   rdtgrp->plr->cbm);
> -			}
> -		} else {
> -			closid = rdtgrp->closid;
> -			list_for_each_entry(schema, &resctrl_schema_all, list) {
> -				if (closid < schema->num_closid)
> -					show_doms(s, schema, closid);
> -			}
> -		}
> -	} else {
> -		ret = -ENOENT;
> -	}
> -	rdtgroup_kn_unlock(of->kn);
> -	return ret;
> -}
> -
> -static int smp_mon_event_count(void *arg)
> -{
> -	mon_event_count(arg);
> -
> -	return 0;
> -}
> -
> -void mon_event_read(struct rmid_read *rr, struct rdt_resource *r,
> -		    struct rdt_domain *d, struct rdtgroup *rdtgrp,
> -		    int evtid, int first)
> -{
> -	int cpu;
> -
> -	/* When picking a CPU from cpu_mask, ensure it can't race with cpuhp */
> -	lockdep_assert_cpus_held();
> -
> -	/*
> -	 * Setup the parameters to pass to mon_event_count() to read the data.
> -	 */
> -	rr->rgrp = rdtgrp;
> -	rr->evtid = evtid;
> -	rr->r = r;
> -	rr->d = d;
> -	rr->val = 0;
> -	rr->first = first;
> -	rr->arch_mon_ctx = resctrl_arch_mon_ctx_alloc(r, evtid);
> -	if (IS_ERR(rr->arch_mon_ctx)) {
> -		rr->err = -EINVAL;
> -		return;
> -	}
> -
> -	cpu = cpumask_any_housekeeping(&d->cpu_mask, RESCTRL_PICK_ANY_CPU);
> -
> -	/*
> -	 * cpumask_any_housekeeping() prefers housekeeping CPUs, but
> -	 * are all the CPUs nohz_full? If yes, pick a CPU to IPI.
> -	 * MPAM's resctrl_arch_rmid_read() is unable to read the
> -	 * counters on some platforms if its called in IRQ context.
> -	 */
> -	if (tick_nohz_full_cpu(cpu))
> -		smp_call_function_any(&d->cpu_mask, mon_event_count, rr, 1);
> -	else
> -		smp_call_on_cpu(cpu, smp_mon_event_count, rr, false);
> -
> -	resctrl_arch_mon_ctx_free(r, evtid, rr->arch_mon_ctx);
> -}
> -
> -int rdtgroup_mondata_show(struct seq_file *m, void *arg)
> -{
> -	struct kernfs_open_file *of = m->private;
> -	u32 resid, evtid, domid;
> -	struct rdtgroup *rdtgrp;
> -	struct rdt_resource *r;
> -	union mon_data_bits md;
> -	struct rdt_domain *d;
> -	struct rmid_read rr;
> -	int ret = 0;
> -
> -	rdtgrp = rdtgroup_kn_lock_live(of->kn);
> -	if (!rdtgrp) {
> -		ret = -ENOENT;
> -		goto out;
> -	}
> -
> -	md.priv = of->kn->priv;
> -	resid = md.u.rid;
> -	domid = md.u.domid;
> -	evtid = md.u.evtid;
> -
> -	r = resctrl_arch_get_resource(resid);
> -	d = resctrl_arch_find_domain(r, domid);
> -	if (IS_ERR_OR_NULL(d)) {
> -		ret = -ENOENT;
> -		goto out;
> -	}
> -
> -	mon_event_read(&rr, r, d, rdtgrp, evtid, false);
> -
> -	if (rr.err == -EIO)
> -		seq_puts(m, "Error\n");
> -	else if (rr.err == -EINVAL)
> -		seq_puts(m, "Unavailable\n");
> -	else
> -		seq_printf(m, "%llu\n", rr.val);
> -
> -out:
> -	rdtgroup_kn_unlock(of->kn);
> -	return ret;
> -}
> diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h
> index 0f7e3f10941b..bf3538992667 100644
> --- a/arch/x86/kernel/cpu/resctrl/internal.h
> +++ b/arch/x86/kernel/cpu/resctrl/internal.h
> @@ -26,227 +26,6 @@
>   */
>  #define MBM_CNTR_WIDTH_OFFSET_MAX (62 - MBM_CNTR_WIDTH_BASE)
>  
> -/**
> - * cpumask_any_housekeeping() - Choose any CPU in @mask, preferring those that
> - *			        aren't marked nohz_full
> - * @mask:	The mask to pick a CPU from.
> - * @exclude_cpu:The CPU to avoid picking.
> - *
> - * Returns a CPU from @mask, but not @exclude_cpu. If there are housekeeping
> - * CPUs that don't use nohz_full, these are preferred. Pass
> - * RESCTRL_PICK_ANY_CPU to avoid excluding any CPUs.
> - *
> - * When a CPU is excluded, returns >= nr_cpu_ids if no CPUs are available.
> - */
> -static inline unsigned int
> -cpumask_any_housekeeping(const struct cpumask *mask, int exclude_cpu)
> -{
> -	unsigned int cpu, hk_cpu;
> -
> -	if (exclude_cpu == RESCTRL_PICK_ANY_CPU)
> -		cpu = cpumask_any(mask);
> -	else
> -		cpu = cpumask_any_but(mask, exclude_cpu);
> -
> -	if (!IS_ENABLED(CONFIG_NO_HZ_FULL))
> -		return cpu;
> -
> -	/* If the CPU picked isn't marked nohz_full nothing more needs doing. */
> -	if (cpu < nr_cpu_ids && !tick_nohz_full_cpu(cpu))
> -		return cpu;
> -
> -	/* Try to find a CPU that isn't nohz_full to use in preference */
> -	hk_cpu = cpumask_nth_andnot(0, mask, tick_nohz_full_mask);
> -	if (hk_cpu == exclude_cpu)
> -		hk_cpu = cpumask_nth_andnot(1, mask, tick_nohz_full_mask);
> -
> -	if (hk_cpu < nr_cpu_ids)
> -		cpu = hk_cpu;
> -
> -	return cpu;
> -}
> -
> -struct rdt_fs_context {
> -	struct kernfs_fs_context	kfc;
> -	bool				enable_cdpl2;
> -	bool				enable_cdpl3;
> -	bool				enable_mba_mbps;
> -	bool				enable_debug;
> -};
> -
> -static inline struct rdt_fs_context *rdt_fc2context(struct fs_context *fc)
> -{
> -	struct kernfs_fs_context *kfc = fc->fs_private;
> -
> -	return container_of(kfc, struct rdt_fs_context, kfc);
> -}
> -
> -/**
> - * struct mon_evt - Entry in the event list of a resource
> - * @evtid:		event id
> - * @name:		name of the event
> - * @configurable:	true if the event is configurable
> - * @list:		entry in &rdt_resource->evt_list
> - */
> -struct mon_evt {
> -	enum resctrl_event_id	evtid;
> -	char			*name;
> -	bool			configurable;
> -	struct list_head	list;
> -};
> -
> -/**
> - * union mon_data_bits - Monitoring details for each event file
> - * @priv:              Used to store monitoring event data in @u
> - *                     as kernfs private data
> - * @rid:               Resource id associated with the event file
> - * @evtid:             Event id associated with the event file
> - * @domid:             The domain to which the event file belongs
> - * @u:                 Name of the bit fields struct
> - */
> -union mon_data_bits {
> -	void *priv;
> -	struct {
> -		unsigned int rid		: 10;
> -		enum resctrl_event_id evtid	: 8;
> -		unsigned int domid		: 14;
> -	} u;
> -};
> -
> -struct rmid_read {
> -	struct rdtgroup		*rgrp;
> -	struct rdt_resource	*r;
> -	struct rdt_domain	*d;
> -	enum resctrl_event_id	evtid;
> -	bool			first;
> -	int			err;
> -	u64			val;
> -	void			*arch_mon_ctx;
> -};
> -
> -extern struct list_head resctrl_schema_all;
> -extern bool resctrl_mounted;
> -
> -enum rdt_group_type {
> -	RDTCTRL_GROUP = 0,
> -	RDTMON_GROUP,
> -	RDT_NUM_GROUP,
> -};
> -
> -/**
> - * enum rdtgrp_mode - Mode of a RDT resource group
> - * @RDT_MODE_SHAREABLE: This resource group allows sharing of its allocations
> - * @RDT_MODE_EXCLUSIVE: No sharing of this resource group's allocations allowed
> - * @RDT_MODE_PSEUDO_LOCKSETUP: Resource group will be used for Pseudo-Locking
> - * @RDT_MODE_PSEUDO_LOCKED: No sharing of this resource group's allocations
> - *                          allowed AND the allocations are Cache Pseudo-Locked
> - * @RDT_NUM_MODES: Total number of modes
> - *
> - * The mode of a resource group enables control over the allowed overlap
> - * between allocations associated with different resource groups (classes
> - * of service). User is able to modify the mode of a resource group by
> - * writing to the "mode" resctrl file associated with the resource group.
> - *
> - * The "shareable", "exclusive", and "pseudo-locksetup" modes are set by
> - * writing the appropriate text to the "mode" file. A resource group enters
> - * "pseudo-locked" mode after the schemata is written while the resource
> - * group is in "pseudo-locksetup" mode.
> - */
> -enum rdtgrp_mode {
> -	RDT_MODE_SHAREABLE = 0,
> -	RDT_MODE_EXCLUSIVE,
> -	RDT_MODE_PSEUDO_LOCKSETUP,
> -	RDT_MODE_PSEUDO_LOCKED,
> -
> -	/* Must be last */
> -	RDT_NUM_MODES,
> -};
> -
> -/**
> - * struct mongroup - store mon group's data in resctrl fs.
> - * @mon_data_kn:		kernfs node for the mon_data directory
> - * @parent:			parent rdtgrp
> - * @crdtgrp_list:		child rdtgroup node list
> - * @rmid:			rmid for this rdtgroup
> - */
> -struct mongroup {
> -	struct kernfs_node	*mon_data_kn;
> -	struct rdtgroup		*parent;
> -	struct list_head	crdtgrp_list;
> -	u32			rmid;
> -};
> -
> -/**
> - * struct rdtgroup - store rdtgroup's data in resctrl file system.
> - * @kn:				kernfs node
> - * @rdtgroup_list:		linked list for all rdtgroups
> - * @closid:			closid for this rdtgroup
> - * @cpu_mask:			CPUs assigned to this rdtgroup
> - * @flags:			status bits
> - * @waitcount:			how many cpus expect to find this
> - *				group when they acquire rdtgroup_mutex
> - * @type:			indicates type of this rdtgroup - either
> - *				monitor only or ctrl_mon group
> - * @mon:			mongroup related data
> - * @mode:			mode of resource group
> - * @plr:			pseudo-locked region
> - */
> -struct rdtgroup {
> -	struct kernfs_node		*kn;
> -	struct list_head		rdtgroup_list;
> -	u32				closid;
> -	struct cpumask			cpu_mask;
> -	int				flags;
> -	atomic_t			waitcount;
> -	enum rdt_group_type		type;
> -	struct mongroup			mon;
> -	enum rdtgrp_mode		mode;
> -	struct pseudo_lock_region	*plr;
> -};
> -
> -/* List of all resource groups */
> -extern struct list_head rdt_all_groups;
> -
> -extern int max_name_width, max_data_width;
> -
> -/**
> - * struct rftype - describe each file in the resctrl file system
> - * @name:	File name
> - * @mode:	Access mode
> - * @kf_ops:	File operations
> - * @flags:	File specific RFTYPE_FLAGS_* flags
> - * @fflags:	File specific RFTYPE_* flags
> - * @seq_show:	Show content of the file
> - * @write:	Write to the file
> - */
> -struct rftype {
> -	char			*name;
> -	umode_t			mode;
> -	const struct kernfs_ops	*kf_ops;
> -	unsigned long		flags;
> -	unsigned long		fflags;
> -
> -	int (*seq_show)(struct kernfs_open_file *of,
> -			struct seq_file *sf, void *v);
> -	/*
> -	 * write() is the generic write callback which maps directly to
> -	 * kernfs write operation and overrides all other operations.
> -	 * Maximum write size is determined by ->max_write_len.
> -	 */
> -	ssize_t (*write)(struct kernfs_open_file *of,
> -			 char *buf, size_t nbytes, loff_t off);
> -};
> -
> -/**
> - * struct mbm_state - status for each MBM counter in each domain
> - * @prev_bw_bytes: Previous bytes value read for bandwidth calculation
> - * @prev_bw:	The most recent bandwidth in MBps
> - */
> -struct mbm_state {
> -	u64	prev_bw_bytes;
> -	u32	prev_bw;
> -};
> -
>  /**
>   * struct arch_mbm_state - values used to compute resctrl_arch_rmid_read()s
>   *			   return value.
> @@ -327,11 +106,7 @@ static inline struct rdt_hw_resource *resctrl_to_arch_res(struct rdt_resource *r
>  	return container_of(r, struct rdt_hw_resource, r_resctrl);
>  }
>  
> -extern struct mutex rdtgroup_mutex;
> -
>  extern struct rdt_hw_resource rdt_resources_all[];
> -extern struct rdtgroup rdtgroup_default;
> -extern struct dentry *debugfs_resctrl;
>  
>  static inline struct rdt_resource *resctrl_inc(struct rdt_resource *res)
>  {
> @@ -395,95 +170,10 @@ union cpuid_0x10_x_edx {
>  	unsigned int full;
>  };
>  
> -void rdt_last_cmd_clear(void);
> -void rdt_last_cmd_puts(const char *s);
> -__printf(1, 2)
> -void rdt_last_cmd_printf(const char *fmt, ...);
> -
>  void rdt_ctrl_update(void *arg);
> -struct rdtgroup *rdtgroup_kn_lock_live(struct kernfs_node *kn);
> -void rdtgroup_kn_unlock(struct kernfs_node *kn);
> -int rdtgroup_kn_mode_restrict(struct rdtgroup *r, const char *name);
> -int rdtgroup_kn_mode_restore(struct rdtgroup *r, const char *name,
> -			     umode_t mask);
> -ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of,
> -				char *buf, size_t nbytes, loff_t off);
> -int rdtgroup_schemata_show(struct kernfs_open_file *of,
> -			   struct seq_file *s, void *v);
> -bool rdtgroup_cbm_overlaps(struct resctrl_schema *s, struct rdt_domain *d,
> -			   unsigned long cbm, int closid, bool exclusive);
> -unsigned int rdtgroup_cbm_to_size(struct rdt_resource *r, struct rdt_domain *d,
> -				  unsigned long cbm);
> -enum rdtgrp_mode rdtgroup_mode_by_closid(int closid);
> -int rdtgroup_tasks_assigned(struct rdtgroup *r);
> -int closids_supported(void);
> -void closid_free(int closid);
> -int alloc_rmid(u32 closid);
> -void free_rmid(u32 closid, u32 rmid);
>  int rdt_get_mon_l3_config(struct rdt_resource *r);
> -void resctrl_mon_resource_exit(void);
>  bool rdt_cpu_has(int flag);
> -void mon_event_count(void *info);
> -int rdtgroup_mondata_show(struct seq_file *m, void *arg);
> -void mon_event_read(struct rmid_read *rr, struct rdt_resource *r,
> -		    struct rdt_domain *d, struct rdtgroup *rdtgrp,
> -		    int evtid, int first);
> -int resctrl_mon_resource_init(void);
> -void mbm_setup_overflow_handler(struct rdt_domain *dom,
> -				unsigned long delay_ms,
> -				int exclude_cpu);
> -void mbm_handle_overflow(struct work_struct *work);
>  void __init intel_rdt_mbm_apply_quirk(void);
> -bool is_mba_sc(struct rdt_resource *r);
> -void cqm_setup_limbo_handler(struct rdt_domain *dom, unsigned long delay_ms,
> -			     int exclude_cpu);
> -void cqm_handle_limbo(struct work_struct *work);
> -bool has_busy_rmid(struct rdt_domain *d);
> -void __check_limbo(struct rdt_domain *d, bool force_free);
>  void rdt_domain_reconfigure_cdp(struct rdt_resource *r);
> -void mbm_config_rftype_init(const char *config);
> -void rdt_staged_configs_clear(void);
> -bool closid_allocated(unsigned int closid);
> -int resctrl_find_cleanest_closid(void);
> -
> -#ifdef CONFIG_RESCTRL_FS_PSEUDO_LOCK
> -int rdtgroup_locksetup_enter(struct rdtgroup *rdtgrp);
> -int rdtgroup_locksetup_exit(struct rdtgroup *rdtgrp);
> -bool rdtgroup_cbm_overlaps_pseudo_locked(struct rdt_domain *d, unsigned long cbm);
> -bool rdtgroup_pseudo_locked_in_hierarchy(struct rdt_domain *d);
> -int rdt_pseudo_lock_init(void);
> -void rdt_pseudo_lock_release(void);
> -int rdtgroup_pseudo_lock_create(struct rdtgroup *rdtgrp);
> -void rdtgroup_pseudo_lock_remove(struct rdtgroup *rdtgrp);
> -#else
> -static inline int rdtgroup_locksetup_enter(struct rdtgroup *rdtgrp)
> -{
> -	return -EOPNOTSUPP;
> -}
> -
> -static inline int rdtgroup_locksetup_exit(struct rdtgroup *rdtgrp)
> -{
> -	return -EOPNOTSUPP;
> -}
> -
> -static inline bool rdtgroup_cbm_overlaps_pseudo_locked(struct rdt_domain *d, unsigned long cbm)
> -{
> -	return false;
> -}
> -
> -static inline bool rdtgroup_pseudo_locked_in_hierarchy(struct rdt_domain *d)
> -{
> -	return false;
> -}
> -
> -static inline int rdt_pseudo_lock_init(void) { return 0; }
> -static inline void rdt_pseudo_lock_release(void) { }
> -static inline int rdtgroup_pseudo_lock_create(struct rdtgroup *rdtgrp)
> -{
> -	return -EOPNOTSUPP;
> -}
> -
> -static inline void rdtgroup_pseudo_lock_remove(struct rdtgroup *rdtgrp) { }
> -#endif /* CONFIG_RESCTRL_FS_PSEUDO_LOCK */
>  
>  #endif /* _ASM_X86_RESCTRL_INTERNAL_H */
> diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c
> index 7e6fca138cb7..02fb9d87479a 100644
> --- a/arch/x86/kernel/cpu/resctrl/monitor.c
> +++ b/arch/x86/kernel/cpu/resctrl/monitor.c
> @@ -25,53 +25,6 @@
>  
>  #include "internal.h"
>  
> -/**
> - * struct rmid_entry - dirty tracking for all RMID.
> - * @closid:	The CLOSID for this entry.
> - * @rmid:	The RMID for this entry.
> - * @busy:	The number of domains with cached data using this RMID.
> - * @list:	Member of the rmid_free_lru list when busy == 0.
> - *
> - * Depending on the architecture the correct monitor is accessed using
> - * both @closid and @rmid, or @rmid only.
> - *
> - * Take the rdtgroup_mutex when accessing.
> - */
> -struct rmid_entry {
> -	u32				closid;
> -	u32				rmid;
> -	int				busy;
> -	struct list_head		list;
> -};
> -
> -/*
> - * @rmid_free_lru - A least recently used list of free RMIDs
> - *     These RMIDs are guaranteed to have an occupancy less than the
> - *     threshold occupancy
> - */
> -static LIST_HEAD(rmid_free_lru);
> -
> -/*
> - * @closid_num_dirty_rmid    The number of dirty RMID each CLOSID has.
> - *     Only allocated when CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID is defined.
> - *     Indexed by CLOSID. Protected by rdtgroup_mutex.
> - */
> -static u32 *closid_num_dirty_rmid;
> -
> -/*
> - * @rmid_limbo_count - count of currently unused but (potentially)
> - *     dirty RMIDs.
> - *     This counts RMIDs that no one is currently using but that
> - *     may have a occupancy value > resctrl_rmid_realloc_threshold. User can
> - *     change the threshold occupancy value.
> - */
> -static unsigned int rmid_limbo_count;
> -
> -/*
> - * @rmid_entry - The entry in the limbo and free lists.
> - */
> -static struct rmid_entry	*rmid_ptrs;
> -
>  /*
>   * Global boolean for rdt_monitor which is true if any
>   * resource monitoring is enabled.
> @@ -83,17 +36,6 @@ bool rdt_mon_capable;
>   */
>  unsigned int rdt_mon_features;
>  
> -/*
> - * This is the threshold cache occupancy in bytes at which we will consider an
> - * RMID available for re-allocation.
> - */
> -unsigned int resctrl_rmid_realloc_threshold;
> -
> -/*
> - * This is the maximum value for the reallocation threshold, in bytes.
> - */
> -unsigned int resctrl_rmid_realloc_limit;
> -
>  #define CF(cf)	((unsigned long)(1048576 * (cf) + 0.5))
>  
>  /*
> @@ -157,33 +99,6 @@ static inline u64 get_corrected_mbm_count(u32 rmid, unsigned long val)
>  	return val;
>  }
>  
> -/*
> - * x86 and arm64 differ in their handling of monitoring.
> - * x86's RMID are independent numbers, there is only one source of traffic
> - * with an RMID value of '1'.
> - * arm64's PMG extends the PARTID/CLOSID space, there are multiple sources of
> - * traffic with a PMG value of '1', one for each CLOSID, meaning the RMID
> - * value is no longer unique.
> - * To account for this, resctrl uses an index. On x86 this is just the RMID,
> - * on arm64 it encodes the CLOSID and RMID. This gives a unique number.
> - *
> - * The domain's rmid_busy_llc and rmid_ptrs[] are sized by index. The arch code
> - * must accept an attempt to read every index.
> - */
> -static inline struct rmid_entry *__rmid_entry(u32 idx)
> -{
> -	struct rmid_entry *entry;
> -	u32 closid, rmid;
> -
> -	entry = &rmid_ptrs[idx];
> -	resctrl_arch_rmid_idx_decode(idx, &closid, &rmid);
> -
> -	WARN_ON_ONCE(entry->closid != closid);
> -	WARN_ON_ONCE(entry->rmid != rmid);
> -
> -	return entry;
> -}
> -
>  static int __rmid_read(u32 rmid, enum resctrl_event_id eventid, u64 *val)
>  {
>  	u64 msr_val;
> @@ -302,735 +217,6 @@ int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_domain *d,
>  	return 0;
>  }
>  
> -static void limbo_release_entry(struct rmid_entry *entry)
> -{
> -	lockdep_assert_held(&rdtgroup_mutex);
> -
> -	rmid_limbo_count--;
> -	list_add_tail(&entry->list, &rmid_free_lru);
> -
> -	if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID))
> -		closid_num_dirty_rmid[entry->closid]--;
> -}
> -
> -/*
> - * Check the RMIDs that are marked as busy for this domain. If the
> - * reported LLC occupancy is below the threshold clear the busy bit and
> - * decrement the count. If the busy count gets to zero on an RMID, we
> - * free the RMID
> - */
> -void __check_limbo(struct rdt_domain *d, bool force_free)
> -{
> -	struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_L3);
> -	u32 idx_limit = resctrl_arch_system_num_rmid_idx();
> -	struct rmid_entry *entry;
> -	u32 idx, cur_idx = 1;
> -	void *arch_mon_ctx;
> -	bool rmid_dirty;
> -	u64 val = 0;
> -
> -	arch_mon_ctx = resctrl_arch_mon_ctx_alloc(r, QOS_L3_OCCUP_EVENT_ID);
> -	if (IS_ERR(arch_mon_ctx)) {
> -		pr_warn_ratelimited("Failed to allocate monitor context: %ld",
> -				    PTR_ERR(arch_mon_ctx));
> -		return;
> -	}
> -
> -	/*
> -	 * Skip RMID 0 and start from RMID 1 and check all the RMIDs that
> -	 * are marked as busy for occupancy < threshold. If the occupancy
> -	 * is less than the threshold decrement the busy counter of the
> -	 * RMID and move it to the free list when the counter reaches 0.
> -	 */
> -	for (;;) {
> -		idx = find_next_bit(d->rmid_busy_llc, idx_limit, cur_idx);
> -		if (idx >= idx_limit)
> -			break;
> -
> -		entry = __rmid_entry(idx);
> -		if (resctrl_arch_rmid_read(r, d, entry->closid, entry->rmid,
> -					   QOS_L3_OCCUP_EVENT_ID, &val,
> -					   arch_mon_ctx)) {
> -			rmid_dirty = true;
> -		} else {
> -			rmid_dirty = (val >= resctrl_rmid_realloc_threshold);
> -		}
> -
> -		if (force_free || !rmid_dirty) {
> -			clear_bit(idx, d->rmid_busy_llc);
> -			if (!--entry->busy)
> -				limbo_release_entry(entry);
> -		}
> -		cur_idx = idx + 1;
> -	}
> -
> -	resctrl_arch_mon_ctx_free(r, QOS_L3_OCCUP_EVENT_ID, arch_mon_ctx);
> -}
> -
> -bool has_busy_rmid(struct rdt_domain *d)
> -{
> -	u32 idx_limit = resctrl_arch_system_num_rmid_idx();
> -
> -	return find_first_bit(d->rmid_busy_llc, idx_limit) != idx_limit;
> -}
> -
> -static struct rmid_entry *resctrl_find_free_rmid(u32 closid)
> -{
> -	struct rmid_entry *itr;
> -	u32 itr_idx, cmp_idx;
> -
> -	if (list_empty(&rmid_free_lru))
> -		return rmid_limbo_count ? ERR_PTR(-EBUSY) : ERR_PTR(-ENOSPC);
> -
> -	list_for_each_entry(itr, &rmid_free_lru, list) {
> -		/*
> -		 * Get the index of this free RMID, and the index it would need
> -		 * to be if it were used with this CLOSID.
> -		 * If the CLOSID is irrelevant on this architecture, the two
> -		 * index values are always the same on every entry and thus the
> -		 * very first entry will be returned.
> -		 */
> -		itr_idx = resctrl_arch_rmid_idx_encode(itr->closid, itr->rmid);
> -		cmp_idx = resctrl_arch_rmid_idx_encode(closid, itr->rmid);
> -
> -		if (itr_idx == cmp_idx)
> -			return itr;
> -	}
> -
> -	return ERR_PTR(-ENOSPC);
> -}
> -
> -/**
> - * resctrl_find_cleanest_closid() - Find a CLOSID where all the associated
> - *                                  RMID are clean, or the CLOSID that has
> - *                                  the most clean RMID.
> - *
> - * MPAM's equivalent of RMID are per-CLOSID, meaning a freshly allocated CLOSID
> - * may not be able to allocate clean RMID. To avoid this the allocator will
> - * choose the CLOSID with the most clean RMID.
> - *
> - * When the CLOSID and RMID are independent numbers, the first free CLOSID will
> - * be returned.
> - */
> -int resctrl_find_cleanest_closid(void)
> -{
> -	u32 cleanest_closid = ~0;
> -	int i = 0;
> -
> -	lockdep_assert_held(&rdtgroup_mutex);
> -
> -	if (!IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID))
> -		return -EIO;
> -
> -	for (i = 0; i < closids_supported(); i++) {
> -		int num_dirty;
> -
> -		if (closid_allocated(i))
> -			continue;
> -
> -		num_dirty = closid_num_dirty_rmid[i];
> -		if (num_dirty == 0)
> -			return i;
> -
> -		if (cleanest_closid == ~0)
> -			cleanest_closid = i;
> -
> -		if (num_dirty < closid_num_dirty_rmid[cleanest_closid])
> -			cleanest_closid = i;
> -	}
> -
> -	if (cleanest_closid == ~0)
> -		return -ENOSPC;
> -
> -	return cleanest_closid;
> -}
> -
> -/*
> - * For MPAM the RMID value is not unique, and has to be considered with
> - * the CLOSID. The (CLOSID, RMID) pair is allocated on all domains, which
> - * allows all domains to be managed by a single free list.
> - * Each domain also has a rmid_busy_llc to reduce the work of the limbo handler.
> - */
> -int alloc_rmid(u32 closid)
> -{
> -	struct rmid_entry *entry;
> -
> -	lockdep_assert_held(&rdtgroup_mutex);
> -
> -	entry = resctrl_find_free_rmid(closid);
> -	if (IS_ERR(entry))
> -		return PTR_ERR(entry);
> -
> -	list_del(&entry->list);
> -	return entry->rmid;
> -}
> -
> -static void add_rmid_to_limbo(struct rmid_entry *entry)
> -{
> -	struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_L3);
> -	struct rdt_domain *d;
> -	u32 idx;
> -
> -	lockdep_assert_held(&rdtgroup_mutex);
> -
> -	/* Walking r->domains, ensure it can't race with cpuhp */
> -	lockdep_assert_cpus_held();
> -
> -	idx = resctrl_arch_rmid_idx_encode(entry->closid, entry->rmid);
> -
> -	entry->busy = 0;
> -	list_for_each_entry(d, &r->domains, list) {
> -		/*
> -		 * For the first limbo RMID in the domain,
> -		 * setup up the limbo worker.
> -		 */
> -		if (!has_busy_rmid(d))
> -			cqm_setup_limbo_handler(d, CQM_LIMBOCHECK_INTERVAL,
> -						RESCTRL_PICK_ANY_CPU);
> -		set_bit(idx, d->rmid_busy_llc);
> -		entry->busy++;
> -	}
> -
> -	rmid_limbo_count++;
> -	if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID))
> -		closid_num_dirty_rmid[entry->closid]++;
> -}
> -
> -void free_rmid(u32 closid, u32 rmid)
> -{
> -	u32 idx = resctrl_arch_rmid_idx_encode(closid, rmid);
> -	struct rmid_entry *entry;
> -
> -	lockdep_assert_held(&rdtgroup_mutex);
> -
> -	/*
> -	 * Do not allow the default rmid to be free'd. Comparing by index
> -	 * allows architectures that ignore the closid parameter to avoid an
> -	 * unnecessary check.
> -	 */
> -	if (idx == resctrl_arch_rmid_idx_encode(RESCTRL_RESERVED_CLOSID,
> -						RESCTRL_RESERVED_RMID))
> -		return;
> -
> -	entry = __rmid_entry(idx);
> -
> -	if (resctrl_arch_is_llc_occupancy_enabled())
> -		add_rmid_to_limbo(entry);
> -	else
> -		list_add_tail(&entry->list, &rmid_free_lru);
> -}
> -
> -static struct mbm_state *get_mbm_state(struct rdt_domain *d, u32 closid,
> -				       u32 rmid, enum resctrl_event_id evtid)
> -{
> -	u32 idx = resctrl_arch_rmid_idx_encode(closid, rmid);
> -
> -	switch (evtid) {
> -	case QOS_L3_MBM_TOTAL_EVENT_ID:
> -		return &d->mbm_total[idx];
> -	case QOS_L3_MBM_LOCAL_EVENT_ID:
> -		return &d->mbm_local[idx];
> -	default:
> -		return NULL;
> -	}
> -}
> -
> -static int __mon_event_count(u32 closid, u32 rmid, struct rmid_read *rr)
> -{
> -	struct mbm_state *m;
> -	u64 tval = 0;
> -
> -	if (rr->first) {
> -		resctrl_arch_reset_rmid(rr->r, rr->d, closid, rmid, rr->evtid);
> -		m = get_mbm_state(rr->d, closid, rmid, rr->evtid);
> -		if (m)
> -			memset(m, 0, sizeof(struct mbm_state));
> -		return 0;
> -	}
> -
> -	rr->err = resctrl_arch_rmid_read(rr->r, rr->d, closid, rmid, rr->evtid,
> -					 &tval, rr->arch_mon_ctx);
> -	if (rr->err)
> -		return rr->err;
> -
> -	rr->val += tval;
> -
> -	return 0;
> -}
> -
> -/*
> - * mbm_bw_count() - Update bw count from values previously read by
> - *		    __mon_event_count().
> - * @closid:	The closid used to identify the cached mbm_state.
> - * @rmid:	The rmid used to identify the cached mbm_state.
> - * @rr:		The struct rmid_read populated by __mon_event_count().
> - *
> - * Supporting function to calculate the memory bandwidth
> - * and delta bandwidth in MBps. The chunks value previously read by
> - * __mon_event_count() is compared with the chunks value from the previous
> - * invocation. This must be called once per second to maintain values in MBps.
> - */
> -static void mbm_bw_count(u32 closid, u32 rmid, struct rmid_read *rr)
> -{
> -	u32 idx = resctrl_arch_rmid_idx_encode(closid, rmid);
> -	struct mbm_state *m = &rr->d->mbm_local[idx];
> -	u64 cur_bw, bytes, cur_bytes;
> -
> -	cur_bytes = rr->val;
> -	bytes = cur_bytes - m->prev_bw_bytes;
> -	m->prev_bw_bytes = cur_bytes;
> -
> -	cur_bw = bytes / SZ_1M;
> -
> -	m->prev_bw = cur_bw;
> -}
> -
> -/*
> - * This is scheduled by mon_event_read() to read the CQM/MBM counters
> - * on a domain.
> - */
> -void mon_event_count(void *info)
> -{
> -	struct rdtgroup *rdtgrp, *entry;
> -	struct rmid_read *rr = info;
> -	struct list_head *head;
> -	int ret;
> -
> -	rdtgrp = rr->rgrp;
> -
> -	ret = __mon_event_count(rdtgrp->closid, rdtgrp->mon.rmid, rr);
> -
> -	/*
> -	 * For Ctrl groups read data from child monitor groups and
> -	 * add them together. Count events which are read successfully.
> -	 * Discard the rmid_read's reporting errors.
> -	 */
> -	head = &rdtgrp->mon.crdtgrp_list;
> -
> -	if (rdtgrp->type == RDTCTRL_GROUP) {
> -		list_for_each_entry(entry, head, mon.crdtgrp_list) {
> -			if (__mon_event_count(entry->closid, entry->mon.rmid,
> -					      rr) == 0)
> -				ret = 0;
> -		}
> -	}
> -
> -	/*
> -	 * __mon_event_count() calls for newly created monitor groups may
> -	 * report -EINVAL/Unavailable if the monitor hasn't seen any traffic.
> -	 * Discard error if any of the monitor event reads succeeded.
> -	 */
> -	if (ret == 0)
> -		rr->err = 0;
> -}
> -
> -/*
> - * Feedback loop for MBA software controller (mba_sc)
> - *
> - * mba_sc is a feedback loop where we periodically read MBM counters and
> - * adjust the bandwidth percentage values via the IA32_MBA_THRTL_MSRs so
> - * that:
> - *
> - *   current bandwidth(cur_bw) < user specified bandwidth(user_bw)
> - *
> - * This uses the MBM counters to measure the bandwidth and MBA throttle
> - * MSRs to control the bandwidth for a particular rdtgrp. It builds on the
> - * fact that resctrl rdtgroups have both monitoring and control.
> - *
> - * The frequency of the checks is 1s and we just tag along the MBM overflow
> - * timer. Having 1s interval makes the calculation of bandwidth simpler.
> - *
> - * Although MBA's goal is to restrict the bandwidth to a maximum, there may
> - * be a need to increase the bandwidth to avoid unnecessarily restricting
> - * the L2 <-> L3 traffic.
> - *
> - * Since MBA controls the L2 external bandwidth where as MBM measures the
> - * L3 external bandwidth the following sequence could lead to such a
> - * situation.
> - *
> - * Consider an rdtgroup which had high L3 <-> memory traffic in initial
> - * phases -> mba_sc kicks in and reduced bandwidth percentage values -> but
> - * after some time rdtgroup has mostly L2 <-> L3 traffic.
> - *
> - * In this case we may restrict the rdtgroup's L2 <-> L3 traffic as its
> - * throttle MSRs already have low percentage values.  To avoid
> - * unnecessarily restricting such rdtgroups, we also increase the bandwidth.
> - */
> -static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_domain *dom_mbm)
> -{
> -	u32 closid, rmid, cur_msr_val, new_msr_val;
> -	struct mbm_state *pmbm_data, *cmbm_data;
> -	struct rdt_resource *r_mba;
> -	struct rdt_domain *dom_mba;
> -	u32 cur_bw, user_bw, idx;
> -	struct list_head *head;
> -	struct rdtgroup *entry;
> -
> -	if (!resctrl_arch_is_mbm_local_enabled())
> -		return;
> -
> -	r_mba = resctrl_arch_get_resource(RDT_RESOURCE_MBA);
> -
> -	closid = rgrp->closid;
> -	rmid = rgrp->mon.rmid;
> -	idx = resctrl_arch_rmid_idx_encode(closid, rmid);
> -	pmbm_data = &dom_mbm->mbm_local[idx];
> -
> -	dom_mba = resctrl_get_domain_from_cpu(smp_processor_id(), r_mba);
> -	if (!dom_mba) {
> -		pr_warn_once("Failure to get domain for MBA update\n");
> -		return;
> -	}
> -
> -	cur_bw = pmbm_data->prev_bw;
> -	user_bw = dom_mba->mbps_val[closid];
> -
> -	/* MBA resource doesn't support CDP */
> -	cur_msr_val = resctrl_arch_get_config(r_mba, dom_mba, closid, CDP_NONE);
> -
> -	/*
> -	 * For Ctrl groups read data from child monitor groups.
> -	 */
> -	head = &rgrp->mon.crdtgrp_list;
> -	list_for_each_entry(entry, head, mon.crdtgrp_list) {
> -		cmbm_data = &dom_mbm->mbm_local[entry->mon.rmid];
> -		cur_bw += cmbm_data->prev_bw;
> -	}
> -
> -	/*
> -	 * Scale up/down the bandwidth linearly for the ctrl group.  The
> -	 * bandwidth step is the bandwidth granularity specified by the
> -	 * hardware.
> -	 * Always increase throttling if current bandwidth is above the
> -	 * target set by user.
> -	 * But avoid thrashing up and down on every poll by checking
> -	 * whether a decrease in throttling is likely to push the group
> -	 * back over target. E.g. if currently throttling to 30% of bandwidth
> -	 * on a system with 10% granularity steps, check whether moving to
> -	 * 40% would go past the limit by multiplying current bandwidth by
> -	 * "(30 + 10) / 30".
> -	 */
> -	if (cur_msr_val > r_mba->membw.min_bw && user_bw < cur_bw) {
> -		new_msr_val = cur_msr_val - r_mba->membw.bw_gran;
> -	} else if (cur_msr_val < MAX_MBA_BW &&
> -		   (user_bw > (cur_bw * (cur_msr_val + r_mba->membw.min_bw) / cur_msr_val))) {
> -		new_msr_val = cur_msr_val + r_mba->membw.bw_gran;
> -	} else {
> -		return;
> -	}
> -
> -	resctrl_arch_update_one(r_mba, dom_mba, closid, CDP_NONE, new_msr_val);
> -}
> -
> -static void mbm_update(struct rdt_resource *r, struct rdt_domain *d,
> -		       u32 closid, u32 rmid)
> -{
> -	struct rmid_read rr;
> -
> -	rr.first = false;
> -	rr.r = r;
> -	rr.d = d;
> -
> -	/*
> -	 * This is protected from concurrent reads from user
> -	 * as both the user and we hold the global mutex.
> -	 */
> -	if (resctrl_arch_is_mbm_total_enabled()) {
> -		rr.evtid = QOS_L3_MBM_TOTAL_EVENT_ID;
> -		rr.val = 0;
> -		rr.arch_mon_ctx = resctrl_arch_mon_ctx_alloc(rr.r, rr.evtid);
> -		if (IS_ERR(rr.arch_mon_ctx)) {
> -			pr_warn_ratelimited("Failed to allocate monitor context: %ld",
> -					    PTR_ERR(rr.arch_mon_ctx));
> -			return;
> -		}
> -
> -		__mon_event_count(closid, rmid, &rr);
> -
> -		resctrl_arch_mon_ctx_free(rr.r, rr.evtid, rr.arch_mon_ctx);
> -	}
> -	if (resctrl_arch_is_mbm_local_enabled()) {
> -		rr.evtid = QOS_L3_MBM_LOCAL_EVENT_ID;
> -		rr.val = 0;
> -		rr.arch_mon_ctx = resctrl_arch_mon_ctx_alloc(rr.r, rr.evtid);
> -		if (IS_ERR(rr.arch_mon_ctx)) {
> -			pr_warn_ratelimited("Failed to allocate monitor context: %ld",
> -					    PTR_ERR(rr.arch_mon_ctx));
> -			return;
> -		}
> -
> -		__mon_event_count(closid, rmid, &rr);
> -
> -		/*
> -		 * Call the MBA software controller only for the
> -		 * control groups and when user has enabled
> -		 * the software controller explicitly.
> -		 */
> -		if (is_mba_sc(NULL))
> -			mbm_bw_count(closid, rmid, &rr);
> -
> -		resctrl_arch_mon_ctx_free(rr.r, rr.evtid, rr.arch_mon_ctx);
> -	}
> -}
> -
> -/*
> - * Handler to scan the limbo list and move the RMIDs
> - * to free list whose occupancy < threshold_occupancy.
> - */
> -void cqm_handle_limbo(struct work_struct *work)
> -{
> -	unsigned long delay = msecs_to_jiffies(CQM_LIMBOCHECK_INTERVAL);
> -	struct rdt_domain *d;
> -
> -	cpus_read_lock();
> -	mutex_lock(&rdtgroup_mutex);
> -
> -	d = container_of(work, struct rdt_domain, cqm_limbo.work);
> -
> -	__check_limbo(d, false);
> -
> -	if (has_busy_rmid(d)) {
> -		d->cqm_work_cpu = cpumask_any_housekeeping(&d->cpu_mask,
> -							   RESCTRL_PICK_ANY_CPU);
> -		schedule_delayed_work_on(d->cqm_work_cpu, &d->cqm_limbo,
> -					 delay);
> -	}
> -
> -	mutex_unlock(&rdtgroup_mutex);
> -	cpus_read_unlock();
> -}
> -
> -/**
> - * cqm_setup_limbo_handler() - Schedule the limbo handler to run for this
> - *                             domain.
> - * @dom:           The domain the limbo handler should run for.
> - * @delay_ms:      How far in the future the handler should run.
> - * @exclude_cpu:   Which CPU the handler should not run on,
> - *		   RESCTRL_PICK_ANY_CPU to pick any CPU.
> - */
> -void cqm_setup_limbo_handler(struct rdt_domain *dom, unsigned long delay_ms,
> -			     int exclude_cpu)
> -{
> -	unsigned long delay = msecs_to_jiffies(delay_ms);
> -	int cpu;
> -
> -	cpu = cpumask_any_housekeeping(&dom->cpu_mask, exclude_cpu);
> -	dom->cqm_work_cpu = cpu;
> -
> -	if (cpu < nr_cpu_ids)
> -		schedule_delayed_work_on(cpu, &dom->cqm_limbo, delay);
> -}
> -
> -void mbm_handle_overflow(struct work_struct *work)
> -{
> -	unsigned long delay = msecs_to_jiffies(MBM_OVERFLOW_INTERVAL);
> -	struct rdtgroup *prgrp, *crgrp;
> -	struct list_head *head;
> -	struct rdt_resource *r;
> -	struct rdt_domain *d;
> -
> -	cpus_read_lock();
> -	mutex_lock(&rdtgroup_mutex);
> -
> -	/*
> -	 * If the filesystem has been unmounted this work no longer needs to
> -	 * run.
> -	 */
> -	if (!resctrl_mounted || !resctrl_arch_mon_capable())
> -		goto out_unlock;
> -
> -	r = resctrl_arch_get_resource(RDT_RESOURCE_L3);
> -	d = container_of(work, struct rdt_domain, mbm_over.work);
> -
> -	list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) {
> -		mbm_update(r, d, prgrp->closid, prgrp->mon.rmid);
> -
> -		head = &prgrp->mon.crdtgrp_list;
> -		list_for_each_entry(crgrp, head, mon.crdtgrp_list)
> -			mbm_update(r, d, crgrp->closid, crgrp->mon.rmid);
> -
> -		if (is_mba_sc(NULL))
> -			update_mba_bw(prgrp, d);
> -	}
> -
> -	/*
> -	 * Re-check for housekeeping CPUs. This allows the overflow handler to
> -	 * move off a nohz_full CPU quickly.
> -	 */
> -	d->mbm_work_cpu = cpumask_any_housekeeping(&d->cpu_mask,
> -						   RESCTRL_PICK_ANY_CPU);
> -	schedule_delayed_work_on(d->mbm_work_cpu, &d->mbm_over, delay);
> -
> -out_unlock:
> -	mutex_unlock(&rdtgroup_mutex);
> -	cpus_read_unlock();
> -}
> -
> -/**
> - * mbm_setup_overflow_handler() - Schedule the overflow handler to run for this
> - *                                domain.
> - * @dom:           The domain the overflow handler should run for.
> - * @delay_ms:      How far in the future the handler should run.
> - * @exclude_cpu:   Which CPU the handler should not run on,
> - *		   RESCTRL_PICK_ANY_CPU to pick any CPU.
> - */
> -void mbm_setup_overflow_handler(struct rdt_domain *dom, unsigned long delay_ms,
> -				int exclude_cpu)
> -{
> -	unsigned long delay = msecs_to_jiffies(delay_ms);
> -	int cpu;
> -
> -	/*
> -	 * When a domain comes online there is no guarantee the filesystem is
> -	 * mounted. If not, there is no need to catch counter overflow.
> -	 */
> -	if (!resctrl_mounted || !resctrl_arch_mon_capable())
> -		return;
> -	cpu = cpumask_any_housekeeping(&dom->cpu_mask, exclude_cpu);
> -	dom->mbm_work_cpu = cpu;
> -
> -	if (cpu < nr_cpu_ids)
> -		schedule_delayed_work_on(cpu, &dom->mbm_over, delay);
> -}
> -
> -static int dom_data_init(struct rdt_resource *r)
> -{
> -	u32 idx_limit = resctrl_arch_system_num_rmid_idx();
> -	u32 num_closid = resctrl_arch_get_num_closid(r);
> -	struct rmid_entry *entry = NULL;
> -	int err = 0, i;
> -	u32 idx;
> -
> -	mutex_lock(&rdtgroup_mutex);
> -	if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) {
> -		u32 *tmp;
> -
> -		/*
> -		 * If the architecture hasn't provided a sanitised value here,
> -		 * this may result in larger arrays than necessary. Resctrl will
> -		 * use a smaller system wide value based on the resources in
> -		 * use.
> -		 */
> -		tmp = kcalloc(num_closid, sizeof(*tmp), GFP_KERNEL);
> -		if (!tmp) {
> -			err = -ENOMEM;
> -			goto out_unlock;
> -		}
> -
> -		closid_num_dirty_rmid = tmp;
> -	}
> -
> -	rmid_ptrs = kcalloc(idx_limit, sizeof(struct rmid_entry), GFP_KERNEL);
> -	if (!rmid_ptrs) {
> -		if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) {
> -			kfree(closid_num_dirty_rmid);
> -			closid_num_dirty_rmid = NULL;
> -		}
> -		err = -ENOMEM;
> -		goto out_unlock;
> -	}
> -
> -	for (i = 0; i < idx_limit; i++) {
> -		entry = &rmid_ptrs[i];
> -		INIT_LIST_HEAD(&entry->list);
> -
> -		resctrl_arch_rmid_idx_decode(i, &entry->closid, &entry->rmid);
> -		list_add_tail(&entry->list, &rmid_free_lru);
> -	}
> -
> -	/*
> -	 * RESCTRL_RESERVED_CLOSID and RESCTRL_RESERVED_RMID are special and
> -	 * are always allocated. These are used for the rdtgroup_default
> -	 * control group, which will be setup later in rdtgroup_init().
> -	 */
> -	idx = resctrl_arch_rmid_idx_encode(RESCTRL_RESERVED_CLOSID,
> -					   RESCTRL_RESERVED_RMID);
> -	entry = __rmid_entry(idx);
> -	list_del(&entry->list);
> -
> -out_unlock:
> -	mutex_unlock(&rdtgroup_mutex);
> -
> -	return err;
> -}
> -
> -static void dom_data_exit(struct rdt_resource *r)
> -{
> -	if (!r->mon_capable)
> -		return;
> -
> -	mutex_lock(&rdtgroup_mutex);
> -	if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) {
> -		kfree(closid_num_dirty_rmid);
> -		closid_num_dirty_rmid = NULL;
> -	}
> -
> -	kfree(rmid_ptrs);
> -	rmid_ptrs = NULL;
> -
> -	mutex_unlock(&rdtgroup_mutex);
> -}
> -
> -static struct mon_evt llc_occupancy_event = {
> -	.name		= "llc_occupancy",
> -	.evtid		= QOS_L3_OCCUP_EVENT_ID,
> -};
> -
> -static struct mon_evt mbm_total_event = {
> -	.name		= "mbm_total_bytes",
> -	.evtid		= QOS_L3_MBM_TOTAL_EVENT_ID,
> -};
> -
> -static struct mon_evt mbm_local_event = {
> -	.name		= "mbm_local_bytes",
> -	.evtid		= QOS_L3_MBM_LOCAL_EVENT_ID,
> -};
> -
> -/*
> - * Initialize the event list for the resource.
> - *
> - * Note that MBM events are also part of RDT_RESOURCE_L3 resource
> - * because as per the SDM the total and local memory bandwidth
> - * are enumerated as part of L3 monitoring.
> - */
> -static void l3_mon_evt_init(struct rdt_resource *r)
> -{
> -	INIT_LIST_HEAD(&r->evt_list);
> -
> -	if (resctrl_arch_is_llc_occupancy_enabled())
> -		list_add_tail(&llc_occupancy_event.list, &r->evt_list);
> -	if (resctrl_arch_is_mbm_total_enabled())
> -		list_add_tail(&mbm_total_event.list, &r->evt_list);
> -	if (resctrl_arch_is_mbm_local_enabled())
> -		list_add_tail(&mbm_local_event.list, &r->evt_list);
> -}
> -
> -int resctrl_mon_resource_init(void)
> -{
> -	struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_L3);
> -	int ret;
> -
> -	if (!r->mon_capable)
> -		return 0;
> -
> -	ret = dom_data_init(r);
> -	if (ret)
> -		return ret;
> -
> -	l3_mon_evt_init(r);
> -
> -	if (resctrl_arch_is_evt_configurable(QOS_L3_MBM_TOTAL_EVENT_ID)) {
> -		mbm_total_event.configurable = true;
> -		mbm_config_rftype_init("mbm_total_bytes_config");
> -	}
> -	if (resctrl_arch_is_evt_configurable(QOS_L3_MBM_LOCAL_EVENT_ID)) {
> -		mbm_local_event.configurable = true;
> -		mbm_config_rftype_init("mbm_local_bytes_config");
> -	}
> -
> -	return 0;
> -}
> -
>  int __init rdt_get_mon_l3_config(struct rdt_resource *r)
>  {
>  	unsigned int mbm_offset = boot_cpu_data.x86_cache_mbm_width_offset;
> @@ -1076,13 +262,6 @@ int __init rdt_get_mon_l3_config(struct rdt_resource *r)
>  	return 0;
>  }
>  
> -void resctrl_mon_resource_exit(void)
> -{
> -	struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_L3);
> -
> -	dom_data_exit(r);
> -}
> -
>  void __init intel_rdt_mbm_apply_quirk(void)
>  {
>  	int cf_index;
> diff --git a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c
> index ba51ab1f70e6..ba1596afee10 100644
> --- a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c
> +++ b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c
> @@ -39,28 +39,6 @@
>   */
>  static u64 prefetch_disable_bits;
>  
> -/*
> - * Major number assigned to and shared by all devices exposing
> - * pseudo-locked regions.
> - */
> -static unsigned int pseudo_lock_major;
> -static unsigned long pseudo_lock_minor_avail = GENMASK(MINORBITS, 0);
> -
> -static char *pseudo_lock_devnode(const struct device *dev, umode_t *mode)
> -{
> -	const struct rdtgroup *rdtgrp;
> -
> -	rdtgrp = dev_get_drvdata(dev);
> -	if (mode)
> -		*mode = 0600;
> -	return kasprintf(GFP_KERNEL, "pseudo_lock/%s", rdtgrp->kn->name);
> -}
> -
> -static const struct class pseudo_lock_class = {
> -	.name = "pseudo_lock",
> -	.devnode = pseudo_lock_devnode,
> -};
> -
>  /**
>   * resctrl_arch_get_prefetch_disable_bits - prefetch disable bits of supported
>   *                                          platforms
> @@ -121,299 +99,6 @@ u64 resctrl_arch_get_prefetch_disable_bits(void)
>  	return prefetch_disable_bits;
>  }
>  
> -/**
> - * pseudo_lock_minor_get - Obtain available minor number
> - * @minor: Pointer to where new minor number will be stored
> - *
> - * A bitmask is used to track available minor numbers. Here the next free
> - * minor number is marked as unavailable and returned.
> - *
> - * Return: 0 on success, <0 on failure.
> - */
> -static int pseudo_lock_minor_get(unsigned int *minor)
> -{
> -	unsigned long first_bit;
> -
> -	first_bit = find_first_bit(&pseudo_lock_minor_avail, MINORBITS);
> -
> -	if (first_bit == MINORBITS)
> -		return -ENOSPC;
> -
> -	__clear_bit(first_bit, &pseudo_lock_minor_avail);
> -	*minor = first_bit;
> -
> -	return 0;
> -}
> -
> -/**
> - * pseudo_lock_minor_release - Return minor number to available
> - * @minor: The minor number made available
> - */
> -static void pseudo_lock_minor_release(unsigned int minor)
> -{
> -	__set_bit(minor, &pseudo_lock_minor_avail);
> -}
> -
> -/**
> - * region_find_by_minor - Locate a pseudo-lock region by inode minor number
> - * @minor: The minor number of the device representing pseudo-locked region
> - *
> - * When the character device is accessed we need to determine which
> - * pseudo-locked region it belongs to. This is done by matching the minor
> - * number of the device to the pseudo-locked region it belongs.
> - *
> - * Minor numbers are assigned at the time a pseudo-locked region is associated
> - * with a cache instance.
> - *
> - * Return: On success return pointer to resource group owning the pseudo-locked
> - *         region, NULL on failure.
> - */
> -static struct rdtgroup *region_find_by_minor(unsigned int minor)
> -{
> -	struct rdtgroup *rdtgrp, *rdtgrp_match = NULL;
> -
> -	list_for_each_entry(rdtgrp, &rdt_all_groups, rdtgroup_list) {
> -		if (rdtgrp->plr && rdtgrp->plr->minor == minor) {
> -			rdtgrp_match = rdtgrp;
> -			break;
> -		}
> -	}
> -	return rdtgrp_match;
> -}
> -
> -/**
> - * struct pseudo_lock_pm_req - A power management QoS request list entry
> - * @list:	Entry within the @pm_reqs list for a pseudo-locked region
> - * @req:	PM QoS request
> - */
> -struct pseudo_lock_pm_req {
> -	struct list_head list;
> -	struct dev_pm_qos_request req;
> -};
> -
> -static void pseudo_lock_cstates_relax(struct pseudo_lock_region *plr)
> -{
> -	struct pseudo_lock_pm_req *pm_req, *next;
> -
> -	list_for_each_entry_safe(pm_req, next, &plr->pm_reqs, list) {
> -		dev_pm_qos_remove_request(&pm_req->req);
> -		list_del(&pm_req->list);
> -		kfree(pm_req);
> -	}
> -}
> -
> -/**
> - * pseudo_lock_cstates_constrain - Restrict cores from entering C6
> - * @plr: Pseudo-locked region
> - *
> - * To prevent the cache from being affected by power management entering
> - * C6 has to be avoided. This is accomplished by requesting a latency
> - * requirement lower than lowest C6 exit latency of all supported
> - * platforms as found in the cpuidle state tables in the intel_idle driver.
> - * At this time it is possible to do so with a single latency requirement
> - * for all supported platforms.
> - *
> - * Since Goldmont is supported, which is affected by X86_BUG_MONITOR,
> - * the ACPI latencies need to be considered while keeping in mind that C2
> - * may be set to map to deeper sleep states. In this case the latency
> - * requirement needs to prevent entering C2 also.
> - *
> - * Return: 0 on success, <0 on failure
> - */
> -static int pseudo_lock_cstates_constrain(struct pseudo_lock_region *plr)
> -{
> -	struct pseudo_lock_pm_req *pm_req;
> -	int cpu;
> -	int ret;
> -
> -	for_each_cpu(cpu, &plr->d->cpu_mask) {
> -		pm_req = kzalloc(sizeof(*pm_req), GFP_KERNEL);
> -		if (!pm_req) {
> -			rdt_last_cmd_puts("Failure to allocate memory for PM QoS\n");
> -			ret = -ENOMEM;
> -			goto out_err;
> -		}
> -		ret = dev_pm_qos_add_request(get_cpu_device(cpu),
> -					     &pm_req->req,
> -					     DEV_PM_QOS_RESUME_LATENCY,
> -					     30);
> -		if (ret < 0) {
> -			rdt_last_cmd_printf("Failed to add latency req CPU%d\n",
> -					    cpu);
> -			kfree(pm_req);
> -			ret = -1;
> -			goto out_err;
> -		}
> -		list_add(&pm_req->list, &plr->pm_reqs);
> -	}
> -
> -	return 0;
> -
> -out_err:
> -	pseudo_lock_cstates_relax(plr);
> -	return ret;
> -}
> -
> -/**
> - * pseudo_lock_region_clear - Reset pseudo-lock region data
> - * @plr: pseudo-lock region
> - *
> - * All content of the pseudo-locked region is reset - any memory allocated
> - * freed.
> - *
> - * Return: void
> - */
> -static void pseudo_lock_region_clear(struct pseudo_lock_region *plr)
> -{
> -	plr->size = 0;
> -	plr->line_size = 0;
> -	kfree(plr->kmem);
> -	plr->kmem = NULL;
> -	plr->s = NULL;
> -	if (plr->d)
> -		plr->d->plr = NULL;
> -	plr->d = NULL;
> -	plr->cbm = 0;
> -	plr->debugfs_dir = NULL;
> -}
> -
> -/**
> - * pseudo_lock_region_init - Initialize pseudo-lock region information
> - * @plr: pseudo-lock region
> - *
> - * Called after user provided a schemata to be pseudo-locked. From the
> - * schemata the &struct pseudo_lock_region is on entry already initialized
> - * with the resource, domain, and capacity bitmask. Here the information
> - * required for pseudo-locking is deduced from this data and &struct
> - * pseudo_lock_region initialized further. This information includes:
> - * - size in bytes of the region to be pseudo-locked
> - * - cache line size to know the stride with which data needs to be accessed
> - *   to be pseudo-locked
> - * - a cpu associated with the cache instance on which the pseudo-locking
> - *   flow can be executed
> - *
> - * Return: 0 on success, <0 on failure. Descriptive error will be written
> - * to last_cmd_status buffer.
> - */
> -static int pseudo_lock_region_init(struct pseudo_lock_region *plr)
> -{
> -	struct cpu_cacheinfo *ci;
> -	int ret;
> -	int i;
> -
> -	/* Pick the first cpu we find that is associated with the cache. */
> -	plr->cpu = cpumask_first(&plr->d->cpu_mask);
> -
> -	if (!cpu_online(plr->cpu)) {
> -		rdt_last_cmd_printf("CPU %u associated with cache not online\n",
> -				    plr->cpu);
> -		ret = -ENODEV;
> -		goto out_region;
> -	}
> -
> -	ci = get_cpu_cacheinfo(plr->cpu);
> -
> -	plr->size = rdtgroup_cbm_to_size(plr->s->res, plr->d, plr->cbm);
> -
> -	for (i = 0; i < ci->num_leaves; i++) {
> -		if (ci->info_list[i].level == plr->s->res->cache_level) {
> -			plr->line_size = ci->info_list[i].coherency_line_size;
> -			return 0;
> -		}
> -	}
> -
> -	ret = -1;
> -	rdt_last_cmd_puts("Unable to determine cache line size\n");
> -out_region:
> -	pseudo_lock_region_clear(plr);
> -	return ret;
> -}
> -
> -/**
> - * pseudo_lock_init - Initialize a pseudo-lock region
> - * @rdtgrp: resource group to which new pseudo-locked region will belong
> - *
> - * A pseudo-locked region is associated with a resource group. When this
> - * association is created the pseudo-locked region is initialized. The
> - * details of the pseudo-locked region are not known at this time so only
> - * allocation is done and association established.
> - *
> - * Return: 0 on success, <0 on failure
> - */
> -static int pseudo_lock_init(struct rdtgroup *rdtgrp)
> -{
> -	struct pseudo_lock_region *plr;
> -
> -	plr = kzalloc(sizeof(*plr), GFP_KERNEL);
> -	if (!plr)
> -		return -ENOMEM;
> -
> -	init_waitqueue_head(&plr->lock_thread_wq);
> -	INIT_LIST_HEAD(&plr->pm_reqs);
> -	rdtgrp->plr = plr;
> -	return 0;
> -}
> -
> -/**
> - * pseudo_lock_region_alloc - Allocate kernel memory that will be pseudo-locked
> - * @plr: pseudo-lock region
> - *
> - * Initialize the details required to set up the pseudo-locked region and
> - * allocate the contiguous memory that will be pseudo-locked to the cache.
> - *
> - * Return: 0 on success, <0 on failure.  Descriptive error will be written
> - * to last_cmd_status buffer.
> - */
> -static int pseudo_lock_region_alloc(struct pseudo_lock_region *plr)
> -{
> -	int ret;
> -
> -	ret = pseudo_lock_region_init(plr);
> -	if (ret < 0)
> -		return ret;
> -
> -	/*
> -	 * We do not yet support contiguous regions larger than
> -	 * KMALLOC_MAX_SIZE.
> -	 */
> -	if (plr->size > KMALLOC_MAX_SIZE) {
> -		rdt_last_cmd_puts("Requested region exceeds maximum size\n");
> -		ret = -E2BIG;
> -		goto out_region;
> -	}
> -
> -	plr->kmem = kzalloc(plr->size, GFP_KERNEL);
> -	if (!plr->kmem) {
> -		rdt_last_cmd_puts("Unable to allocate memory\n");
> -		ret = -ENOMEM;
> -		goto out_region;
> -	}
> -
> -	ret = 0;
> -	goto out;
> -out_region:
> -	pseudo_lock_region_clear(plr);
> -out:
> -	return ret;
> -}
> -
> -/**
> - * pseudo_lock_free - Free a pseudo-locked region
> - * @rdtgrp: resource group to which pseudo-locked region belonged
> - *
> - * The pseudo-locked region's resources have already been released, or not
> - * yet created at this point. Now it can be freed and disassociated from the
> - * resource group.
> - *
> - * Return: void
> - */
> -static void pseudo_lock_free(struct rdtgroup *rdtgrp)
> -{
> -	pseudo_lock_region_clear(rdtgrp->plr);
> -	kfree(rdtgrp->plr);
> -	rdtgrp->plr = NULL;
> -}
> -
>  /**
>   * resctrl_arch_pseudo_lock_fn - Load kernel memory into cache
>   * @_plr: the pseudo-lock region descriptor
> @@ -543,345 +228,6 @@ int resctrl_arch_pseudo_lock_fn(void *_plr)
>  	return 0;
>  }
>  
> -/**
> - * rdtgroup_monitor_in_progress - Test if monitoring in progress
> - * @rdtgrp: resource group being queried
> - *
> - * Return: 1 if monitor groups have been created for this resource
> - * group, 0 otherwise.
> - */
> -static int rdtgroup_monitor_in_progress(struct rdtgroup *rdtgrp)
> -{
> -	return !list_empty(&rdtgrp->mon.crdtgrp_list);
> -}
> -
> -/**
> - * rdtgroup_locksetup_user_restrict - Restrict user access to group
> - * @rdtgrp: resource group needing access restricted
> - *
> - * A resource group used for cache pseudo-locking cannot have cpus or tasks
> - * assigned to it. This is communicated to the user by restricting access
> - * to all the files that can be used to make such changes.
> - *
> - * Permissions restored with rdtgroup_locksetup_user_restore()
> - *
> - * Return: 0 on success, <0 on failure. If a failure occurs during the
> - * restriction of access an attempt will be made to restore permissions but
> - * the state of the mode of these files will be uncertain when a failure
> - * occurs.
> - */
> -static int rdtgroup_locksetup_user_restrict(struct rdtgroup *rdtgrp)
> -{
> -	int ret;
> -
> -	ret = rdtgroup_kn_mode_restrict(rdtgrp, "tasks");
> -	if (ret)
> -		return ret;
> -
> -	ret = rdtgroup_kn_mode_restrict(rdtgrp, "cpus");
> -	if (ret)
> -		goto err_tasks;
> -
> -	ret = rdtgroup_kn_mode_restrict(rdtgrp, "cpus_list");
> -	if (ret)
> -		goto err_cpus;
> -
> -	if (resctrl_arch_mon_capable()) {
> -		ret = rdtgroup_kn_mode_restrict(rdtgrp, "mon_groups");
> -		if (ret)
> -			goto err_cpus_list;
> -	}
> -
> -	ret = 0;
> -	goto out;
> -
> -err_cpus_list:
> -	rdtgroup_kn_mode_restore(rdtgrp, "cpus_list", 0777);
> -err_cpus:
> -	rdtgroup_kn_mode_restore(rdtgrp, "cpus", 0777);
> -err_tasks:
> -	rdtgroup_kn_mode_restore(rdtgrp, "tasks", 0777);
> -out:
> -	return ret;
> -}
> -
> -/**
> - * rdtgroup_locksetup_user_restore - Restore user access to group
> - * @rdtgrp: resource group needing access restored
> - *
> - * Restore all file access previously removed using
> - * rdtgroup_locksetup_user_restrict()
> - *
> - * Return: 0 on success, <0 on failure.  If a failure occurs during the
> - * restoration of access an attempt will be made to restrict permissions
> - * again but the state of the mode of these files will be uncertain when
> - * a failure occurs.
> - */
> -static int rdtgroup_locksetup_user_restore(struct rdtgroup *rdtgrp)
> -{
> -	int ret;
> -
> -	ret = rdtgroup_kn_mode_restore(rdtgrp, "tasks", 0777);
> -	if (ret)
> -		return ret;
> -
> -	ret = rdtgroup_kn_mode_restore(rdtgrp, "cpus", 0777);
> -	if (ret)
> -		goto err_tasks;
> -
> -	ret = rdtgroup_kn_mode_restore(rdtgrp, "cpus_list", 0777);
> -	if (ret)
> -		goto err_cpus;
> -
> -	if (resctrl_arch_mon_capable()) {
> -		ret = rdtgroup_kn_mode_restore(rdtgrp, "mon_groups", 0777);
> -		if (ret)
> -			goto err_cpus_list;
> -	}
> -
> -	ret = 0;
> -	goto out;
> -
> -err_cpus_list:
> -	rdtgroup_kn_mode_restrict(rdtgrp, "cpus_list");
> -err_cpus:
> -	rdtgroup_kn_mode_restrict(rdtgrp, "cpus");
> -err_tasks:
> -	rdtgroup_kn_mode_restrict(rdtgrp, "tasks");
> -out:
> -	return ret;
> -}
> -
> -/**
> - * rdtgroup_locksetup_enter - Resource group enters locksetup mode
> - * @rdtgrp: resource group requested to enter locksetup mode
> - *
> - * A resource group enters locksetup mode to reflect that it would be used
> - * to represent a pseudo-locked region and is in the process of being set
> - * up to do so. A resource group used for a pseudo-locked region would
> - * lose the closid associated with it so we cannot allow it to have any
> - * tasks or cpus assigned nor permit tasks or cpus to be assigned in the
> - * future. Monitoring of a pseudo-locked region is not allowed either.
> - *
> - * The above and more restrictions on a pseudo-locked region are checked
> - * for and enforced before the resource group enters the locksetup mode.
> - *
> - * Returns: 0 if the resource group successfully entered locksetup mode, <0
> - * on failure. On failure the last_cmd_status buffer is updated with text to
> - * communicate details of failure to the user.
> - */
> -int rdtgroup_locksetup_enter(struct rdtgroup *rdtgrp)
> -{
> -	int ret;
> -
> -	/*
> -	 * The default resource group can neither be removed nor lose the
> -	 * default closid associated with it.
> -	 */
> -	if (rdtgrp == &rdtgroup_default) {
> -		rdt_last_cmd_puts("Cannot pseudo-lock default group\n");
> -		return -EINVAL;
> -	}
> -
> -	/*
> -	 * Cache Pseudo-locking not supported when CDP is enabled.
> -	 *
> -	 * Some things to consider if you would like to enable this
> -	 * support (using L3 CDP as example):
> -	 * - When CDP is enabled two separate resources are exposed,
> -	 *   L3DATA and L3CODE, but they are actually on the same cache.
> -	 *   The implication for pseudo-locking is that if a
> -	 *   pseudo-locked region is created on a domain of one
> -	 *   resource (eg. L3CODE), then a pseudo-locked region cannot
> -	 *   be created on that same domain of the other resource
> -	 *   (eg. L3DATA). This is because the creation of a
> -	 *   pseudo-locked region involves a call to wbinvd that will
> -	 *   affect all cache allocations on particular domain.
> -	 * - Considering the previous, it may be possible to only
> -	 *   expose one of the CDP resources to pseudo-locking and
> -	 *   hide the other. For example, we could consider to only
> -	 *   expose L3DATA and since the L3 cache is unified it is
> -	 *   still possible to place instructions there are execute it.
> -	 * - If only one region is exposed to pseudo-locking we should
> -	 *   still keep in mind that availability of a portion of cache
> -	 *   for pseudo-locking should take into account both resources.
> -	 *   Similarly, if a pseudo-locked region is created in one
> -	 *   resource, the portion of cache used by it should be made
> -	 *   unavailable to all future allocations from both resources.
> -	 */
> -	if (resctrl_arch_get_cdp_enabled(RDT_RESOURCE_L3) ||
> -	    resctrl_arch_get_cdp_enabled(RDT_RESOURCE_L2)) {
> -		rdt_last_cmd_puts("CDP enabled\n");
> -		return -EINVAL;
> -	}
> -
> -	/*
> -	 * Not knowing the bits to disable prefetching implies that this
> -	 * platform does not support Cache Pseudo-Locking.
> -	 */
> -	if (resctrl_arch_get_prefetch_disable_bits() == 0) {
> -		rdt_last_cmd_puts("Pseudo-locking not supported\n");
> -		return -EINVAL;
> -	}
> -
> -	if (rdtgroup_monitor_in_progress(rdtgrp)) {
> -		rdt_last_cmd_puts("Monitoring in progress\n");
> -		return -EINVAL;
> -	}
> -
> -	if (rdtgroup_tasks_assigned(rdtgrp)) {
> -		rdt_last_cmd_puts("Tasks assigned to resource group\n");
> -		return -EINVAL;
> -	}
> -
> -	if (!cpumask_empty(&rdtgrp->cpu_mask)) {
> -		rdt_last_cmd_puts("CPUs assigned to resource group\n");
> -		return -EINVAL;
> -	}
> -
> -	if (rdtgroup_locksetup_user_restrict(rdtgrp)) {
> -		rdt_last_cmd_puts("Unable to modify resctrl permissions\n");
> -		return -EIO;
> -	}
> -
> -	ret = pseudo_lock_init(rdtgrp);
> -	if (ret) {
> -		rdt_last_cmd_puts("Unable to init pseudo-lock region\n");
> -		goto out_release;
> -	}
> -
> -	/*
> -	 * If this system is capable of monitoring a rmid would have been
> -	 * allocated when the control group was created. This is not needed
> -	 * anymore when this group would be used for pseudo-locking. This
> -	 * is safe to call on platforms not capable of monitoring.
> -	 */
> -	free_rmid(rdtgrp->closid, rdtgrp->mon.rmid);
> -
> -	ret = 0;
> -	goto out;
> -
> -out_release:
> -	rdtgroup_locksetup_user_restore(rdtgrp);
> -out:
> -	return ret;
> -}
> -
> -/**
> - * rdtgroup_locksetup_exit - resource group exist locksetup mode
> - * @rdtgrp: resource group
> - *
> - * When a resource group exits locksetup mode the earlier restrictions are
> - * lifted.
> - *
> - * Return: 0 on success, <0 on failure
> - */
> -int rdtgroup_locksetup_exit(struct rdtgroup *rdtgrp)
> -{
> -	int ret;
> -
> -	if (resctrl_arch_mon_capable()) {
> -		ret = alloc_rmid(rdtgrp->closid);
> -		if (ret < 0) {
> -			rdt_last_cmd_puts("Out of RMIDs\n");
> -			return ret;
> -		}
> -		rdtgrp->mon.rmid = ret;
> -	}
> -
> -	ret = rdtgroup_locksetup_user_restore(rdtgrp);
> -	if (ret) {
> -		free_rmid(rdtgrp->closid, rdtgrp->mon.rmid);
> -		return ret;
> -	}
> -
> -	pseudo_lock_free(rdtgrp);
> -	return 0;
> -}
> -
> -/**
> - * rdtgroup_cbm_overlaps_pseudo_locked - Test if CBM or portion is pseudo-locked
> - * @d: RDT domain
> - * @cbm: CBM to test
> - *
> - * @d represents a cache instance and @cbm a capacity bitmask that is
> - * considered for it. Determine if @cbm overlaps with any existing
> - * pseudo-locked region on @d.
> - *
> - * @cbm is unsigned long, even if only 32 bits are used, to make the
> - * bitmap functions work correctly.
> - *
> - * Return: true if @cbm overlaps with pseudo-locked region on @d, false
> - * otherwise.
> - */
> -bool rdtgroup_cbm_overlaps_pseudo_locked(struct rdt_domain *d, unsigned long cbm)
> -{
> -	unsigned int cbm_len;
> -	unsigned long cbm_b;
> -
> -	if (d->plr) {
> -		cbm_len = d->plr->s->res->cache.cbm_len;
> -		cbm_b = d->plr->cbm;
> -		if (bitmap_intersects(&cbm, &cbm_b, cbm_len))
> -			return true;
> -	}
> -	return false;
> -}
> -
> -/**
> - * rdtgroup_pseudo_locked_in_hierarchy - Pseudo-locked region in cache hierarchy
> - * @d: RDT domain under test
> - *
> - * The setup of a pseudo-locked region affects all cache instances within
> - * the hierarchy of the region. It is thus essential to know if any
> - * pseudo-locked regions exist within a cache hierarchy to prevent any
> - * attempts to create new pseudo-locked regions in the same hierarchy.
> - *
> - * Return: true if a pseudo-locked region exists in the hierarchy of @d or
> - *         if it is not possible to test due to memory allocation issue,
> - *         false otherwise.
> - */
> -bool rdtgroup_pseudo_locked_in_hierarchy(struct rdt_domain *d)
> -{
> -	cpumask_var_t cpu_with_psl;
> -	enum resctrl_res_level i;
> -	struct rdt_resource *r;
> -	struct rdt_domain *d_i;
> -	bool ret = false;
> -
> -	/* Walking r->domains, ensure it can't race with cpuhp */
> -	lockdep_assert_cpus_held();
> -
> -	if (!zalloc_cpumask_var(&cpu_with_psl, GFP_KERNEL))
> -		return true;
> -
> -	/*
> -	 * First determine which cpus have pseudo-locked regions
> -	 * associated with them.
> -	 */
> -	for (i = 0; i < RDT_NUM_RESOURCES; i++) {
> -		r = resctrl_arch_get_resource(i);
> -		if (!r->alloc_capable)
> -			continue;
> -
> -		list_for_each_entry(d_i, &r->domains, list) {
> -			if (d_i->plr)
> -				cpumask_or(cpu_with_psl, cpu_with_psl,
> -					   &d_i->cpu_mask);
> -		}
> -	}
> -
> -	/*
> -	 * Next test if new pseudo-locked region would intersect with
> -	 * existing region.
> -	 */
> -	if (cpumask_intersects(&d->cpu_mask, cpu_with_psl))
> -		ret = true;
> -
> -	free_cpumask_var(cpu_with_psl);
> -	return ret;
> -}
> -
>  /**
>   * resctrl_arch_measure_cycles_lat_fn - Measure cycle latency to read
>   *                                      pseudo-locked memory
> @@ -1174,442 +520,3 @@ int resctrl_arch_measure_l3_residency(void *_plr)
>  	wake_up_interruptible(&plr->lock_thread_wq);
>  	return 0;
>  }
> -
> -/**
> - * pseudo_lock_measure_cycles - Trigger latency measure to pseudo-locked region
> - * @rdtgrp: Resource group to which the pseudo-locked region belongs.
> - * @sel: Selector of which measurement to perform on a pseudo-locked region.
> - *
> - * The measurement of latency to access a pseudo-locked region should be
> - * done from a cpu that is associated with that pseudo-locked region.
> - * Determine which cpu is associated with this region and start a thread on
> - * that cpu to perform the measurement, wait for that thread to complete.
> - *
> - * Return: 0 on success, <0 on failure
> - */
> -static int pseudo_lock_measure_cycles(struct rdtgroup *rdtgrp, int sel)
> -{
> -	struct pseudo_lock_region *plr = rdtgrp->plr;
> -	struct task_struct *thread;
> -	unsigned int cpu;
> -	int ret = -1;
> -
> -	cpus_read_lock();
> -	mutex_lock(&rdtgroup_mutex);
> -
> -	if (rdtgrp->flags & RDT_DELETED) {
> -		ret = -ENODEV;
> -		goto out;
> -	}
> -
> -	if (!plr->d) {
> -		ret = -ENODEV;
> -		goto out;
> -	}
> -
> -	plr->thread_done = 0;
> -	cpu = cpumask_first(&plr->d->cpu_mask);
> -	if (!cpu_online(cpu)) {
> -		ret = -ENODEV;
> -		goto out;
> -	}
> -
> -	plr->cpu = cpu;
> -
> -	if (sel == 1)
> -		thread = kthread_create_on_node(resctrl_arch_measure_cycles_lat_fn,
> -						plr, cpu_to_node(cpu),
> -						"pseudo_lock_measure/%u",
> -						cpu);
> -	else if (sel == 2)
> -		thread = kthread_create_on_node(resctrl_arch_measure_l2_residency,
> -						plr, cpu_to_node(cpu),
> -						"pseudo_lock_measure/%u",
> -						cpu);
> -	else if (sel == 3)
> -		thread = kthread_create_on_node(resctrl_arch_measure_l3_residency,
> -						plr, cpu_to_node(cpu),
> -						"pseudo_lock_measure/%u",
> -						cpu);
> -	else
> -		goto out;
> -
> -	if (IS_ERR(thread)) {
> -		ret = PTR_ERR(thread);
> -		goto out;
> -	}
> -	kthread_bind(thread, cpu);
> -	wake_up_process(thread);
> -
> -	ret = wait_event_interruptible(plr->lock_thread_wq,
> -				       plr->thread_done == 1);
> -	if (ret < 0)
> -		goto out;
> -
> -	ret = 0;
> -
> -out:
> -	mutex_unlock(&rdtgroup_mutex);
> -	cpus_read_unlock();
> -	return ret;
> -}
> -
> -static ssize_t pseudo_lock_measure_trigger(struct file *file,
> -					   const char __user *user_buf,
> -					   size_t count, loff_t *ppos)
> -{
> -	struct rdtgroup *rdtgrp = file->private_data;
> -	size_t buf_size;
> -	char buf[32];
> -	int ret;
> -	int sel;
> -
> -	buf_size = min(count, (sizeof(buf) - 1));
> -	if (copy_from_user(buf, user_buf, buf_size))
> -		return -EFAULT;
> -
> -	buf[buf_size] = '\0';
> -	ret = kstrtoint(buf, 10, &sel);
> -	if (ret == 0) {
> -		if (sel != 1 && sel != 2 && sel != 3)
> -			return -EINVAL;
> -		ret = debugfs_file_get(file->f_path.dentry);
> -		if (ret)
> -			return ret;
> -		ret = pseudo_lock_measure_cycles(rdtgrp, sel);
> -		if (ret == 0)
> -			ret = count;
> -		debugfs_file_put(file->f_path.dentry);
> -	}
> -
> -	return ret;
> -}
> -
> -static const struct file_operations pseudo_measure_fops = {
> -	.write = pseudo_lock_measure_trigger,
> -	.open = simple_open,
> -	.llseek = default_llseek,
> -};
> -
> -/**
> - * rdtgroup_pseudo_lock_create - Create a pseudo-locked region
> - * @rdtgrp: resource group to which pseudo-lock region belongs
> - *
> - * Called when a resource group in the pseudo-locksetup mode receives a
> - * valid schemata that should be pseudo-locked. Since the resource group is
> - * in pseudo-locksetup mode the &struct pseudo_lock_region has already been
> - * allocated and initialized with the essential information. If a failure
> - * occurs the resource group remains in the pseudo-locksetup mode with the
> - * &struct pseudo_lock_region associated with it, but cleared from all
> - * information and ready for the user to re-attempt pseudo-locking by
> - * writing the schemata again.
> - *
> - * Return: 0 if the pseudo-locked region was successfully pseudo-locked, <0
> - * on failure. Descriptive error will be written to last_cmd_status buffer.
> - */
> -int rdtgroup_pseudo_lock_create(struct rdtgroup *rdtgrp)
> -{
> -	struct pseudo_lock_region *plr = rdtgrp->plr;
> -	struct task_struct *thread;
> -	unsigned int new_minor;
> -	struct device *dev;
> -	int ret;
> -
> -	ret = pseudo_lock_region_alloc(plr);
> -	if (ret < 0)
> -		return ret;
> -
> -	ret = pseudo_lock_cstates_constrain(plr);
> -	if (ret < 0) {
> -		ret = -EINVAL;
> -		goto out_region;
> -	}
> -
> -	plr->thread_done = 0;
> -
> -	plr->closid = rdtgrp->closid;
> -	thread = kthread_create_on_node(resctrl_arch_pseudo_lock_fn, plr,
> -					cpu_to_node(plr->cpu),
> -					"pseudo_lock/%u", plr->cpu);
> -	if (IS_ERR(thread)) {
> -		ret = PTR_ERR(thread);
> -		rdt_last_cmd_printf("Locking thread returned error %d\n", ret);
> -		goto out_cstates;
> -	}
> -
> -	kthread_bind(thread, plr->cpu);
> -	wake_up_process(thread);
> -
> -	ret = wait_event_interruptible(plr->lock_thread_wq,
> -				       plr->thread_done == 1);
> -	if (ret < 0) {
> -		/*
> -		 * If the thread does not get on the CPU for whatever
> -		 * reason and the process which sets up the region is
> -		 * interrupted then this will leave the thread in runnable
> -		 * state and once it gets on the CPU it will dereference
> -		 * the cleared, but not freed, plr struct resulting in an
> -		 * empty pseudo-locking loop.
> -		 */
> -		rdt_last_cmd_puts("Locking thread interrupted\n");
> -		goto out_cstates;
> -	}
> -
> -	ret = pseudo_lock_minor_get(&new_minor);
> -	if (ret < 0) {
> -		rdt_last_cmd_puts("Unable to obtain a new minor number\n");
> -		goto out_cstates;
> -	}
> -
> -	/*
> -	 * Unlock access but do not release the reference. The
> -	 * pseudo-locked region will still be here on return.
> -	 *
> -	 * The mutex has to be released temporarily to avoid a potential
> -	 * deadlock with the mm->mmap_lock which is obtained in the
> -	 * device_create() and debugfs_create_dir() callpath below as well as
> -	 * before the mmap() callback is called.
> -	 */
> -	mutex_unlock(&rdtgroup_mutex);
> -
> -	if (!IS_ERR_OR_NULL(debugfs_resctrl)) {
> -		plr->debugfs_dir = debugfs_create_dir(rdtgrp->kn->name,
> -						      debugfs_resctrl);
> -		if (!IS_ERR_OR_NULL(plr->debugfs_dir))
> -			debugfs_create_file("pseudo_lock_measure", 0200,
> -					    plr->debugfs_dir, rdtgrp,
> -					    &pseudo_measure_fops);
> -	}
> -
> -	dev = device_create(&pseudo_lock_class, NULL,
> -			    MKDEV(pseudo_lock_major, new_minor),
> -			    rdtgrp, "%s", rdtgrp->kn->name);
> -
> -	mutex_lock(&rdtgroup_mutex);
> -
> -	if (IS_ERR(dev)) {
> -		ret = PTR_ERR(dev);
> -		rdt_last_cmd_printf("Failed to create character device: %d\n",
> -				    ret);
> -		goto out_debugfs;
> -	}
> -
> -	/* We released the mutex - check if group was removed while we did so */
> -	if (rdtgrp->flags & RDT_DELETED) {
> -		ret = -ENODEV;
> -		goto out_device;
> -	}
> -
> -	plr->minor = new_minor;
> -
> -	rdtgrp->mode = RDT_MODE_PSEUDO_LOCKED;
> -	closid_free(rdtgrp->closid);
> -	rdtgroup_kn_mode_restore(rdtgrp, "cpus", 0444);
> -	rdtgroup_kn_mode_restore(rdtgrp, "cpus_list", 0444);
> -
> -	ret = 0;
> -	goto out;
> -
> -out_device:
> -	device_destroy(&pseudo_lock_class, MKDEV(pseudo_lock_major, new_minor));
> -out_debugfs:
> -	debugfs_remove_recursive(plr->debugfs_dir);
> -	pseudo_lock_minor_release(new_minor);
> -out_cstates:
> -	pseudo_lock_cstates_relax(plr);
> -out_region:
> -	pseudo_lock_region_clear(plr);
> -out:
> -	return ret;
> -}
> -
> -/**
> - * rdtgroup_pseudo_lock_remove - Remove a pseudo-locked region
> - * @rdtgrp: resource group to which the pseudo-locked region belongs
> - *
> - * The removal of a pseudo-locked region can be initiated when the resource
> - * group is removed from user space via a "rmdir" from userspace or the
> - * unmount of the resctrl filesystem. On removal the resource group does
> - * not go back to pseudo-locksetup mode before it is removed, instead it is
> - * removed directly. There is thus asymmetry with the creation where the
> - * &struct pseudo_lock_region is removed here while it was not created in
> - * rdtgroup_pseudo_lock_create().
> - *
> - * Return: void
> - */
> -void rdtgroup_pseudo_lock_remove(struct rdtgroup *rdtgrp)
> -{
> -	struct pseudo_lock_region *plr = rdtgrp->plr;
> -
> -	if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
> -		/*
> -		 * Default group cannot be a pseudo-locked region so we can
> -		 * free closid here.
> -		 */
> -		closid_free(rdtgrp->closid);
> -		goto free;
> -	}
> -
> -	pseudo_lock_cstates_relax(plr);
> -	debugfs_remove_recursive(rdtgrp->plr->debugfs_dir);
> -	device_destroy(&pseudo_lock_class, MKDEV(pseudo_lock_major, plr->minor));
> -	pseudo_lock_minor_release(plr->minor);
> -
> -free:
> -	pseudo_lock_free(rdtgrp);
> -}
> -
> -static int pseudo_lock_dev_open(struct inode *inode, struct file *filp)
> -{
> -	struct rdtgroup *rdtgrp;
> -
> -	mutex_lock(&rdtgroup_mutex);
> -
> -	rdtgrp = region_find_by_minor(iminor(inode));
> -	if (!rdtgrp) {
> -		mutex_unlock(&rdtgroup_mutex);
> -		return -ENODEV;
> -	}
> -
> -	filp->private_data = rdtgrp;
> -	atomic_inc(&rdtgrp->waitcount);
> -	/* Perform a non-seekable open - llseek is not supported */
> -	filp->f_mode &= ~(FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE);
> -
> -	mutex_unlock(&rdtgroup_mutex);
> -
> -	return 0;
> -}
> -
> -static int pseudo_lock_dev_release(struct inode *inode, struct file *filp)
> -{
> -	struct rdtgroup *rdtgrp;
> -
> -	mutex_lock(&rdtgroup_mutex);
> -	rdtgrp = filp->private_data;
> -	WARN_ON(!rdtgrp);
> -	if (!rdtgrp) {
> -		mutex_unlock(&rdtgroup_mutex);
> -		return -ENODEV;
> -	}
> -	filp->private_data = NULL;
> -	atomic_dec(&rdtgrp->waitcount);
> -	mutex_unlock(&rdtgroup_mutex);
> -	return 0;
> -}
> -
> -static int pseudo_lock_dev_mremap(struct vm_area_struct *area)
> -{
> -	/* Not supported */
> -	return -EINVAL;
> -}
> -
> -static const struct vm_operations_struct pseudo_mmap_ops = {
> -	.mremap = pseudo_lock_dev_mremap,
> -};
> -
> -static int pseudo_lock_dev_mmap(struct file *filp, struct vm_area_struct *vma)
> -{
> -	unsigned long vsize = vma->vm_end - vma->vm_start;
> -	unsigned long off = vma->vm_pgoff << PAGE_SHIFT;
> -	struct pseudo_lock_region *plr;
> -	struct rdtgroup *rdtgrp;
> -	unsigned long physical;
> -	unsigned long psize;
> -
> -	mutex_lock(&rdtgroup_mutex);
> -
> -	rdtgrp = filp->private_data;
> -	WARN_ON(!rdtgrp);
> -	if (!rdtgrp) {
> -		mutex_unlock(&rdtgroup_mutex);
> -		return -ENODEV;
> -	}
> -
> -	plr = rdtgrp->plr;
> -
> -	if (!plr->d) {
> -		mutex_unlock(&rdtgroup_mutex);
> -		return -ENODEV;
> -	}
> -
> -	/*
> -	 * Task is required to run with affinity to the cpus associated
> -	 * with the pseudo-locked region. If this is not the case the task
> -	 * may be scheduled elsewhere and invalidate entries in the
> -	 * pseudo-locked region.
> -	 */
> -	if (!cpumask_subset(current->cpus_ptr, &plr->d->cpu_mask)) {
> -		mutex_unlock(&rdtgroup_mutex);
> -		return -EINVAL;
> -	}
> -
> -	physical = __pa(plr->kmem) >> PAGE_SHIFT;
> -	psize = plr->size - off;
> -
> -	if (off > plr->size) {
> -		mutex_unlock(&rdtgroup_mutex);
> -		return -ENOSPC;
> -	}
> -
> -	/*
> -	 * Ensure changes are carried directly to the memory being mapped,
> -	 * do not allow copy-on-write mapping.
> -	 */
> -	if (!(vma->vm_flags & VM_SHARED)) {
> -		mutex_unlock(&rdtgroup_mutex);
> -		return -EINVAL;
> -	}
> -
> -	if (vsize > psize) {
> -		mutex_unlock(&rdtgroup_mutex);
> -		return -ENOSPC;
> -	}
> -
> -	memset(plr->kmem + off, 0, vsize);
> -
> -	if (remap_pfn_range(vma, vma->vm_start, physical + vma->vm_pgoff,
> -			    vsize, vma->vm_page_prot)) {
> -		mutex_unlock(&rdtgroup_mutex);
> -		return -EAGAIN;
> -	}
> -	vma->vm_ops = &pseudo_mmap_ops;
> -	mutex_unlock(&rdtgroup_mutex);
> -	return 0;
> -}
> -
> -static const struct file_operations pseudo_lock_dev_fops = {
> -	.owner =	THIS_MODULE,
> -	.llseek =	no_llseek,
> -	.read =		NULL,
> -	.write =	NULL,
> -	.open =		pseudo_lock_dev_open,
> -	.release =	pseudo_lock_dev_release,
> -	.mmap =		pseudo_lock_dev_mmap,
> -};
> -
> -int rdt_pseudo_lock_init(void)
> -{
> -	int ret;
> -
> -	ret = register_chrdev(0, "pseudo_lock", &pseudo_lock_dev_fops);
> -	if (ret < 0)
> -		return ret;
> -
> -	pseudo_lock_major = ret;
> -
> -	ret = class_register(&pseudo_lock_class);
> -	if (ret) {
> -		unregister_chrdev(pseudo_lock_major, "pseudo_lock");
> -		return ret;
> -	}
> -
> -	return 0;
> -}
> -
> -void rdt_pseudo_lock_release(void)
> -{
> -	class_unregister(&pseudo_lock_class);
> -	unregister_chrdev(pseudo_lock_major, "pseudo_lock");
> -	pseudo_lock_major = 0;
> -}
> diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
> index 1425a33d201d..fe3952514add 100644
> --- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c
> +++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
> @@ -12,22 +12,8 @@
>  
>  #define pr_fmt(fmt)	KBUILD_MODNAME ": " fmt
>  
> -#include <linux/cacheinfo.h>
>  #include <linux/cpu.h>
> -#include <linux/debugfs.h>
> -#include <linux/fs.h>
> -#include <linux/fs_parser.h>
> -#include <linux/sysfs.h>
> -#include <linux/kernfs.h>
> -#include <linux/seq_buf.h>
> -#include <linux/seq_file.h>
> -#include <linux/sched/signal.h>
> -#include <linux/sched/task.h>
>  #include <linux/slab.h>
> -#include <linux/task_work.h>
> -#include <linux/user_namespace.h>
> -
> -#include <uapi/linux/magic.h>
>  
>  #include <asm/resctrl.h>
>  #include "internal.h"
> @@ -36,328 +22,6 @@ DEFINE_STATIC_KEY_FALSE(rdt_enable_key);
>  DEFINE_STATIC_KEY_FALSE(rdt_mon_enable_key);
>  DEFINE_STATIC_KEY_FALSE(rdt_alloc_enable_key);
>  
> -/* Mutex to protect rdtgroup access. */
> -DEFINE_MUTEX(rdtgroup_mutex);
> -
> -static struct kernfs_root *rdt_root;
> -struct rdtgroup rdtgroup_default;
> -LIST_HEAD(rdt_all_groups);
> -
> -/* list of entries for the schemata file */
> -LIST_HEAD(resctrl_schema_all);
> -
> -/* The filesystem can only be mounted once. */
> -bool resctrl_mounted;
> -
> -/* Kernel fs node for "info" directory under root */
> -static struct kernfs_node *kn_info;
> -
> -/* Kernel fs node for "mon_groups" directory under root */
> -static struct kernfs_node *kn_mongrp;
> -
> -/* Kernel fs node for "mon_data" directory under root */
> -static struct kernfs_node *kn_mondata;
> -
> -/*
> - * Used to store the max resource name width and max resource data width
> - * to display the schemata in a tabular format
> - */
> -int max_name_width, max_data_width;
> -
> -static struct seq_buf last_cmd_status;
> -static char last_cmd_status_buf[512];
> -
> -static int rdtgroup_setup_root(struct rdt_fs_context *ctx);
> -static void rdtgroup_destroy_root(void);
> -
> -struct dentry *debugfs_resctrl;
> -
> -static bool resctrl_debug;
> -
> -void rdt_last_cmd_clear(void)
> -{
> -	lockdep_assert_held(&rdtgroup_mutex);
> -	seq_buf_clear(&last_cmd_status);
> -}
> -
> -void rdt_last_cmd_puts(const char *s)
> -{
> -	lockdep_assert_held(&rdtgroup_mutex);
> -	seq_buf_puts(&last_cmd_status, s);
> -}
> -
> -void rdt_last_cmd_printf(const char *fmt, ...)
> -{
> -	va_list ap;
> -
> -	va_start(ap, fmt);
> -	lockdep_assert_held(&rdtgroup_mutex);
> -	seq_buf_vprintf(&last_cmd_status, fmt, ap);
> -	va_end(ap);
> -}
> -
> -void rdt_staged_configs_clear(void)
> -{
> -	enum resctrl_res_level i;
> -	struct rdt_resource *r;
> -	struct rdt_domain *dom;
> -
> -	lockdep_assert_held(&rdtgroup_mutex);
> -
> -	for (i = 0; i < RDT_NUM_RESOURCES; i++) {
> -		r = resctrl_arch_get_resource(i);
> -		if (!r->alloc_capable)
> -			continue;
> -
> -		list_for_each_entry(dom, &r->domains, list)
> -			memset(dom->staged_config, 0, sizeof(dom->staged_config));
> -	}
> -}
> -
> -static bool resctrl_is_mbm_enabled(void)
> -{
> -	return (resctrl_arch_is_mbm_total_enabled() ||
> -		resctrl_arch_is_mbm_local_enabled());
> -}
> -
> -static bool resctrl_is_mbm_event(int e)
> -{
> -	return (e >= QOS_L3_MBM_TOTAL_EVENT_ID &&
> -		e <= QOS_L3_MBM_LOCAL_EVENT_ID);
> -}
> -
> -/*
> - * Trivial allocator for CLOSIDs. Since h/w only supports a small number,
> - * we can keep a bitmap of free CLOSIDs in a single integer.
> - *
> - * Using a global CLOSID across all resources has some advantages and
> - * some drawbacks:
> - * + We can simply set current's closid to assign a task to a resource
> - *   group.
> - * + Context switch code can avoid extra memory references deciding which
> - *   CLOSID to load into the PQR_ASSOC MSR
> - * - We give up some options in configuring resource groups across multi-socket
> - *   systems.
> - * - Our choices on how to configure each resource become progressively more
> - *   limited as the number of resources grows.
> - */
> -static unsigned long closid_free_map;
> -static int closid_free_map_len;
> -
> -int closids_supported(void)
> -{
> -	return closid_free_map_len;
> -}
> -
> -static void closid_init(void)
> -{
> -	struct resctrl_schema *s;
> -	u32 rdt_min_closid = 32;
> -
> -	/* Compute rdt_min_closid across all resources */
> -	list_for_each_entry(s, &resctrl_schema_all, list)
> -		rdt_min_closid = min(rdt_min_closid, s->num_closid);
> -
> -	closid_free_map = BIT_MASK(rdt_min_closid) - 1;
> -
> -	/* RESCTRL_RESERVED_CLOSID is always reserved for the default group */
> -	__clear_bit(RESCTRL_RESERVED_CLOSID, &closid_free_map);
> -	closid_free_map_len = rdt_min_closid;
> -}
> -
> -static int closid_alloc(void)
> -{
> -	int cleanest_closid;
> -	u32 closid;
> -
> -	lockdep_assert_held(&rdtgroup_mutex);
> -
> -	if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID) &&
> -	    resctrl_arch_is_llc_occupancy_enabled()) {
> -		cleanest_closid = resctrl_find_cleanest_closid();
> -		if (cleanest_closid < 0)
> -			return cleanest_closid;
> -		closid = cleanest_closid;
> -	} else {
> -		closid = ffs(closid_free_map);
> -		if (closid == 0)
> -			return -ENOSPC;
> -		closid--;
> -	}
> -	__clear_bit(closid, &closid_free_map);
> -
> -	return closid;
> -}
> -
> -void closid_free(int closid)
> -{
> -	lockdep_assert_held(&rdtgroup_mutex);
> -
> -	__set_bit(closid, &closid_free_map);
> -}
> -
> -/**
> - * closid_allocated - test if provided closid is in use
> - * @closid: closid to be tested
> - *
> - * Return: true if @closid is currently associated with a resource group,
> - * false if @closid is free
> - */
> -bool closid_allocated(unsigned int closid)
> -{
> -	lockdep_assert_held(&rdtgroup_mutex);
> -
> -	return !test_bit(closid, &closid_free_map);
> -}
> -
> -/**
> - * rdtgroup_mode_by_closid - Return mode of resource group with closid
> - * @closid: closid if the resource group
> - *
> - * Each resource group is associated with a @closid. Here the mode
> - * of a resource group can be queried by searching for it using its closid.
> - *
> - * Return: mode as &enum rdtgrp_mode of resource group with closid @closid
> - */
> -enum rdtgrp_mode rdtgroup_mode_by_closid(int closid)
> -{
> -	struct rdtgroup *rdtgrp;
> -
> -	list_for_each_entry(rdtgrp, &rdt_all_groups, rdtgroup_list) {
> -		if (rdtgrp->closid == closid)
> -			return rdtgrp->mode;
> -	}
> -
> -	return RDT_NUM_MODES;
> -}
> -
> -static const char * const rdt_mode_str[] = {
> -	[RDT_MODE_SHAREABLE]		= "shareable",
> -	[RDT_MODE_EXCLUSIVE]		= "exclusive",
> -	[RDT_MODE_PSEUDO_LOCKSETUP]	= "pseudo-locksetup",
> -	[RDT_MODE_PSEUDO_LOCKED]	= "pseudo-locked",
> -};
> -
> -/**
> - * rdtgroup_mode_str - Return the string representation of mode
> - * @mode: the resource group mode as &enum rdtgroup_mode
> - *
> - * Return: string representation of valid mode, "unknown" otherwise
> - */
> -static const char *rdtgroup_mode_str(enum rdtgrp_mode mode)
> -{
> -	if (mode < RDT_MODE_SHAREABLE || mode >= RDT_NUM_MODES)
> -		return "unknown";
> -
> -	return rdt_mode_str[mode];
> -}
> -
> -/* set uid and gid of rdtgroup dirs and files to that of the creator */
> -static int rdtgroup_kn_set_ugid(struct kernfs_node *kn)
> -{
> -	struct iattr iattr = { .ia_valid = ATTR_UID | ATTR_GID,
> -				.ia_uid = current_fsuid(),
> -				.ia_gid = current_fsgid(), };
> -
> -	if (uid_eq(iattr.ia_uid, GLOBAL_ROOT_UID) &&
> -	    gid_eq(iattr.ia_gid, GLOBAL_ROOT_GID))
> -		return 0;
> -
> -	return kernfs_setattr(kn, &iattr);
> -}
> -
> -static int rdtgroup_add_file(struct kernfs_node *parent_kn, struct rftype *rft)
> -{
> -	struct kernfs_node *kn;
> -	int ret;
> -
> -	kn = __kernfs_create_file(parent_kn, rft->name, rft->mode,
> -				  GLOBAL_ROOT_UID, GLOBAL_ROOT_GID,
> -				  0, rft->kf_ops, rft, NULL, NULL);
> -	if (IS_ERR(kn))
> -		return PTR_ERR(kn);
> -
> -	ret = rdtgroup_kn_set_ugid(kn);
> -	if (ret) {
> -		kernfs_remove(kn);
> -		return ret;
> -	}
> -
> -	return 0;
> -}
> -
> -static int rdtgroup_seqfile_show(struct seq_file *m, void *arg)
> -{
> -	struct kernfs_open_file *of = m->private;
> -	struct rftype *rft = of->kn->priv;
> -
> -	if (rft->seq_show)
> -		return rft->seq_show(of, m, arg);
> -	return 0;
> -}
> -
> -static ssize_t rdtgroup_file_write(struct kernfs_open_file *of, char *buf,
> -				   size_t nbytes, loff_t off)
> -{
> -	struct rftype *rft = of->kn->priv;
> -
> -	if (rft->write)
> -		return rft->write(of, buf, nbytes, off);
> -
> -	return -EINVAL;
> -}
> -
> -static const struct kernfs_ops rdtgroup_kf_single_ops = {
> -	.atomic_write_len	= PAGE_SIZE,
> -	.write			= rdtgroup_file_write,
> -	.seq_show		= rdtgroup_seqfile_show,
> -};
> -
> -static const struct kernfs_ops kf_mondata_ops = {
> -	.atomic_write_len	= PAGE_SIZE,
> -	.seq_show		= rdtgroup_mondata_show,
> -};
> -
> -static bool is_cpu_list(struct kernfs_open_file *of)
> -{
> -	struct rftype *rft = of->kn->priv;
> -
> -	return rft->flags & RFTYPE_FLAGS_CPUS_LIST;
> -}
> -
> -static int rdtgroup_cpus_show(struct kernfs_open_file *of,
> -			      struct seq_file *s, void *v)
> -{
> -	struct rdtgroup *rdtgrp;
> -	struct cpumask *mask;
> -	int ret = 0;
> -
> -	rdtgrp = rdtgroup_kn_lock_live(of->kn);
> -
> -	if (rdtgrp) {
> -		if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) {
> -			if (!rdtgrp->plr->d) {
> -				rdt_last_cmd_clear();
> -				rdt_last_cmd_puts("Cache domain offline\n");
> -				ret = -ENODEV;
> -			} else {
> -				mask = &rdtgrp->plr->d->cpu_mask;
> -				seq_printf(s, is_cpu_list(of) ?
> -					   "%*pbl\n" : "%*pb\n",
> -					   cpumask_pr_args(mask));
> -			}
> -		} else {
> -			seq_printf(s, is_cpu_list(of) ? "%*pbl\n" : "%*pb\n",
> -				   cpumask_pr_args(&rdtgrp->cpu_mask));
> -		}
> -	} else {
> -		ret = -ENOENT;
> -	}
> -	rdtgroup_kn_unlock(of->kn);
> -
> -	return ret;
> -}
> -
>  /*
>   * This is safe against resctrl_arch_sched_in() called from __switch_to()
>   * because __switch_to() is executed with interrupts disabled. A local call
> @@ -381,1206 +45,6 @@ void resctrl_arch_sync_cpu_defaults(void *info)
>  	resctrl_arch_sched_in(current);
>  }
>  
> -/*
> - * Update the PGR_ASSOC MSR on all cpus in @cpu_mask,
> - *
> - * Per task closids/rmids must have been set up before calling this function.
> - * @r may be NULL.
> - */
> -static void
> -update_closid_rmid(const struct cpumask *cpu_mask, struct rdtgroup *r)
> -{
> -	struct resctrl_cpu_sync defaults;
> -	struct resctrl_cpu_sync *defaults_p = NULL;
> -
> -	if (r) {
> -		defaults.closid = r->closid;
> -		defaults.rmid = r->mon.rmid;
> -		defaults_p = &defaults;
> -	}
> -
> -	on_each_cpu_mask(cpu_mask, resctrl_arch_sync_cpu_defaults, defaults_p,
> -			 1);
> -}
> -
> -static int cpus_mon_write(struct rdtgroup *rdtgrp, cpumask_var_t newmask,
> -			  cpumask_var_t tmpmask)
> -{
> -	struct rdtgroup *prgrp = rdtgrp->mon.parent, *crgrp;
> -	struct list_head *head;
> -
> -	/* Check whether cpus belong to parent ctrl group */
> -	cpumask_andnot(tmpmask, newmask, &prgrp->cpu_mask);
> -	if (!cpumask_empty(tmpmask)) {
> -		rdt_last_cmd_puts("Can only add CPUs to mongroup that belong to parent\n");
> -		return -EINVAL;
> -	}
> -
> -	/* Check whether cpus are dropped from this group */
> -	cpumask_andnot(tmpmask, &rdtgrp->cpu_mask, newmask);
> -	if (!cpumask_empty(tmpmask)) {
> -		/* Give any dropped cpus to parent rdtgroup */
> -		cpumask_or(&prgrp->cpu_mask, &prgrp->cpu_mask, tmpmask);
> -		update_closid_rmid(tmpmask, prgrp);
> -	}
> -
> -	/*
> -	 * If we added cpus, remove them from previous group that owned them
> -	 * and update per-cpu rmid
> -	 */
> -	cpumask_andnot(tmpmask, newmask, &rdtgrp->cpu_mask);
> -	if (!cpumask_empty(tmpmask)) {
> -		head = &prgrp->mon.crdtgrp_list;
> -		list_for_each_entry(crgrp, head, mon.crdtgrp_list) {
> -			if (crgrp == rdtgrp)
> -				continue;
> -			cpumask_andnot(&crgrp->cpu_mask, &crgrp->cpu_mask,
> -				       tmpmask);
> -		}
> -		update_closid_rmid(tmpmask, rdtgrp);
> -	}
> -
> -	/* Done pushing/pulling - update this group with new mask */
> -	cpumask_copy(&rdtgrp->cpu_mask, newmask);
> -
> -	return 0;
> -}
> -
> -static void cpumask_rdtgrp_clear(struct rdtgroup *r, struct cpumask *m)
> -{
> -	struct rdtgroup *crgrp;
> -
> -	cpumask_andnot(&r->cpu_mask, &r->cpu_mask, m);
> -	/* update the child mon group masks as well*/
> -	list_for_each_entry(crgrp, &r->mon.crdtgrp_list, mon.crdtgrp_list)
> -		cpumask_and(&crgrp->cpu_mask, &r->cpu_mask, &crgrp->cpu_mask);
> -}
> -
> -static int cpus_ctrl_write(struct rdtgroup *rdtgrp, cpumask_var_t newmask,
> -			   cpumask_var_t tmpmask, cpumask_var_t tmpmask1)
> -{
> -	struct rdtgroup *r, *crgrp;
> -	struct list_head *head;
> -
> -	/* Check whether cpus are dropped from this group */
> -	cpumask_andnot(tmpmask, &rdtgrp->cpu_mask, newmask);
> -	if (!cpumask_empty(tmpmask)) {
> -		/* Can't drop from default group */
> -		if (rdtgrp == &rdtgroup_default) {
> -			rdt_last_cmd_puts("Can't drop CPUs from default group\n");
> -			return -EINVAL;
> -		}
> -
> -		/* Give any dropped cpus to rdtgroup_default */
> -		cpumask_or(&rdtgroup_default.cpu_mask,
> -			   &rdtgroup_default.cpu_mask, tmpmask);
> -		update_closid_rmid(tmpmask, &rdtgroup_default);
> -	}
> -
> -	/*
> -	 * If we added cpus, remove them from previous group and
> -	 * the prev group's child groups that owned them
> -	 * and update per-cpu closid/rmid.
> -	 */
> -	cpumask_andnot(tmpmask, newmask, &rdtgrp->cpu_mask);
> -	if (!cpumask_empty(tmpmask)) {
> -		list_for_each_entry(r, &rdt_all_groups, rdtgroup_list) {
> -			if (r == rdtgrp)
> -				continue;
> -			cpumask_and(tmpmask1, &r->cpu_mask, tmpmask);
> -			if (!cpumask_empty(tmpmask1))
> -				cpumask_rdtgrp_clear(r, tmpmask1);
> -		}
> -		update_closid_rmid(tmpmask, rdtgrp);
> -	}
> -
> -	/* Done pushing/pulling - update this group with new mask */
> -	cpumask_copy(&rdtgrp->cpu_mask, newmask);
> -
> -	/*
> -	 * Clear child mon group masks since there is a new parent mask
> -	 * now and update the rmid for the cpus the child lost.
> -	 */
> -	head = &rdtgrp->mon.crdtgrp_list;
> -	list_for_each_entry(crgrp, head, mon.crdtgrp_list) {
> -		cpumask_and(tmpmask, &rdtgrp->cpu_mask, &crgrp->cpu_mask);
> -		update_closid_rmid(tmpmask, rdtgrp);
> -		cpumask_clear(&crgrp->cpu_mask);
> -	}
> -
> -	return 0;
> -}
> -
> -static ssize_t rdtgroup_cpus_write(struct kernfs_open_file *of,
> -				   char *buf, size_t nbytes, loff_t off)
> -{
> -	cpumask_var_t tmpmask, newmask, tmpmask1;
> -	struct rdtgroup *rdtgrp;
> -	int ret;
> -
> -	if (!buf)
> -		return -EINVAL;
> -
> -	if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
> -		return -ENOMEM;
> -	if (!zalloc_cpumask_var(&newmask, GFP_KERNEL)) {
> -		free_cpumask_var(tmpmask);
> -		return -ENOMEM;
> -	}
> -	if (!zalloc_cpumask_var(&tmpmask1, GFP_KERNEL)) {
> -		free_cpumask_var(tmpmask);
> -		free_cpumask_var(newmask);
> -		return -ENOMEM;
> -	}
> -
> -	rdtgrp = rdtgroup_kn_lock_live(of->kn);
> -	if (!rdtgrp) {
> -		ret = -ENOENT;
> -		goto unlock;
> -	}
> -
> -	if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED ||
> -	    rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
> -		ret = -EINVAL;
> -		rdt_last_cmd_puts("Pseudo-locking in progress\n");
> -		goto unlock;
> -	}
> -
> -	if (is_cpu_list(of))
> -		ret = cpulist_parse(buf, newmask);
> -	else
> -		ret = cpumask_parse(buf, newmask);
> -
> -	if (ret) {
> -		rdt_last_cmd_puts("Bad CPU list/mask\n");
> -		goto unlock;
> -	}
> -
> -	/* check that user didn't specify any offline cpus */
> -	cpumask_andnot(tmpmask, newmask, cpu_online_mask);
> -	if (!cpumask_empty(tmpmask)) {
> -		ret = -EINVAL;
> -		rdt_last_cmd_puts("Can only assign online CPUs\n");
> -		goto unlock;
> -	}
> -
> -	if (rdtgrp->type == RDTCTRL_GROUP)
> -		ret = cpus_ctrl_write(rdtgrp, newmask, tmpmask, tmpmask1);
> -	else if (rdtgrp->type == RDTMON_GROUP)
> -		ret = cpus_mon_write(rdtgrp, newmask, tmpmask);
> -	else
> -		ret = -EINVAL;
> -
> -unlock:
> -	rdtgroup_kn_unlock(of->kn);
> -	free_cpumask_var(tmpmask);
> -	free_cpumask_var(newmask);
> -	free_cpumask_var(tmpmask1);
> -
> -	return ret ?: nbytes;
> -}
> -
> -/**
> - * rdtgroup_remove - the helper to remove resource group safely
> - * @rdtgrp: resource group to remove
> - *
> - * On resource group creation via a mkdir, an extra kernfs_node reference is
> - * taken to ensure that the rdtgroup structure remains accessible for the
> - * rdtgroup_kn_unlock() calls where it is removed.
> - *
> - * Drop the extra reference here, then free the rdtgroup structure.
> - *
> - * Return: void
> - */
> -static void rdtgroup_remove(struct rdtgroup *rdtgrp)
> -{
> -	kernfs_put(rdtgrp->kn);
> -	kfree(rdtgrp);
> -}
> -
> -static void _update_task_closid_rmid(void *task)
> -{
> -	/*
> -	 * If the task is still current on this CPU, update PQR_ASSOC MSR.
> -	 * Otherwise, the MSR is updated when the task is scheduled in.
> -	 */
> -	if (task == current)
> -		resctrl_arch_sched_in(task);
> -}
> -
> -static void update_task_closid_rmid(struct task_struct *t)
> -{
> -	if (IS_ENABLED(CONFIG_SMP) && task_curr(t))
> -		smp_call_function_single(task_cpu(t), _update_task_closid_rmid, t, 1);
> -	else
> -		_update_task_closid_rmid(t);
> -}
> -
> -static bool task_in_rdtgroup(struct task_struct *tsk, struct rdtgroup *rdtgrp)
> -{
> -	u32 closid, rmid = rdtgrp->mon.rmid;
> -
> -	if (rdtgrp->type == RDTCTRL_GROUP)
> -		closid = rdtgrp->closid;
> -	else if (rdtgrp->type == RDTMON_GROUP)
> -		closid = rdtgrp->mon.parent->closid;
> -	else
> -		return false;
> -
> -	return resctrl_arch_match_closid(tsk, closid) &&
> -	       resctrl_arch_match_rmid(tsk, closid, rmid);
> -}
> -
> -static int __rdtgroup_move_task(struct task_struct *tsk,
> -				struct rdtgroup *rdtgrp)
> -{
> -	/* If the task is already in rdtgrp, no need to move the task. */
> -	if (task_in_rdtgroup(tsk, rdtgrp))
> -		return 0;
> -
> -	/*
> -	 * Set the task's closid/rmid before the PQR_ASSOC MSR can be
> -	 * updated by them.
> -	 *
> -	 * For ctrl_mon groups, move both closid and rmid.
> -	 * For monitor groups, can move the tasks only from
> -	 * their parent CTRL group.
> -	 */
> -	if (rdtgrp->type == RDTMON_GROUP &&
> -	    !resctrl_arch_match_closid(tsk, rdtgrp->mon.parent->closid)) {
> -		rdt_last_cmd_puts("Can't move task to different control group\n");
> -		return -EINVAL;
> -	}
> -
> -	if (rdtgrp->type == RDTMON_GROUP)
> -		resctrl_arch_set_closid_rmid(tsk, rdtgrp->mon.parent->closid,
> -					     rdtgrp->mon.rmid);
> -	else
> -		resctrl_arch_set_closid_rmid(tsk, rdtgrp->closid,
> -					     rdtgrp->mon.rmid);
> -
> -	/*
> -	 * Ensure the task's closid and rmid are written before determining if
> -	 * the task is current that will decide if it will be interrupted.
> -	 * This pairs with the full barrier between the rq->curr update and
> -	 * resctrl_arch_sched_in() during context switch.
> -	 */
> -	smp_mb();
> -
> -	/*
> -	 * By now, the task's closid and rmid are set. If the task is current
> -	 * on a CPU, the PQR_ASSOC MSR needs to be updated to make the resource
> -	 * group go into effect. If the task is not current, the MSR will be
> -	 * updated when the task is scheduled in.
> -	 */
> -	update_task_closid_rmid(tsk);
> -
> -	return 0;
> -}
> -
> -static bool is_closid_match(struct task_struct *t, struct rdtgroup *r)
> -{
> -	return (resctrl_arch_alloc_capable() && (r->type == RDTCTRL_GROUP) &&
> -		resctrl_arch_match_closid(t, r->closid));
> -}
> -
> -static bool is_rmid_match(struct task_struct *t, struct rdtgroup *r)
> -{
> -	return (resctrl_arch_mon_capable() && (r->type == RDTMON_GROUP) &&
> -		resctrl_arch_match_rmid(t, r->mon.parent->closid,
> -					r->mon.rmid));
> -}
> -
> -/**
> - * rdtgroup_tasks_assigned - Test if tasks have been assigned to resource group
> - * @r: Resource group
> - *
> - * Return: 1 if tasks have been assigned to @r, 0 otherwise
> - */
> -int rdtgroup_tasks_assigned(struct rdtgroup *r)
> -{
> -	struct task_struct *p, *t;
> -	int ret = 0;
> -
> -	lockdep_assert_held(&rdtgroup_mutex);
> -
> -	rcu_read_lock();
> -	for_each_process_thread(p, t) {
> -		if (is_closid_match(t, r) || is_rmid_match(t, r)) {
> -			ret = 1;
> -			break;
> -		}
> -	}
> -	rcu_read_unlock();
> -
> -	return ret;
> -}
> -
> -static int rdtgroup_task_write_permission(struct task_struct *task,
> -					  struct kernfs_open_file *of)
> -{
> -	const struct cred *tcred = get_task_cred(task);
> -	const struct cred *cred = current_cred();
> -	int ret = 0;
> -
> -	/*
> -	 * Even if we're attaching all tasks in the thread group, we only
> -	 * need to check permissions on one of them.
> -	 */
> -	if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) &&
> -	    !uid_eq(cred->euid, tcred->uid) &&
> -	    !uid_eq(cred->euid, tcred->suid)) {
> -		rdt_last_cmd_printf("No permission to move task %d\n", task->pid);
> -		ret = -EPERM;
> -	}
> -
> -	put_cred(tcred);
> -	return ret;
> -}
> -
> -static int rdtgroup_move_task(pid_t pid, struct rdtgroup *rdtgrp,
> -			      struct kernfs_open_file *of)
> -{
> -	struct task_struct *tsk;
> -	int ret;
> -
> -	rcu_read_lock();
> -	if (pid) {
> -		tsk = find_task_by_vpid(pid);
> -		if (!tsk) {
> -			rcu_read_unlock();
> -			rdt_last_cmd_printf("No task %d\n", pid);
> -			return -ESRCH;
> -		}
> -	} else {
> -		tsk = current;
> -	}
> -
> -	get_task_struct(tsk);
> -	rcu_read_unlock();
> -
> -	ret = rdtgroup_task_write_permission(tsk, of);
> -	if (!ret)
> -		ret = __rdtgroup_move_task(tsk, rdtgrp);
> -
> -	put_task_struct(tsk);
> -	return ret;
> -}
> -
> -static ssize_t rdtgroup_tasks_write(struct kernfs_open_file *of,
> -				    char *buf, size_t nbytes, loff_t off)
> -{
> -	struct rdtgroup *rdtgrp;
> -	char *pid_str;
> -	int ret = 0;
> -	pid_t pid;
> -
> -	rdtgrp = rdtgroup_kn_lock_live(of->kn);
> -	if (!rdtgrp) {
> -		rdtgroup_kn_unlock(of->kn);
> -		return -ENOENT;
> -	}
> -	rdt_last_cmd_clear();
> -
> -	if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED ||
> -	    rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
> -		ret = -EINVAL;
> -		rdt_last_cmd_puts("Pseudo-locking in progress\n");
> -		goto unlock;
> -	}
> -
> -	while (buf && buf[0] != '\0' && buf[0] != '\n') {
> -		pid_str = strim(strsep(&buf, ","));
> -
> -		if (kstrtoint(pid_str, 0, &pid)) {
> -			rdt_last_cmd_printf("Task list parsing error pid %s\n", pid_str);
> -			ret = -EINVAL;
> -			break;
> -		}
> -
> -		if (pid < 0) {
> -			rdt_last_cmd_printf("Invalid pid %d\n", pid);
> -			ret = -EINVAL;
> -			break;
> -		}
> -
> -		ret = rdtgroup_move_task(pid, rdtgrp, of);
> -		if (ret) {
> -			rdt_last_cmd_printf("Error while processing task %d\n", pid);
> -			break;
> -		}
> -	}
> -
> -unlock:
> -	rdtgroup_kn_unlock(of->kn);
> -
> -	return ret ?: nbytes;
> -}
> -
> -static void show_rdt_tasks(struct rdtgroup *r, struct seq_file *s)
> -{
> -	struct task_struct *p, *t;
> -	pid_t pid;
> -
> -	rcu_read_lock();
> -	for_each_process_thread(p, t) {
> -		if (is_closid_match(t, r) || is_rmid_match(t, r)) {
> -			pid = task_pid_vnr(t);
> -			if (pid)
> -				seq_printf(s, "%d\n", pid);
> -		}
> -	}
> -	rcu_read_unlock();
> -}
> -
> -static int rdtgroup_tasks_show(struct kernfs_open_file *of,
> -			       struct seq_file *s, void *v)
> -{
> -	struct rdtgroup *rdtgrp;
> -	int ret = 0;
> -
> -	rdtgrp = rdtgroup_kn_lock_live(of->kn);
> -	if (rdtgrp)
> -		show_rdt_tasks(rdtgrp, s);
> -	else
> -		ret = -ENOENT;
> -	rdtgroup_kn_unlock(of->kn);
> -
> -	return ret;
> -}
> -
> -static int rdtgroup_closid_show(struct kernfs_open_file *of,
> -				struct seq_file *s, void *v)
> -{
> -	struct rdtgroup *rdtgrp;
> -	int ret = 0;
> -
> -	rdtgrp = rdtgroup_kn_lock_live(of->kn);
> -	if (rdtgrp)
> -		seq_printf(s, "%u\n", rdtgrp->closid);
> -	else
> -		ret = -ENOENT;
> -	rdtgroup_kn_unlock(of->kn);
> -
> -	return ret;
> -}
> -
> -static int rdtgroup_rmid_show(struct kernfs_open_file *of,
> -			      struct seq_file *s, void *v)
> -{
> -	struct rdtgroup *rdtgrp;
> -	int ret = 0;
> -
> -	rdtgrp = rdtgroup_kn_lock_live(of->kn);
> -	if (rdtgrp)
> -		seq_printf(s, "%u\n", rdtgrp->mon.rmid);
> -	else
> -		ret = -ENOENT;
> -	rdtgroup_kn_unlock(of->kn);
> -
> -	return ret;
> -}
> -
> -#ifdef CONFIG_PROC_CPU_RESCTRL
> -
> -/*
> - * A task can only be part of one resctrl control group and of one monitor
> - * group which is associated to that control group.
> - *
> - * 1)   res:
> - *      mon:
> - *
> - *    resctrl is not available.
> - *
> - * 2)   res:/
> - *      mon:
> - *
> - *    Task is part of the root resctrl control group, and it is not associated
> - *    to any monitor group.
> - *
> - * 3)  res:/
> - *     mon:mon0
> - *
> - *    Task is part of the root resctrl control group and monitor group mon0.
> - *
> - * 4)  res:group0
> - *     mon:
> - *
> - *    Task is part of resctrl control group group0, and it is not associated
> - *    to any monitor group.
> - *
> - * 5) res:group0
> - *    mon:mon1
> - *
> - *    Task is part of resctrl control group group0 and monitor group mon1.
> - */
> -int proc_resctrl_show(struct seq_file *s, struct pid_namespace *ns,
> -		      struct pid *pid, struct task_struct *tsk)
> -{
> -	struct rdtgroup *rdtg;
> -	int ret = 0;
> -
> -	mutex_lock(&rdtgroup_mutex);
> -
> -	/* Return empty if resctrl has not been mounted. */
> -	if (!resctrl_mounted) {
> -		seq_puts(s, "res:\nmon:\n");
> -		goto unlock;
> -	}
> -
> -	list_for_each_entry(rdtg, &rdt_all_groups, rdtgroup_list) {
> -		struct rdtgroup *crg;
> -
> -		/*
> -		 * Task information is only relevant for shareable
> -		 * and exclusive groups.
> -		 */
> -		if (rdtg->mode != RDT_MODE_SHAREABLE &&
> -		    rdtg->mode != RDT_MODE_EXCLUSIVE)
> -			continue;
> -
> -		if (!resctrl_arch_match_closid(tsk, rdtg->closid))
> -			continue;
> -
> -		seq_printf(s, "res:%s%s\n", (rdtg == &rdtgroup_default) ? "/" : "",
> -			   rdtg->kn->name);
> -		seq_puts(s, "mon:");
> -		list_for_each_entry(crg, &rdtg->mon.crdtgrp_list,
> -				    mon.crdtgrp_list) {
> -			if (!resctrl_arch_match_rmid(tsk, crg->mon.parent->closid,
> -						     crg->mon.rmid))
> -				continue;
> -			seq_printf(s, "%s", crg->kn->name);
> -			break;
> -		}
> -		seq_putc(s, '\n');
> -		goto unlock;
> -	}
> -	/*
> -	 * The above search should succeed. Otherwise return
> -	 * with an error.
> -	 */
> -	ret = -ENOENT;
> -unlock:
> -	mutex_unlock(&rdtgroup_mutex);
> -
> -	return ret;
> -}
> -#endif
> -
> -static int rdt_last_cmd_status_show(struct kernfs_open_file *of,
> -				    struct seq_file *seq, void *v)
> -{
> -	int len;
> -
> -	mutex_lock(&rdtgroup_mutex);
> -	len = seq_buf_used(&last_cmd_status);
> -	if (len)
> -		seq_printf(seq, "%.*s", len, last_cmd_status_buf);
> -	else
> -		seq_puts(seq, "ok\n");
> -	mutex_unlock(&rdtgroup_mutex);
> -	return 0;
> -}
> -
> -static int rdt_num_closids_show(struct kernfs_open_file *of,
> -				struct seq_file *seq, void *v)
> -{
> -	struct resctrl_schema *s = of->kn->parent->priv;
> -
> -	seq_printf(seq, "%u\n", s->num_closid);
> -	return 0;
> -}
> -
> -static int rdt_default_ctrl_show(struct kernfs_open_file *of,
> -			     struct seq_file *seq, void *v)
> -{
> -	struct resctrl_schema *s = of->kn->parent->priv;
> -	struct rdt_resource *r = s->res;
> -
> -	seq_printf(seq, "%x\n", r->default_ctrl);
> -	return 0;
> -}
> -
> -static int rdt_min_cbm_bits_show(struct kernfs_open_file *of,
> -			     struct seq_file *seq, void *v)
> -{
> -	struct resctrl_schema *s = of->kn->parent->priv;
> -	struct rdt_resource *r = s->res;
> -
> -	seq_printf(seq, "%u\n", r->cache.min_cbm_bits);
> -	return 0;
> -}
> -
> -static int rdt_shareable_bits_show(struct kernfs_open_file *of,
> -				   struct seq_file *seq, void *v)
> -{
> -	struct resctrl_schema *s = of->kn->parent->priv;
> -	struct rdt_resource *r = s->res;
> -
> -	seq_printf(seq, "%x\n", r->cache.shareable_bits);
> -	return 0;
> -}
> -
> -/*
> - * rdt_bit_usage_show - Display current usage of resources
> - *
> - * A domain is a shared resource that can now be allocated differently. Here
> - * we display the current regions of the domain as an annotated bitmask.
> - * For each domain of this resource its allocation bitmask
> - * is annotated as below to indicate the current usage of the corresponding bit:
> - *   0 - currently unused
> - *   X - currently available for sharing and used by software and hardware
> - *   H - currently used by hardware only but available for software use
> - *   S - currently used and shareable by software only
> - *   E - currently used exclusively by one resource group
> - *   P - currently pseudo-locked by one resource group
> - */
> -static int rdt_bit_usage_show(struct kernfs_open_file *of,
> -			      struct seq_file *seq, void *v)
> -{
> -	struct resctrl_schema *s = of->kn->parent->priv;
> -	/*
> -	 * Use unsigned long even though only 32 bits are used to ensure
> -	 * test_bit() is used safely.
> -	 */
> -	unsigned long sw_shareable = 0, hw_shareable = 0;
> -	unsigned long exclusive = 0, pseudo_locked = 0;
> -	struct rdt_resource *r = s->res;
> -	struct rdt_domain *dom;
> -	int i, hwb, swb, excl, psl;
> -	enum rdtgrp_mode mode;
> -	bool sep = false;
> -	u32 ctrl_val;
> -
> -	cpus_read_lock();
> -	mutex_lock(&rdtgroup_mutex);
> -	hw_shareable = r->cache.shareable_bits;
> -	list_for_each_entry(dom, &r->domains, list) {
> -		if (sep)
> -			seq_putc(seq, ';');
> -		sw_shareable = 0;
> -		exclusive = 0;
> -		seq_printf(seq, "%d=", dom->id);
> -		for (i = 0; i < closids_supported(); i++) {
> -			if (!closid_allocated(i))
> -				continue;
> -			ctrl_val = resctrl_arch_get_config(r, dom, i,
> -							   s->conf_type);
> -			mode = rdtgroup_mode_by_closid(i);
> -			switch (mode) {
> -			case RDT_MODE_SHAREABLE:
> -				sw_shareable |= ctrl_val;
> -				break;
> -			case RDT_MODE_EXCLUSIVE:
> -				exclusive |= ctrl_val;
> -				break;
> -			case RDT_MODE_PSEUDO_LOCKSETUP:
> -			/*
> -			 * RDT_MODE_PSEUDO_LOCKSETUP is possible
> -			 * here but not included since the CBM
> -			 * associated with this CLOSID in this mode
> -			 * is not initialized and no task or cpu can be
> -			 * assigned this CLOSID.
> -			 */
> -				break;
> -			case RDT_MODE_PSEUDO_LOCKED:
> -			case RDT_NUM_MODES:
> -				WARN(1,
> -				     "invalid mode for closid %d\n", i);
> -				break;
> -			}
> -		}
> -		for (i = r->cache.cbm_len - 1; i >= 0; i--) {
> -			pseudo_locked = dom->plr ? dom->plr->cbm : 0;
> -			hwb = test_bit(i, &hw_shareable);
> -			swb = test_bit(i, &sw_shareable);
> -			excl = test_bit(i, &exclusive);
> -			psl = test_bit(i, &pseudo_locked);
> -			if (hwb && swb)
> -				seq_putc(seq, 'X');
> -			else if (hwb && !swb)
> -				seq_putc(seq, 'H');
> -			else if (!hwb && swb)
> -				seq_putc(seq, 'S');
> -			else if (excl)
> -				seq_putc(seq, 'E');
> -			else if (psl)
> -				seq_putc(seq, 'P');
> -			else /* Unused bits remain */
> -				seq_putc(seq, '0');
> -		}
> -		sep = true;
> -	}
> -	seq_putc(seq, '\n');
> -	mutex_unlock(&rdtgroup_mutex);
> -	cpus_read_unlock();
> -	return 0;
> -}
> -
> -static int rdt_min_bw_show(struct kernfs_open_file *of,
> -			     struct seq_file *seq, void *v)
> -{
> -	struct resctrl_schema *s = of->kn->parent->priv;
> -	struct rdt_resource *r = s->res;
> -
> -	seq_printf(seq, "%u\n", r->membw.min_bw);
> -	return 0;
> -}
> -
> -static int rdt_num_rmids_show(struct kernfs_open_file *of,
> -			      struct seq_file *seq, void *v)
> -{
> -	struct rdt_resource *r = of->kn->parent->priv;
> -
> -	seq_printf(seq, "%d\n", r->num_rmid);
> -
> -	return 0;
> -}
> -
> -static int rdt_mon_features_show(struct kernfs_open_file *of,
> -				 struct seq_file *seq, void *v)
> -{
> -	struct rdt_resource *r = of->kn->parent->priv;
> -	struct mon_evt *mevt;
> -
> -	list_for_each_entry(mevt, &r->evt_list, list) {
> -		seq_printf(seq, "%s\n", mevt->name);
> -		if (mevt->configurable)
> -			seq_printf(seq, "%s_config\n", mevt->name);
> -	}
> -
> -	return 0;
> -}
> -
> -static int rdt_bw_gran_show(struct kernfs_open_file *of,
> -			     struct seq_file *seq, void *v)
> -{
> -	struct resctrl_schema *s = of->kn->parent->priv;
> -	struct rdt_resource *r = s->res;
> -
> -	seq_printf(seq, "%u\n", r->membw.bw_gran);
> -	return 0;
> -}
> -
> -static int rdt_delay_linear_show(struct kernfs_open_file *of,
> -			     struct seq_file *seq, void *v)
> -{
> -	struct resctrl_schema *s = of->kn->parent->priv;
> -	struct rdt_resource *r = s->res;
> -
> -	seq_printf(seq, "%u\n", r->membw.delay_linear);
> -	return 0;
> -}
> -
> -static int max_threshold_occ_show(struct kernfs_open_file *of,
> -				  struct seq_file *seq, void *v)
> -{
> -	seq_printf(seq, "%u\n", resctrl_rmid_realloc_threshold);
> -
> -	return 0;
> -}
> -
> -static int rdt_thread_throttle_mode_show(struct kernfs_open_file *of,
> -					 struct seq_file *seq, void *v)
> -{
> -	struct resctrl_schema *s = of->kn->parent->priv;
> -	struct rdt_resource *r = s->res;
> -
> -	if (r->membw.throttle_mode == THREAD_THROTTLE_PER_THREAD)
> -		seq_puts(seq, "per-thread\n");
> -	else
> -		seq_puts(seq, "max\n");
> -
> -	return 0;
> -}
> -
> -static ssize_t max_threshold_occ_write(struct kernfs_open_file *of,
> -				       char *buf, size_t nbytes, loff_t off)
> -{
> -	unsigned int bytes;
> -	int ret;
> -
> -	ret = kstrtouint(buf, 0, &bytes);
> -	if (ret)
> -		return ret;
> -
> -	if (bytes > resctrl_rmid_realloc_limit)
> -		return -EINVAL;
> -
> -	resctrl_rmid_realloc_threshold = resctrl_arch_round_mon_val(bytes);
> -
> -	return nbytes;
> -}
> -
> -/*
> - * rdtgroup_mode_show - Display mode of this resource group
> - */
> -static int rdtgroup_mode_show(struct kernfs_open_file *of,
> -			      struct seq_file *s, void *v)
> -{
> -	struct rdtgroup *rdtgrp;
> -
> -	rdtgrp = rdtgroup_kn_lock_live(of->kn);
> -	if (!rdtgrp) {
> -		rdtgroup_kn_unlock(of->kn);
> -		return -ENOENT;
> -	}
> -
> -	seq_printf(s, "%s\n", rdtgroup_mode_str(rdtgrp->mode));
> -
> -	rdtgroup_kn_unlock(of->kn);
> -	return 0;
> -}
> -
> -static enum resctrl_conf_type resctrl_peer_type(enum resctrl_conf_type my_type)
> -{
> -	switch (my_type) {
> -	case CDP_CODE:
> -		return CDP_DATA;
> -	case CDP_DATA:
> -		return CDP_CODE;
> -	default:
> -	case CDP_NONE:
> -		return CDP_NONE;
> -	}
> -}
> -
> -static int rdt_has_sparse_bitmasks_show(struct kernfs_open_file *of,
> -					struct seq_file *seq, void *v)
> -{
> -	struct resctrl_schema *s = of->kn->parent->priv;
> -	struct rdt_resource *r = s->res;
> -
> -	seq_printf(seq, "%u\n", r->cache.arch_has_sparse_bitmasks);
> -
> -	return 0;
> -}
> -
> -/**
> - * __rdtgroup_cbm_overlaps - Does CBM for intended closid overlap with other
> - * @r: Resource to which domain instance @d belongs.
> - * @d: The domain instance for which @closid is being tested.
> - * @cbm: Capacity bitmask being tested.
> - * @closid: Intended closid for @cbm.
> - * @type: CDP type of @r.
> - * @exclusive: Only check if overlaps with exclusive resource groups
> - *
> - * Checks if provided @cbm intended to be used for @closid on domain
> - * @d overlaps with any other closids or other hardware usage associated
> - * with this domain. If @exclusive is true then only overlaps with
> - * resource groups in exclusive mode will be considered. If @exclusive
> - * is false then overlaps with any resource group or hardware entities
> - * will be considered.
> - *
> - * @cbm is unsigned long, even if only 32 bits are used, to make the
> - * bitmap functions work correctly.
> - *
> - * Return: false if CBM does not overlap, true if it does.
> - */
> -static bool __rdtgroup_cbm_overlaps(struct rdt_resource *r, struct rdt_domain *d,
> -				    unsigned long cbm, int closid,
> -				    enum resctrl_conf_type type, bool exclusive)
> -{
> -	enum rdtgrp_mode mode;
> -	unsigned long ctrl_b;
> -	int i;
> -
> -	/* Check for any overlap with regions used by hardware directly */
> -	if (!exclusive) {
> -		ctrl_b = r->cache.shareable_bits;
> -		if (bitmap_intersects(&cbm, &ctrl_b, r->cache.cbm_len))
> -			return true;
> -	}
> -
> -	/* Check for overlap with other resource groups */
> -	for (i = 0; i < closids_supported(); i++) {
> -		ctrl_b = resctrl_arch_get_config(r, d, i, type);
> -		mode = rdtgroup_mode_by_closid(i);
> -		if (closid_allocated(i) && i != closid &&
> -		    mode != RDT_MODE_PSEUDO_LOCKSETUP) {
> -			if (bitmap_intersects(&cbm, &ctrl_b, r->cache.cbm_len)) {
> -				if (exclusive) {
> -					if (mode == RDT_MODE_EXCLUSIVE)
> -						return true;
> -					continue;
> -				}
> -				return true;
> -			}
> -		}
> -	}
> -
> -	return false;
> -}
> -
> -/**
> - * rdtgroup_cbm_overlaps - Does CBM overlap with other use of hardware
> - * @s: Schema for the resource to which domain instance @d belongs.
> - * @d: The domain instance for which @closid is being tested.
> - * @cbm: Capacity bitmask being tested.
> - * @closid: Intended closid for @cbm.
> - * @exclusive: Only check if overlaps with exclusive resource groups
> - *
> - * Resources that can be allocated using a CBM can use the CBM to control
> - * the overlap of these allocations. rdtgroup_cmb_overlaps() is the test
> - * for overlap. Overlap test is not limited to the specific resource for
> - * which the CBM is intended though - when dealing with CDP resources that
> - * share the underlying hardware the overlap check should be performed on
> - * the CDP resource sharing the hardware also.
> - *
> - * Refer to description of __rdtgroup_cbm_overlaps() for the details of the
> - * overlap test.
> - *
> - * Return: true if CBM overlap detected, false if there is no overlap
> - */
> -bool rdtgroup_cbm_overlaps(struct resctrl_schema *s, struct rdt_domain *d,
> -			   unsigned long cbm, int closid, bool exclusive)
> -{
> -	enum resctrl_conf_type peer_type = resctrl_peer_type(s->conf_type);
> -	struct rdt_resource *r = s->res;
> -
> -	if (__rdtgroup_cbm_overlaps(r, d, cbm, closid, s->conf_type,
> -				    exclusive))
> -		return true;
> -
> -	if (!resctrl_arch_get_cdp_enabled(r->rid))
> -		return false;
> -	return  __rdtgroup_cbm_overlaps(r, d, cbm, closid, peer_type, exclusive);
> -}
> -
> -/**
> - * rdtgroup_mode_test_exclusive - Test if this resource group can be exclusive
> - * @rdtgrp: Resource group identified through its closid.
> - *
> - * An exclusive resource group implies that there should be no sharing of
> - * its allocated resources. At the time this group is considered to be
> - * exclusive this test can determine if its current schemata supports this
> - * setting by testing for overlap with all other resource groups.
> - *
> - * Return: true if resource group can be exclusive, false if there is overlap
> - * with allocations of other resource groups and thus this resource group
> - * cannot be exclusive.
> - */
> -static bool rdtgroup_mode_test_exclusive(struct rdtgroup *rdtgrp)
> -{
> -	int closid = rdtgrp->closid;
> -	struct resctrl_schema *s;
> -	struct rdt_resource *r;
> -	bool has_cache = false;
> -	struct rdt_domain *d;
> -	u32 ctrl;
> -
> -	/* Walking r->domains, ensure it can't race with cpuhp */
> -	lockdep_assert_cpus_held();
> -
> -	list_for_each_entry(s, &resctrl_schema_all, list) {
> -		r = s->res;
> -		if (r->rid == RDT_RESOURCE_MBA || r->rid == RDT_RESOURCE_SMBA)
> -			continue;
> -		has_cache = true;
> -		list_for_each_entry(d, &r->domains, list) {
> -			ctrl = resctrl_arch_get_config(r, d, closid,
> -						       s->conf_type);
> -			if (rdtgroup_cbm_overlaps(s, d, ctrl, closid, false)) {
> -				rdt_last_cmd_puts("Schemata overlaps\n");
> -				return false;
> -			}
> -		}
> -	}
> -
> -	if (!has_cache) {
> -		rdt_last_cmd_puts("Cannot be exclusive without CAT/CDP\n");
> -		return false;
> -	}
> -
> -	return true;
> -}
> -
> -/*
> - * rdtgroup_mode_write - Modify the resource group's mode
> - */
> -static ssize_t rdtgroup_mode_write(struct kernfs_open_file *of,
> -				   char *buf, size_t nbytes, loff_t off)
> -{
> -	struct rdtgroup *rdtgrp;
> -	enum rdtgrp_mode mode;
> -	int ret = 0;
> -
> -	/* Valid input requires a trailing newline */
> -	if (nbytes == 0 || buf[nbytes - 1] != '\n')
> -		return -EINVAL;
> -	buf[nbytes - 1] = '\0';
> -
> -	rdtgrp = rdtgroup_kn_lock_live(of->kn);
> -	if (!rdtgrp) {
> -		rdtgroup_kn_unlock(of->kn);
> -		return -ENOENT;
> -	}
> -
> -	rdt_last_cmd_clear();
> -
> -	mode = rdtgrp->mode;
> -
> -	if ((!strcmp(buf, "shareable") && mode == RDT_MODE_SHAREABLE) ||
> -	    (!strcmp(buf, "exclusive") && mode == RDT_MODE_EXCLUSIVE) ||
> -	    (!strcmp(buf, "pseudo-locksetup") &&
> -	     mode == RDT_MODE_PSEUDO_LOCKSETUP) ||
> -	    (!strcmp(buf, "pseudo-locked") && mode == RDT_MODE_PSEUDO_LOCKED))
> -		goto out;
> -
> -	if (mode == RDT_MODE_PSEUDO_LOCKED) {
> -		rdt_last_cmd_puts("Cannot change pseudo-locked group\n");
> -		ret = -EINVAL;
> -		goto out;
> -	}
> -
> -	if (!strcmp(buf, "shareable")) {
> -		if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
> -			ret = rdtgroup_locksetup_exit(rdtgrp);
> -			if (ret)
> -				goto out;
> -		}
> -		rdtgrp->mode = RDT_MODE_SHAREABLE;
> -	} else if (!strcmp(buf, "exclusive")) {
> -		if (!rdtgroup_mode_test_exclusive(rdtgrp)) {
> -			ret = -EINVAL;
> -			goto out;
> -		}
> -		if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
> -			ret = rdtgroup_locksetup_exit(rdtgrp);
> -			if (ret)
> -				goto out;
> -		}
> -		rdtgrp->mode = RDT_MODE_EXCLUSIVE;
> -	} else if (IS_ENABLED(CONFIG_RESCTRL_FS_PSEUDO_LOCK) &&
> -		   !strcmp(buf, "pseudo-locksetup")) {
> -		ret = rdtgroup_locksetup_enter(rdtgrp);
> -		if (ret)
> -			goto out;
> -		rdtgrp->mode = RDT_MODE_PSEUDO_LOCKSETUP;
> -	} else {
> -		rdt_last_cmd_puts("Unknown or unsupported mode\n");
> -		ret = -EINVAL;
> -	}
> -
> -out:
> -	rdtgroup_kn_unlock(of->kn);
> -	return ret ?: nbytes;
> -}
> -
> -/**
> - * rdtgroup_cbm_to_size - Translate CBM to size in bytes
> - * @r: RDT resource to which @d belongs.
> - * @d: RDT domain instance.
> - * @cbm: bitmask for which the size should be computed.
> - *
> - * The bitmask provided associated with the RDT domain instance @d will be
> - * translated into how many bytes it represents. The size in bytes is
> - * computed by first dividing the total cache size by the CBM length to
> - * determine how many bytes each bit in the bitmask represents. The result
> - * is multiplied with the number of bits set in the bitmask.
> - *
> - * @cbm is unsigned long, even if only 32 bits are used to make the
> - * bitmap functions work correctly.
> - */
> -unsigned int rdtgroup_cbm_to_size(struct rdt_resource *r,
> -				  struct rdt_domain *d, unsigned long cbm)
> -{
> -	struct cpu_cacheinfo *ci;
> -	unsigned int size = 0;
> -	int num_b, i;
> -
> -	num_b = bitmap_weight(&cbm, r->cache.cbm_len);
> -	ci = get_cpu_cacheinfo(cpumask_any(&d->cpu_mask));
> -	for (i = 0; i < ci->num_leaves; i++) {
> -		if (ci->info_list[i].level == r->cache_level) {
> -			size = ci->info_list[i].size / r->cache.cbm_len * num_b;
> -			break;
> -		}
> -	}
> -
> -	return size;
> -}
> -
> -/*
> - * rdtgroup_size_show - Display size in bytes of allocated regions
> - *
> - * The "size" file mirrors the layout of the "schemata" file, printing the
> - * size in bytes of each region instead of the capacity bitmask.
> - */
> -static int rdtgroup_size_show(struct kernfs_open_file *of,
> -			      struct seq_file *s, void *v)
> -{
> -	struct resctrl_schema *schema;
> -	enum resctrl_conf_type type;
> -	struct rdtgroup *rdtgrp;
> -	struct rdt_resource *r;
> -	struct rdt_domain *d;
> -	unsigned int size;
> -	int ret = 0;
> -	u32 closid;
> -	bool sep;
> -	u32 ctrl;
> -
> -	rdtgrp = rdtgroup_kn_lock_live(of->kn);
> -	if (!rdtgrp) {
> -		rdtgroup_kn_unlock(of->kn);
> -		return -ENOENT;
> -	}
> -
> -	if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) {
> -		if (!rdtgrp->plr->d) {
> -			rdt_last_cmd_clear();
> -			rdt_last_cmd_puts("Cache domain offline\n");
> -			ret = -ENODEV;
> -		} else {
> -			seq_printf(s, "%*s:", max_name_width,
> -				   rdtgrp->plr->s->name);
> -			size = rdtgroup_cbm_to_size(rdtgrp->plr->s->res,
> -						    rdtgrp->plr->d,
> -						    rdtgrp->plr->cbm);
> -			seq_printf(s, "%d=%u\n", rdtgrp->plr->d->id, size);
> -		}
> -		goto out;
> -	}
> -
> -	closid = rdtgrp->closid;
> -
> -	list_for_each_entry(schema, &resctrl_schema_all, list) {
> -		r = schema->res;
> -		type = schema->conf_type;
> -		sep = false;
> -		seq_printf(s, "%*s:", max_name_width, schema->name);
> -		list_for_each_entry(d, &r->domains, list) {
> -			if (sep)
> -				seq_putc(s, ';');
> -			if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
> -				size = 0;
> -			} else {
> -				if (is_mba_sc(r))
> -					ctrl = d->mbps_val[closid];
> -				else
> -					ctrl = resctrl_arch_get_config(r, d,
> -								       closid,
> -								       type);
> -				if (r->rid == RDT_RESOURCE_MBA ||
> -				    r->rid == RDT_RESOURCE_SMBA)
> -					size = ctrl;
> -				else
> -					size = rdtgroup_cbm_to_size(r, d, ctrl);
> -			}
> -			seq_printf(s, "%d=%u", d->id, size);
> -			sep = true;
> -		}
> -		seq_putc(s, '\n');
> -	}
> -
> -out:
> -	rdtgroup_kn_unlock(of->kn);
> -
> -	return ret;
> -}
> -
>  #define INVALID_CONFIG_INDEX   UINT_MAX
>  
>  /**
> @@ -1622,62 +86,6 @@ void resctrl_arch_mon_event_config_read(void *info)
>  	mon_info->mon_config = msrval & MAX_EVT_CONFIG_BITS;
>  }
>  
> -static void mondata_config_read(struct resctrl_mon_config_info *mon_info)
> -{
> -	smp_call_function_any(&mon_info->d->cpu_mask,
> -			      resctrl_arch_mon_event_config_read, mon_info, 1);
> -}
> -
> -static int mbm_config_show(struct seq_file *s, struct rdt_resource *r, u32 evtid)
> -{
> -	struct resctrl_mon_config_info mon_info = {0};
> -	struct rdt_domain *dom;
> -	bool sep = false;
> -
> -	cpus_read_lock();
> -	mutex_lock(&rdtgroup_mutex);
> -
> -	list_for_each_entry(dom, &r->domains, list) {
> -		if (sep)
> -			seq_puts(s, ";");
> -
> -		memset(&mon_info, 0, sizeof(struct resctrl_mon_config_info));
> -		mon_info.r = r;
> -		mon_info.d = dom;
> -		mon_info.evtid = evtid;
> -		mondata_config_read(&mon_info);
> -
> -		seq_printf(s, "%d=0x%02x", dom->id, mon_info.mon_config);
> -		sep = true;
> -	}
> -	seq_puts(s, "\n");
> -
> -	mutex_unlock(&rdtgroup_mutex);
> -	cpus_read_unlock();
> -
> -	return 0;
> -}
> -
> -static int mbm_total_bytes_config_show(struct kernfs_open_file *of,
> -				       struct seq_file *seq, void *v)
> -{
> -	struct rdt_resource *r = of->kn->parent->priv;
> -
> -	mbm_config_show(seq, r, QOS_L3_MBM_TOTAL_EVENT_ID);
> -
> -	return 0;
> -}
> -
> -static int mbm_local_bytes_config_show(struct kernfs_open_file *of,
> -				       struct seq_file *seq, void *v)
> -{
> -	struct rdt_resource *r = of->kn->parent->priv;
> -
> -	mbm_config_show(seq, r, QOS_L3_MBM_LOCAL_EVENT_ID);
> -
> -	return 0;
> -}
> -
>  void resctrl_arch_mon_event_config_write(void *info)
>  {
>  	struct resctrl_mon_config_info *mon_info = info;
> @@ -1694,605 +102,6 @@ void resctrl_arch_mon_event_config_write(void *info)
>  	mon_info->err = 0;
>  }
>  
> -static int mbm_config_write_domain(struct rdt_resource *r,
> -				   struct rdt_domain *d, u32 evtid, u32 val)
> -{
> -	struct resctrl_mon_config_info mon_info = {0};
> -
> -	/*
> -	 * Read the current config value first. If both are the same then
> -	 * no need to write it again.
> -	 */
> -	mon_info.r = r;
> -	mon_info.d = d;
> -	mon_info.evtid = evtid;
> -	mondata_config_read(&mon_info);
> -	if (mon_info.mon_config == val)
> -		return 0;
> -
> -	mon_info.mon_config = val;
> -
> -	/*
> -	 * Update MSR_IA32_EVT_CFG_BASE MSR on one of the CPUs in the
> -	 * domain. The MSRs offset from MSR MSR_IA32_EVT_CFG_BASE
> -	 * are scoped at the domain level. Writing any of these MSRs
> -	 * on one CPU is observed by all the CPUs in the domain.
> -	 */
> -	smp_call_function_any(&d->cpu_mask, resctrl_arch_mon_event_config_write,
> -			      &mon_info, 1);
> -	if (mon_info.err) {
> -		rdt_last_cmd_puts("Invalid event configuration\n");
> -		return mon_info.err;
> -	}
> -
> -	/*
> -	 * When an Event Configuration is changed, the bandwidth counters
> -	 * for all RMIDs and Events will be cleared by the hardware. The
> -	 * hardware also sets MSR_IA32_QM_CTR.Unavailable (bit 62) for
> -	 * every RMID on the next read to any event for every RMID.
> -	 * Subsequent reads will have MSR_IA32_QM_CTR.Unavailable (bit 62)
> -	 * cleared while it is tracked by the hardware. Clear the
> -	 * mbm_local and mbm_total counts for all the RMIDs.
> -	 */
> -	resctrl_arch_reset_rmid_all(r, d);
> -
> -	return 0;
> -}
> -
> -static int mon_config_write(struct rdt_resource *r, char *tok, u32 evtid)
> -{
> -	char *dom_str = NULL, *id_str;
> -	unsigned long dom_id, val;
> -	struct rdt_domain *d;
> -	int err;
> -
> -	/* Walking r->domains, ensure it can't race with cpuhp */
> -	lockdep_assert_cpus_held();
> -
> -next:
> -	if (!tok || tok[0] == '\0')
> -		return 0;
> -
> -	/* Start processing the strings for each domain */
> -	dom_str = strim(strsep(&tok, ";"));
> -	id_str = strsep(&dom_str, "=");
> -
> -	if (!id_str || kstrtoul(id_str, 10, &dom_id)) {
> -		rdt_last_cmd_puts("Missing '=' or non-numeric domain id\n");
> -		return -EINVAL;
> -	}
> -
> -	if (!dom_str || kstrtoul(dom_str, 16, &val)) {
> -		rdt_last_cmd_puts("Non-numeric event configuration value\n");
> -		return -EINVAL;
> -	}
> -
> -	/* Value from user cannot be more than the supported set of events */
> -	if ((val & r->mbm_cfg_mask) != val) {
> -		rdt_last_cmd_printf("Invalid event configuration: max valid mask is 0x%02x\n",
> -				    r->mbm_cfg_mask);
> -		return -EINVAL;
> -	}
> -
> -	list_for_each_entry(d, &r->domains, list) {
> -		if (d->id == dom_id) {
> -			err = mbm_config_write_domain(r, d, evtid, val);
> -			if (err)
> -				return err;
> -			goto next;
> -		}
> -	}
> -
> -	return -EINVAL;
> -}
> -
> -static ssize_t mbm_total_bytes_config_write(struct kernfs_open_file *of,
> -					    char *buf, size_t nbytes,
> -					    loff_t off)
> -{
> -	struct rdt_resource *r = of->kn->parent->priv;
> -	int ret;
> -
> -	/* Valid input requires a trailing newline */
> -	if (nbytes == 0 || buf[nbytes - 1] != '\n')
> -		return -EINVAL;
> -
> -	cpus_read_lock();
> -	mutex_lock(&rdtgroup_mutex);
> -
> -	rdt_last_cmd_clear();
> -
> -	buf[nbytes - 1] = '\0';
> -
> -	ret = mon_config_write(r, buf, QOS_L3_MBM_TOTAL_EVENT_ID);
> -
> -	mutex_unlock(&rdtgroup_mutex);
> -	cpus_read_unlock();
> -
> -	return ret ?: nbytes;
> -}
> -
> -static ssize_t mbm_local_bytes_config_write(struct kernfs_open_file *of,
> -					    char *buf, size_t nbytes,
> -					    loff_t off)
> -{
> -	struct rdt_resource *r = of->kn->parent->priv;
> -	int ret;
> -
> -	/* Valid input requires a trailing newline */
> -	if (nbytes == 0 || buf[nbytes - 1] != '\n')
> -		return -EINVAL;
> -
> -	cpus_read_lock();
> -	mutex_lock(&rdtgroup_mutex);
> -
> -	rdt_last_cmd_clear();
> -
> -	buf[nbytes - 1] = '\0';
> -
> -	ret = mon_config_write(r, buf, QOS_L3_MBM_LOCAL_EVENT_ID);
> -
> -	mutex_unlock(&rdtgroup_mutex);
> -	cpus_read_unlock();
> -
> -	return ret ?: nbytes;
> -}
> -
> -/* rdtgroup information files for one cache resource. */
> -static struct rftype res_common_files[] = {
> -	{
> -		.name		= "last_cmd_status",
> -		.mode		= 0444,
> -		.kf_ops		= &rdtgroup_kf_single_ops,
> -		.seq_show	= rdt_last_cmd_status_show,
> -		.fflags		= RFTYPE_TOP_INFO,
> -	},
> -	{
> -		.name		= "num_closids",
> -		.mode		= 0444,
> -		.kf_ops		= &rdtgroup_kf_single_ops,
> -		.seq_show	= rdt_num_closids_show,
> -		.fflags		= RFTYPE_CTRL_INFO,
> -	},
> -	{
> -		.name		= "mon_features",
> -		.mode		= 0444,
> -		.kf_ops		= &rdtgroup_kf_single_ops,
> -		.seq_show	= rdt_mon_features_show,
> -		.fflags		= RFTYPE_MON_INFO,
> -	},
> -	{
> -		.name		= "num_rmids",
> -		.mode		= 0444,
> -		.kf_ops		= &rdtgroup_kf_single_ops,
> -		.seq_show	= rdt_num_rmids_show,
> -		.fflags		= RFTYPE_MON_INFO,
> -	},
> -	{
> -		.name		= "cbm_mask",
> -		.mode		= 0444,
> -		.kf_ops		= &rdtgroup_kf_single_ops,
> -		.seq_show	= rdt_default_ctrl_show,
> -		.fflags		= RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE,
> -	},
> -	{
> -		.name		= "min_cbm_bits",
> -		.mode		= 0444,
> -		.kf_ops		= &rdtgroup_kf_single_ops,
> -		.seq_show	= rdt_min_cbm_bits_show,
> -		.fflags		= RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE,
> -	},
> -	{
> -		.name		= "shareable_bits",
> -		.mode		= 0444,
> -		.kf_ops		= &rdtgroup_kf_single_ops,
> -		.seq_show	= rdt_shareable_bits_show,
> -		.fflags		= RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE,
> -	},
> -	{
> -		.name		= "bit_usage",
> -		.mode		= 0444,
> -		.kf_ops		= &rdtgroup_kf_single_ops,
> -		.seq_show	= rdt_bit_usage_show,
> -		.fflags		= RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE,
> -	},
> -	{
> -		.name		= "min_bandwidth",
> -		.mode		= 0444,
> -		.kf_ops		= &rdtgroup_kf_single_ops,
> -		.seq_show	= rdt_min_bw_show,
> -		.fflags		= RFTYPE_CTRL_INFO | RFTYPE_RES_MB,
> -	},
> -	{
> -		.name		= "bandwidth_gran",
> -		.mode		= 0444,
> -		.kf_ops		= &rdtgroup_kf_single_ops,
> -		.seq_show	= rdt_bw_gran_show,
> -		.fflags		= RFTYPE_CTRL_INFO | RFTYPE_RES_MB,
> -	},
> -	{
> -		.name		= "delay_linear",
> -		.mode		= 0444,
> -		.kf_ops		= &rdtgroup_kf_single_ops,
> -		.seq_show	= rdt_delay_linear_show,
> -		.fflags		= RFTYPE_CTRL_INFO | RFTYPE_RES_MB,
> -	},
> -	/*
> -	 * Platform specific which (if any) capabilities are provided by
> -	 * thread_throttle_mode. Defer "fflags" initialization to platform
> -	 * discovery.
> -	 */
> -	{
> -		.name		= "thread_throttle_mode",
> -		.mode		= 0444,
> -		.kf_ops		= &rdtgroup_kf_single_ops,
> -		.seq_show	= rdt_thread_throttle_mode_show,
> -	},
> -	{
> -		.name		= "max_threshold_occupancy",
> -		.mode		= 0644,
> -		.kf_ops		= &rdtgroup_kf_single_ops,
> -		.write		= max_threshold_occ_write,
> -		.seq_show	= max_threshold_occ_show,
> -		.fflags		= RFTYPE_MON_INFO | RFTYPE_RES_CACHE,
> -	},
> -	{
> -		.name		= "mbm_total_bytes_config",
> -		.mode		= 0644,
> -		.kf_ops		= &rdtgroup_kf_single_ops,
> -		.seq_show	= mbm_total_bytes_config_show,
> -		.write		= mbm_total_bytes_config_write,
> -	},
> -	{
> -		.name		= "mbm_local_bytes_config",
> -		.mode		= 0644,
> -		.kf_ops		= &rdtgroup_kf_single_ops,
> -		.seq_show	= mbm_local_bytes_config_show,
> -		.write		= mbm_local_bytes_config_write,
> -	},
> -	{
> -		.name		= "cpus",
> -		.mode		= 0644,
> -		.kf_ops		= &rdtgroup_kf_single_ops,
> -		.write		= rdtgroup_cpus_write,
> -		.seq_show	= rdtgroup_cpus_show,
> -		.fflags		= RFTYPE_BASE,
> -	},
> -	{
> -		.name		= "cpus_list",
> -		.mode		= 0644,
> -		.kf_ops		= &rdtgroup_kf_single_ops,
> -		.write		= rdtgroup_cpus_write,
> -		.seq_show	= rdtgroup_cpus_show,
> -		.flags		= RFTYPE_FLAGS_CPUS_LIST,
> -		.fflags		= RFTYPE_BASE,
> -	},
> -	{
> -		.name		= "tasks",
> -		.mode		= 0644,
> -		.kf_ops		= &rdtgroup_kf_single_ops,
> -		.write		= rdtgroup_tasks_write,
> -		.seq_show	= rdtgroup_tasks_show,
> -		.fflags		= RFTYPE_BASE,
> -	},
> -	{
> -		.name		= "mon_hw_id",
> -		.mode		= 0444,
> -		.kf_ops		= &rdtgroup_kf_single_ops,
> -		.seq_show	= rdtgroup_rmid_show,
> -		.fflags		= RFTYPE_MON_BASE | RFTYPE_DEBUG,
> -	},
> -	{
> -		.name		= "schemata",
> -		.mode		= 0644,
> -		.kf_ops		= &rdtgroup_kf_single_ops,
> -		.write		= rdtgroup_schemata_write,
> -		.seq_show	= rdtgroup_schemata_show,
> -		.fflags		= RFTYPE_CTRL_BASE,
> -	},
> -	{
> -		.name		= "mode",
> -		.mode		= 0644,
> -		.kf_ops		= &rdtgroup_kf_single_ops,
> -		.write		= rdtgroup_mode_write,
> -		.seq_show	= rdtgroup_mode_show,
> -		.fflags		= RFTYPE_CTRL_BASE,
> -	},
> -	{
> -		.name		= "size",
> -		.mode		= 0444,
> -		.kf_ops		= &rdtgroup_kf_single_ops,
> -		.seq_show	= rdtgroup_size_show,
> -		.fflags		= RFTYPE_CTRL_BASE,
> -	},
> -	{
> -		.name		= "sparse_masks",
> -		.mode		= 0444,
> -		.kf_ops		= &rdtgroup_kf_single_ops,
> -		.seq_show	= rdt_has_sparse_bitmasks_show,
> -		.fflags		= RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE,
> -	},
> -	{
> -		.name		= "ctrl_hw_id",
> -		.mode		= 0444,
> -		.kf_ops		= &rdtgroup_kf_single_ops,
> -		.seq_show	= rdtgroup_closid_show,
> -		.fflags		= RFTYPE_CTRL_BASE | RFTYPE_DEBUG,
> -	},
> -
> -};
> -
> -static int rdtgroup_add_files(struct kernfs_node *kn, unsigned long fflags)
> -{
> -	struct rftype *rfts, *rft;
> -	int ret, len;
> -
> -	rfts = res_common_files;
> -	len = ARRAY_SIZE(res_common_files);
> -
> -	lockdep_assert_held(&rdtgroup_mutex);
> -
> -	if (resctrl_debug)
> -		fflags |= RFTYPE_DEBUG;
> -
> -	for (rft = rfts; rft < rfts + len; rft++) {
> -		if (rft->fflags && ((fflags & rft->fflags) == rft->fflags)) {
> -			ret = rdtgroup_add_file(kn, rft);
> -			if (ret)
> -				goto error;
> -		}
> -	}
> -
> -	return 0;
> -error:
> -	pr_warn("Failed to add %s, err=%d\n", rft->name, ret);
> -	while (--rft >= rfts) {
> -		if ((fflags & rft->fflags) == rft->fflags)
> -			kernfs_remove_by_name(kn, rft->name);
> -	}
> -	return ret;
> -}
> -
> -static struct rftype *rdtgroup_get_rftype_by_name(const char *name)
> -{
> -	struct rftype *rfts, *rft;
> -	int len;
> -
> -	rfts = res_common_files;
> -	len = ARRAY_SIZE(res_common_files);
> -
> -	for (rft = rfts; rft < rfts + len; rft++) {
> -		if (!strcmp(rft->name, name))
> -			return rft;
> -	}
> -
> -	return NULL;
> -}
> -
> -static void thread_throttle_mode_init(void)
> -{
> -	struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_MBA);
> -	struct rftype *rft;
> -
> -	if (!r->alloc_capable ||
> -	    r->membw.throttle_mode == THREAD_THROTTLE_UNDEFINED)
> -		return;
> -
> -	rft = rdtgroup_get_rftype_by_name("thread_throttle_mode");
> -	if (!rft)
> -		return;
> -
> -	rft->fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_MB;
> -}
> -
> -void mbm_config_rftype_init(const char *config)
> -{
> -	struct rftype *rft;
> -
> -	rft = rdtgroup_get_rftype_by_name(config);
> -	if (rft)
> -		rft->fflags = RFTYPE_MON_INFO | RFTYPE_RES_CACHE;
> -}
> -
> -/**
> - * rdtgroup_kn_mode_restrict - Restrict user access to named resctrl file
> - * @r: The resource group with which the file is associated.
> - * @name: Name of the file
> - *
> - * The permissions of named resctrl file, directory, or link are modified
> - * to not allow read, write, or execute by any user.
> - *
> - * WARNING: This function is intended to communicate to the user that the
> - * resctrl file has been locked down - that it is not relevant to the
> - * particular state the system finds itself in. It should not be relied
> - * on to protect from user access because after the file's permissions
> - * are restricted the user can still change the permissions using chmod
> - * from the command line.
> - *
> - * Return: 0 on success, <0 on failure.
> - */
> -int rdtgroup_kn_mode_restrict(struct rdtgroup *r, const char *name)
> -{
> -	struct iattr iattr = {.ia_valid = ATTR_MODE,};
> -	struct kernfs_node *kn;
> -	int ret = 0;
> -
> -	kn = kernfs_find_and_get_ns(r->kn, name, NULL);
> -	if (!kn)
> -		return -ENOENT;
> -
> -	switch (kernfs_type(kn)) {
> -	case KERNFS_DIR:
> -		iattr.ia_mode = S_IFDIR;
> -		break;
> -	case KERNFS_FILE:
> -		iattr.ia_mode = S_IFREG;
> -		break;
> -	case KERNFS_LINK:
> -		iattr.ia_mode = S_IFLNK;
> -		break;
> -	}
> -
> -	ret = kernfs_setattr(kn, &iattr);
> -	kernfs_put(kn);
> -	return ret;
> -}
> -
> -/**
> - * rdtgroup_kn_mode_restore - Restore user access to named resctrl file
> - * @r: The resource group with which the file is associated.
> - * @name: Name of the file
> - * @mask: Mask of permissions that should be restored
> - *
> - * Restore the permissions of the named file. If @name is a directory the
> - * permissions of its parent will be used.
> - *
> - * Return: 0 on success, <0 on failure.
> - */
> -int rdtgroup_kn_mode_restore(struct rdtgroup *r, const char *name,
> -			     umode_t mask)
> -{
> -	struct iattr iattr = {.ia_valid = ATTR_MODE,};
> -	struct kernfs_node *kn, *parent;
> -	struct rftype *rfts, *rft;
> -	int ret, len;
> -
> -	rfts = res_common_files;
> -	len = ARRAY_SIZE(res_common_files);
> -
> -	for (rft = rfts; rft < rfts + len; rft++) {
> -		if (!strcmp(rft->name, name))
> -			iattr.ia_mode = rft->mode & mask;
> -	}
> -
> -	kn = kernfs_find_and_get_ns(r->kn, name, NULL);
> -	if (!kn)
> -		return -ENOENT;
> -
> -	switch (kernfs_type(kn)) {
> -	case KERNFS_DIR:
> -		parent = kernfs_get_parent(kn);
> -		if (parent) {
> -			iattr.ia_mode |= parent->mode;
> -			kernfs_put(parent);
> -		}
> -		iattr.ia_mode |= S_IFDIR;
> -		break;
> -	case KERNFS_FILE:
> -		iattr.ia_mode |= S_IFREG;
> -		break;
> -	case KERNFS_LINK:
> -		iattr.ia_mode |= S_IFLNK;
> -		break;
> -	}
> -
> -	ret = kernfs_setattr(kn, &iattr);
> -	kernfs_put(kn);
> -	return ret;
> -}
> -
> -static int rdtgroup_mkdir_info_resdir(void *priv, char *name,
> -				      unsigned long fflags)
> -{
> -	struct kernfs_node *kn_subdir;
> -	int ret;
> -
> -	kn_subdir = kernfs_create_dir(kn_info, name,
> -				      kn_info->mode, priv);
> -	if (IS_ERR(kn_subdir))
> -		return PTR_ERR(kn_subdir);
> -
> -	ret = rdtgroup_kn_set_ugid(kn_subdir);
> -	if (ret)
> -		return ret;
> -
> -	ret = rdtgroup_add_files(kn_subdir, fflags);
> -	if (!ret)
> -		kernfs_activate(kn_subdir);
> -
> -	return ret;
> -}
> -
> -static int rdtgroup_create_info_dir(struct kernfs_node *parent_kn)
> -{
> -	enum resctrl_res_level i;
> -	struct resctrl_schema *s;
> -	struct rdt_resource *r;
> -	unsigned long fflags;
> -	char name[32];
> -	int ret;
> -
> -	/* create the directory */
> -	kn_info = kernfs_create_dir(parent_kn, "info", parent_kn->mode, NULL);
> -	if (IS_ERR(kn_info))
> -		return PTR_ERR(kn_info);
> -
> -	ret = rdtgroup_add_files(kn_info, RFTYPE_TOP_INFO);
> -	if (ret)
> -		goto out_destroy;
> -
> -	/* loop over enabled controls, these are all alloc_capable */
> -	list_for_each_entry(s, &resctrl_schema_all, list) {
> -		r = s->res;
> -		fflags = r->fflags | RFTYPE_CTRL_INFO;
> -		ret = rdtgroup_mkdir_info_resdir(s, s->name, fflags);
> -		if (ret)
> -			goto out_destroy;
> -	}
> -
> -	for (i = 0; i < RDT_NUM_RESOURCES; i++) {
> -		r = resctrl_arch_get_resource(i);
> -		if (!r->mon_capable)
> -			continue;
> -
> -		fflags =  r->fflags | RFTYPE_MON_INFO;
> -		sprintf(name, "%s_MON", r->name);
> -		ret = rdtgroup_mkdir_info_resdir(r, name, fflags);
> -		if (ret)
> -			goto out_destroy;
> -	}
> -
> -	ret = rdtgroup_kn_set_ugid(kn_info);
> -	if (ret)
> -		goto out_destroy;
> -
> -	kernfs_activate(kn_info);
> -
> -	return 0;
> -
> -out_destroy:
> -	kernfs_remove(kn_info);
> -	return ret;
> -}
> -
> -static int
> -mongroup_create_dir(struct kernfs_node *parent_kn, struct rdtgroup *prgrp,
> -		    char *name, struct kernfs_node **dest_kn)
> -{
> -	struct kernfs_node *kn;
> -	int ret;
> -
> -	/* create the directory */
> -	kn = kernfs_create_dir(parent_kn, name, parent_kn->mode, prgrp);
> -	if (IS_ERR(kn))
> -		return PTR_ERR(kn);
> -
> -	if (dest_kn)
> -		*dest_kn = kn;
> -
> -	ret = rdtgroup_kn_set_ugid(kn);
> -	if (ret)
> -		goto out_destroy;
> -
> -	kernfs_activate(kn);
> -
> -	return 0;
> -
> -out_destroy:
> -	kernfs_remove(kn);
> -	return ret;
> -}
> -
>  static void l3_qos_cfg_update(void *arg)
>  {
>  	bool *enable = arg;
> @@ -2307,11 +116,6 @@ static void l2_qos_cfg_update(void *arg)
>  	wrmsrl(MSR_IA32_L2_QOS_CFG, *enable ? L2_QOS_CDP_ENABLE : 0ULL);
>  }
>  
> -static inline bool is_mba_linear(void)
> -{
> -	return resctrl_arch_get_resource(RDT_RESOURCE_MBA)->membw.delay_linear;
> -}
> -
>  static int set_cache_qos_cfg(int level, bool enable)
>  {
>  	void (*update)(void *arg);
> @@ -2367,66 +171,6 @@ void rdt_domain_reconfigure_cdp(struct rdt_resource *r)
>  		l3_qos_cfg_update(&hw_res->cdp_enabled);
>  }
>  
> -static int mba_sc_domain_allocate(struct rdt_resource *r, struct rdt_domain *d)
> -{
> -	u32 num_closid = resctrl_arch_get_num_closid(r);
> -	int cpu = cpumask_any(&d->cpu_mask);
> -	int i;
> -
> -	d->mbps_val = kcalloc_node(num_closid, sizeof(*d->mbps_val),
> -				   GFP_KERNEL, cpu_to_node(cpu));
> -	if (!d->mbps_val)
> -		return -ENOMEM;
> -
> -	for (i = 0; i < num_closid; i++)
> -		d->mbps_val[i] = MBA_MAX_MBPS;
> -
> -	return 0;
> -}
> -
> -static void mba_sc_domain_destroy(struct rdt_resource *r,
> -				  struct rdt_domain *d)
> -{
> -	kfree(d->mbps_val);
> -	d->mbps_val = NULL;
> -}
> -
> -/*
> - * MBA software controller is supported only if
> - * MBM is supported and MBA is in linear scale.
> - */
> -static bool supports_mba_mbps(void)
> -{
> -	struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_MBA);
> -
> -	return (resctrl_arch_is_mbm_local_enabled() &&
> -		r->alloc_capable && is_mba_linear());
> -}
> -
> -/*
> - * Enable or disable the MBA software controller
> - * which helps user specify bandwidth in MBps.
> - */
> -static int set_mba_sc(bool mba_sc)
> -{
> -	struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_MBA);
> -	u32 num_closid = resctrl_arch_get_num_closid(r);
> -	struct rdt_domain *d;
> -	int i;
> -
> -	if (!supports_mba_mbps() || mba_sc == is_mba_sc(r))
> -		return -EINVAL;
> -
> -	r->membw.mba_sc = mba_sc;
> -
> -	list_for_each_entry(d, &r->domains, list) {
> -		for (i = 0; i < num_closid; i++)
> -			d->mbps_val[i] = MBA_MAX_MBPS;
> -	}
> -
> -	return 0;
> -}
> -
>  static int cdp_enable(int level)
>  {
>  	struct rdt_resource *r_l = &rdt_resources_all[level].r_resctrl;
> @@ -2467,414 +211,6 @@ int resctrl_arch_set_cdp_enabled(enum resctrl_res_level l, bool enable)
>  	return 0;
>  }
>  
> -/*
> - * We don't allow rdtgroup directories to be created anywhere
> - * except the root directory. Thus when looking for the rdtgroup
> - * structure for a kernfs node we are either looking at a directory,
> - * in which case the rdtgroup structure is pointed at by the "priv"
> - * field, otherwise we have a file, and need only look to the parent
> - * to find the rdtgroup.
> - */
> -static struct rdtgroup *kernfs_to_rdtgroup(struct kernfs_node *kn)
> -{
> -	if (kernfs_type(kn) == KERNFS_DIR) {
> -		/*
> -		 * All the resource directories use "kn->priv"
> -		 * to point to the "struct rdtgroup" for the
> -		 * resource. "info" and its subdirectories don't
> -		 * have rdtgroup structures, so return NULL here.
> -		 */
> -		if (kn == kn_info || kn->parent == kn_info)
> -			return NULL;
> -		else
> -			return kn->priv;
> -	} else {
> -		return kn->parent->priv;
> -	}
> -}
> -
> -static void rdtgroup_kn_get(struct rdtgroup *rdtgrp, struct kernfs_node *kn)
> -{
> -	atomic_inc(&rdtgrp->waitcount);
> -	kernfs_break_active_protection(kn);
> -}
> -
> -static void rdtgroup_kn_put(struct rdtgroup *rdtgrp, struct kernfs_node *kn)
> -{
> -	if (atomic_dec_and_test(&rdtgrp->waitcount) &&
> -	    (rdtgrp->flags & RDT_DELETED)) {
> -		if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP ||
> -		    rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED)
> -			rdtgroup_pseudo_lock_remove(rdtgrp);
> -		kernfs_unbreak_active_protection(kn);
> -		rdtgroup_remove(rdtgrp);
> -	} else {
> -		kernfs_unbreak_active_protection(kn);
> -	}
> -}
> -
> -struct rdtgroup *rdtgroup_kn_lock_live(struct kernfs_node *kn)
> -{
> -	struct rdtgroup *rdtgrp = kernfs_to_rdtgroup(kn);
> -
> -	if (!rdtgrp)
> -		return NULL;
> -
> -	rdtgroup_kn_get(rdtgrp, kn);
> -
> -	cpus_read_lock();
> -	mutex_lock(&rdtgroup_mutex);
> -
> -	/* Was this group deleted while we waited? */
> -	if (rdtgrp->flags & RDT_DELETED)
> -		return NULL;
> -
> -	return rdtgrp;
> -}
> -
> -void rdtgroup_kn_unlock(struct kernfs_node *kn)
> -{
> -	struct rdtgroup *rdtgrp = kernfs_to_rdtgroup(kn);
> -
> -	if (!rdtgrp)
> -		return;
> -
> -	mutex_unlock(&rdtgroup_mutex);
> -	cpus_read_unlock();
> -
> -	rdtgroup_kn_put(rdtgrp, kn);
> -}
> -
> -static int mkdir_mondata_all(struct kernfs_node *parent_kn,
> -			     struct rdtgroup *prgrp,
> -			     struct kernfs_node **mon_data_kn);
> -
> -static void rdt_disable_ctx(void)
> -{
> -	resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L3, false);
> -	resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L2, false);
> -	set_mba_sc(false);
> -
> -	resctrl_debug = false;
> -}
> -
> -static int rdt_enable_ctx(struct rdt_fs_context *ctx)
> -{
> -	int ret = 0;
> -
> -	if (ctx->enable_cdpl2) {
> -		ret = resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L2, true);
> -		if (ret)
> -			goto out_done;
> -	}
> -
> -	if (ctx->enable_cdpl3) {
> -		ret = resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L3, true);
> -		if (ret)
> -			goto out_cdpl2;
> -	}
> -
> -	if (ctx->enable_mba_mbps) {
> -		ret = set_mba_sc(true);
> -		if (ret)
> -			goto out_cdpl3;
> -	}
> -
> -	if (ctx->enable_debug)
> -		resctrl_debug = true;
> -
> -	return 0;
> -
> -out_cdpl3:
> -	resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L3, false);
> -out_cdpl2:
> -	resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L2, false);
> -out_done:
> -	return ret;
> -}
> -
> -static int schemata_list_add(struct rdt_resource *r, enum resctrl_conf_type type)
> -{
> -	struct resctrl_schema *s;
> -	const char *suffix = "";
> -	int ret, cl;
> -
> -	s = kzalloc(sizeof(*s), GFP_KERNEL);
> -	if (!s)
> -		return -ENOMEM;
> -
> -	s->res = r;
> -	s->num_closid = resctrl_arch_get_num_closid(r);
> -	if (resctrl_arch_get_cdp_enabled(r->rid))
> -		s->num_closid /= 2;
> -
> -	s->conf_type = type;
> -	switch (type) {
> -	case CDP_CODE:
> -		suffix = "CODE";
> -		break;
> -	case CDP_DATA:
> -		suffix = "DATA";
> -		break;
> -	case CDP_NONE:
> -		suffix = "";
> -		break;
> -	}
> -
> -	ret = snprintf(s->name, sizeof(s->name), "%s%s", r->name, suffix);
> -	if (ret >= sizeof(s->name)) {
> -		kfree(s);
> -		return -EINVAL;
> -	}
> -
> -	cl = strlen(s->name);
> -
> -	/*
> -	 * If CDP is supported by this resource, but not enabled,
> -	 * include the suffix. This ensures the tabular format of the
> -	 * schemata file does not change between mounts of the filesystem.
> -	 */
> -	if (r->cdp_capable && !resctrl_arch_get_cdp_enabled(r->rid))
> -		cl += 4;
> -
> -	if (cl > max_name_width)
> -		max_name_width = cl;
> -
> -	/*
> -	 * Choose a width for the resource data based on the resource that has
> -	 * widest name and cbm.
> -	 */
> -	max_data_width = max(max_data_width, r->data_width);
> -
> -	INIT_LIST_HEAD(&s->list);
> -	list_add(&s->list, &resctrl_schema_all);
> -
> -	return 0;
> -}
> -
> -static int schemata_list_create(void)
> -{
> -	enum resctrl_res_level i;
> -	struct rdt_resource *r;
> -	int ret = 0;
> -
> -	for (i = 0; i < RDT_NUM_RESOURCES; i++) {
> -		r = resctrl_arch_get_resource(i);
> -		if (!r->alloc_capable)
> -			continue;
> -
> -		if (resctrl_arch_get_cdp_enabled(r->rid)) {
> -			ret = schemata_list_add(r, CDP_CODE);
> -			if (ret)
> -				break;
> -
> -			ret = schemata_list_add(r, CDP_DATA);
> -		} else {
> -			ret = schemata_list_add(r, CDP_NONE);
> -		}
> -
> -		if (ret)
> -			break;
> -	}
> -
> -	return ret;
> -}
> -
> -static void schemata_list_destroy(void)
> -{
> -	struct resctrl_schema *s, *tmp;
> -
> -	list_for_each_entry_safe(s, tmp, &resctrl_schema_all, list) {
> -		list_del(&s->list);
> -		kfree(s);
> -	}
> -}
> -
> -static int rdt_get_tree(struct fs_context *fc)
> -{
> -	struct rdt_resource *l3 = resctrl_arch_get_resource(RDT_RESOURCE_L3);
> -	struct rdt_fs_context *ctx = rdt_fc2context(fc);
> -	unsigned long flags = RFTYPE_CTRL_BASE;
> -	struct rdt_domain *dom;
> -	int ret;
> -
> -	cpus_read_lock();
> -	mutex_lock(&rdtgroup_mutex);
> -	/*
> -	 * resctrl file system can only be mounted once.
> -	 */
> -	if (resctrl_mounted) {
> -		ret = -EBUSY;
> -		goto out;
> -	}
> -
> -	ret = rdtgroup_setup_root(ctx);
> -	if (ret)
> -		goto out;
> -
> -	ret = rdt_enable_ctx(ctx);
> -	if (ret)
> -		goto out_root;
> -
> -	ret = schemata_list_create();
> -	if (ret) {
> -		schemata_list_destroy();
> -		goto out_ctx;
> -	}
> -
> -	closid_init();
> -
> -	if (resctrl_arch_mon_capable())
> -		flags |= RFTYPE_MON;
> -
> -	ret = rdtgroup_add_files(rdtgroup_default.kn, flags);
> -	if (ret)
> -		goto out_schemata_free;
> -
> -	kernfs_activate(rdtgroup_default.kn);
> -
> -	ret = rdtgroup_create_info_dir(rdtgroup_default.kn);
> -	if (ret < 0)
> -		goto out_schemata_free;
> -
> -	if (resctrl_arch_mon_capable()) {
> -		ret = mongroup_create_dir(rdtgroup_default.kn,
> -					  &rdtgroup_default, "mon_groups",
> -					  &kn_mongrp);
> -		if (ret < 0)
> -			goto out_info;
> -
> -		ret = mkdir_mondata_all(rdtgroup_default.kn,
> -					&rdtgroup_default, &kn_mondata);
> -		if (ret < 0)
> -			goto out_mongrp;
> -		rdtgroup_default.mon.mon_data_kn = kn_mondata;
> -	}
> -
> -	ret = rdt_pseudo_lock_init();
> -	if (ret)
> -		goto out_mondata;
> -
> -	ret = kernfs_get_tree(fc);
> -	if (ret < 0)
> -		goto out_psl;
> -
> -	if (resctrl_arch_alloc_capable())
> -		resctrl_arch_enable_alloc();
> -	if (resctrl_arch_mon_capable())
> -		resctrl_arch_enable_mon();
> -
> -	if (resctrl_arch_alloc_capable() || resctrl_arch_mon_capable())
> -		resctrl_mounted = true;
> -
> -	if (resctrl_is_mbm_enabled()) {
> -		list_for_each_entry(dom, &l3->domains, list)
> -			mbm_setup_overflow_handler(dom, MBM_OVERFLOW_INTERVAL,
> -						   RESCTRL_PICK_ANY_CPU);
> -	}
> -
> -	goto out;
> -
> -out_psl:
> -	rdt_pseudo_lock_release();
> -out_mondata:
> -	if (resctrl_arch_mon_capable())
> -		kernfs_remove(kn_mondata);
> -out_mongrp:
> -	if (resctrl_arch_mon_capable())
> -		kernfs_remove(kn_mongrp);
> -out_info:
> -	kernfs_remove(kn_info);
> -out_schemata_free:
> -	schemata_list_destroy();
> -out_ctx:
> -	rdt_disable_ctx();
> -out_root:
> -	rdtgroup_destroy_root();
> -out:
> -	rdt_last_cmd_clear();
> -	mutex_unlock(&rdtgroup_mutex);
> -	cpus_read_unlock();
> -	return ret;
> -}
> -
> -enum rdt_param {
> -	Opt_cdp,
> -	Opt_cdpl2,
> -	Opt_mba_mbps,
> -	Opt_debug,
> -	nr__rdt_params
> -};
> -
> -static const struct fs_parameter_spec rdt_fs_parameters[] = {
> -	fsparam_flag("cdp",		Opt_cdp),
> -	fsparam_flag("cdpl2",		Opt_cdpl2),
> -	fsparam_flag("mba_MBps",	Opt_mba_mbps),
> -	fsparam_flag("debug",		Opt_debug),
> -	{}
> -};
> -
> -static int rdt_parse_param(struct fs_context *fc, struct fs_parameter *param)
> -{
> -	struct rdt_fs_context *ctx = rdt_fc2context(fc);
> -	struct fs_parse_result result;
> -	int opt;
> -
> -	opt = fs_parse(fc, rdt_fs_parameters, param, &result);
> -	if (opt < 0)
> -		return opt;
> -
> -	switch (opt) {
> -	case Opt_cdp:
> -		ctx->enable_cdpl3 = true;
> -		return 0;
> -	case Opt_cdpl2:
> -		ctx->enable_cdpl2 = true;
> -		return 0;
> -	case Opt_mba_mbps:
> -		if (!supports_mba_mbps())
> -			return -EINVAL;
> -		ctx->enable_mba_mbps = true;
> -		return 0;
> -	case Opt_debug:
> -		ctx->enable_debug = true;
> -		return 0;
> -	}
> -
> -	return -EINVAL;
> -}
> -
> -static void rdt_fs_context_free(struct fs_context *fc)
> -{
> -	struct rdt_fs_context *ctx = rdt_fc2context(fc);
> -
> -	kernfs_free_fs_context(fc);
> -	kfree(ctx);
> -}
> -
> -static const struct fs_context_operations rdt_fs_context_ops = {
> -	.free		= rdt_fs_context_free,
> -	.parse_param	= rdt_parse_param,
> -	.get_tree	= rdt_get_tree,
> -};
> -
> -static int rdt_init_fs_context(struct fs_context *fc)
> -{
> -	struct rdt_fs_context *ctx;
> -
> -	ctx = kzalloc(sizeof(struct rdt_fs_context), GFP_KERNEL);
> -	if (!ctx)
> -		return -ENOMEM;
> -
> -	ctx->kfc.magic = RDTGROUP_SUPER_MAGIC;
> -	fc->fs_private = &ctx->kfc;
> -	fc->ops = &rdt_fs_context_ops;
> -	put_user_ns(fc->user_ns);
> -	fc->user_ns = get_user_ns(&init_user_ns);
> -	fc->global = true;
> -	return 0;
> -}
> -
>  static int reset_all_ctrls(struct rdt_resource *r)
>  {
>  	struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
> @@ -2922,1333 +258,3 @@ void resctrl_arch_reset_resources(void)
>  	for_each_capable_rdt_resource(r)
>  		reset_all_ctrls(r);
>  }
> -
> -/*
> - * Move tasks from one to the other group. If @from is NULL, then all tasks
> - * in the systems are moved unconditionally (used for teardown).
> - *
> - * If @mask is not NULL the cpus on which moved tasks are running are set
> - * in that mask so the update smp function call is restricted to affected
> - * cpus.
> - */
> -static void rdt_move_group_tasks(struct rdtgroup *from, struct rdtgroup *to,
> -				 struct cpumask *mask)
> -{
> -	struct task_struct *p, *t;
> -
> -	read_lock(&tasklist_lock);
> -	for_each_process_thread(p, t) {
> -		if (!from || is_closid_match(t, from) ||
> -		    is_rmid_match(t, from)) {
> -			resctrl_arch_set_closid_rmid(t, to->closid,
> -						     to->mon.rmid);
> -
> -			/*
> -			 * Order the closid/rmid stores above before the loads
> -			 * in task_curr(). This pairs with the full barrier
> -			 * between the rq->curr update and
> -			 * resctrl_arch_sched_in() during context switch.
> -			 */
> -			smp_mb();
> -
> -			/*
> -			 * If the task is on a CPU, set the CPU in the mask.
> -			 * The detection is inaccurate as tasks might move or
> -			 * schedule before the smp function call takes place.
> -			 * In such a case the function call is pointless, but
> -			 * there is no other side effect.
> -			 */
> -			if (IS_ENABLED(CONFIG_SMP) && mask && task_curr(t))
> -				cpumask_set_cpu(task_cpu(t), mask);
> -		}
> -	}
> -	read_unlock(&tasklist_lock);
> -}
> -
> -static void free_all_child_rdtgrp(struct rdtgroup *rdtgrp)
> -{
> -	struct rdtgroup *sentry, *stmp;
> -	struct list_head *head;
> -
> -	head = &rdtgrp->mon.crdtgrp_list;
> -	list_for_each_entry_safe(sentry, stmp, head, mon.crdtgrp_list) {
> -		free_rmid(sentry->closid, sentry->mon.rmid);
> -		list_del(&sentry->mon.crdtgrp_list);
> -
> -		if (atomic_read(&sentry->waitcount) != 0)
> -			sentry->flags = RDT_DELETED;
> -		else
> -			rdtgroup_remove(sentry);
> -	}
> -}
> -
> -/*
> - * Forcibly remove all of subdirectories under root.
> - */
> -static void rmdir_all_sub(void)
> -{
> -	struct rdtgroup *rdtgrp, *tmp;
> -
> -	/* Move all tasks to the default resource group */
> -	rdt_move_group_tasks(NULL, &rdtgroup_default, NULL);
> -
> -	list_for_each_entry_safe(rdtgrp, tmp, &rdt_all_groups, rdtgroup_list) {
> -		/* Free any child rmids */
> -		free_all_child_rdtgrp(rdtgrp);
> -
> -		/* Remove each rdtgroup other than root */
> -		if (rdtgrp == &rdtgroup_default)
> -			continue;
> -
> -		if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP ||
> -		    rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED)
> -			rdtgroup_pseudo_lock_remove(rdtgrp);
> -
> -		/*
> -		 * Give any CPUs back to the default group. We cannot copy
> -		 * cpu_online_mask because a CPU might have executed the
> -		 * offline callback already, but is still marked online.
> -		 */
> -		cpumask_or(&rdtgroup_default.cpu_mask,
> -			   &rdtgroup_default.cpu_mask, &rdtgrp->cpu_mask);
> -
> -		free_rmid(rdtgrp->closid, rdtgrp->mon.rmid);
> -
> -		kernfs_remove(rdtgrp->kn);
> -		list_del(&rdtgrp->rdtgroup_list);
> -
> -		if (atomic_read(&rdtgrp->waitcount) != 0)
> -			rdtgrp->flags = RDT_DELETED;
> -		else
> -			rdtgroup_remove(rdtgrp);
> -	}
> -	/* Notify online CPUs to update per cpu storage and PQR_ASSOC MSR */
> -	update_closid_rmid(cpu_online_mask, &rdtgroup_default);
> -
> -	kernfs_remove(kn_info);
> -	kernfs_remove(kn_mongrp);
> -	kernfs_remove(kn_mondata);
> -}
> -
> -static void rdt_kill_sb(struct super_block *sb)
> -{
> -	cpus_read_lock();
> -	mutex_lock(&rdtgroup_mutex);
> -
> -	rdt_disable_ctx();
> -
> -	/* Put everything back to default values. */
> -	resctrl_arch_reset_resources();
> -
> -	rmdir_all_sub();
> -	rdt_pseudo_lock_release();
> -	rdtgroup_default.mode = RDT_MODE_SHAREABLE;
> -	schemata_list_destroy();
> -	rdtgroup_destroy_root();
> -	if (resctrl_arch_alloc_capable())
> -		resctrl_arch_disable_alloc();
> -	if (resctrl_arch_mon_capable())
> -		resctrl_arch_disable_mon();
> -	resctrl_mounted = false;
> -	kernfs_kill_sb(sb);
> -	mutex_unlock(&rdtgroup_mutex);
> -	cpus_read_unlock();
> -}
> -
> -static struct file_system_type rdt_fs_type = {
> -	.name			= "resctrl",
> -	.init_fs_context	= rdt_init_fs_context,
> -	.parameters		= rdt_fs_parameters,
> -	.kill_sb		= rdt_kill_sb,
> -};
> -
> -static int mon_addfile(struct kernfs_node *parent_kn, const char *name,
> -		       void *priv)
> -{
> -	struct kernfs_node *kn;
> -	int ret = 0;
> -
> -	kn = __kernfs_create_file(parent_kn, name, 0444,
> -				  GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, 0,
> -				  &kf_mondata_ops, priv, NULL, NULL);
> -	if (IS_ERR(kn))
> -		return PTR_ERR(kn);
> -
> -	ret = rdtgroup_kn_set_ugid(kn);
> -	if (ret) {
> -		kernfs_remove(kn);
> -		return ret;
> -	}
> -
> -	return ret;
> -}
> -
> -/*
> - * Remove all subdirectories of mon_data of ctrl_mon groups
> - * and monitor groups with given domain id.
> - */
> -static void rmdir_mondata_subdir_allrdtgrp(struct rdt_resource *r,
> -					   unsigned int dom_id)
> -{
> -	struct rdtgroup *prgrp, *crgrp;
> -	char name[32];
> -
> -	list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) {
> -		sprintf(name, "mon_%s_%02d", r->name, dom_id);
> -		kernfs_remove_by_name(prgrp->mon.mon_data_kn, name);
> -
> -		list_for_each_entry(crgrp, &prgrp->mon.crdtgrp_list, mon.crdtgrp_list)
> -			kernfs_remove_by_name(crgrp->mon.mon_data_kn, name);
> -	}
> -}
> -
> -static int mkdir_mondata_subdir(struct kernfs_node *parent_kn,
> -				struct rdt_domain *d,
> -				struct rdt_resource *r, struct rdtgroup *prgrp)
> -{
> -	union mon_data_bits priv;
> -	struct kernfs_node *kn;
> -	struct mon_evt *mevt;
> -	struct rmid_read rr;
> -	char name[32];
> -	int ret;
> -
> -	sprintf(name, "mon_%s_%02d", r->name, d->id);
> -	/* create the directory */
> -	kn = kernfs_create_dir(parent_kn, name, parent_kn->mode, prgrp);
> -	if (IS_ERR(kn))
> -		return PTR_ERR(kn);
> -
> -	ret = rdtgroup_kn_set_ugid(kn);
> -	if (ret)
> -		goto out_destroy;
> -
> -	if (WARN_ON(list_empty(&r->evt_list))) {
> -		ret = -EPERM;
> -		goto out_destroy;
> -	}
> -
> -	priv.u.rid = r->rid;
> -	priv.u.domid = d->id;
> -	list_for_each_entry(mevt, &r->evt_list, list) {
> -		priv.u.evtid = mevt->evtid;
> -		ret = mon_addfile(kn, mevt->name, priv.priv);
> -		if (ret)
> -			goto out_destroy;
> -
> -		if (resctrl_is_mbm_event(mevt->evtid))
> -			mon_event_read(&rr, r, d, prgrp, mevt->evtid, true);
> -	}
> -	kernfs_activate(kn);
> -	return 0;
> -
> -out_destroy:
> -	kernfs_remove(kn);
> -	return ret;
> -}
> -
> -/*
> - * Add all subdirectories of mon_data for "ctrl_mon" groups
> - * and "monitor" groups with given domain id.
> - */
> -static void mkdir_mondata_subdir_allrdtgrp(struct rdt_resource *r,
> -					   struct rdt_domain *d)
> -{
> -	struct kernfs_node *parent_kn;
> -	struct rdtgroup *prgrp, *crgrp;
> -	struct list_head *head;
> -
> -	list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) {
> -		parent_kn = prgrp->mon.mon_data_kn;
> -		mkdir_mondata_subdir(parent_kn, d, r, prgrp);
> -
> -		head = &prgrp->mon.crdtgrp_list;
> -		list_for_each_entry(crgrp, head, mon.crdtgrp_list) {
> -			parent_kn = crgrp->mon.mon_data_kn;
> -			mkdir_mondata_subdir(parent_kn, d, r, crgrp);
> -		}
> -	}
> -}
> -
> -static int mkdir_mondata_subdir_alldom(struct kernfs_node *parent_kn,
> -				       struct rdt_resource *r,
> -				       struct rdtgroup *prgrp)
> -{
> -	struct rdt_domain *dom;
> -	int ret;
> -
> -	/* Walking r->domains, ensure it can't race with cpuhp */
> -	lockdep_assert_cpus_held();
> -
> -	list_for_each_entry(dom, &r->domains, list) {
> -		ret = mkdir_mondata_subdir(parent_kn, dom, r, prgrp);
> -		if (ret)
> -			return ret;
> -	}
> -
> -	return 0;
> -}
> -
> -/*
> - * This creates a directory mon_data which contains the monitored data.
> - *
> - * mon_data has one directory for each domain which are named
> - * in the format mon_<domain_name>_<domain_id>. For ex: A mon_data
> - * with L3 domain looks as below:
> - * ./mon_data:
> - * mon_L3_00
> - * mon_L3_01
> - * mon_L3_02
> - * ...
> - *
> - * Each domain directory has one file per event:
> - * ./mon_L3_00/:
> - * llc_occupancy
> - *
> - */
> -static int mkdir_mondata_all(struct kernfs_node *parent_kn,
> -			     struct rdtgroup *prgrp,
> -			     struct kernfs_node **dest_kn)
> -{
> -	enum resctrl_res_level i;
> -	struct rdt_resource *r;
> -	struct kernfs_node *kn;
> -	int ret;
> -
> -	/*
> -	 * Create the mon_data directory first.
> -	 */
> -	ret = mongroup_create_dir(parent_kn, prgrp, "mon_data", &kn);
> -	if (ret)
> -		return ret;
> -
> -	if (dest_kn)
> -		*dest_kn = kn;
> -
> -	/*
> -	 * Create the subdirectories for each domain. Note that all events
> -	 * in a domain like L3 are grouped into a resource whose domain is L3
> -	 */
> -	for (i = 0; i < RDT_NUM_RESOURCES; i++) {
> -		r = resctrl_arch_get_resource(i);
> -		if (!r->mon_capable)
> -			continue;
> -
> -		ret = mkdir_mondata_subdir_alldom(kn, r, prgrp);
> -		if (ret)
> -			goto out_destroy;
> -	}
> -
> -	return 0;
> -
> -out_destroy:
> -	kernfs_remove(kn);
> -	return ret;
> -}
> -
> -/**
> - * cbm_ensure_valid - Enforce validity on provided CBM
> - * @_val:	Candidate CBM
> - * @r:		RDT resource to which the CBM belongs
> - *
> - * The provided CBM represents all cache portions available for use. This
> - * may be represented by a bitmap that does not consist of contiguous ones
> - * and thus be an invalid CBM.
> - * Here the provided CBM is forced to be a valid CBM by only considering
> - * the first set of contiguous bits as valid and clearing all bits.
> - * The intention here is to provide a valid default CBM with which a new
> - * resource group is initialized. The user can follow this with a
> - * modification to the CBM if the default does not satisfy the
> - * requirements.
> - */
> -static u32 cbm_ensure_valid(u32 _val, struct rdt_resource *r)
> -{
> -	unsigned int cbm_len = r->cache.cbm_len;
> -	unsigned long first_bit, zero_bit;
> -	unsigned long val = _val;
> -
> -	if (!val)
> -		return 0;
> -
> -	first_bit = find_first_bit(&val, cbm_len);
> -	zero_bit = find_next_zero_bit(&val, cbm_len, first_bit);
> -
> -	/* Clear any remaining bits to ensure contiguous region */
> -	bitmap_clear(&val, zero_bit, cbm_len - zero_bit);
> -	return (u32)val;
> -}
> -
> -/*
> - * Initialize cache resources per RDT domain
> - *
> - * Set the RDT domain up to start off with all usable allocations. That is,
> - * all shareable and unused bits. All-zero CBM is invalid.
> - */
> -static int __init_one_rdt_domain(struct rdt_domain *d, struct resctrl_schema *s,
> -				 u32 closid)
> -{
> -	enum resctrl_conf_type peer_type = resctrl_peer_type(s->conf_type);
> -	enum resctrl_conf_type t = s->conf_type;
> -	struct resctrl_staged_config *cfg;
> -	struct rdt_resource *r = s->res;
> -	u32 used_b = 0, unused_b = 0;
> -	unsigned long tmp_cbm;
> -	enum rdtgrp_mode mode;
> -	u32 peer_ctl, ctrl_val;
> -	int i;
> -
> -	cfg = &d->staged_config[t];
> -	cfg->have_new_ctrl = false;
> -	cfg->new_ctrl = r->cache.shareable_bits;
> -	used_b = r->cache.shareable_bits;
> -	for (i = 0; i < closids_supported(); i++) {
> -		if (closid_allocated(i) && i != closid) {
> -			mode = rdtgroup_mode_by_closid(i);
> -			if (mode == RDT_MODE_PSEUDO_LOCKSETUP)
> -				/*
> -				 * ctrl values for locksetup aren't relevant
> -				 * until the schemata is written, and the mode
> -				 * becomes RDT_MODE_PSEUDO_LOCKED.
> -				 */
> -				continue;
> -			/*
> -			 * If CDP is active include peer domain's
> -			 * usage to ensure there is no overlap
> -			 * with an exclusive group.
> -			 */
> -			if (resctrl_arch_get_cdp_enabled(r->rid))
> -				peer_ctl = resctrl_arch_get_config(r, d, i,
> -								   peer_type);
> -			else
> -				peer_ctl = 0;
> -			ctrl_val = resctrl_arch_get_config(r, d, i,
> -							   s->conf_type);
> -			used_b |= ctrl_val | peer_ctl;
> -			if (mode == RDT_MODE_SHAREABLE)
> -				cfg->new_ctrl |= ctrl_val | peer_ctl;
> -		}
> -	}
> -	if (d->plr && d->plr->cbm > 0)
> -		used_b |= d->plr->cbm;
> -	unused_b = used_b ^ (BIT_MASK(r->cache.cbm_len) - 1);
> -	unused_b &= BIT_MASK(r->cache.cbm_len) - 1;
> -	cfg->new_ctrl |= unused_b;
> -	/*
> -	 * Force the initial CBM to be valid, user can
> -	 * modify the CBM based on system availability.
> -	 */
> -	cfg->new_ctrl = cbm_ensure_valid(cfg->new_ctrl, r);
> -	/*
> -	 * Assign the u32 CBM to an unsigned long to ensure that
> -	 * bitmap_weight() does not access out-of-bound memory.
> -	 */
> -	tmp_cbm = cfg->new_ctrl;
> -	if (bitmap_weight(&tmp_cbm, r->cache.cbm_len) < r->cache.min_cbm_bits) {
> -		rdt_last_cmd_printf("No space on %s:%d\n", s->name, d->id);
> -		return -ENOSPC;
> -	}
> -	cfg->have_new_ctrl = true;
> -
> -	return 0;
> -}
> -
> -/*
> - * Initialize cache resources with default values.
> - *
> - * A new RDT group is being created on an allocation capable (CAT)
> - * supporting system. Set this group up to start off with all usable
> - * allocations.
> - *
> - * If there are no more shareable bits available on any domain then
> - * the entire allocation will fail.
> - */
> -static int rdtgroup_init_cat(struct resctrl_schema *s, u32 closid)
> -{
> -	struct rdt_domain *d;
> -	int ret;
> -
> -	list_for_each_entry(d, &s->res->domains, list) {
> -		ret = __init_one_rdt_domain(d, s, closid);
> -		if (ret < 0)
> -			return ret;
> -	}
> -
> -	return 0;
> -}
> -
> -/* Initialize MBA resource with default values. */
> -static void rdtgroup_init_mba(struct rdt_resource *r, u32 closid)
> -{
> -	struct resctrl_staged_config *cfg;
> -	struct rdt_domain *d;
> -
> -	list_for_each_entry(d, &r->domains, list) {
> -		if (is_mba_sc(r)) {
> -			d->mbps_val[closid] = MBA_MAX_MBPS;
> -			continue;
> -		}
> -
> -		cfg = &d->staged_config[CDP_NONE];
> -		cfg->new_ctrl = r->default_ctrl;
> -		cfg->have_new_ctrl = true;
> -	}
> -}
> -
> -/* Initialize the RDT group's allocations. */
> -static int rdtgroup_init_alloc(struct rdtgroup *rdtgrp)
> -{
> -	struct resctrl_schema *s;
> -	struct rdt_resource *r;
> -	int ret = 0;
> -
> -	rdt_staged_configs_clear();
> -
> -	list_for_each_entry(s, &resctrl_schema_all, list) {
> -		r = s->res;
> -		if (r->rid == RDT_RESOURCE_MBA ||
> -		    r->rid == RDT_RESOURCE_SMBA) {
> -			rdtgroup_init_mba(r, rdtgrp->closid);
> -			if (is_mba_sc(r))
> -				continue;
> -		} else {
> -			ret = rdtgroup_init_cat(s, rdtgrp->closid);
> -			if (ret < 0)
> -				goto out;
> -		}
> -
> -		ret = resctrl_arch_update_domains(r, rdtgrp->closid);
> -		if (ret < 0) {
> -			rdt_last_cmd_puts("Failed to initialize allocations\n");
> -			goto out;
> -		}
> -
> -	}
> -
> -	rdtgrp->mode = RDT_MODE_SHAREABLE;
> -
> -out:
> -	rdt_staged_configs_clear();
> -	return ret;
> -}
> -
> -static int mkdir_rdt_prepare_rmid_alloc(struct rdtgroup *rdtgrp)
> -{
> -	int ret;
> -
> -	if (!resctrl_arch_mon_capable())
> -		return 0;
> -
> -	ret = alloc_rmid(rdtgrp->closid);
> -	if (ret < 0) {
> -		rdt_last_cmd_puts("Out of RMIDs\n");
> -		return ret;
> -	}
> -	rdtgrp->mon.rmid = ret;
> -
> -	ret = mkdir_mondata_all(rdtgrp->kn, rdtgrp, &rdtgrp->mon.mon_data_kn);
> -	if (ret) {
> -		rdt_last_cmd_puts("kernfs subdir error\n");
> -		free_rmid(rdtgrp->closid, rdtgrp->mon.rmid);
> -		return ret;
> -	}
> -
> -	return 0;
> -}
> -
> -static void mkdir_rdt_prepare_rmid_free(struct rdtgroup *rgrp)
> -{
> -	if (resctrl_arch_mon_capable())
> -		free_rmid(rgrp->closid, rgrp->mon.rmid);
> -}
> -
> -static int mkdir_rdt_prepare(struct kernfs_node *parent_kn,
> -			     const char *name, umode_t mode,
> -			     enum rdt_group_type rtype, struct rdtgroup **r)
> -{
> -	struct rdtgroup *prdtgrp, *rdtgrp;
> -	unsigned long files = 0;
> -	struct kernfs_node *kn;
> -	int ret;
> -
> -	prdtgrp = rdtgroup_kn_lock_live(parent_kn);
> -	if (!prdtgrp) {
> -		ret = -ENODEV;
> -		goto out_unlock;
> -	}
> -
> -	if (rtype == RDTMON_GROUP &&
> -	    (prdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP ||
> -	     prdtgrp->mode == RDT_MODE_PSEUDO_LOCKED)) {
> -		ret = -EINVAL;
> -		rdt_last_cmd_puts("Pseudo-locking in progress\n");
> -		goto out_unlock;
> -	}
> -
> -	/* allocate the rdtgroup. */
> -	rdtgrp = kzalloc(sizeof(*rdtgrp), GFP_KERNEL);
> -	if (!rdtgrp) {
> -		ret = -ENOSPC;
> -		rdt_last_cmd_puts("Kernel out of memory\n");
> -		goto out_unlock;
> -	}
> -	*r = rdtgrp;
> -	rdtgrp->mon.parent = prdtgrp;
> -	rdtgrp->type = rtype;
> -	INIT_LIST_HEAD(&rdtgrp->mon.crdtgrp_list);
> -
> -	/* kernfs creates the directory for rdtgrp */
> -	kn = kernfs_create_dir(parent_kn, name, mode, rdtgrp);
> -	if (IS_ERR(kn)) {
> -		ret = PTR_ERR(kn);
> -		rdt_last_cmd_puts("kernfs create error\n");
> -		goto out_free_rgrp;
> -	}
> -	rdtgrp->kn = kn;
> -
> -	/*
> -	 * kernfs_remove() will drop the reference count on "kn" which
> -	 * will free it. But we still need it to stick around for the
> -	 * rdtgroup_kn_unlock(kn) call. Take one extra reference here,
> -	 * which will be dropped by kernfs_put() in rdtgroup_remove().
> -	 */
> -	kernfs_get(kn);
> -
> -	ret = rdtgroup_kn_set_ugid(kn);
> -	if (ret) {
> -		rdt_last_cmd_puts("kernfs perm error\n");
> -		goto out_destroy;
> -	}
> -
> -	if (rtype == RDTCTRL_GROUP) {
> -		files = RFTYPE_BASE | RFTYPE_CTRL;
> -		if (resctrl_arch_mon_capable())
> -			files |= RFTYPE_MON;
> -	} else {
> -		files = RFTYPE_BASE | RFTYPE_MON;
> -	}
> -
> -	ret = rdtgroup_add_files(kn, files);
> -	if (ret) {
> -		rdt_last_cmd_puts("kernfs fill error\n");
> -		goto out_destroy;
> -	}
> -
> -	/*
> -	 * The caller unlocks the parent_kn upon success.
> -	 */
> -	return 0;
> -
> -out_destroy:
> -	kernfs_put(rdtgrp->kn);
> -	kernfs_remove(rdtgrp->kn);
> -out_free_rgrp:
> -	kfree(rdtgrp);
> -out_unlock:
> -	rdtgroup_kn_unlock(parent_kn);
> -	return ret;
> -}
> -
> -static void mkdir_rdt_prepare_clean(struct rdtgroup *rgrp)
> -{
> -	kernfs_remove(rgrp->kn);
> -	rdtgroup_remove(rgrp);
> -}
> -
> -/*
> - * Create a monitor group under "mon_groups" directory of a control
> - * and monitor group(ctrl_mon). This is a resource group
> - * to monitor a subset of tasks and cpus in its parent ctrl_mon group.
> - */
> -static int rdtgroup_mkdir_mon(struct kernfs_node *parent_kn,
> -			      const char *name, umode_t mode)
> -{
> -	struct rdtgroup *rdtgrp, *prgrp;
> -	int ret;
> -
> -	ret = mkdir_rdt_prepare(parent_kn, name, mode, RDTMON_GROUP, &rdtgrp);
> -	if (ret)
> -		return ret;
> -
> -	prgrp = rdtgrp->mon.parent;
> -	rdtgrp->closid = prgrp->closid;
> -
> -	ret = mkdir_rdt_prepare_rmid_alloc(rdtgrp);
> -	if (ret) {
> -		mkdir_rdt_prepare_clean(rdtgrp);
> -		goto out_unlock;
> -	}
> -
> -	kernfs_activate(rdtgrp->kn);
> -
> -	/*
> -	 * Add the rdtgrp to the list of rdtgrps the parent
> -	 * ctrl_mon group has to track.
> -	 */
> -	list_add_tail(&rdtgrp->mon.crdtgrp_list, &prgrp->mon.crdtgrp_list);
> -
> -out_unlock:
> -	rdtgroup_kn_unlock(parent_kn);
> -	return ret;
> -}
> -
> -/*
> - * These are rdtgroups created under the root directory. Can be used
> - * to allocate and monitor resources.
> - */
> -static int rdtgroup_mkdir_ctrl_mon(struct kernfs_node *parent_kn,
> -				   const char *name, umode_t mode)
> -{
> -	struct rdtgroup *rdtgrp;
> -	struct kernfs_node *kn;
> -	u32 closid;
> -	int ret;
> -
> -	ret = mkdir_rdt_prepare(parent_kn, name, mode, RDTCTRL_GROUP, &rdtgrp);
> -	if (ret)
> -		return ret;
> -
> -	kn = rdtgrp->kn;
> -	ret = closid_alloc();
> -	if (ret < 0) {
> -		rdt_last_cmd_puts("Out of CLOSIDs\n");
> -		goto out_common_fail;
> -	}
> -	closid = ret;
> -	ret = 0;
> -
> -	rdtgrp->closid = closid;
> -
> -	ret = mkdir_rdt_prepare_rmid_alloc(rdtgrp);
> -	if (ret)
> -		goto out_closid_free;
> -
> -	kernfs_activate(rdtgrp->kn);
> -
> -	ret = rdtgroup_init_alloc(rdtgrp);
> -	if (ret < 0)
> -		goto out_rmid_free;
> -
> -	list_add(&rdtgrp->rdtgroup_list, &rdt_all_groups);
> -
> -	if (resctrl_arch_mon_capable()) {
> -		/*
> -		 * Create an empty mon_groups directory to hold the subset
> -		 * of tasks and cpus to monitor.
> -		 */
> -		ret = mongroup_create_dir(kn, rdtgrp, "mon_groups", NULL);
> -		if (ret) {
> -			rdt_last_cmd_puts("kernfs subdir error\n");
> -			goto out_del_list;
> -		}
> -	}
> -
> -	goto out_unlock;
> -
> -out_del_list:
> -	list_del(&rdtgrp->rdtgroup_list);
> -out_rmid_free:
> -	mkdir_rdt_prepare_rmid_free(rdtgrp);
> -out_closid_free:
> -	closid_free(closid);
> -out_common_fail:
> -	mkdir_rdt_prepare_clean(rdtgrp);
> -out_unlock:
> -	rdtgroup_kn_unlock(parent_kn);
> -	return ret;
> -}
> -
> -/*
> - * We allow creating mon groups only with in a directory called "mon_groups"
> - * which is present in every ctrl_mon group. Check if this is a valid
> - * "mon_groups" directory.
> - *
> - * 1. The directory should be named "mon_groups".
> - * 2. The mon group itself should "not" be named "mon_groups".
> - *   This makes sure "mon_groups" directory always has a ctrl_mon group
> - *   as parent.
> - */
> -static bool is_mon_groups(struct kernfs_node *kn, const char *name)
> -{
> -	return (!strcmp(kn->name, "mon_groups") &&
> -		strcmp(name, "mon_groups"));
> -}
> -
> -static int rdtgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
> -			  umode_t mode)
> -{
> -	/* Do not accept '\n' to avoid unparsable situation. */
> -	if (strchr(name, '\n'))
> -		return -EINVAL;
> -
> -	/*
> -	 * If the parent directory is the root directory and RDT
> -	 * allocation is supported, add a control and monitoring
> -	 * subdirectory
> -	 */
> -	if (resctrl_arch_alloc_capable() && parent_kn == rdtgroup_default.kn)
> -		return rdtgroup_mkdir_ctrl_mon(parent_kn, name, mode);
> -
> -	/*
> -	 * If RDT monitoring is supported and the parent directory is a valid
> -	 * "mon_groups" directory, add a monitoring subdirectory.
> -	 */
> -	if (resctrl_arch_mon_capable() && is_mon_groups(parent_kn, name))
> -		return rdtgroup_mkdir_mon(parent_kn, name, mode);
> -
> -	return -EPERM;
> -}
> -
> -static int rdtgroup_rmdir_mon(struct rdtgroup *rdtgrp, cpumask_var_t tmpmask)
> -{
> -	struct rdtgroup *prdtgrp = rdtgrp->mon.parent;
> -	u32 closid, rmid;
> -	int cpu;
> -
> -	/* Give any tasks back to the parent group */
> -	rdt_move_group_tasks(rdtgrp, prdtgrp, tmpmask);
> -
> -	/* Update per cpu rmid of the moved CPUs first */
> -	closid = rdtgrp->closid;
> -	rmid = prdtgrp->mon.rmid;
> -	for_each_cpu(cpu, &rdtgrp->cpu_mask)
> -		resctrl_arch_set_cpu_default_closid_rmid(cpu, closid, rmid);
> -
> -	/*
> -	 * Update the MSR on moved CPUs and CPUs which have moved
> -	 * task running on them.
> -	 */
> -	cpumask_or(tmpmask, tmpmask, &rdtgrp->cpu_mask);
> -	update_closid_rmid(tmpmask, NULL);
> -
> -	rdtgrp->flags = RDT_DELETED;
> -	free_rmid(rdtgrp->closid, rdtgrp->mon.rmid);
> -
> -	/*
> -	 * Remove the rdtgrp from the parent ctrl_mon group's list
> -	 */
> -	WARN_ON(list_empty(&prdtgrp->mon.crdtgrp_list));
> -	list_del(&rdtgrp->mon.crdtgrp_list);
> -
> -	kernfs_remove(rdtgrp->kn);
> -
> -	return 0;
> -}
> -
> -static int rdtgroup_ctrl_remove(struct rdtgroup *rdtgrp)
> -{
> -	rdtgrp->flags = RDT_DELETED;
> -	list_del(&rdtgrp->rdtgroup_list);
> -
> -	kernfs_remove(rdtgrp->kn);
> -	return 0;
> -}
> -
> -static int rdtgroup_rmdir_ctrl(struct rdtgroup *rdtgrp, cpumask_var_t tmpmask)
> -{
> -	u32 closid, rmid;
> -	int cpu;
> -
> -	/* Give any tasks back to the default group */
> -	rdt_move_group_tasks(rdtgrp, &rdtgroup_default, tmpmask);
> -
> -	/* Give any CPUs back to the default group */
> -	cpumask_or(&rdtgroup_default.cpu_mask,
> -		   &rdtgroup_default.cpu_mask, &rdtgrp->cpu_mask);
> -
> -	/* Update per cpu closid and rmid of the moved CPUs first */
> -	closid = rdtgroup_default.closid;
> -	rmid = rdtgroup_default.mon.rmid;
> -	for_each_cpu(cpu, &rdtgrp->cpu_mask)
> -		resctrl_arch_set_cpu_default_closid_rmid(cpu, closid, rmid);
> -
> -	/*
> -	 * Update the MSR on moved CPUs and CPUs which have moved
> -	 * task running on them.
> -	 */
> -	cpumask_or(tmpmask, tmpmask, &rdtgrp->cpu_mask);
> -	update_closid_rmid(tmpmask, NULL);
> -
> -	free_rmid(rdtgrp->closid, rdtgrp->mon.rmid);
> -	closid_free(rdtgrp->closid);
> -
> -	rdtgroup_ctrl_remove(rdtgrp);
> -
> -	/*
> -	 * Free all the child monitor group rmids.
> -	 */
> -	free_all_child_rdtgrp(rdtgrp);
> -
> -	return 0;
> -}
> -
> -static int rdtgroup_rmdir(struct kernfs_node *kn)
> -{
> -	struct kernfs_node *parent_kn = kn->parent;
> -	struct rdtgroup *rdtgrp;
> -	cpumask_var_t tmpmask;
> -	int ret = 0;
> -
> -	if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
> -		return -ENOMEM;
> -
> -	rdtgrp = rdtgroup_kn_lock_live(kn);
> -	if (!rdtgrp) {
> -		ret = -EPERM;
> -		goto out;
> -	}
> -
> -	/*
> -	 * If the rdtgroup is a ctrl_mon group and parent directory
> -	 * is the root directory, remove the ctrl_mon group.
> -	 *
> -	 * If the rdtgroup is a mon group and parent directory
> -	 * is a valid "mon_groups" directory, remove the mon group.
> -	 */
> -	if (rdtgrp->type == RDTCTRL_GROUP && parent_kn == rdtgroup_default.kn &&
> -	    rdtgrp != &rdtgroup_default) {
> -		if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP ||
> -		    rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) {
> -			ret = rdtgroup_ctrl_remove(rdtgrp);
> -		} else {
> -			ret = rdtgroup_rmdir_ctrl(rdtgrp, tmpmask);
> -		}
> -	} else if (rdtgrp->type == RDTMON_GROUP &&
> -		 is_mon_groups(parent_kn, kn->name)) {
> -		ret = rdtgroup_rmdir_mon(rdtgrp, tmpmask);
> -	} else {
> -		ret = -EPERM;
> -	}
> -
> -out:
> -	rdtgroup_kn_unlock(kn);
> -	free_cpumask_var(tmpmask);
> -	return ret;
> -}
> -
> -/**
> - * mongrp_reparent() - replace parent CTRL_MON group of a MON group
> - * @rdtgrp:		the MON group whose parent should be replaced
> - * @new_prdtgrp:	replacement parent CTRL_MON group for @rdtgrp
> - * @cpus:		cpumask provided by the caller for use during this call
> - *
> - * Replaces the parent CTRL_MON group for a MON group, resulting in all member
> - * tasks' CLOSID immediately changing to that of the new parent group.
> - * Monitoring data for the group is unaffected by this operation.
> - */
> -static void mongrp_reparent(struct rdtgroup *rdtgrp,
> -			    struct rdtgroup *new_prdtgrp,
> -			    cpumask_var_t cpus)
> -{
> -	struct rdtgroup *prdtgrp = rdtgrp->mon.parent;
> -
> -	WARN_ON(rdtgrp->type != RDTMON_GROUP);
> -	WARN_ON(new_prdtgrp->type != RDTCTRL_GROUP);
> -
> -	/* Nothing to do when simply renaming a MON group. */
> -	if (prdtgrp == new_prdtgrp)
> -		return;
> -
> -	WARN_ON(list_empty(&prdtgrp->mon.crdtgrp_list));
> -	list_move_tail(&rdtgrp->mon.crdtgrp_list,
> -		       &new_prdtgrp->mon.crdtgrp_list);
> -
> -	rdtgrp->mon.parent = new_prdtgrp;
> -	rdtgrp->closid = new_prdtgrp->closid;
> -
> -	/* Propagate updated closid to all tasks in this group. */
> -	rdt_move_group_tasks(rdtgrp, rdtgrp, cpus);
> -
> -	update_closid_rmid(cpus, NULL);
> -}
> -
> -static int rdtgroup_rename(struct kernfs_node *kn,
> -			   struct kernfs_node *new_parent, const char *new_name)
> -{
> -	struct rdtgroup *new_prdtgrp;
> -	struct rdtgroup *rdtgrp;
> -	cpumask_var_t tmpmask;
> -	int ret;
> -
> -	rdtgrp = kernfs_to_rdtgroup(kn);
> -	new_prdtgrp = kernfs_to_rdtgroup(new_parent);
> -	if (!rdtgrp || !new_prdtgrp)
> -		return -ENOENT;
> -
> -	/* Release both kernfs active_refs before obtaining rdtgroup mutex. */
> -	rdtgroup_kn_get(rdtgrp, kn);
> -	rdtgroup_kn_get(new_prdtgrp, new_parent);
> -
> -	mutex_lock(&rdtgroup_mutex);
> -
> -	rdt_last_cmd_clear();
> -
> -	/*
> -	 * Don't allow kernfs_to_rdtgroup() to return a parent rdtgroup if
> -	 * either kernfs_node is a file.
> -	 */
> -	if (kernfs_type(kn) != KERNFS_DIR ||
> -	    kernfs_type(new_parent) != KERNFS_DIR) {
> -		rdt_last_cmd_puts("Source and destination must be directories");
> -		ret = -EPERM;
> -		goto out;
> -	}
> -
> -	if ((rdtgrp->flags & RDT_DELETED) || (new_prdtgrp->flags & RDT_DELETED)) {
> -		ret = -ENOENT;
> -		goto out;
> -	}
> -
> -	if (rdtgrp->type != RDTMON_GROUP || !kn->parent ||
> -	    !is_mon_groups(kn->parent, kn->name)) {
> -		rdt_last_cmd_puts("Source must be a MON group\n");
> -		ret = -EPERM;
> -		goto out;
> -	}
> -
> -	if (!is_mon_groups(new_parent, new_name)) {
> -		rdt_last_cmd_puts("Destination must be a mon_groups subdirectory\n");
> -		ret = -EPERM;
> -		goto out;
> -	}
> -
> -	/*
> -	 * If the MON group is monitoring CPUs, the CPUs must be assigned to the
> -	 * current parent CTRL_MON group and therefore cannot be assigned to
> -	 * the new parent, making the move illegal.
> -	 */
> -	if (!cpumask_empty(&rdtgrp->cpu_mask) &&
> -	    rdtgrp->mon.parent != new_prdtgrp) {
> -		rdt_last_cmd_puts("Cannot move a MON group that monitors CPUs\n");
> -		ret = -EPERM;
> -		goto out;
> -	}
> -
> -	/*
> -	 * Allocate the cpumask for use in mongrp_reparent() to avoid the
> -	 * possibility of failing to allocate it after kernfs_rename() has
> -	 * succeeded.
> -	 */
> -	if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL)) {
> -		ret = -ENOMEM;
> -		goto out;
> -	}
> -
> -	/*
> -	 * Perform all input validation and allocations needed to ensure
> -	 * mongrp_reparent() will succeed before calling kernfs_rename(),
> -	 * otherwise it would be necessary to revert this call if
> -	 * mongrp_reparent() failed.
> -	 */
> -	ret = kernfs_rename(kn, new_parent, new_name);
> -	if (!ret)
> -		mongrp_reparent(rdtgrp, new_prdtgrp, tmpmask);
> -
> -	free_cpumask_var(tmpmask);
> -
> -out:
> -	mutex_unlock(&rdtgroup_mutex);
> -	rdtgroup_kn_put(rdtgrp, kn);
> -	rdtgroup_kn_put(new_prdtgrp, new_parent);
> -	return ret;
> -}
> -
> -static int rdtgroup_show_options(struct seq_file *seq, struct kernfs_root *kf)
> -{
> -	if (resctrl_arch_get_cdp_enabled(RDT_RESOURCE_L3))
> -		seq_puts(seq, ",cdp");
> -
> -	if (resctrl_arch_get_cdp_enabled(RDT_RESOURCE_L2))
> -		seq_puts(seq, ",cdpl2");
> -
> -	if (is_mba_sc(resctrl_arch_get_resource(RDT_RESOURCE_MBA)))
> -		seq_puts(seq, ",mba_MBps");
> -
> -	if (resctrl_debug)
> -		seq_puts(seq, ",debug");
> -
> -	return 0;
> -}
> -
> -static struct kernfs_syscall_ops rdtgroup_kf_syscall_ops = {
> -	.mkdir		= rdtgroup_mkdir,
> -	.rmdir		= rdtgroup_rmdir,
> -	.rename		= rdtgroup_rename,
> -	.show_options	= rdtgroup_show_options,
> -};
> -
> -static int rdtgroup_setup_root(struct rdt_fs_context *ctx)
> -{
> -	rdt_root = kernfs_create_root(&rdtgroup_kf_syscall_ops,
> -				      KERNFS_ROOT_CREATE_DEACTIVATED |
> -				      KERNFS_ROOT_EXTRA_OPEN_PERM_CHECK,
> -				      &rdtgroup_default);
> -	if (IS_ERR(rdt_root))
> -		return PTR_ERR(rdt_root);
> -
> -	ctx->kfc.root = rdt_root;
> -	rdtgroup_default.kn = kernfs_root_to_node(rdt_root);
> -
> -	return 0;
> -}
> -
> -static void rdtgroup_destroy_root(void)
> -{
> -	kernfs_destroy_root(rdt_root);
> -	rdtgroup_default.kn = NULL;
> -}
> -
> -static void rdtgroup_setup_default(void)
> -{
> -	mutex_lock(&rdtgroup_mutex);
> -
> -	rdtgroup_default.closid = RESCTRL_RESERVED_CLOSID;
> -	rdtgroup_default.mon.rmid = RESCTRL_RESERVED_RMID;
> -	rdtgroup_default.type = RDTCTRL_GROUP;
> -	INIT_LIST_HEAD(&rdtgroup_default.mon.crdtgrp_list);
> -
> -	list_add(&rdtgroup_default.rdtgroup_list, &rdt_all_groups);
> -
> -	mutex_unlock(&rdtgroup_mutex);
> -}
> -
> -static void domain_destroy_mon_state(struct rdt_domain *d)
> -{
> -	bitmap_free(d->rmid_busy_llc);
> -	kfree(d->mbm_total);
> -	kfree(d->mbm_local);
> -}
> -
> -void resctrl_offline_domain(struct rdt_resource *r, struct rdt_domain *d)
> -{
> -	mutex_lock(&rdtgroup_mutex);
> -
> -	if (supports_mba_mbps() && r->rid == RDT_RESOURCE_MBA)
> -		mba_sc_domain_destroy(r, d);
> -
> -	if (!r->mon_capable)
> -		goto out_unlock;
> -
> -	/*
> -	 * If resctrl is mounted, remove all the
> -	 * per domain monitor data directories.
> -	 */
> -	if (resctrl_mounted && resctrl_arch_mon_capable())
> -		rmdir_mondata_subdir_allrdtgrp(r, d->id);
> -
> -	if (resctrl_is_mbm_enabled())
> -		cancel_delayed_work(&d->mbm_over);
> -	if (resctrl_arch_is_llc_occupancy_enabled() && has_busy_rmid(d)) {
> -		/*
> -		 * When a package is going down, forcefully
> -		 * decrement rmid->ebusy. There is no way to know
> -		 * that the L3 was flushed and hence may lead to
> -		 * incorrect counts in rare scenarios, but leaving
> -		 * the RMID as busy creates RMID leaks if the
> -		 * package never comes back.
> -		 */
> -		__check_limbo(d, true);
> -		cancel_delayed_work(&d->cqm_limbo);
> -	}
> -
> -	domain_destroy_mon_state(d);
> -
> -out_unlock:
> -	mutex_unlock(&rdtgroup_mutex);
> -}
> -
> -static int domain_setup_mon_state(struct rdt_resource *r, struct rdt_domain *d)
> -{
> -	u32 idx_limit = resctrl_arch_system_num_rmid_idx();
> -	size_t tsize;
> -
> -	if (resctrl_arch_is_llc_occupancy_enabled()) {
> -		d->rmid_busy_llc = bitmap_zalloc(idx_limit, GFP_KERNEL);
> -		if (!d->rmid_busy_llc)
> -			return -ENOMEM;
> -	}
> -	if (resctrl_arch_is_mbm_total_enabled()) {
> -		tsize = sizeof(*d->mbm_total);
> -		d->mbm_total = kcalloc(idx_limit, tsize, GFP_KERNEL);
> -		if (!d->mbm_total) {
> -			bitmap_free(d->rmid_busy_llc);
> -			return -ENOMEM;
> -		}
> -	}
> -	if (resctrl_arch_is_mbm_local_enabled()) {
> -		tsize = sizeof(*d->mbm_local);
> -		d->mbm_local = kcalloc(idx_limit, tsize, GFP_KERNEL);
> -		if (!d->mbm_local) {
> -			bitmap_free(d->rmid_busy_llc);
> -			kfree(d->mbm_total);
> -			return -ENOMEM;
> -		}
> -	}
> -
> -	return 0;
> -}
> -
> -int resctrl_online_domain(struct rdt_resource *r, struct rdt_domain *d)
> -{
> -	int err = 0;
> -
> -	mutex_lock(&rdtgroup_mutex);
> -
> -	if (supports_mba_mbps() && r->rid == RDT_RESOURCE_MBA) {
> -		/* RDT_RESOURCE_MBA is never mon_capable */
> -		err = mba_sc_domain_allocate(r, d);
> -		goto out_unlock;
> -	}
> -
> -	if (!r->mon_capable)
> -		goto out_unlock;
> -
> -	err = domain_setup_mon_state(r, d);
> -	if (err)
> -		goto out_unlock;
> -
> -	if (resctrl_is_mbm_enabled()) {
> -		INIT_DELAYED_WORK(&d->mbm_over, mbm_handle_overflow);
> -		mbm_setup_overflow_handler(d, MBM_OVERFLOW_INTERVAL,
> -					   RESCTRL_PICK_ANY_CPU);
> -	}
> -
> -	if (resctrl_arch_is_llc_occupancy_enabled())
> -		INIT_DELAYED_WORK(&d->cqm_limbo, cqm_handle_limbo);
> -
> -	/*
> -	 * If the filesystem is not mounted then only the default resource group
> -	 * exists. Creation of its directories is deferred until mount time
> -	 * by rdt_get_tree() calling mkdir_mondata_all().
> -	 * If resctrl is mounted, add per domain monitor data directories.
> -	 */
> -	if (resctrl_mounted && resctrl_arch_mon_capable())
> -		mkdir_mondata_subdir_allrdtgrp(r, d);
> -
> -out_unlock:
> -	mutex_unlock(&rdtgroup_mutex);
> -
> -	return err;
> -}
> -
> -void resctrl_online_cpu(unsigned int cpu)
> -{
> -	mutex_lock(&rdtgroup_mutex);
> -	/* The CPU is set in default rdtgroup after online. */
> -	cpumask_set_cpu(cpu, &rdtgroup_default.cpu_mask);
> -	mutex_unlock(&rdtgroup_mutex);
> -}
> -
> -static void clear_childcpus(struct rdtgroup *r, unsigned int cpu)
> -{
> -	struct rdtgroup *cr;
> -
> -	list_for_each_entry(cr, &r->mon.crdtgrp_list, mon.crdtgrp_list) {
> -		if (cpumask_test_and_clear_cpu(cpu, &cr->cpu_mask))
> -			break;
> -	}
> -}
> -
> -void resctrl_offline_cpu(unsigned int cpu)
> -{
> -	struct rdt_resource *l3 = resctrl_arch_get_resource(RDT_RESOURCE_L3);
> -	struct rdtgroup *rdtgrp;
> -	struct rdt_domain *d;
> -
> -	mutex_lock(&rdtgroup_mutex);
> -	list_for_each_entry(rdtgrp, &rdt_all_groups, rdtgroup_list) {
> -		if (cpumask_test_and_clear_cpu(cpu, &rdtgrp->cpu_mask)) {
> -			clear_childcpus(rdtgrp, cpu);
> -			break;
> -		}
> -	}
> -
> -	if (!l3->mon_capable)
> -		goto out_unlock;
> -
> -	d = resctrl_get_domain_from_cpu(cpu, l3);
> -	if (d) {
> -		if (resctrl_is_mbm_enabled() && cpu == d->mbm_work_cpu) {
> -			cancel_delayed_work(&d->mbm_over);
> -			mbm_setup_overflow_handler(d, 0, cpu);
> -		}
> -		if (resctrl_arch_is_llc_occupancy_enabled() &&
> -		    cpu == d->cqm_work_cpu && has_busy_rmid(d)) {
> -			cancel_delayed_work(&d->cqm_limbo);
> -			cqm_setup_limbo_handler(d, 0, cpu);
> -		}
> -	}
> -
> -out_unlock:
> -	mutex_unlock(&rdtgroup_mutex);
> -}
> -
> -/*
> - * resctrl_init - resctrl filesystem initialization
> - *
> - * Setup resctrl file system including set up root, create mount point,
> - * register resctrl filesystem, and initialize files under root directory.
> - *
> - * Return: 0 on success or -errno
> - */
> -int resctrl_init(void)
> -{
> -	int ret = 0;
> -
> -	seq_buf_init(&last_cmd_status, last_cmd_status_buf,
> -		     sizeof(last_cmd_status_buf));
> -
> -	rdtgroup_setup_default();
> -
> -	thread_throttle_mode_init();
> -
> -	ret = resctrl_mon_resource_init();
> -	if (ret)
> -		return ret;
> -
> -	ret = sysfs_create_mount_point(fs_kobj, "resctrl");
> -	if (ret)
> -		return ret;
> -
> -	ret = register_filesystem(&rdt_fs_type);
> -	if (ret)
> -		goto cleanup_mountpoint;
> -
> -	/*
> -	 * Adding the resctrl debugfs directory here may not be ideal since
> -	 * it would let the resctrl debugfs directory appear on the debugfs
> -	 * filesystem before the resctrl filesystem is mounted.
> -	 * It may also be ok since that would enable debugging of RDT before
> -	 * resctrl is mounted.
> -	 * The reason why the debugfs directory is created here and not in
> -	 * rdt_get_tree() is because rdt_get_tree() takes rdtgroup_mutex and
> -	 * during the debugfs directory creation also &sb->s_type->i_mutex_key
> -	 * (the lockdep class of inode->i_rwsem). Other filesystem
> -	 * interactions (eg. SyS_getdents) have the lock ordering:
> -	 * &sb->s_type->i_mutex_key --> &mm->mmap_lock
> -	 * During mmap(), called with &mm->mmap_lock, the rdtgroup_mutex
> -	 * is taken, thus creating dependency:
> -	 * &mm->mmap_lock --> rdtgroup_mutex for the latter that can cause
> -	 * issues considering the other two lock dependencies.
> -	 * By creating the debugfs directory here we avoid a dependency
> -	 * that may cause deadlock (even though file operations cannot
> -	 * occur until the filesystem is mounted, but I do not know how to
> -	 * tell lockdep that).
> -	 */
> -	debugfs_resctrl = debugfs_create_dir("resctrl", NULL);
> -
> -	return 0;
> -
> -cleanup_mountpoint:
> -	sysfs_remove_mount_point(fs_kobj, "resctrl");
> -
> -	return ret;
> -}
> -
> -void resctrl_exit(void)
> -{
> -	debugfs_remove_recursive(debugfs_resctrl);
> -	unregister_filesystem(&rdt_fs_type);
> -	sysfs_remove_mount_point(fs_kobj, "resctrl");
> -
> -	resctrl_mon_resource_exit();
> -}
> diff --git a/fs/resctrl/ctrlmondata.c b/fs/resctrl/ctrlmondata.c
> index e69de29bb2d1..a8f2dd66ede3 100644
> --- a/fs/resctrl/ctrlmondata.c
> +++ b/fs/resctrl/ctrlmondata.c
> @@ -0,0 +1,527 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/*
> + * Resource Director Technology(RDT)
> + * - Cache Allocation code.
> + *
> + * Copyright (C) 2016 Intel Corporation
> + *
> + * Authors:
> + *    Fenghua Yu <fenghua.yu@...el.com>
> + *    Tony Luck <tony.luck@...el.com>
> + *
> + * More information about RDT be found in the Intel (R) x86 Architecture
> + * Software Developer Manual June 2016, volume 3, section 17.17.
> + */
> +
> +#define pr_fmt(fmt)	KBUILD_MODNAME ": " fmt
> +
> +#include <linux/cpu.h>
> +#include <linux/kernfs.h>
> +#include <linux/seq_file.h>
> +#include <linux/slab.h>
> +#include "internal.h"
> +
> +struct rdt_parse_data {
> +	struct rdtgroup		*rdtgrp;
> +	char			*buf;
> +};
> +
> +typedef int (ctrlval_parser_t)(struct rdt_parse_data *data,
> +			       struct resctrl_schema *s,
> +			       struct rdt_domain *d);
> +
> +/*
> + * Check whether MBA bandwidth percentage value is correct. The value is
> + * checked against the minimum and max bandwidth values specified by the
> + * hardware. The allocated bandwidth percentage is rounded to the next
> + * control step available on the hardware.
> + */
> +static bool bw_validate(char *buf, unsigned long *data, struct rdt_resource *r)
> +{
> +	unsigned long bw;
> +	int ret;
> +
> +	/*
> +	 * Only linear delay values is supported for current Intel SKUs.
> +	 */
> +	if (!r->membw.delay_linear && r->membw.arch_needs_linear) {
> +		rdt_last_cmd_puts("No support for non-linear MB domains\n");
> +		return false;
> +	}
> +
> +	ret = kstrtoul(buf, 10, &bw);
> +	if (ret) {
> +		rdt_last_cmd_printf("Non-decimal digit in MB value %s\n", buf);
> +		return false;
> +	}
> +
> +	if ((bw < r->membw.min_bw || bw > r->default_ctrl) &&
> +	    !is_mba_sc(r)) {
> +		rdt_last_cmd_printf("MB value %ld out of range [%d,%d]\n", bw,
> +				    r->membw.min_bw, r->default_ctrl);
> +		return false;
> +	}
> +
> +	*data = roundup(bw, (unsigned long)r->membw.bw_gran);
> +	return true;
> +}
> +
> +static int parse_bw(struct rdt_parse_data *data, struct resctrl_schema *s,
> +		    struct rdt_domain *d)
> +{
> +	struct resctrl_staged_config *cfg;
> +	u32 closid = data->rdtgrp->closid;
> +	struct rdt_resource *r = s->res;
> +	unsigned long bw_val;
> +
> +	cfg = &d->staged_config[s->conf_type];
> +	if (cfg->have_new_ctrl) {
> +		rdt_last_cmd_printf("Duplicate domain %d\n", d->id);
> +		return -EINVAL;
> +	}
> +
> +	if (!bw_validate(data->buf, &bw_val, r))
> +		return -EINVAL;
> +
> +	if (is_mba_sc(r)) {
> +		d->mbps_val[closid] = bw_val;
> +		return 0;
> +	}
> +
> +	cfg->new_ctrl = bw_val;
> +	cfg->have_new_ctrl = true;
> +
> +	return 0;
> +}
> +
> +/*
> + * Check whether a cache bit mask is valid.
> + * On Intel CPUs, non-contiguous 1s value support is indicated by CPUID:
> + *   - CPUID.0x10.1:ECX[3]: L3 non-contiguous 1s value supported if 1
> + *   - CPUID.0x10.2:ECX[3]: L2 non-contiguous 1s value supported if 1
> + *
> + * Haswell does not support a non-contiguous 1s value and additionally
> + * requires at least two bits set.
> + * AMD allows non-contiguous bitmasks.
> + */
> +static bool cbm_validate(char *buf, u32 *data, struct rdt_resource *r)
> +{
> +	unsigned long first_bit, zero_bit, val;
> +	unsigned int cbm_len = r->cache.cbm_len;
> +	int ret;
> +
> +	ret = kstrtoul(buf, 16, &val);
> +	if (ret) {
> +		rdt_last_cmd_printf("Non-hex character in the mask %s\n", buf);
> +		return false;
> +	}
> +
> +	if ((r->cache.min_cbm_bits > 0 && val == 0) || val > r->default_ctrl) {
> +		rdt_last_cmd_puts("Mask out of range\n");
> +		return false;
> +	}
> +
> +	first_bit = find_first_bit(&val, cbm_len);
> +	zero_bit = find_next_zero_bit(&val, cbm_len, first_bit);
> +
> +	/* Are non-contiguous bitmasks allowed? */
> +	if (!r->cache.arch_has_sparse_bitmasks &&
> +	    (find_next_bit(&val, cbm_len, zero_bit) < cbm_len)) {
> +		rdt_last_cmd_printf("The mask %lx has non-consecutive 1-bits\n", val);
> +		return false;
> +	}
> +
> +	if ((zero_bit - first_bit) < r->cache.min_cbm_bits) {
> +		rdt_last_cmd_printf("Need at least %d bits in the mask\n",
> +				    r->cache.min_cbm_bits);
> +		return false;
> +	}
> +
> +	*data = val;
> +	return true;
> +}
> +
> +/*
> + * Read one cache bit mask (hex). Check that it is valid for the current
> + * resource type.
> + */
> +static int parse_cbm(struct rdt_parse_data *data, struct resctrl_schema *s,
> +		     struct rdt_domain *d)
> +{
> +	struct rdtgroup *rdtgrp = data->rdtgrp;
> +	struct resctrl_staged_config *cfg;
> +	struct rdt_resource *r = s->res;
> +	u32 cbm_val;
> +
> +	cfg = &d->staged_config[s->conf_type];
> +	if (cfg->have_new_ctrl) {
> +		rdt_last_cmd_printf("Duplicate domain %d\n", d->id);
> +		return -EINVAL;
> +	}
> +
> +	/*
> +	 * Cannot set up more than one pseudo-locked region in a cache
> +	 * hierarchy.
> +	 */
> +	if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP &&
> +	    rdtgroup_pseudo_locked_in_hierarchy(d)) {
> +		rdt_last_cmd_puts("Pseudo-locked region in hierarchy\n");
> +		return -EINVAL;
> +	}
> +
> +	if (!cbm_validate(data->buf, &cbm_val, r))
> +		return -EINVAL;
> +
> +	if ((rdtgrp->mode == RDT_MODE_EXCLUSIVE ||
> +	     rdtgrp->mode == RDT_MODE_SHAREABLE) &&
> +	    rdtgroup_cbm_overlaps_pseudo_locked(d, cbm_val)) {
> +		rdt_last_cmd_puts("CBM overlaps with pseudo-locked region\n");
> +		return -EINVAL;
> +	}
> +
> +	/*
> +	 * The CBM may not overlap with the CBM of another closid if
> +	 * either is exclusive.
> +	 */
> +	if (rdtgroup_cbm_overlaps(s, d, cbm_val, rdtgrp->closid, true)) {
> +		rdt_last_cmd_puts("Overlaps with exclusive group\n");
> +		return -EINVAL;
> +	}
> +
> +	if (rdtgroup_cbm_overlaps(s, d, cbm_val, rdtgrp->closid, false)) {
> +		if (rdtgrp->mode == RDT_MODE_EXCLUSIVE ||
> +		    rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
> +			rdt_last_cmd_puts("Overlaps with other group\n");
> +			return -EINVAL;
> +		}
> +	}
> +
> +	cfg->new_ctrl = cbm_val;
> +	cfg->have_new_ctrl = true;
> +
> +	return 0;
> +}
> +
> +static ctrlval_parser_t *get_parser(struct rdt_resource *res)
> +{
> +	if (res->fflags & RFTYPE_RES_CACHE)
> +		return &parse_cbm;
> +	else
> +		return &parse_bw;
> +}
> +
> +/*
> + * For each domain in this resource we expect to find a series of:
> + *	id=mask
> + * separated by ";". The "id" is in decimal, and must match one of
> + * the "id"s for this resource.
> + */
> +static int parse_line(char *line, struct resctrl_schema *s,
> +		      struct rdtgroup *rdtgrp)
> +{
> +	ctrlval_parser_t *parse_ctrlval = get_parser(s->res);
> +	enum resctrl_conf_type t = s->conf_type;
> +	struct resctrl_staged_config *cfg;
> +	struct rdt_resource *r = s->res;
> +	struct rdt_parse_data data;
> +	char *dom = NULL, *id;
> +	struct rdt_domain *d;
> +	unsigned long dom_id;
> +
> +	/* Walking r->domains, ensure it can't race with cpuhp */
> +	lockdep_assert_cpus_held();
> +
> +	if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP &&
> +	    (r->rid == RDT_RESOURCE_MBA || r->rid == RDT_RESOURCE_SMBA)) {
> +		rdt_last_cmd_puts("Cannot pseudo-lock MBA resource\n");
> +		return -EINVAL;
> +	}
> +
> +next:
> +	if (!line || line[0] == '\0')
> +		return 0;
> +	dom = strsep(&line, ";");
> +	id = strsep(&dom, "=");
> +	if (!dom || kstrtoul(id, 10, &dom_id)) {
> +		rdt_last_cmd_puts("Missing '=' or non-numeric domain\n");
> +		return -EINVAL;
> +	}
> +	dom = strim(dom);
> +	list_for_each_entry(d, &r->domains, list) {
> +		if (d->id == dom_id) {
> +			data.buf = dom;
> +			data.rdtgrp = rdtgrp;
> +			if (parse_ctrlval(&data, s, d))
> +				return -EINVAL;
> +			if (rdtgrp->mode ==  RDT_MODE_PSEUDO_LOCKSETUP) {
> +				cfg = &d->staged_config[t];
> +				/*
> +				 * In pseudo-locking setup mode and just
> +				 * parsed a valid CBM that should be
> +				 * pseudo-locked. Only one locked region per
> +				 * resource group and domain so just do
> +				 * the required initialization for single
> +				 * region and return.
> +				 */
> +				rdtgrp->plr->s = s;
> +				rdtgrp->plr->d = d;
> +				rdtgrp->plr->cbm = cfg->new_ctrl;
> +				d->plr = rdtgrp->plr;
> +				return 0;
> +			}
> +			goto next;
> +		}
> +	}
> +	return -EINVAL;
> +}
> +
> +static int rdtgroup_parse_resource(char *resname, char *tok,
> +				   struct rdtgroup *rdtgrp)
> +{
> +	struct resctrl_schema *s;
> +
> +	list_for_each_entry(s, &resctrl_schema_all, list) {
> +		if (!strcmp(resname, s->name) && rdtgrp->closid < s->num_closid)
> +			return parse_line(tok, s, rdtgrp);
> +	}
> +	rdt_last_cmd_printf("Unknown or unsupported resource name '%s'\n", resname);
> +	return -EINVAL;
> +}
> +
> +ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of,
> +				char *buf, size_t nbytes, loff_t off)
> +{
> +	struct resctrl_schema *s;
> +	struct rdtgroup *rdtgrp;
> +	struct rdt_resource *r;
> +	char *tok, *resname;
> +	int ret = 0;
> +
> +	/* Valid input requires a trailing newline */
> +	if (nbytes == 0 || buf[nbytes - 1] != '\n')
> +		return -EINVAL;
> +	buf[nbytes - 1] = '\0';
> +
> +	rdtgrp = rdtgroup_kn_lock_live(of->kn);
> +	if (!rdtgrp) {
> +		rdtgroup_kn_unlock(of->kn);
> +		return -ENOENT;
> +	}
> +	rdt_last_cmd_clear();
> +
> +	/*
> +	 * No changes to pseudo-locked region allowed. It has to be removed
> +	 * and re-created instead.
> +	 */
> +	if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) {
> +		ret = -EINVAL;
> +		rdt_last_cmd_puts("Resource group is pseudo-locked\n");
> +		goto out;
> +	}
> +
> +	rdt_staged_configs_clear();
> +
> +	while ((tok = strsep(&buf, "\n")) != NULL) {
> +		resname = strim(strsep(&tok, ":"));
> +		if (!tok) {
> +			rdt_last_cmd_puts("Missing ':'\n");
> +			ret = -EINVAL;
> +			goto out;
> +		}
> +		if (tok[0] == '\0') {
> +			rdt_last_cmd_printf("Missing '%s' value\n", resname);
> +			ret = -EINVAL;
> +			goto out;
> +		}
> +		ret = rdtgroup_parse_resource(resname, tok, rdtgrp);
> +		if (ret)
> +			goto out;
> +	}
> +
> +	list_for_each_entry(s, &resctrl_schema_all, list) {
> +		r = s->res;
> +
> +		/*
> +		 * Writes to mba_sc resources update the software controller,
> +		 * not the control MSR.
> +		 */
> +		if (is_mba_sc(r))
> +			continue;
> +
> +		ret = resctrl_arch_update_domains(r, rdtgrp->closid);
> +		if (ret)
> +			goto out;
> +	}
> +
> +	if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
> +		/*
> +		 * If pseudo-locking fails we keep the resource group in
> +		 * mode RDT_MODE_PSEUDO_LOCKSETUP with its class of service
> +		 * active and updated for just the domain the pseudo-locked
> +		 * region was requested for.
> +		 */
> +		ret = rdtgroup_pseudo_lock_create(rdtgrp);
> +	}
> +
> +out:
> +	rdt_staged_configs_clear();
> +	rdtgroup_kn_unlock(of->kn);
> +	return ret ?: nbytes;
> +}
> +
> +static void show_doms(struct seq_file *s, struct resctrl_schema *schema, int closid)
> +{
> +	struct rdt_resource *r = schema->res;
> +	struct rdt_domain *dom;
> +	bool sep = false;
> +	u32 ctrl_val;
> +
> +	/* Walking r->domains, ensure it can't race with cpuhp */
> +	lockdep_assert_cpus_held();
> +
> +	seq_printf(s, "%*s:", max_name_width, schema->name);
> +	list_for_each_entry(dom, &r->domains, list) {
> +		if (sep)
> +			seq_puts(s, ";");
> +
> +		if (is_mba_sc(r))
> +			ctrl_val = dom->mbps_val[closid];
> +		else
> +			ctrl_val = resctrl_arch_get_config(r, dom, closid,
> +							   schema->conf_type);
> +
> +		seq_printf(s, r->format_str, dom->id, max_data_width,
> +			   ctrl_val);
> +		sep = true;
> +	}
> +	seq_puts(s, "\n");
> +}
> +
> +int rdtgroup_schemata_show(struct kernfs_open_file *of,
> +			   struct seq_file *s, void *v)
> +{
> +	struct resctrl_schema *schema;
> +	struct rdtgroup *rdtgrp;
> +	int ret = 0;
> +	u32 closid;
> +
> +	rdtgrp = rdtgroup_kn_lock_live(of->kn);
> +	if (rdtgrp) {
> +		if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
> +			list_for_each_entry(schema, &resctrl_schema_all, list) {
> +				seq_printf(s, "%s:uninitialized\n", schema->name);
> +			}
> +		} else if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) {
> +			if (!rdtgrp->plr->d) {
> +				rdt_last_cmd_clear();
> +				rdt_last_cmd_puts("Cache domain offline\n");
> +				ret = -ENODEV;
> +			} else {
> +				seq_printf(s, "%s:%d=%x\n",
> +					   rdtgrp->plr->s->res->name,
> +					   rdtgrp->plr->d->id,
> +					   rdtgrp->plr->cbm);
> +			}
> +		} else {
> +			closid = rdtgrp->closid;
> +			list_for_each_entry(schema, &resctrl_schema_all, list) {
> +				if (closid < schema->num_closid)
> +					show_doms(s, schema, closid);
> +			}
> +		}
> +	} else {
> +		ret = -ENOENT;
> +	}
> +	rdtgroup_kn_unlock(of->kn);
> +	return ret;
> +}
> +
> +static int smp_mon_event_count(void *arg)
> +{
> +	mon_event_count(arg);
> +
> +	return 0;
> +}
> +
> +void mon_event_read(struct rmid_read *rr, struct rdt_resource *r,
> +		    struct rdt_domain *d, struct rdtgroup *rdtgrp,
> +		    int evtid, int first)
> +{
> +	int cpu;
> +
> +	/* When picking a CPU from cpu_mask, ensure it can't race with cpuhp */
> +	lockdep_assert_cpus_held();
> +
> +	/*
> +	 * Setup the parameters to pass to mon_event_count() to read the data.
> +	 */
> +	rr->rgrp = rdtgrp;
> +	rr->evtid = evtid;
> +	rr->r = r;
> +	rr->d = d;
> +	rr->val = 0;
> +	rr->first = first;
> +	rr->arch_mon_ctx = resctrl_arch_mon_ctx_alloc(r, evtid);
> +	if (IS_ERR(rr->arch_mon_ctx)) {
> +		rr->err = -EINVAL;
> +		return;
> +	}
> +
> +	cpu = cpumask_any_housekeeping(&d->cpu_mask, RESCTRL_PICK_ANY_CPU);
> +
> +	/*
> +	 * cpumask_any_housekeeping() prefers housekeeping CPUs, but
> +	 * are all the CPUs nohz_full? If yes, pick a CPU to IPI.
> +	 * MPAM's resctrl_arch_rmid_read() is unable to read the
> +	 * counters on some platforms if its called in IRQ context.
> +	 */
> +	if (tick_nohz_full_cpu(cpu))
> +		smp_call_function_any(&d->cpu_mask, mon_event_count, rr, 1);
> +	else
> +		smp_call_on_cpu(cpu, smp_mon_event_count, rr, false);
> +
> +	resctrl_arch_mon_ctx_free(r, evtid, rr->arch_mon_ctx);
> +}
> +
> +int rdtgroup_mondata_show(struct seq_file *m, void *arg)
> +{
> +	struct kernfs_open_file *of = m->private;
> +	u32 resid, evtid, domid;
> +	struct rdtgroup *rdtgrp;
> +	struct rdt_resource *r;
> +	union mon_data_bits md;
> +	struct rdt_domain *d;
> +	struct rmid_read rr;
> +	int ret = 0;
> +
> +	rdtgrp = rdtgroup_kn_lock_live(of->kn);
> +	if (!rdtgrp) {
> +		ret = -ENOENT;
> +		goto out;
> +	}
> +
> +	md.priv = of->kn->priv;
> +	resid = md.u.rid;
> +	domid = md.u.domid;
> +	evtid = md.u.evtid;
> +
> +	r = resctrl_arch_get_resource(resid);
> +	d = resctrl_arch_find_domain(r, domid);
> +	if (IS_ERR_OR_NULL(d)) {
> +		ret = -ENOENT;
> +		goto out;
> +	}
> +
> +	mon_event_read(&rr, r, d, rdtgrp, evtid, false);
> +
> +	if (rr.err == -EIO)
> +		seq_puts(m, "Error\n");
> +	else if (rr.err == -EINVAL)
> +		seq_puts(m, "Unavailable\n");
> +	else
> +		seq_printf(m, "%llu\n", rr.val);
> +
> +out:
> +	rdtgroup_kn_unlock(of->kn);
> +	return ret;
> +}
> diff --git a/fs/resctrl/internal.h b/fs/resctrl/internal.h
> index e69de29bb2d1..f73267762a87 100644
> --- a/fs/resctrl/internal.h
> +++ b/fs/resctrl/internal.h
> @@ -0,0 +1,340 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#ifndef _FS_RESCTRL_INTERNAL_H
> +#define _FS_RESCTRL_INTERNAL_H
> +
> +#include <linux/resctrl.h>
> +#include <linux/sched.h>
> +#include <linux/kernfs.h>
> +#include <linux/fs_context.h>
> +#include <linux/jump_label.h>
> +#include <linux/tick.h>
> +
> +#include <asm/resctrl.h>
> +
> +/**
> + * cpumask_any_housekeeping() - Choose any CPU in @mask, preferring those that
> + *			        aren't marked nohz_full
> + * @mask:	The mask to pick a CPU from.
> + * @exclude_cpu:The CPU to avoid picking.
> + *
> + * Returns a CPU from @mask, but not @exclude_cpu. If there are housekeeping
> + * CPUs that don't use nohz_full, these are preferred. Pass
> + * RESCTRL_PICK_ANY_CPU to avoid excluding any CPUs.
> + *
> + * When a CPU is excluded, returns >= nr_cpu_ids if no CPUs are available.
> + */
> +static inline unsigned int
> +cpumask_any_housekeeping(const struct cpumask *mask, int exclude_cpu)
> +{
> +	unsigned int cpu, hk_cpu;
> +
> +	if (exclude_cpu == RESCTRL_PICK_ANY_CPU)
> +		cpu = cpumask_any(mask);
> +	else
> +		cpu = cpumask_any_but(mask, exclude_cpu);
> +
> +	if (!IS_ENABLED(CONFIG_NO_HZ_FULL))
> +		return cpu;
> +
> +	/* If the CPU picked isn't marked nohz_full nothing more needs doing. */
> +	if (cpu < nr_cpu_ids && !tick_nohz_full_cpu(cpu))
> +		return cpu;
> +
> +	/* Try to find a CPU that isn't nohz_full to use in preference */
> +	hk_cpu = cpumask_nth_andnot(0, mask, tick_nohz_full_mask);
> +	if (hk_cpu == exclude_cpu)
> +		hk_cpu = cpumask_nth_andnot(1, mask, tick_nohz_full_mask);
> +
> +	if (hk_cpu < nr_cpu_ids)
> +		cpu = hk_cpu;
> +
> +	return cpu;
> +}
> +
> +struct rdt_fs_context {
> +	struct kernfs_fs_context	kfc;
> +	bool				enable_cdpl2;
> +	bool				enable_cdpl3;
> +	bool				enable_mba_mbps;
> +	bool				enable_debug;
> +};
> +
> +static inline struct rdt_fs_context *rdt_fc2context(struct fs_context *fc)
> +{
> +	struct kernfs_fs_context *kfc = fc->fs_private;
> +
> +	return container_of(kfc, struct rdt_fs_context, kfc);
> +}
> +
> +/**
> + * struct mon_evt - Entry in the event list of a resource
> + * @evtid:		event id
> + * @name:		name of the event
> + * @configurable:	true if the event is configurable
> + * @list:		entry in &rdt_resource->evt_list
> + */
> +struct mon_evt {
> +	enum resctrl_event_id	evtid;
> +	char			*name;
> +	bool			configurable;
> +	struct list_head	list;
> +};
> +
> +/**
> + * union mon_data_bits - Monitoring details for each event file
> + * @priv:              Used to store monitoring event data in @u
> + *                     as kernfs private data
> + * @rid:               Resource id associated with the event file
> + * @evtid:             Event id associated with the event file
> + * @domid:             The domain to which the event file belongs
> + * @u:                 Name of the bit fields struct
> + */
> +union mon_data_bits {
> +	void *priv;
> +	struct {
> +		unsigned int rid		: 10;
> +		enum resctrl_event_id evtid	: 8;
> +		unsigned int domid		: 14;
> +	} u;
> +};
> +
> +struct rmid_read {
> +	struct rdtgroup		*rgrp;
> +	struct rdt_resource	*r;
> +	struct rdt_domain	*d;
> +	enum resctrl_event_id	evtid;
> +	bool			first;
> +	int			err;
> +	u64			val;
> +	void			*arch_mon_ctx;
> +};
> +
> +extern struct list_head resctrl_schema_all;
> +extern bool resctrl_mounted;
> +
> +enum rdt_group_type {
> +	RDTCTRL_GROUP = 0,
> +	RDTMON_GROUP,
> +	RDT_NUM_GROUP,
> +};
> +
> +/**
> + * enum rdtgrp_mode - Mode of a RDT resource group
> + * @RDT_MODE_SHAREABLE: This resource group allows sharing of its allocations
> + * @RDT_MODE_EXCLUSIVE: No sharing of this resource group's allocations allowed
> + * @RDT_MODE_PSEUDO_LOCKSETUP: Resource group will be used for Pseudo-Locking
> + * @RDT_MODE_PSEUDO_LOCKED: No sharing of this resource group's allocations
> + *                          allowed AND the allocations are Cache Pseudo-Locked
> + * @RDT_NUM_MODES: Total number of modes
> + *
> + * The mode of a resource group enables control over the allowed overlap
> + * between allocations associated with different resource groups (classes
> + * of service). User is able to modify the mode of a resource group by
> + * writing to the "mode" resctrl file associated with the resource group.
> + *
> + * The "shareable", "exclusive", and "pseudo-locksetup" modes are set by
> + * writing the appropriate text to the "mode" file. A resource group enters
> + * "pseudo-locked" mode after the schemata is written while the resource
> + * group is in "pseudo-locksetup" mode.
> + */
> +enum rdtgrp_mode {
> +	RDT_MODE_SHAREABLE = 0,
> +	RDT_MODE_EXCLUSIVE,
> +	RDT_MODE_PSEUDO_LOCKSETUP,
> +	RDT_MODE_PSEUDO_LOCKED,
> +
> +	/* Must be last */
> +	RDT_NUM_MODES,
> +};
> +
> +/**
> + * struct mongroup - store mon group's data in resctrl fs.
> + * @mon_data_kn:		kernfs node for the mon_data directory
> + * @parent:			parent rdtgrp
> + * @crdtgrp_list:		child rdtgroup node list
> + * @rmid:			rmid for this rdtgroup
> + */
> +struct mongroup {
> +	struct kernfs_node	*mon_data_kn;
> +	struct rdtgroup		*parent;
> +	struct list_head	crdtgrp_list;
> +	u32			rmid;
> +};
> +
> +/**
> + * struct rdtgroup - store rdtgroup's data in resctrl file system.
> + * @kn:				kernfs node
> + * @rdtgroup_list:		linked list for all rdtgroups
> + * @closid:			closid for this rdtgroup
> + * @cpu_mask:			CPUs assigned to this rdtgroup
> + * @flags:			status bits
> + * @waitcount:			how many cpus expect to find this
> + *				group when they acquire rdtgroup_mutex
> + * @type:			indicates type of this rdtgroup - either
> + *				monitor only or ctrl_mon group
> + * @mon:			mongroup related data
> + * @mode:			mode of resource group
> + * @plr:			pseudo-locked region
> + */
> +struct rdtgroup {
> +	struct kernfs_node		*kn;
> +	struct list_head		rdtgroup_list;
> +	u32				closid;
> +	struct cpumask			cpu_mask;
> +	int				flags;
> +	atomic_t			waitcount;
> +	enum rdt_group_type		type;
> +	struct mongroup			mon;
> +	enum rdtgrp_mode		mode;
> +	struct pseudo_lock_region	*plr;
> +};
> +
> +/* List of all resource groups */
> +extern struct list_head rdt_all_groups;
> +
> +extern int max_name_width, max_data_width;
> +
> +/**
> + * struct rftype - describe each file in the resctrl file system
> + * @name:	File name
> + * @mode:	Access mode
> + * @kf_ops:	File operations
> + * @flags:	File specific RFTYPE_FLAGS_* flags
> + * @fflags:	File specific RFTYPE_* flags
> + * @seq_show:	Show content of the file
> + * @write:	Write to the file
> + */
> +struct rftype {
> +	char			*name;
> +	umode_t			mode;
> +	const struct kernfs_ops	*kf_ops;
> +	unsigned long		flags;
> +	unsigned long		fflags;
> +
> +	int (*seq_show)(struct kernfs_open_file *of,
> +			struct seq_file *sf, void *v);
> +	/*
> +	 * write() is the generic write callback which maps directly to
> +	 * kernfs write operation and overrides all other operations.
> +	 * Maximum write size is determined by ->max_write_len.
> +	 */
> +	ssize_t (*write)(struct kernfs_open_file *of,
> +			 char *buf, size_t nbytes, loff_t off);
> +};
> +
> +/**
> + * struct mbm_state - status for each MBM counter in each domain
> + * @prev_bw_bytes: Previous bytes value read for bandwidth calculation
> + * @prev_bw:	The most recent bandwidth in MBps
> + */
> +struct mbm_state {
> +	u64	prev_bw_bytes;
> +	u32	prev_bw;
> +};
> +
> +static inline  bool is_mba_sc(struct rdt_resource *r)
> +{
> +	if (!r)
> +		r = resctrl_arch_get_resource(RDT_RESOURCE_MBA);
> +
> +	/*
> +	 * The software controller support is only applicable to MBA resource.
> +	 * Make sure to check for resource type.
> +	 */
> +	if (r->rid != RDT_RESOURCE_MBA)
> +		return false;
> +
> +	return r->membw.mba_sc;
> +}
> +
> +extern struct mutex rdtgroup_mutex;
> +extern struct rdtgroup rdtgroup_default;
> +extern struct dentry *debugfs_resctrl;
> +
> +void rdt_last_cmd_clear(void);
> +void rdt_last_cmd_puts(const char *s);
> +__printf(1, 2)
> +void rdt_last_cmd_printf(const char *fmt, ...);
> +
> +struct rdtgroup *rdtgroup_kn_lock_live(struct kernfs_node *kn);
> +void rdtgroup_kn_unlock(struct kernfs_node *kn);
> +int rdtgroup_kn_mode_restrict(struct rdtgroup *r, const char *name);
> +int rdtgroup_kn_mode_restore(struct rdtgroup *r, const char *name,
> +			     umode_t mask);
> +ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of,
> +				char *buf, size_t nbytes, loff_t off);
> +int rdtgroup_schemata_show(struct kernfs_open_file *of,
> +			   struct seq_file *s, void *v);
> +bool rdtgroup_cbm_overlaps(struct resctrl_schema *s, struct rdt_domain *d,
> +			   unsigned long cbm, int closid, bool exclusive);
> +unsigned int rdtgroup_cbm_to_size(struct rdt_resource *r, struct rdt_domain *d,
> +				  unsigned long cbm);
> +enum rdtgrp_mode rdtgroup_mode_by_closid(int closid);
> +int rdtgroup_tasks_assigned(struct rdtgroup *r);
> +int closids_supported(void);
> +void closid_free(int closid);
> +int alloc_rmid(u32 closid);
> +void free_rmid(u32 closid, u32 rmid);
> +void resctrl_mon_resource_exit(void);
> +void mon_event_count(void *info);
> +int rdtgroup_mondata_show(struct seq_file *m, void *arg);
> +void mon_event_read(struct rmid_read *rr, struct rdt_resource *r,
> +		    struct rdt_domain *d, struct rdtgroup *rdtgrp,
> +		    int evtid, int first);
> +int resctrl_mon_resource_init(void);
> +void mbm_setup_overflow_handler(struct rdt_domain *dom,
> +				unsigned long delay_ms,
> +				int exclude_cpu);
> +void mbm_handle_overflow(struct work_struct *work);
> +bool is_mba_sc(struct rdt_resource *r);
> +void cqm_setup_limbo_handler(struct rdt_domain *dom, unsigned long delay_ms,
> +			     int exclude_cpu);
> +void cqm_handle_limbo(struct work_struct *work);
> +bool has_busy_rmid(struct rdt_domain *d);
> +void __check_limbo(struct rdt_domain *d, bool force_free);
> +void mbm_config_rftype_init(const char *config);
> +void rdt_staged_configs_clear(void);
> +bool closid_allocated(unsigned int closid);
> +int resctrl_find_cleanest_closid(void);
> +
> +#ifdef CONFIG_RESCTRL_FS_PSEUDO_LOCK
> +int rdtgroup_locksetup_enter(struct rdtgroup *rdtgrp);
> +int rdtgroup_locksetup_exit(struct rdtgroup *rdtgrp);
> +bool rdtgroup_cbm_overlaps_pseudo_locked(struct rdt_domain *d, unsigned long cbm);
> +bool rdtgroup_pseudo_locked_in_hierarchy(struct rdt_domain *d);
> +int rdt_pseudo_lock_init(void);
> +void rdt_pseudo_lock_release(void);
> +int rdtgroup_pseudo_lock_create(struct rdtgroup *rdtgrp);
> +void rdtgroup_pseudo_lock_remove(struct rdtgroup *rdtgrp);
> +#else
> +static inline int rdtgroup_locksetup_enter(struct rdtgroup *rdtgrp)
> +{
> +	return -EOPNOTSUPP;
> +}
> +
> +static inline int rdtgroup_locksetup_exit(struct rdtgroup *rdtgrp)
> +{
> +	return -EOPNOTSUPP;
> +}
> +
> +static inline bool rdtgroup_cbm_overlaps_pseudo_locked(struct rdt_domain *d, unsigned long cbm)
> +{
> +	return false;
> +}
> +
> +static inline bool rdtgroup_pseudo_locked_in_hierarchy(struct rdt_domain *d)
> +{
> +	return false;
> +}
> +
> +static inline int rdt_pseudo_lock_init(void) { return 0; }
> +static inline void rdt_pseudo_lock_release(void) { }
> +static inline int rdtgroup_pseudo_lock_create(struct rdtgroup *rdtgrp)
> +{
> +	return -EOPNOTSUPP;
> +}
> +
> +static inline void rdtgroup_pseudo_lock_remove(struct rdtgroup *rdtgrp) { }
> +#endif /* CONFIG_RESCTRL_FS_PSEUDO_LOCK */
> +
> +#endif /* _FS_RESCTRL_INTERNAL_H */
> diff --git a/fs/resctrl/monitor.c b/fs/resctrl/monitor.c
> index e69de29bb2d1..06f660dfd929 100644
> --- a/fs/resctrl/monitor.c
> +++ b/fs/resctrl/monitor.c
> @@ -0,0 +1,843 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/*
> + * Resource Director Technology(RDT)
> + * - Monitoring code
> + *
> + * Copyright (C) 2017 Intel Corporation
> + *
> + * Author:
> + *    Vikas Shivappa <vikas.shivappa@...el.com>
> + *
> + * This replaces the cqm.c based on perf but we reuse a lot of
> + * code and datastructures originally from Peter Zijlstra and Matt Fleming.
> + *
> + * More information about RDT be found in the Intel (R) x86 Architecture
> + * Software Developer Manual June 2016, volume 3, section 17.17.
> + */
> +
> +#include <linux/cpu.h>
> +#include <linux/module.h>
> +#include <linux/sizes.h>
> +#include <linux/slab.h>
> +#include "internal.h"
> +
> +/*
> + * struct rmid_entry - dirty tracking for all RMID.
> + * @closid:	The CLOSID for this entry.
> + * @rmid:	The RMID for this entry.
> + * @busy:	The number of domains with cached data using this RMID.
> + * @list:	Member of the rmid_free_lru list when busy == 0.
> + *
> + * Depending on the architecture the correct monitor is accessed using
> + * both @closid and @rmid, or @rmid only.
> + *
> + * Take the rdtgroup_mutex when accessing.
> + */
> +struct rmid_entry {
> +	u32				closid;
> +	u32				rmid;
> +	int				busy;
> +	struct list_head		list;
> +};
> +
> +/*
> + * @rmid_free_lru - A least recently used list of free RMIDs
> + *     These RMIDs are guaranteed to have an occupancy less than the
> + *     threshold occupancy
> + */
> +static LIST_HEAD(rmid_free_lru);
> +
> +/*
> + * @closid_num_dirty_rmid    The number of dirty RMID each CLOSID has.
> + *     Only allocated when CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID is defined.
> + *     Indexed by CLOSID. Protected by rdtgroup_mutex.
> + */
> +static u32 *closid_num_dirty_rmid;
> +
> +/*
> + * @rmid_limbo_count - count of currently unused but (potentially)
> + *     dirty RMIDs.
> + *     This counts RMIDs that no one is currently using but that
> + *     may have a occupancy value > resctrl_rmid_realloc_threshold. User can
> + *     change the threshold occupancy value.
> + */
> +static unsigned int rmid_limbo_count;
> +
> +/*
> + * @rmid_entry - The entry in the limbo and free lists.
> + */
> +static struct rmid_entry	*rmid_ptrs;
> +
> +/*
> + * This is the threshold cache occupancy in bytes at which we will consider an
> + * RMID available for re-allocation.
> + */
> +unsigned int resctrl_rmid_realloc_threshold;
> +
> +/*
> + * This is the maximum value for the reallocation threshold, in bytes.
> + */
> +unsigned int resctrl_rmid_realloc_limit;
> +
> +/*
> + * x86 and arm64 differ in their handling of monitoring.
> + * x86's RMID are independent numbers, there is only one source of traffic
> + * with an RMID value of '1'.
> + * arm64's PMG extends the PARTID/CLOSID space, there are multiple sources of
> + * traffic with a PMG value of '1', one for each CLOSID, meaning the RMID
> + * value is no longer unique.
> + * To account for this, resctrl uses an index. On x86 this is just the RMID,
> + * on arm64 it encodes the CLOSID and RMID. This gives a unique number.
> + *
> + * The domain's rmid_busy_llc and rmid_ptrs[] are sized by index. The arch code
> + * must accept an attempt to read every index.
> + */
> +static inline struct rmid_entry *__rmid_entry(u32 idx)
> +{
> +	struct rmid_entry *entry;
> +	u32 closid, rmid;
> +
> +	entry = &rmid_ptrs[idx];
> +	resctrl_arch_rmid_idx_decode(idx, &closid, &rmid);
> +
> +	WARN_ON_ONCE(entry->closid != closid);
> +	WARN_ON_ONCE(entry->rmid != rmid);
> +
> +	return entry;
> +}
> +
> +static void limbo_release_entry(struct rmid_entry *entry)
> +{
> +	lockdep_assert_held(&rdtgroup_mutex);
> +
> +	rmid_limbo_count--;
> +	list_add_tail(&entry->list, &rmid_free_lru);
> +
> +	if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID))
> +		closid_num_dirty_rmid[entry->closid]--;
> +}
> +
> +/*
> + * Check the RMIDs that are marked as busy for this domain. If the
> + * reported LLC occupancy is below the threshold clear the busy bit and
> + * decrement the count. If the busy count gets to zero on an RMID, we
> + * free the RMID
> + */
> +void __check_limbo(struct rdt_domain *d, bool force_free)
> +{
> +	struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_L3);
> +	u32 idx_limit = resctrl_arch_system_num_rmid_idx();
> +	struct rmid_entry *entry;
> +	u32 idx, cur_idx = 1;
> +	void *arch_mon_ctx;
> +	bool rmid_dirty;
> +	u64 val = 0;
> +
> +	arch_mon_ctx = resctrl_arch_mon_ctx_alloc(r, QOS_L3_OCCUP_EVENT_ID);
> +	if (IS_ERR(arch_mon_ctx)) {
> +		pr_warn_ratelimited("Failed to allocate monitor context: %ld",
> +				    PTR_ERR(arch_mon_ctx));
> +		return;
> +	}
> +
> +	/*
> +	 * Skip RMID 0 and start from RMID 1 and check all the RMIDs that
> +	 * are marked as busy for occupancy < threshold. If the occupancy
> +	 * is less than the threshold decrement the busy counter of the
> +	 * RMID and move it to the free list when the counter reaches 0.
> +	 */
> +	for (;;) {
> +		idx = find_next_bit(d->rmid_busy_llc, idx_limit, cur_idx);
> +		if (idx >= idx_limit)
> +			break;
> +
> +		entry = __rmid_entry(idx);
> +		if (resctrl_arch_rmid_read(r, d, entry->closid, entry->rmid,
> +					   QOS_L3_OCCUP_EVENT_ID, &val,
> +					   arch_mon_ctx)) {
> +			rmid_dirty = true;
> +		} else {
> +			rmid_dirty = (val >= resctrl_rmid_realloc_threshold);
> +		}
> +
> +		if (force_free || !rmid_dirty) {
> +			clear_bit(idx, d->rmid_busy_llc);
> +			if (!--entry->busy)
> +				limbo_release_entry(entry);
> +		}
> +		cur_idx = idx + 1;
> +	}
> +
> +	resctrl_arch_mon_ctx_free(r, QOS_L3_OCCUP_EVENT_ID, arch_mon_ctx);
> +}
> +
> +bool has_busy_rmid(struct rdt_domain *d)
> +{
> +	u32 idx_limit = resctrl_arch_system_num_rmid_idx();
> +
> +	return find_first_bit(d->rmid_busy_llc, idx_limit) != idx_limit;
> +}
> +
> +static struct rmid_entry *resctrl_find_free_rmid(u32 closid)
> +{
> +	struct rmid_entry *itr;
> +	u32 itr_idx, cmp_idx;
> +
> +	if (list_empty(&rmid_free_lru))
> +		return rmid_limbo_count ? ERR_PTR(-EBUSY) : ERR_PTR(-ENOSPC);
> +
> +	list_for_each_entry(itr, &rmid_free_lru, list) {
> +		/*
> +		 * Get the index of this free RMID, and the index it would need
> +		 * to be if it were used with this CLOSID.
> +		 * If the CLOSID is irrelevant on this architecture, the two
> +		 * index values are always the same on every entry and thus the
> +		 * very first entry will be returned.
> +		 */
> +		itr_idx = resctrl_arch_rmid_idx_encode(itr->closid, itr->rmid);
> +		cmp_idx = resctrl_arch_rmid_idx_encode(closid, itr->rmid);
> +
> +		if (itr_idx == cmp_idx)
> +			return itr;
> +	}
> +
> +	return ERR_PTR(-ENOSPC);
> +}
> +
> +/**
> + * resctrl_find_cleanest_closid() - Find a CLOSID where all the associated
> + *                                  RMID are clean, or the CLOSID that has
> + *                                  the most clean RMID.
> + *
> + * MPAM's equivalent of RMID are per-CLOSID, meaning a freshly allocated CLOSID
> + * may not be able to allocate clean RMID. To avoid this the allocator will
> + * choose the CLOSID with the most clean RMID.
> + *
> + * When the CLOSID and RMID are independent numbers, the first free CLOSID will
> + * be returned.
> + */
> +int resctrl_find_cleanest_closid(void)
> +{
> +	u32 cleanest_closid = ~0;
> +	int i = 0;
> +
> +	lockdep_assert_held(&rdtgroup_mutex);
> +
> +	if (!IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID))
> +		return -EIO;
> +
> +	for (i = 0; i < closids_supported(); i++) {
> +		int num_dirty;
> +
> +		if (closid_allocated(i))
> +			continue;
> +
> +		num_dirty = closid_num_dirty_rmid[i];
> +		if (num_dirty == 0)
> +			return i;
> +
> +		if (cleanest_closid == ~0)
> +			cleanest_closid = i;
> +
> +		if (num_dirty < closid_num_dirty_rmid[cleanest_closid])
> +			cleanest_closid = i;
> +	}
> +
> +	if (cleanest_closid == ~0)
> +		return -ENOSPC;
> +
> +	return cleanest_closid;
> +}
> +
> +/*
> + * For MPAM the RMID value is not unique, and has to be considered with
> + * the CLOSID. The (CLOSID, RMID) pair is allocated on all domains, which
> + * allows all domains to be managed by a single free list.
> + * Each domain also has a rmid_busy_llc to reduce the work of the limbo handler.
> + */
> +int alloc_rmid(u32 closid)
> +{
> +	struct rmid_entry *entry;
> +
> +	lockdep_assert_held(&rdtgroup_mutex);
> +
> +	entry = resctrl_find_free_rmid(closid);
> +	if (IS_ERR(entry))
> +		return PTR_ERR(entry);
> +
> +	list_del(&entry->list);
> +	return entry->rmid;
> +}
> +
> +static void add_rmid_to_limbo(struct rmid_entry *entry)
> +{
> +	struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_L3);
> +	struct rdt_domain *d;
> +	u32 idx;
> +
> +	lockdep_assert_held(&rdtgroup_mutex);
> +
> +	/* Walking r->domains, ensure it can't race with cpuhp */
> +	lockdep_assert_cpus_held();
> +
> +	idx = resctrl_arch_rmid_idx_encode(entry->closid, entry->rmid);
> +
> +	entry->busy = 0;
> +	list_for_each_entry(d, &r->domains, list) {
> +		/*
> +		 * For the first limbo RMID in the domain,
> +		 * setup up the limbo worker.
> +		 */
> +		if (!has_busy_rmid(d))
> +			cqm_setup_limbo_handler(d, CQM_LIMBOCHECK_INTERVAL,
> +						RESCTRL_PICK_ANY_CPU);
> +		set_bit(idx, d->rmid_busy_llc);
> +		entry->busy++;
> +	}
> +
> +	rmid_limbo_count++;
> +	if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID))
> +		closid_num_dirty_rmid[entry->closid]++;
> +}
> +
> +void free_rmid(u32 closid, u32 rmid)
> +{
> +	u32 idx = resctrl_arch_rmid_idx_encode(closid, rmid);
> +	struct rmid_entry *entry;
> +
> +	lockdep_assert_held(&rdtgroup_mutex);
> +
> +	/*
> +	 * Do not allow the default rmid to be free'd. Comparing by index
> +	 * allows architectures that ignore the closid parameter to avoid an
> +	 * unnecessary check.
> +	 */
> +	if (idx == resctrl_arch_rmid_idx_encode(RESCTRL_RESERVED_CLOSID,
> +						RESCTRL_RESERVED_RMID))
> +		return;
> +
> +	entry = __rmid_entry(idx);
> +
> +	if (resctrl_arch_is_llc_occupancy_enabled())
> +		add_rmid_to_limbo(entry);
> +	else
> +		list_add_tail(&entry->list, &rmid_free_lru);
> +}
> +
> +static struct mbm_state *get_mbm_state(struct rdt_domain *d, u32 closid,
> +				       u32 rmid, enum resctrl_event_id evtid)
> +{
> +	u32 idx = resctrl_arch_rmid_idx_encode(closid, rmid);
> +
> +	switch (evtid) {
> +	case QOS_L3_MBM_TOTAL_EVENT_ID:
> +		return &d->mbm_total[idx];
> +	case QOS_L3_MBM_LOCAL_EVENT_ID:
> +		return &d->mbm_local[idx];
> +	default:
> +		return NULL;
> +	}
> +}
> +
> +static int __mon_event_count(u32 closid, u32 rmid, struct rmid_read *rr)
> +{
> +	struct mbm_state *m;
> +	u64 tval = 0;
> +
> +	if (rr->first) {
> +		resctrl_arch_reset_rmid(rr->r, rr->d, closid, rmid, rr->evtid);
> +		m = get_mbm_state(rr->d, closid, rmid, rr->evtid);
> +		if (m)
> +			memset(m, 0, sizeof(struct mbm_state));
> +		return 0;
> +	}
> +
> +	rr->err = resctrl_arch_rmid_read(rr->r, rr->d, closid, rmid, rr->evtid,
> +					 &tval, rr->arch_mon_ctx);
> +	if (rr->err)
> +		return rr->err;
> +
> +	rr->val += tval;
> +
> +	return 0;
> +}
> +
> +/*
> + * mbm_bw_count() - Update bw count from values previously read by
> + *		    __mon_event_count().
> + * @closid:	The closid used to identify the cached mbm_state.
> + * @rmid:	The rmid used to identify the cached mbm_state.
> + * @rr:		The struct rmid_read populated by __mon_event_count().
> + *
> + * Supporting function to calculate the memory bandwidth
> + * and delta bandwidth in MBps. The chunks value previously read by
> + * __mon_event_count() is compared with the chunks value from the previous
> + * invocation. This must be called once per second to maintain values in MBps.
> + */
> +static void mbm_bw_count(u32 closid, u32 rmid, struct rmid_read *rr)
> +{
> +	u32 idx = resctrl_arch_rmid_idx_encode(closid, rmid);
> +	struct mbm_state *m = &rr->d->mbm_local[idx];
> +	u64 cur_bw, bytes, cur_bytes;
> +
> +	cur_bytes = rr->val;
> +	bytes = cur_bytes - m->prev_bw_bytes;
> +	m->prev_bw_bytes = cur_bytes;
> +
> +	cur_bw = bytes / SZ_1M;
> +
> +	m->prev_bw = cur_bw;
> +}
> +
> +/*
> + * This is scheduled by mon_event_read() to read the CQM/MBM counters
> + * on a domain.
> + */
> +void mon_event_count(void *info)
> +{
> +	struct rdtgroup *rdtgrp, *entry;
> +	struct rmid_read *rr = info;
> +	struct list_head *head;
> +	int ret;
> +
> +	rdtgrp = rr->rgrp;
> +
> +	ret = __mon_event_count(rdtgrp->closid, rdtgrp->mon.rmid, rr);
> +
> +	/*
> +	 * For Ctrl groups read data from child monitor groups and
> +	 * add them together. Count events which are read successfully.
> +	 * Discard the rmid_read's reporting errors.
> +	 */
> +	head = &rdtgrp->mon.crdtgrp_list;
> +
> +	if (rdtgrp->type == RDTCTRL_GROUP) {
> +		list_for_each_entry(entry, head, mon.crdtgrp_list) {
> +			if (__mon_event_count(entry->closid, entry->mon.rmid,
> +					      rr) == 0)
> +				ret = 0;
> +		}
> +	}
> +
> +	/*
> +	 * __mon_event_count() calls for newly created monitor groups may
> +	 * report -EINVAL/Unavailable if the monitor hasn't seen any traffic.
> +	 * Discard error if any of the monitor event reads succeeded.
> +	 */
> +	if (ret == 0)
> +		rr->err = 0;
> +}
> +
> +/*
> + * Feedback loop for MBA software controller (mba_sc)
> + *
> + * mba_sc is a feedback loop where we periodically read MBM counters and
> + * adjust the bandwidth percentage values via the IA32_MBA_THRTL_MSRs so
> + * that:
> + *
> + *   current bandwidth(cur_bw) < user specified bandwidth(user_bw)
> + *
> + * This uses the MBM counters to measure the bandwidth and MBA throttle
> + * MSRs to control the bandwidth for a particular rdtgrp. It builds on the
> + * fact that resctrl rdtgroups have both monitoring and control.
> + *
> + * The frequency of the checks is 1s and we just tag along the MBM overflow
> + * timer. Having 1s interval makes the calculation of bandwidth simpler.
> + *
> + * Although MBA's goal is to restrict the bandwidth to a maximum, there may
> + * be a need to increase the bandwidth to avoid unnecessarily restricting
> + * the L2 <-> L3 traffic.
> + *
> + * Since MBA controls the L2 external bandwidth where as MBM measures the
> + * L3 external bandwidth the following sequence could lead to such a
> + * situation.
> + *
> + * Consider an rdtgroup which had high L3 <-> memory traffic in initial
> + * phases -> mba_sc kicks in and reduced bandwidth percentage values -> but
> + * after some time rdtgroup has mostly L2 <-> L3 traffic.
> + *
> + * In this case we may restrict the rdtgroup's L2 <-> L3 traffic as its
> + * throttle MSRs already have low percentage values.  To avoid
> + * unnecessarily restricting such rdtgroups, we also increase the bandwidth.
> + */
> +static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_domain *dom_mbm)
> +{
> +	u32 closid, rmid, cur_msr_val, new_msr_val;
> +	struct mbm_state *pmbm_data, *cmbm_data;
> +	struct rdt_resource *r_mba;
> +	struct rdt_domain *dom_mba;
> +	u32 cur_bw, user_bw, idx;
> +	struct list_head *head;
> +	struct rdtgroup *entry;
> +
> +	if (!resctrl_arch_is_mbm_local_enabled())
> +		return;
> +
> +	r_mba = resctrl_arch_get_resource(RDT_RESOURCE_MBA);
> +
> +	closid = rgrp->closid;
> +	rmid = rgrp->mon.rmid;
> +	idx = resctrl_arch_rmid_idx_encode(closid, rmid);
> +	pmbm_data = &dom_mbm->mbm_local[idx];
> +
> +	dom_mba = resctrl_get_domain_from_cpu(smp_processor_id(), r_mba);
> +	if (!dom_mba) {
> +		pr_warn_once("Failure to get domain for MBA update\n");
> +		return;
> +	}
> +
> +	cur_bw = pmbm_data->prev_bw;
> +	user_bw = dom_mba->mbps_val[closid];
> +
> +	/* MBA resource doesn't support CDP */
> +	cur_msr_val = resctrl_arch_get_config(r_mba, dom_mba, closid, CDP_NONE);
> +
> +	/*
> +	 * For Ctrl groups read data from child monitor groups.
> +	 */
> +	head = &rgrp->mon.crdtgrp_list;
> +	list_for_each_entry(entry, head, mon.crdtgrp_list) {
> +		cmbm_data = &dom_mbm->mbm_local[entry->mon.rmid];
> +		cur_bw += cmbm_data->prev_bw;
> +	}
> +
> +	/*
> +	 * Scale up/down the bandwidth linearly for the ctrl group.  The
> +	 * bandwidth step is the bandwidth granularity specified by the
> +	 * hardware.
> +	 * Always increase throttling if current bandwidth is above the
> +	 * target set by user.
> +	 * But avoid thrashing up and down on every poll by checking
> +	 * whether a decrease in throttling is likely to push the group
> +	 * back over target. E.g. if currently throttling to 30% of bandwidth
> +	 * on a system with 10% granularity steps, check whether moving to
> +	 * 40% would go past the limit by multiplying current bandwidth by
> +	 * "(30 + 10) / 30".
> +	 */
> +	if (cur_msr_val > r_mba->membw.min_bw && user_bw < cur_bw) {
> +		new_msr_val = cur_msr_val - r_mba->membw.bw_gran;
> +	} else if (cur_msr_val < MAX_MBA_BW &&
> +		   (user_bw > (cur_bw * (cur_msr_val + r_mba->membw.min_bw) / cur_msr_val))) {
> +		new_msr_val = cur_msr_val + r_mba->membw.bw_gran;
> +	} else {
> +		return;
> +	}
> +
> +	resctrl_arch_update_one(r_mba, dom_mba, closid, CDP_NONE, new_msr_val);
> +}
> +
> +static void mbm_update(struct rdt_resource *r, struct rdt_domain *d,
> +		       u32 closid, u32 rmid)
> +{
> +	struct rmid_read rr;
> +
> +	rr.first = false;
> +	rr.r = r;
> +	rr.d = d;
> +
> +	/*
> +	 * This is protected from concurrent reads from user
> +	 * as both the user and we hold the global mutex.
> +	 */
> +	if (resctrl_arch_is_mbm_total_enabled()) {
> +		rr.evtid = QOS_L3_MBM_TOTAL_EVENT_ID;
> +		rr.val = 0;
> +		rr.arch_mon_ctx = resctrl_arch_mon_ctx_alloc(rr.r, rr.evtid);
> +		if (IS_ERR(rr.arch_mon_ctx)) {
> +			pr_warn_ratelimited("Failed to allocate monitor context: %ld",
> +					    PTR_ERR(rr.arch_mon_ctx));
> +			return;
> +		}
> +
> +		__mon_event_count(closid, rmid, &rr);
> +
> +		resctrl_arch_mon_ctx_free(rr.r, rr.evtid, rr.arch_mon_ctx);
> +	}
> +	if (resctrl_arch_is_mbm_local_enabled()) {
> +		rr.evtid = QOS_L3_MBM_LOCAL_EVENT_ID;
> +		rr.val = 0;
> +		rr.arch_mon_ctx = resctrl_arch_mon_ctx_alloc(rr.r, rr.evtid);
> +		if (IS_ERR(rr.arch_mon_ctx)) {
> +			pr_warn_ratelimited("Failed to allocate monitor context: %ld",
> +					    PTR_ERR(rr.arch_mon_ctx));
> +			return;
> +		}
> +
> +		__mon_event_count(closid, rmid, &rr);
> +
> +		/*
> +		 * Call the MBA software controller only for the
> +		 * control groups and when user has enabled
> +		 * the software controller explicitly.
> +		 */
> +		if (is_mba_sc(NULL))
> +			mbm_bw_count(closid, rmid, &rr);
> +
> +		resctrl_arch_mon_ctx_free(rr.r, rr.evtid, rr.arch_mon_ctx);
> +	}
> +}
> +
> +/*
> + * Handler to scan the limbo list and move the RMIDs
> + * to free list whose occupancy < threshold_occupancy.
> + */
> +void cqm_handle_limbo(struct work_struct *work)
> +{
> +	unsigned long delay = msecs_to_jiffies(CQM_LIMBOCHECK_INTERVAL);
> +	struct rdt_domain *d;
> +
> +	cpus_read_lock();
> +	mutex_lock(&rdtgroup_mutex);
> +
> +	d = container_of(work, struct rdt_domain, cqm_limbo.work);
> +
> +	__check_limbo(d, false);
> +
> +	if (has_busy_rmid(d)) {
> +		d->cqm_work_cpu = cpumask_any_housekeeping(&d->cpu_mask,
> +							   RESCTRL_PICK_ANY_CPU);
> +		schedule_delayed_work_on(d->cqm_work_cpu, &d->cqm_limbo,
> +					 delay);
> +	}
> +
> +	mutex_unlock(&rdtgroup_mutex);
> +	cpus_read_unlock();
> +}
> +
> +/**
> + * cqm_setup_limbo_handler() - Schedule the limbo handler to run for this
> + *                             domain.
> + * @dom:           The domain the limbo handler should run for.
> + * @delay_ms:      How far in the future the handler should run.
> + * @exclude_cpu:   Which CPU the handler should not run on,
> + *		   RESCTRL_PICK_ANY_CPU to pick any CPU.
> + */
> +void cqm_setup_limbo_handler(struct rdt_domain *dom, unsigned long delay_ms,
> +			     int exclude_cpu)
> +{
> +	unsigned long delay = msecs_to_jiffies(delay_ms);
> +	int cpu;
> +
> +	cpu = cpumask_any_housekeeping(&dom->cpu_mask, exclude_cpu);
> +	dom->cqm_work_cpu = cpu;
> +
> +	if (cpu < nr_cpu_ids)
> +		schedule_delayed_work_on(cpu, &dom->cqm_limbo, delay);
> +}
> +
> +void mbm_handle_overflow(struct work_struct *work)
> +{
> +	unsigned long delay = msecs_to_jiffies(MBM_OVERFLOW_INTERVAL);
> +	struct rdtgroup *prgrp, *crgrp;
> +	struct list_head *head;
> +	struct rdt_resource *r;
> +	struct rdt_domain *d;
> +
> +	cpus_read_lock();
> +	mutex_lock(&rdtgroup_mutex);
> +
> +	/*
> +	 * If the filesystem has been unmounted this work no longer needs to
> +	 * run.
> +	 */
> +	if (!resctrl_mounted || !resctrl_arch_mon_capable())
> +		goto out_unlock;
> +
> +	r = resctrl_arch_get_resource(RDT_RESOURCE_L3);
> +	d = container_of(work, struct rdt_domain, mbm_over.work);
> +
> +	list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) {
> +		mbm_update(r, d, prgrp->closid, prgrp->mon.rmid);
> +
> +		head = &prgrp->mon.crdtgrp_list;
> +		list_for_each_entry(crgrp, head, mon.crdtgrp_list)
> +			mbm_update(r, d, crgrp->closid, crgrp->mon.rmid);
> +
> +		if (is_mba_sc(NULL))
> +			update_mba_bw(prgrp, d);
> +	}
> +
> +	/*
> +	 * Re-check for housekeeping CPUs. This allows the overflow handler to
> +	 * move off a nohz_full CPU quickly.
> +	 */
> +	d->mbm_work_cpu = cpumask_any_housekeeping(&d->cpu_mask,
> +						   RESCTRL_PICK_ANY_CPU);
> +	schedule_delayed_work_on(d->mbm_work_cpu, &d->mbm_over, delay);
> +
> +out_unlock:
> +	mutex_unlock(&rdtgroup_mutex);
> +	cpus_read_unlock();
> +}
> +
> +/**
> + * mbm_setup_overflow_handler() - Schedule the overflow handler to run for this
> + *                                domain.
> + * @dom:           The domain the overflow handler should run for.
> + * @delay_ms:      How far in the future the handler should run.
> + * @exclude_cpu:   Which CPU the handler should not run on,
> + *		   RESCTRL_PICK_ANY_CPU to pick any CPU.
> + */
> +void mbm_setup_overflow_handler(struct rdt_domain *dom, unsigned long delay_ms,
> +				int exclude_cpu)
> +{
> +	unsigned long delay = msecs_to_jiffies(delay_ms);
> +	int cpu;
> +
> +	/*
> +	 * When a domain comes online there is no guarantee the filesystem is
> +	 * mounted. If not, there is no need to catch counter overflow.
> +	 */
> +	if (!resctrl_mounted || !resctrl_arch_mon_capable())
> +		return;
> +	cpu = cpumask_any_housekeeping(&dom->cpu_mask, exclude_cpu);
> +	dom->mbm_work_cpu = cpu;
> +
> +	if (cpu < nr_cpu_ids)
> +		schedule_delayed_work_on(cpu, &dom->mbm_over, delay);
> +}
> +
> +static int dom_data_init(struct rdt_resource *r)
> +{
> +	u32 idx_limit = resctrl_arch_system_num_rmid_idx();
> +	u32 num_closid = resctrl_arch_get_num_closid(r);
> +	struct rmid_entry *entry = NULL;
> +	int err = 0, i;
> +	u32 idx;
> +
> +	mutex_lock(&rdtgroup_mutex);
> +	if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) {
> +		u32 *tmp;
> +
> +		/*
> +		 * If the architecture hasn't provided a sanitised value here,
> +		 * this may result in larger arrays than necessary. Resctrl will
> +		 * use a smaller system wide value based on the resources in
> +		 * use.
> +		 */
> +		tmp = kcalloc(num_closid, sizeof(*tmp), GFP_KERNEL);
> +		if (!tmp) {
> +			err = -ENOMEM;
> +			goto out_unlock;
> +		}
> +
> +		closid_num_dirty_rmid = tmp;
> +	}
> +
> +	rmid_ptrs = kcalloc(idx_limit, sizeof(struct rmid_entry), GFP_KERNEL);
> +	if (!rmid_ptrs) {
> +		if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) {
> +			kfree(closid_num_dirty_rmid);
> +			closid_num_dirty_rmid = NULL;
> +		}
> +		err = -ENOMEM;
> +		goto out_unlock;
> +	}
> +
> +	for (i = 0; i < idx_limit; i++) {
> +		entry = &rmid_ptrs[i];
> +		INIT_LIST_HEAD(&entry->list);
> +
> +		resctrl_arch_rmid_idx_decode(i, &entry->closid, &entry->rmid);
> +		list_add_tail(&entry->list, &rmid_free_lru);
> +	}
> +
> +	/*
> +	 * RESCTRL_RESERVED_CLOSID and RESCTRL_RESERVED_RMID are special and
> +	 * are always allocated. These are used for the rdtgroup_default
> +	 * control group, which will be setup later in rdtgroup_init().
> +	 */
> +	idx = resctrl_arch_rmid_idx_encode(RESCTRL_RESERVED_CLOSID,
> +					   RESCTRL_RESERVED_RMID);
> +	entry = __rmid_entry(idx);
> +	list_del(&entry->list);
> +
> +out_unlock:
> +	mutex_unlock(&rdtgroup_mutex);
> +
> +	return err;
> +}
> +
> +static void dom_data_exit(struct rdt_resource *r)
> +{
> +	if (!r->mon_capable)
> +		return;
> +
> +	mutex_lock(&rdtgroup_mutex);
> +	if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) {
> +		kfree(closid_num_dirty_rmid);
> +		closid_num_dirty_rmid = NULL;
> +	}
> +
> +	kfree(rmid_ptrs);
> +	rmid_ptrs = NULL;
> +
> +	mutex_unlock(&rdtgroup_mutex);
> +}
> +
> +static struct mon_evt llc_occupancy_event = {
> +	.name		= "llc_occupancy",
> +	.evtid		= QOS_L3_OCCUP_EVENT_ID,
> +};
> +
> +static struct mon_evt mbm_total_event = {
> +	.name		= "mbm_total_bytes",
> +	.evtid		= QOS_L3_MBM_TOTAL_EVENT_ID,
> +};
> +
> +static struct mon_evt mbm_local_event = {
> +	.name		= "mbm_local_bytes",
> +	.evtid		= QOS_L3_MBM_LOCAL_EVENT_ID,
> +};
> +
> +/*
> + * Initialize the event list for the resource.
> + *
> + * Note that MBM events are also part of RDT_RESOURCE_L3 resource
> + * because as per the SDM the total and local memory bandwidth
> + * are enumerated as part of L3 monitoring.
> + */
> +static void l3_mon_evt_init(struct rdt_resource *r)
> +{
> +	INIT_LIST_HEAD(&r->evt_list);
> +
> +	if (resctrl_arch_is_llc_occupancy_enabled())
> +		list_add_tail(&llc_occupancy_event.list, &r->evt_list);
> +	if (resctrl_arch_is_mbm_total_enabled())
> +		list_add_tail(&mbm_total_event.list, &r->evt_list);
> +	if (resctrl_arch_is_mbm_local_enabled())
> +		list_add_tail(&mbm_local_event.list, &r->evt_list);
> +}
> +
> +int resctrl_mon_resource_init(void)
> +{
> +	struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_L3);
> +	int ret;
> +
> +	if (!r->mon_capable)
> +		return 0;
> +
> +	ret = dom_data_init(r);
> +	if (ret)
> +		return ret;
> +
> +	l3_mon_evt_init(r);
> +
> +	if (resctrl_arch_is_evt_configurable(QOS_L3_MBM_TOTAL_EVENT_ID)) {
> +		mbm_total_event.configurable = true;
> +		mbm_config_rftype_init("mbm_total_bytes_config");
> +	}
> +	if (resctrl_arch_is_evt_configurable(QOS_L3_MBM_LOCAL_EVENT_ID)) {
> +		mbm_local_event.configurable = true;
> +		mbm_config_rftype_init("mbm_local_bytes_config");
> +	}
> +
> +	return 0;
> +}
> +
> +void resctrl_mon_resource_exit(void)
> +{
> +	struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_L3);
> +
> +	dom_data_exit(r);
> +}
> diff --git a/fs/resctrl/psuedo_lock.c b/fs/resctrl/psuedo_lock.c
> index e69de29bb2d1..077c2abb6edd 100644
> --- a/fs/resctrl/psuedo_lock.c
> +++ b/fs/resctrl/psuedo_lock.c
> @@ -0,0 +1,1122 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * Resource Director Technology (RDT)
> + *
> + * Pseudo-locking support built on top of Cache Allocation Technology (CAT)
> + *
> + * Copyright (C) 2018 Intel Corporation
> + *
> + * Author: Reinette Chatre <reinette.chatre@...el.com>
> + */
> +
> +#define pr_fmt(fmt)	KBUILD_MODNAME ": " fmt
> +
> +#include <linux/cacheinfo.h>
> +#include <linux/cpu.h>
> +#include <linux/cpumask.h>
> +#include <linux/debugfs.h>
> +#include <linux/kthread.h>
> +#include <linux/mman.h>
> +#include <linux/perf_event.h>
> +#include <linux/pm_qos.h>
> +#include <linux/slab.h>
> +#include <linux/uaccess.h>
> +
> +#include <asm/cacheflush.h>
> +#include <asm/resctrl.h>
> +#include <asm/perf_event.h>
> +
> +#include "internal.h"
> +
> +/*
> + * Major number assigned to and shared by all devices exposing
> + * pseudo-locked regions.
> + */
> +static unsigned int pseudo_lock_major;
> +static unsigned long pseudo_lock_minor_avail = GENMASK(MINORBITS, 0);
> +
> +static char *pseudo_lock_devnode(const struct device *dev, umode_t *mode)
> +{
> +	const struct rdtgroup *rdtgrp;
> +
> +	rdtgrp = dev_get_drvdata(dev);
> +	if (mode)
> +		*mode = 0600;
> +	return kasprintf(GFP_KERNEL, "pseudo_lock/%s", rdtgrp->kn->name);
> +}
> +
> +static const struct class pseudo_lock_class = {
> +	.name = "pseudo_lock",
> +	.devnode = pseudo_lock_devnode,
> +};
> +
> +/**
> + * pseudo_lock_minor_get - Obtain available minor number
> + * @minor: Pointer to where new minor number will be stored
> + *
> + * A bitmask is used to track available minor numbers. Here the next free
> + * minor number is marked as unavailable and returned.
> + *
> + * Return: 0 on success, <0 on failure.
> + */
> +static int pseudo_lock_minor_get(unsigned int *minor)
> +{
> +	unsigned long first_bit;
> +
> +	first_bit = find_first_bit(&pseudo_lock_minor_avail, MINORBITS);
> +
> +	if (first_bit == MINORBITS)
> +		return -ENOSPC;
> +
> +	__clear_bit(first_bit, &pseudo_lock_minor_avail);
> +	*minor = first_bit;
> +
> +	return 0;
> +}
> +
> +/**
> + * pseudo_lock_minor_release - Return minor number to available
> + * @minor: The minor number made available
> + */
> +static void pseudo_lock_minor_release(unsigned int minor)
> +{
> +	__set_bit(minor, &pseudo_lock_minor_avail);
> +}
> +
> +/**
> + * region_find_by_minor - Locate a pseudo-lock region by inode minor number
> + * @minor: The minor number of the device representing pseudo-locked region
> + *
> + * When the character device is accessed we need to determine which
> + * pseudo-locked region it belongs to. This is done by matching the minor
> + * number of the device to the pseudo-locked region it belongs.
> + *
> + * Minor numbers are assigned at the time a pseudo-locked region is associated
> + * with a cache instance.
> + *
> + * Return: On success return pointer to resource group owning the pseudo-locked
> + *         region, NULL on failure.
> + */
> +static struct rdtgroup *region_find_by_minor(unsigned int minor)
> +{
> +	struct rdtgroup *rdtgrp, *rdtgrp_match = NULL;
> +
> +	list_for_each_entry(rdtgrp, &rdt_all_groups, rdtgroup_list) {
> +		if (rdtgrp->plr && rdtgrp->plr->minor == minor) {
> +			rdtgrp_match = rdtgrp;
> +			break;
> +		}
> +	}
> +	return rdtgrp_match;
> +}
> +
> +/**
> + * struct pseudo_lock_pm_req - A power management QoS request list entry
> + * @list:	Entry within the @pm_reqs list for a pseudo-locked region
> + * @req:	PM QoS request
> + */
> +struct pseudo_lock_pm_req {
> +	struct list_head list;
> +	struct dev_pm_qos_request req;
> +};
> +
> +static void pseudo_lock_cstates_relax(struct pseudo_lock_region *plr)
> +{
> +	struct pseudo_lock_pm_req *pm_req, *next;
> +
> +	list_for_each_entry_safe(pm_req, next, &plr->pm_reqs, list) {
> +		dev_pm_qos_remove_request(&pm_req->req);
> +		list_del(&pm_req->list);
> +		kfree(pm_req);
> +	}
> +}
> +
> +/**
> + * pseudo_lock_cstates_constrain - Restrict cores from entering C6
> + * @plr: Pseudo-locked region
> + *
> + * To prevent the cache from being affected by power management entering
> + * C6 has to be avoided. This is accomplished by requesting a latency
> + * requirement lower than lowest C6 exit latency of all supported
> + * platforms as found in the cpuidle state tables in the intel_idle driver.
> + * At this time it is possible to do so with a single latency requirement
> + * for all supported platforms.
> + *
> + * Since Goldmont is supported, which is affected by X86_BUG_MONITOR,
> + * the ACPI latencies need to be considered while keeping in mind that C2
> + * may be set to map to deeper sleep states. In this case the latency
> + * requirement needs to prevent entering C2 also.
> + *
> + * Return: 0 on success, <0 on failure
> + */
> +static int pseudo_lock_cstates_constrain(struct pseudo_lock_region *plr)
> +{
> +	struct pseudo_lock_pm_req *pm_req;
> +	int cpu;
> +	int ret;
> +
> +	for_each_cpu(cpu, &plr->d->cpu_mask) {
> +		pm_req = kzalloc(sizeof(*pm_req), GFP_KERNEL);
> +		if (!pm_req) {
> +			rdt_last_cmd_puts("Failure to allocate memory for PM QoS\n");
> +			ret = -ENOMEM;
> +			goto out_err;
> +		}
> +		ret = dev_pm_qos_add_request(get_cpu_device(cpu),
> +					     &pm_req->req,
> +					     DEV_PM_QOS_RESUME_LATENCY,
> +					     30);
> +		if (ret < 0) {
> +			rdt_last_cmd_printf("Failed to add latency req CPU%d\n",
> +					    cpu);
> +			kfree(pm_req);
> +			ret = -1;
> +			goto out_err;
> +		}
> +		list_add(&pm_req->list, &plr->pm_reqs);
> +	}
> +
> +	return 0;
> +
> +out_err:
> +	pseudo_lock_cstates_relax(plr);
> +	return ret;
> +}
> +
> +/**
> + * pseudo_lock_region_clear - Reset pseudo-lock region data
> + * @plr: pseudo-lock region
> + *
> + * All content of the pseudo-locked region is reset - any memory allocated
> + * freed.
> + *
> + * Return: void
> + */
> +static void pseudo_lock_region_clear(struct pseudo_lock_region *plr)
> +{
> +	plr->size = 0;
> +	plr->line_size = 0;
> +	kfree(plr->kmem);
> +	plr->kmem = NULL;
> +	plr->s = NULL;
> +	if (plr->d)
> +		plr->d->plr = NULL;
> +	plr->d = NULL;
> +	plr->cbm = 0;
> +	plr->debugfs_dir = NULL;
> +}
> +
> +/**
> + * pseudo_lock_region_init - Initialize pseudo-lock region information
> + * @plr: pseudo-lock region
> + *
> + * Called after user provided a schemata to be pseudo-locked. From the
> + * schemata the &struct pseudo_lock_region is on entry already initialized
> + * with the resource, domain, and capacity bitmask. Here the information
> + * required for pseudo-locking is deduced from this data and &struct
> + * pseudo_lock_region initialized further. This information includes:
> + * - size in bytes of the region to be pseudo-locked
> + * - cache line size to know the stride with which data needs to be accessed
> + *   to be pseudo-locked
> + * - a cpu associated with the cache instance on which the pseudo-locking
> + *   flow can be executed
> + *
> + * Return: 0 on success, <0 on failure. Descriptive error will be written
> + * to last_cmd_status buffer.
> + */
> +static int pseudo_lock_region_init(struct pseudo_lock_region *plr)
> +{
> +	struct cpu_cacheinfo *ci;
> +	int ret;
> +	int i;
> +
> +	/* Pick the first cpu we find that is associated with the cache. */
> +	plr->cpu = cpumask_first(&plr->d->cpu_mask);
> +
> +	if (!cpu_online(plr->cpu)) {
> +		rdt_last_cmd_printf("CPU %u associated with cache not online\n",
> +				    plr->cpu);
> +		ret = -ENODEV;
> +		goto out_region;
> +	}
> +
> +	ci = get_cpu_cacheinfo(plr->cpu);
> +
> +	plr->size = rdtgroup_cbm_to_size(plr->s->res, plr->d, plr->cbm);
> +
> +	for (i = 0; i < ci->num_leaves; i++) {
> +		if (ci->info_list[i].level == plr->s->res->cache_level) {
> +			plr->line_size = ci->info_list[i].coherency_line_size;
> +			return 0;
> +		}
> +	}
> +
> +	ret = -1;
> +	rdt_last_cmd_puts("Unable to determine cache line size\n");
> +out_region:
> +	pseudo_lock_region_clear(plr);
> +	return ret;
> +}
> +
> +/**
> + * pseudo_lock_init - Initialize a pseudo-lock region
> + * @rdtgrp: resource group to which new pseudo-locked region will belong
> + *
> + * A pseudo-locked region is associated with a resource group. When this
> + * association is created the pseudo-locked region is initialized. The
> + * details of the pseudo-locked region are not known at this time so only
> + * allocation is done and association established.
> + *
> + * Return: 0 on success, <0 on failure
> + */
> +static int pseudo_lock_init(struct rdtgroup *rdtgrp)
> +{
> +	struct pseudo_lock_region *plr;
> +
> +	plr = kzalloc(sizeof(*plr), GFP_KERNEL);
> +	if (!plr)
> +		return -ENOMEM;
> +
> +	init_waitqueue_head(&plr->lock_thread_wq);
> +	INIT_LIST_HEAD(&plr->pm_reqs);
> +	rdtgrp->plr = plr;
> +	return 0;
> +}
> +
> +/**
> + * pseudo_lock_region_alloc - Allocate kernel memory that will be pseudo-locked
> + * @plr: pseudo-lock region
> + *
> + * Initialize the details required to set up the pseudo-locked region and
> + * allocate the contiguous memory that will be pseudo-locked to the cache.
> + *
> + * Return: 0 on success, <0 on failure.  Descriptive error will be written
> + * to last_cmd_status buffer.
> + */
> +static int pseudo_lock_region_alloc(struct pseudo_lock_region *plr)
> +{
> +	int ret;
> +
> +	ret = pseudo_lock_region_init(plr);
> +	if (ret < 0)
> +		return ret;
> +
> +	/*
> +	 * We do not yet support contiguous regions larger than
> +	 * KMALLOC_MAX_SIZE.
> +	 */
> +	if (plr->size > KMALLOC_MAX_SIZE) {
> +		rdt_last_cmd_puts("Requested region exceeds maximum size\n");
> +		ret = -E2BIG;
> +		goto out_region;
> +	}
> +
> +	plr->kmem = kzalloc(plr->size, GFP_KERNEL);
> +	if (!plr->kmem) {
> +		rdt_last_cmd_puts("Unable to allocate memory\n");
> +		ret = -ENOMEM;
> +		goto out_region;
> +	}
> +
> +	ret = 0;
> +	goto out;
> +out_region:
> +	pseudo_lock_region_clear(plr);
> +out:
> +	return ret;
> +}
> +
> +/**
> + * pseudo_lock_free - Free a pseudo-locked region
> + * @rdtgrp: resource group to which pseudo-locked region belonged
> + *
> + * The pseudo-locked region's resources have already been released, or not
> + * yet created at this point. Now it can be freed and disassociated from the
> + * resource group.
> + *
> + * Return: void
> + */
> +static void pseudo_lock_free(struct rdtgroup *rdtgrp)
> +{
> +	pseudo_lock_region_clear(rdtgrp->plr);
> +	kfree(rdtgrp->plr);
> +	rdtgrp->plr = NULL;
> +}
> +
> +/**
> + * rdtgroup_monitor_in_progress - Test if monitoring in progress
> + * @rdtgrp: resource group being queried
> + *
> + * Return: 1 if monitor groups have been created for this resource
> + * group, 0 otherwise.
> + */
> +static int rdtgroup_monitor_in_progress(struct rdtgroup *rdtgrp)
> +{
> +	return !list_empty(&rdtgrp->mon.crdtgrp_list);
> +}
> +
> +/**
> + * rdtgroup_locksetup_user_restrict - Restrict user access to group
> + * @rdtgrp: resource group needing access restricted
> + *
> + * A resource group used for cache pseudo-locking cannot have cpus or tasks
> + * assigned to it. This is communicated to the user by restricting access
> + * to all the files that can be used to make such changes.
> + *
> + * Permissions restored with rdtgroup_locksetup_user_restore()
> + *
> + * Return: 0 on success, <0 on failure. If a failure occurs during the
> + * restriction of access an attempt will be made to restore permissions but
> + * the state of the mode of these files will be uncertain when a failure
> + * occurs.
> + */
> +static int rdtgroup_locksetup_user_restrict(struct rdtgroup *rdtgrp)
> +{
> +	int ret;
> +
> +	ret = rdtgroup_kn_mode_restrict(rdtgrp, "tasks");
> +	if (ret)
> +		return ret;
> +
> +	ret = rdtgroup_kn_mode_restrict(rdtgrp, "cpus");
> +	if (ret)
> +		goto err_tasks;
> +
> +	ret = rdtgroup_kn_mode_restrict(rdtgrp, "cpus_list");
> +	if (ret)
> +		goto err_cpus;
> +
> +	if (resctrl_arch_mon_capable()) {
> +		ret = rdtgroup_kn_mode_restrict(rdtgrp, "mon_groups");
> +		if (ret)
> +			goto err_cpus_list;
> +	}
> +
> +	ret = 0;
> +	goto out;
> +
> +err_cpus_list:
> +	rdtgroup_kn_mode_restore(rdtgrp, "cpus_list", 0777);
> +err_cpus:
> +	rdtgroup_kn_mode_restore(rdtgrp, "cpus", 0777);
> +err_tasks:
> +	rdtgroup_kn_mode_restore(rdtgrp, "tasks", 0777);
> +out:
> +	return ret;
> +}
> +
> +/**
> + * rdtgroup_locksetup_user_restore - Restore user access to group
> + * @rdtgrp: resource group needing access restored
> + *
> + * Restore all file access previously removed using
> + * rdtgroup_locksetup_user_restrict()
> + *
> + * Return: 0 on success, <0 on failure.  If a failure occurs during the
> + * restoration of access an attempt will be made to restrict permissions
> + * again but the state of the mode of these files will be uncertain when
> + * a failure occurs.
> + */
> +static int rdtgroup_locksetup_user_restore(struct rdtgroup *rdtgrp)
> +{
> +	int ret;
> +
> +	ret = rdtgroup_kn_mode_restore(rdtgrp, "tasks", 0777);
> +	if (ret)
> +		return ret;
> +
> +	ret = rdtgroup_kn_mode_restore(rdtgrp, "cpus", 0777);
> +	if (ret)
> +		goto err_tasks;
> +
> +	ret = rdtgroup_kn_mode_restore(rdtgrp, "cpus_list", 0777);
> +	if (ret)
> +		goto err_cpus;
> +
> +	if (resctrl_arch_mon_capable()) {
> +		ret = rdtgroup_kn_mode_restore(rdtgrp, "mon_groups", 0777);
> +		if (ret)
> +			goto err_cpus_list;
> +	}
> +
> +	ret = 0;
> +	goto out;
> +
> +err_cpus_list:
> +	rdtgroup_kn_mode_restrict(rdtgrp, "cpus_list");
> +err_cpus:
> +	rdtgroup_kn_mode_restrict(rdtgrp, "cpus");
> +err_tasks:
> +	rdtgroup_kn_mode_restrict(rdtgrp, "tasks");
> +out:
> +	return ret;
> +}
> +
> +/**
> + * rdtgroup_locksetup_enter - Resource group enters locksetup mode
> + * @rdtgrp: resource group requested to enter locksetup mode
> + *
> + * A resource group enters locksetup mode to reflect that it would be used
> + * to represent a pseudo-locked region and is in the process of being set
> + * up to do so. A resource group used for a pseudo-locked region would
> + * lose the closid associated with it so we cannot allow it to have any
> + * tasks or cpus assigned nor permit tasks or cpus to be assigned in the
> + * future. Monitoring of a pseudo-locked region is not allowed either.
> + *
> + * The above and more restrictions on a pseudo-locked region are checked
> + * for and enforced before the resource group enters the locksetup mode.
> + *
> + * Returns: 0 if the resource group successfully entered locksetup mode, <0
> + * on failure. On failure the last_cmd_status buffer is updated with text to
> + * communicate details of failure to the user.
> + */
> +int rdtgroup_locksetup_enter(struct rdtgroup *rdtgrp)
> +{
> +	int ret;
> +
> +	/*
> +	 * The default resource group can neither be removed nor lose the
> +	 * default closid associated with it.
> +	 */
> +	if (rdtgrp == &rdtgroup_default) {
> +		rdt_last_cmd_puts("Cannot pseudo-lock default group\n");
> +		return -EINVAL;
> +	}
> +
> +	/*
> +	 * Cache Pseudo-locking not supported when CDP is enabled.
> +	 *
> +	 * Some things to consider if you would like to enable this
> +	 * support (using L3 CDP as example):
> +	 * - When CDP is enabled two separate resources are exposed,
> +	 *   L3DATA and L3CODE, but they are actually on the same cache.
> +	 *   The implication for pseudo-locking is that if a
> +	 *   pseudo-locked region is created on a domain of one
> +	 *   resource (eg. L3CODE), then a pseudo-locked region cannot
> +	 *   be created on that same domain of the other resource
> +	 *   (eg. L3DATA). This is because the creation of a
> +	 *   pseudo-locked region involves a call to wbinvd that will
> +	 *   affect all cache allocations on particular domain.
> +	 * - Considering the previous, it may be possible to only
> +	 *   expose one of the CDP resources to pseudo-locking and
> +	 *   hide the other. For example, we could consider to only
> +	 *   expose L3DATA and since the L3 cache is unified it is
> +	 *   still possible to place instructions there are execute it.
> +	 * - If only one region is exposed to pseudo-locking we should
> +	 *   still keep in mind that availability of a portion of cache
> +	 *   for pseudo-locking should take into account both resources.
> +	 *   Similarly, if a pseudo-locked region is created in one
> +	 *   resource, the portion of cache used by it should be made
> +	 *   unavailable to all future allocations from both resources.
> +	 */
> +	if (resctrl_arch_get_cdp_enabled(RDT_RESOURCE_L3) ||
> +	    resctrl_arch_get_cdp_enabled(RDT_RESOURCE_L2)) {
> +		rdt_last_cmd_puts("CDP enabled\n");
> +		return -EINVAL;
> +	}
> +
> +	/*
> +	 * Not knowing the bits to disable prefetching implies that this
> +	 * platform does not support Cache Pseudo-Locking.
> +	 */
> +	if (resctrl_arch_get_prefetch_disable_bits() == 0) {
> +		rdt_last_cmd_puts("Pseudo-locking not supported\n");
> +		return -EINVAL;
> +	}
> +
> +	if (rdtgroup_monitor_in_progress(rdtgrp)) {
> +		rdt_last_cmd_puts("Monitoring in progress\n");
> +		return -EINVAL;
> +	}
> +
> +	if (rdtgroup_tasks_assigned(rdtgrp)) {
> +		rdt_last_cmd_puts("Tasks assigned to resource group\n");
> +		return -EINVAL;
> +	}
> +
> +	if (!cpumask_empty(&rdtgrp->cpu_mask)) {
> +		rdt_last_cmd_puts("CPUs assigned to resource group\n");
> +		return -EINVAL;
> +	}
> +
> +	if (rdtgroup_locksetup_user_restrict(rdtgrp)) {
> +		rdt_last_cmd_puts("Unable to modify resctrl permissions\n");
> +		return -EIO;
> +	}
> +
> +	ret = pseudo_lock_init(rdtgrp);
> +	if (ret) {
> +		rdt_last_cmd_puts("Unable to init pseudo-lock region\n");
> +		goto out_release;
> +	}
> +
> +	/*
> +	 * If this system is capable of monitoring a rmid would have been
> +	 * allocated when the control group was created. This is not needed
> +	 * anymore when this group would be used for pseudo-locking. This
> +	 * is safe to call on platforms not capable of monitoring.
> +	 */
> +	free_rmid(rdtgrp->closid, rdtgrp->mon.rmid);
> +
> +	ret = 0;
> +	goto out;
> +
> +out_release:
> +	rdtgroup_locksetup_user_restore(rdtgrp);
> +out:
> +	return ret;
> +}
> +
> +/**
> + * rdtgroup_locksetup_exit - resource group exist locksetup mode
> + * @rdtgrp: resource group
> + *
> + * When a resource group exits locksetup mode the earlier restrictions are
> + * lifted.
> + *
> + * Return: 0 on success, <0 on failure
> + */
> +int rdtgroup_locksetup_exit(struct rdtgroup *rdtgrp)
> +{
> +	int ret;
> +
> +	if (resctrl_arch_mon_capable()) {
> +		ret = alloc_rmid(rdtgrp->closid);
> +		if (ret < 0) {
> +			rdt_last_cmd_puts("Out of RMIDs\n");
> +			return ret;
> +		}
> +		rdtgrp->mon.rmid = ret;
> +	}
> +
> +	ret = rdtgroup_locksetup_user_restore(rdtgrp);
> +	if (ret) {
> +		free_rmid(rdtgrp->closid, rdtgrp->mon.rmid);
> +		return ret;
> +	}
> +
> +	pseudo_lock_free(rdtgrp);
> +	return 0;
> +}
> +
> +/**
> + * rdtgroup_cbm_overlaps_pseudo_locked - Test if CBM or portion is pseudo-locked
> + * @d: RDT domain
> + * @cbm: CBM to test
> + *
> + * @d represents a cache instance and @cbm a capacity bitmask that is
> + * considered for it. Determine if @cbm overlaps with any existing
> + * pseudo-locked region on @d.
> + *
> + * @cbm is unsigned long, even if only 32 bits are used, to make the
> + * bitmap functions work correctly.
> + *
> + * Return: true if @cbm overlaps with pseudo-locked region on @d, false
> + * otherwise.
> + */
> +bool rdtgroup_cbm_overlaps_pseudo_locked(struct rdt_domain *d, unsigned long cbm)
> +{
> +	unsigned int cbm_len;
> +	unsigned long cbm_b;
> +
> +	if (d->plr) {
> +		cbm_len = d->plr->s->res->cache.cbm_len;
> +		cbm_b = d->plr->cbm;
> +		if (bitmap_intersects(&cbm, &cbm_b, cbm_len))
> +			return true;
> +	}
> +	return false;
> +}
> +
> +/**
> + * rdtgroup_pseudo_locked_in_hierarchy - Pseudo-locked region in cache hierarchy
> + * @d: RDT domain under test
> + *
> + * The setup of a pseudo-locked region affects all cache instances within
> + * the hierarchy of the region. It is thus essential to know if any
> + * pseudo-locked regions exist within a cache hierarchy to prevent any
> + * attempts to create new pseudo-locked regions in the same hierarchy.
> + *
> + * Return: true if a pseudo-locked region exists in the hierarchy of @d or
> + *         if it is not possible to test due to memory allocation issue,
> + *         false otherwise.
> + */
> +bool rdtgroup_pseudo_locked_in_hierarchy(struct rdt_domain *d)
> +{
> +	cpumask_var_t cpu_with_psl;
> +	enum resctrl_res_level i;
> +	struct rdt_resource *r;
> +	struct rdt_domain *d_i;
> +	bool ret = false;
> +
> +	/* Walking r->domains, ensure it can't race with cpuhp */
> +	lockdep_assert_cpus_held();
> +
> +	if (!zalloc_cpumask_var(&cpu_with_psl, GFP_KERNEL))
> +		return true;
> +
> +	/*
> +	 * First determine which cpus have pseudo-locked regions
> +	 * associated with them.
> +	 */
> +	for (i = 0; i < RDT_NUM_RESOURCES; i++) {
> +		r = resctrl_arch_get_resource(i);
> +		if (!r->alloc_capable)
> +			continue;
> +
> +		list_for_each_entry(d_i, &r->domains, list) {
> +			if (d_i->plr)
> +				cpumask_or(cpu_with_psl, cpu_with_psl,
> +					   &d_i->cpu_mask);
> +		}
> +	}
> +
> +	/*
> +	 * Next test if new pseudo-locked region would intersect with
> +	 * existing region.
> +	 */
> +	if (cpumask_intersects(&d->cpu_mask, cpu_with_psl))
> +		ret = true;
> +
> +	free_cpumask_var(cpu_with_psl);
> +	return ret;
> +}
> +
> +/**
> + * pseudo_lock_measure_cycles - Trigger latency measure to pseudo-locked region
> + * @rdtgrp: Resource group to which the pseudo-locked region belongs.
> + * @sel: Selector of which measurement to perform on a pseudo-locked region.
> + *
> + * The measurement of latency to access a pseudo-locked region should be
> + * done from a cpu that is associated with that pseudo-locked region.
> + * Determine which cpu is associated with this region and start a thread on
> + * that cpu to perform the measurement, wait for that thread to complete.
> + *
> + * Return: 0 on success, <0 on failure
> + */
> +static int pseudo_lock_measure_cycles(struct rdtgroup *rdtgrp, int sel)
> +{
> +	struct pseudo_lock_region *plr = rdtgrp->plr;
> +	struct task_struct *thread;
> +	unsigned int cpu;
> +	int ret = -1;
> +
> +	cpus_read_lock();
> +	mutex_lock(&rdtgroup_mutex);
> +
> +	if (rdtgrp->flags & RDT_DELETED) {
> +		ret = -ENODEV;
> +		goto out;
> +	}
> +
> +	if (!plr->d) {
> +		ret = -ENODEV;
> +		goto out;
> +	}
> +
> +	plr->thread_done = 0;
> +	cpu = cpumask_first(&plr->d->cpu_mask);
> +	if (!cpu_online(cpu)) {
> +		ret = -ENODEV;
> +		goto out;
> +	}
> +
> +	plr->cpu = cpu;
> +
> +	if (sel == 1)
> +		thread = kthread_create_on_node(resctrl_arch_measure_cycles_lat_fn,
> +						plr, cpu_to_node(cpu),
> +						"pseudo_lock_measure/%u",
> +						cpu);
> +	else if (sel == 2)
> +		thread = kthread_create_on_node(resctrl_arch_measure_l2_residency,
> +						plr, cpu_to_node(cpu),
> +						"pseudo_lock_measure/%u",
> +						cpu);
> +	else if (sel == 3)
> +		thread = kthread_create_on_node(resctrl_arch_measure_l3_residency,
> +						plr, cpu_to_node(cpu),
> +						"pseudo_lock_measure/%u",
> +						cpu);
> +	else
> +		goto out;
> +
> +	if (IS_ERR(thread)) {
> +		ret = PTR_ERR(thread);
> +		goto out;
> +	}
> +	kthread_bind(thread, cpu);
> +	wake_up_process(thread);
> +
> +	ret = wait_event_interruptible(plr->lock_thread_wq,
> +				       plr->thread_done == 1);
> +	if (ret < 0)
> +		goto out;
> +
> +	ret = 0;
> +
> +out:
> +	mutex_unlock(&rdtgroup_mutex);
> +	cpus_read_unlock();
> +	return ret;
> +}
> +
> +static ssize_t pseudo_lock_measure_trigger(struct file *file,
> +					   const char __user *user_buf,
> +					   size_t count, loff_t *ppos)
> +{
> +	struct rdtgroup *rdtgrp = file->private_data;
> +	size_t buf_size;
> +	char buf[32];
> +	int ret;
> +	int sel;
> +
> +	buf_size = min(count, (sizeof(buf) - 1));
> +	if (copy_from_user(buf, user_buf, buf_size))
> +		return -EFAULT;
> +
> +	buf[buf_size] = '\0';
> +	ret = kstrtoint(buf, 10, &sel);
> +	if (ret == 0) {
> +		if (sel != 1 && sel != 2 && sel != 3)
> +			return -EINVAL;
> +		ret = debugfs_file_get(file->f_path.dentry);
> +		if (ret)
> +			return ret;
> +		ret = pseudo_lock_measure_cycles(rdtgrp, sel);
> +		if (ret == 0)
> +			ret = count;
> +		debugfs_file_put(file->f_path.dentry);
> +	}
> +
> +	return ret;
> +}
> +
> +static const struct file_operations pseudo_measure_fops = {
> +	.write = pseudo_lock_measure_trigger,
> +	.open = simple_open,
> +	.llseek = default_llseek,
> +};
> +
> +/**
> + * rdtgroup_pseudo_lock_create - Create a pseudo-locked region
> + * @rdtgrp: resource group to which pseudo-lock region belongs
> + *
> + * Called when a resource group in the pseudo-locksetup mode receives a
> + * valid schemata that should be pseudo-locked. Since the resource group is
> + * in pseudo-locksetup mode the &struct pseudo_lock_region has already been
> + * allocated and initialized with the essential information. If a failure
> + * occurs the resource group remains in the pseudo-locksetup mode with the
> + * &struct pseudo_lock_region associated with it, but cleared from all
> + * information and ready for the user to re-attempt pseudo-locking by
> + * writing the schemata again.
> + *
> + * Return: 0 if the pseudo-locked region was successfully pseudo-locked, <0
> + * on failure. Descriptive error will be written to last_cmd_status buffer.
> + */
> +int rdtgroup_pseudo_lock_create(struct rdtgroup *rdtgrp)
> +{
> +	struct pseudo_lock_region *plr = rdtgrp->plr;
> +	struct task_struct *thread;
> +	unsigned int new_minor;
> +	struct device *dev;
> +	int ret;
> +
> +	ret = pseudo_lock_region_alloc(plr);
> +	if (ret < 0)
> +		return ret;
> +
> +	ret = pseudo_lock_cstates_constrain(plr);
> +	if (ret < 0) {
> +		ret = -EINVAL;
> +		goto out_region;
> +	}
> +
> +	plr->thread_done = 0;
> +
> +	plr->closid = rdtgrp->closid;
> +	thread = kthread_create_on_node(resctrl_arch_pseudo_lock_fn, plr,
> +					cpu_to_node(plr->cpu),
> +					"pseudo_lock/%u", plr->cpu);
> +	if (IS_ERR(thread)) {
> +		ret = PTR_ERR(thread);
> +		rdt_last_cmd_printf("Locking thread returned error %d\n", ret);
> +		goto out_cstates;
> +	}
> +
> +	kthread_bind(thread, plr->cpu);
> +	wake_up_process(thread);
> +
> +	ret = wait_event_interruptible(plr->lock_thread_wq,
> +				       plr->thread_done == 1);
> +	if (ret < 0) {
> +		/*
> +		 * If the thread does not get on the CPU for whatever
> +		 * reason and the process which sets up the region is
> +		 * interrupted then this will leave the thread in runnable
> +		 * state and once it gets on the CPU it will dereference
> +		 * the cleared, but not freed, plr struct resulting in an
> +		 * empty pseudo-locking loop.
> +		 */
> +		rdt_last_cmd_puts("Locking thread interrupted\n");
> +		goto out_cstates;
> +	}
> +
> +	ret = pseudo_lock_minor_get(&new_minor);
> +	if (ret < 0) {
> +		rdt_last_cmd_puts("Unable to obtain a new minor number\n");
> +		goto out_cstates;
> +	}
> +
> +	/*
> +	 * Unlock access but do not release the reference. The
> +	 * pseudo-locked region will still be here on return.
> +	 *
> +	 * The mutex has to be released temporarily to avoid a potential
> +	 * deadlock with the mm->mmap_lock which is obtained in the
> +	 * device_create() and debugfs_create_dir() callpath below as well as
> +	 * before the mmap() callback is called.
> +	 */
> +	mutex_unlock(&rdtgroup_mutex);
> +
> +	if (!IS_ERR_OR_NULL(debugfs_resctrl)) {
> +		plr->debugfs_dir = debugfs_create_dir(rdtgrp->kn->name,
> +						      debugfs_resctrl);
> +		if (!IS_ERR_OR_NULL(plr->debugfs_dir))
> +			debugfs_create_file("pseudo_lock_measure", 0200,
> +					    plr->debugfs_dir, rdtgrp,
> +					    &pseudo_measure_fops);
> +	}
> +
> +	dev = device_create(&pseudo_lock_class, NULL,
> +			    MKDEV(pseudo_lock_major, new_minor),
> +			    rdtgrp, "%s", rdtgrp->kn->name);
> +
> +	mutex_lock(&rdtgroup_mutex);
> +
> +	if (IS_ERR(dev)) {
> +		ret = PTR_ERR(dev);
> +		rdt_last_cmd_printf("Failed to create character device: %d\n",
> +				    ret);
> +		goto out_debugfs;
> +	}
> +
> +	/* We released the mutex - check if group was removed while we did so */
> +	if (rdtgrp->flags & RDT_DELETED) {
> +		ret = -ENODEV;
> +		goto out_device;
> +	}
> +
> +	plr->minor = new_minor;
> +
> +	rdtgrp->mode = RDT_MODE_PSEUDO_LOCKED;
> +	closid_free(rdtgrp->closid);
> +	rdtgroup_kn_mode_restore(rdtgrp, "cpus", 0444);
> +	rdtgroup_kn_mode_restore(rdtgrp, "cpus_list", 0444);
> +
> +	ret = 0;
> +	goto out;
> +
> +out_device:
> +	device_destroy(&pseudo_lock_class, MKDEV(pseudo_lock_major, new_minor));
> +out_debugfs:
> +	debugfs_remove_recursive(plr->debugfs_dir);
> +	pseudo_lock_minor_release(new_minor);
> +out_cstates:
> +	pseudo_lock_cstates_relax(plr);
> +out_region:
> +	pseudo_lock_region_clear(plr);
> +out:
> +	return ret;
> +}
> +
> +/**
> + * rdtgroup_pseudo_lock_remove - Remove a pseudo-locked region
> + * @rdtgrp: resource group to which the pseudo-locked region belongs
> + *
> + * The removal of a pseudo-locked region can be initiated when the resource
> + * group is removed from user space via a "rmdir" from userspace or the
> + * unmount of the resctrl filesystem. On removal the resource group does
> + * not go back to pseudo-locksetup mode before it is removed, instead it is
> + * removed directly. There is thus asymmetry with the creation where the
> + * &struct pseudo_lock_region is removed here while it was not created in
> + * rdtgroup_pseudo_lock_create().
> + *
> + * Return: void
> + */
> +void rdtgroup_pseudo_lock_remove(struct rdtgroup *rdtgrp)
> +{
> +	struct pseudo_lock_region *plr = rdtgrp->plr;
> +
> +	if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
> +		/*
> +		 * Default group cannot be a pseudo-locked region so we can
> +		 * free closid here.
> +		 */
> +		closid_free(rdtgrp->closid);
> +		goto free;
> +	}
> +
> +	pseudo_lock_cstates_relax(plr);
> +	debugfs_remove_recursive(rdtgrp->plr->debugfs_dir);
> +	device_destroy(&pseudo_lock_class, MKDEV(pseudo_lock_major, plr->minor));
> +	pseudo_lock_minor_release(plr->minor);
> +
> +free:
> +	pseudo_lock_free(rdtgrp);
> +}
> +
> +static int pseudo_lock_dev_open(struct inode *inode, struct file *filp)
> +{
> +	struct rdtgroup *rdtgrp;
> +
> +	mutex_lock(&rdtgroup_mutex);
> +
> +	rdtgrp = region_find_by_minor(iminor(inode));
> +	if (!rdtgrp) {
> +		mutex_unlock(&rdtgroup_mutex);
> +		return -ENODEV;
> +	}
> +
> +	filp->private_data = rdtgrp;
> +	atomic_inc(&rdtgrp->waitcount);
> +	/* Perform a non-seekable open - llseek is not supported */
> +	filp->f_mode &= ~(FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE);
> +
> +	mutex_unlock(&rdtgroup_mutex);
> +
> +	return 0;
> +}
> +
> +static int pseudo_lock_dev_release(struct inode *inode, struct file *filp)
> +{
> +	struct rdtgroup *rdtgrp;
> +
> +	mutex_lock(&rdtgroup_mutex);
> +	rdtgrp = filp->private_data;
> +	WARN_ON(!rdtgrp);
> +	if (!rdtgrp) {
> +		mutex_unlock(&rdtgroup_mutex);
> +		return -ENODEV;
> +	}
> +	filp->private_data = NULL;
> +	atomic_dec(&rdtgrp->waitcount);
> +	mutex_unlock(&rdtgroup_mutex);
> +	return 0;
> +}
> +
> +static int pseudo_lock_dev_mremap(struct vm_area_struct *area)
> +{
> +	/* Not supported */
> +	return -EINVAL;
> +}
> +
> +static const struct vm_operations_struct pseudo_mmap_ops = {
> +	.mremap = pseudo_lock_dev_mremap,
> +};
> +
> +static int pseudo_lock_dev_mmap(struct file *filp, struct vm_area_struct *vma)
> +{
> +	unsigned long vsize = vma->vm_end - vma->vm_start;
> +	unsigned long off = vma->vm_pgoff << PAGE_SHIFT;
> +	struct pseudo_lock_region *plr;
> +	struct rdtgroup *rdtgrp;
> +	unsigned long physical;
> +	unsigned long psize;
> +
> +	mutex_lock(&rdtgroup_mutex);
> +
> +	rdtgrp = filp->private_data;
> +	WARN_ON(!rdtgrp);
> +	if (!rdtgrp) {
> +		mutex_unlock(&rdtgroup_mutex);
> +		return -ENODEV;
> +	}
> +
> +	plr = rdtgrp->plr;
> +
> +	if (!plr->d) {
> +		mutex_unlock(&rdtgroup_mutex);
> +		return -ENODEV;
> +	}
> +
> +	/*
> +	 * Task is required to run with affinity to the cpus associated
> +	 * with the pseudo-locked region. If this is not the case the task
> +	 * may be scheduled elsewhere and invalidate entries in the
> +	 * pseudo-locked region.
> +	 */
> +	if (!cpumask_subset(current->cpus_ptr, &plr->d->cpu_mask)) {
> +		mutex_unlock(&rdtgroup_mutex);
> +		return -EINVAL;
> +	}
> +
> +	physical = __pa(plr->kmem) >> PAGE_SHIFT;
> +	psize = plr->size - off;
> +
> +	if (off > plr->size) {
> +		mutex_unlock(&rdtgroup_mutex);
> +		return -ENOSPC;
> +	}
> +
> +	/*
> +	 * Ensure changes are carried directly to the memory being mapped,
> +	 * do not allow copy-on-write mapping.
> +	 */
> +	if (!(vma->vm_flags & VM_SHARED)) {
> +		mutex_unlock(&rdtgroup_mutex);
> +		return -EINVAL;
> +	}
> +
> +	if (vsize > psize) {
> +		mutex_unlock(&rdtgroup_mutex);
> +		return -ENOSPC;
> +	}
> +
> +	memset(plr->kmem + off, 0, vsize);
> +
> +	if (remap_pfn_range(vma, vma->vm_start, physical + vma->vm_pgoff,
> +			    vsize, vma->vm_page_prot)) {
> +		mutex_unlock(&rdtgroup_mutex);
> +		return -EAGAIN;
> +	}
> +	vma->vm_ops = &pseudo_mmap_ops;
> +	mutex_unlock(&rdtgroup_mutex);
> +	return 0;
> +}
> +
> +static const struct file_operations pseudo_lock_dev_fops = {
> +	.owner =	THIS_MODULE,
> +	.llseek =	no_llseek,
> +	.read =		NULL,
> +	.write =	NULL,
> +	.open =		pseudo_lock_dev_open,
> +	.release =	pseudo_lock_dev_release,
> +	.mmap =		pseudo_lock_dev_mmap,
> +};
> +
> +int rdt_pseudo_lock_init(void)
> +{
> +	int ret;
> +
> +	ret = register_chrdev(0, "pseudo_lock", &pseudo_lock_dev_fops);
> +	if (ret < 0)
> +		return ret;
> +
> +	pseudo_lock_major = ret;
> +
> +	ret = class_register(&pseudo_lock_class);
> +	if (ret) {
> +		unregister_chrdev(pseudo_lock_major, "pseudo_lock");
> +		return ret;
> +	}
> +
> +	return 0;
> +}
> +
> +void rdt_pseudo_lock_release(void)
> +{
> +	class_unregister(&pseudo_lock_class);
> +	unregister_chrdev(pseudo_lock_major, "pseudo_lock");
> +	pseudo_lock_major = 0;
> +}
> diff --git a/fs/resctrl/rdtgroup.c b/fs/resctrl/rdtgroup.c
> index e69de29bb2d1..936fc6e47386 100644
> --- a/fs/resctrl/rdtgroup.c
> +++ b/fs/resctrl/rdtgroup.c
> @@ -0,0 +1,4013 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/*
> + * User interface for Resource Allocation in Resource Director Technology(RDT)
> + *
> + * Copyright (C) 2016 Intel Corporation
> + *
> + * Author: Fenghua Yu <fenghua.yu@...el.com>
> + *
> + * More information about RDT be found in the Intel (R) x86 Architecture
> + * Software Developer Manual.
> + */
> +
> +#define pr_fmt(fmt)	KBUILD_MODNAME ": " fmt
> +
> +#include <linux/cacheinfo.h>
> +#include <linux/cpu.h>
> +#include <linux/debugfs.h>
> +#include <linux/fs.h>
> +#include <linux/fs_parser.h>
> +#include <linux/sysfs.h>
> +#include <linux/kernfs.h>
> +#include <linux/seq_buf.h>
> +#include <linux/seq_file.h>
> +#include <linux/sched/signal.h>
> +#include <linux/sched/task.h>
> +#include <linux/slab.h>
> +#include <linux/task_work.h>
> +#include <linux/user_namespace.h>
> +
> +#include <uapi/linux/magic.h>
> +
> +#include <asm/resctrl.h>
> +#include "internal.h"
> +
> +/* Mutex to protect rdtgroup access. */
> +DEFINE_MUTEX(rdtgroup_mutex);
> +
> +static struct kernfs_root *rdt_root;
> +struct rdtgroup rdtgroup_default;
> +LIST_HEAD(rdt_all_groups);
> +
> +/* list of entries for the schemata file */
> +LIST_HEAD(resctrl_schema_all);
> +
> +/* The filesystem can only be mounted once. */
> +bool resctrl_mounted;
> +
> +/* Kernel fs node for "info" directory under root */
> +static struct kernfs_node *kn_info;
> +
> +/* Kernel fs node for "mon_groups" directory under root */
> +static struct kernfs_node *kn_mongrp;
> +
> +/* Kernel fs node for "mon_data" directory under root */
> +static struct kernfs_node *kn_mondata;
> +
> +/*
> + * Used to store the max resource name width and max resource data width
> + * to display the schemata in a tabular format
> + */
> +int max_name_width, max_data_width;
> +
> +static struct seq_buf last_cmd_status;
> +static char last_cmd_status_buf[512];
> +
> +static int rdtgroup_setup_root(struct rdt_fs_context *ctx);
> +static void rdtgroup_destroy_root(void);
> +
> +struct dentry *debugfs_resctrl;
> +
> +static bool resctrl_debug;
> +
> +void rdt_last_cmd_clear(void)
> +{
> +	lockdep_assert_held(&rdtgroup_mutex);
> +	seq_buf_clear(&last_cmd_status);
> +}
> +
> +void rdt_last_cmd_puts(const char *s)
> +{
> +	lockdep_assert_held(&rdtgroup_mutex);
> +	seq_buf_puts(&last_cmd_status, s);
> +}
> +
> +void rdt_last_cmd_printf(const char *fmt, ...)
> +{
> +	va_list ap;
> +
> +	va_start(ap, fmt);
> +	lockdep_assert_held(&rdtgroup_mutex);
> +	seq_buf_vprintf(&last_cmd_status, fmt, ap);
> +	va_end(ap);
> +}
> +
> +void rdt_staged_configs_clear(void)
> +{
> +	enum resctrl_res_level i;
> +	struct rdt_resource *r;
> +	struct rdt_domain *dom;
> +
> +	lockdep_assert_held(&rdtgroup_mutex);
> +
> +	for (i = 0; i < RDT_NUM_RESOURCES; i++) {
> +		r = resctrl_arch_get_resource(i);
> +		if (!r->alloc_capable)
> +			continue;
> +
> +		list_for_each_entry(dom, &r->domains, list)
> +			memset(dom->staged_config, 0, sizeof(dom->staged_config));
> +	}
> +}
> +
> +static bool resctrl_is_mbm_enabled(void)
> +{
> +	return (resctrl_arch_is_mbm_total_enabled() ||
> +		resctrl_arch_is_mbm_local_enabled());
> +}
> +
> +static bool resctrl_is_mbm_event(int e)
> +{
> +	return (e >= QOS_L3_MBM_TOTAL_EVENT_ID &&
> +		e <= QOS_L3_MBM_LOCAL_EVENT_ID);
> +}
> +
> +/*
> + * Trivial allocator for CLOSIDs. Since h/w only supports a small number,
> + * we can keep a bitmap of free CLOSIDs in a single integer.
> + *
> + * Using a global CLOSID across all resources has some advantages and
> + * some drawbacks:
> + * + We can simply set current's closid to assign a task to a resource
> + *   group.
> + * + Context switch code can avoid extra memory references deciding which
> + *   CLOSID to load into the PQR_ASSOC MSR
> + * - We give up some options in configuring resource groups across multi-socket
> + *   systems.
> + * - Our choices on how to configure each resource become progressively more
> + *   limited as the number of resources grows.
> + */
> +static unsigned long closid_free_map;
> +static int closid_free_map_len;
> +
> +int closids_supported(void)
> +{
> +	return closid_free_map_len;
> +}
> +
> +static void closid_init(void)
> +{
> +	struct resctrl_schema *s;
> +	u32 rdt_min_closid = 32;
> +
> +	/* Compute rdt_min_closid across all resources */
> +	list_for_each_entry(s, &resctrl_schema_all, list)
> +		rdt_min_closid = min(rdt_min_closid, s->num_closid);
> +
> +	closid_free_map = BIT_MASK(rdt_min_closid) - 1;
> +
> +	/* RESCTRL_RESERVED_CLOSID is always reserved for the default group */
> +	__clear_bit(RESCTRL_RESERVED_CLOSID, &closid_free_map);
> +	closid_free_map_len = rdt_min_closid;
> +}
> +
> +static int closid_alloc(void)
> +{
> +	int cleanest_closid;
> +	u32 closid;
> +
> +	lockdep_assert_held(&rdtgroup_mutex);
> +
> +	if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID) &&
> +	    resctrl_arch_is_llc_occupancy_enabled()) {
> +		cleanest_closid = resctrl_find_cleanest_closid();
> +		if (cleanest_closid < 0)
> +			return cleanest_closid;
> +		closid = cleanest_closid;
> +	} else {
> +		closid = ffs(closid_free_map);
> +		if (closid == 0)
> +			return -ENOSPC;
> +		closid--;
> +	}
> +	__clear_bit(closid, &closid_free_map);
> +
> +	return closid;
> +}
> +
> +void closid_free(int closid)
> +{
> +	lockdep_assert_held(&rdtgroup_mutex);
> +
> +	__set_bit(closid, &closid_free_map);
> +}
> +
> +/**
> + * closid_allocated - test if provided closid is in use
> + * @closid: closid to be tested
> + *
> + * Return: true if @closid is currently associated with a resource group,
> + * false if @closid is free
> + */
> +bool closid_allocated(unsigned int closid)
> +{
> +	lockdep_assert_held(&rdtgroup_mutex);
> +
> +	return !test_bit(closid, &closid_free_map);
> +}
> +
> +/**
> + * rdtgroup_mode_by_closid - Return mode of resource group with closid
> + * @closid: closid if the resource group
> + *
> + * Each resource group is associated with a @closid. Here the mode
> + * of a resource group can be queried by searching for it using its closid.
> + *
> + * Return: mode as &enum rdtgrp_mode of resource group with closid @closid
> + */
> +enum rdtgrp_mode rdtgroup_mode_by_closid(int closid)
> +{
> +	struct rdtgroup *rdtgrp;
> +
> +	list_for_each_entry(rdtgrp, &rdt_all_groups, rdtgroup_list) {
> +		if (rdtgrp->closid == closid)
> +			return rdtgrp->mode;
> +	}
> +
> +	return RDT_NUM_MODES;
> +}
> +
> +static const char * const rdt_mode_str[] = {
> +	[RDT_MODE_SHAREABLE]		= "shareable",
> +	[RDT_MODE_EXCLUSIVE]		= "exclusive",
> +	[RDT_MODE_PSEUDO_LOCKSETUP]	= "pseudo-locksetup",
> +	[RDT_MODE_PSEUDO_LOCKED]	= "pseudo-locked",
> +};
> +
> +/**
> + * rdtgroup_mode_str - Return the string representation of mode
> + * @mode: the resource group mode as &enum rdtgroup_mode
> + *
> + * Return: string representation of valid mode, "unknown" otherwise
> + */
> +static const char *rdtgroup_mode_str(enum rdtgrp_mode mode)
> +{
> +	if (mode < RDT_MODE_SHAREABLE || mode >= RDT_NUM_MODES)
> +		return "unknown";
> +
> +	return rdt_mode_str[mode];
> +}
> +
> +/* set uid and gid of rdtgroup dirs and files to that of the creator */
> +static int rdtgroup_kn_set_ugid(struct kernfs_node *kn)
> +{
> +	struct iattr iattr = { .ia_valid = ATTR_UID | ATTR_GID,
> +				.ia_uid = current_fsuid(),
> +				.ia_gid = current_fsgid(), };
> +
> +	if (uid_eq(iattr.ia_uid, GLOBAL_ROOT_UID) &&
> +	    gid_eq(iattr.ia_gid, GLOBAL_ROOT_GID))
> +		return 0;
> +
> +	return kernfs_setattr(kn, &iattr);
> +}
> +
> +static int rdtgroup_add_file(struct kernfs_node *parent_kn, struct rftype *rft)
> +{
> +	struct kernfs_node *kn;
> +	int ret;
> +
> +	kn = __kernfs_create_file(parent_kn, rft->name, rft->mode,
> +				  GLOBAL_ROOT_UID, GLOBAL_ROOT_GID,
> +				  0, rft->kf_ops, rft, NULL, NULL);
> +	if (IS_ERR(kn))
> +		return PTR_ERR(kn);
> +
> +	ret = rdtgroup_kn_set_ugid(kn);
> +	if (ret) {
> +		kernfs_remove(kn);
> +		return ret;
> +	}
> +
> +	return 0;
> +}
> +
> +static int rdtgroup_seqfile_show(struct seq_file *m, void *arg)
> +{
> +	struct kernfs_open_file *of = m->private;
> +	struct rftype *rft = of->kn->priv;
> +
> +	if (rft->seq_show)
> +		return rft->seq_show(of, m, arg);
> +	return 0;
> +}
> +
> +static ssize_t rdtgroup_file_write(struct kernfs_open_file *of, char *buf,
> +				   size_t nbytes, loff_t off)
> +{
> +	struct rftype *rft = of->kn->priv;
> +
> +	if (rft->write)
> +		return rft->write(of, buf, nbytes, off);
> +
> +	return -EINVAL;
> +}
> +
> +static const struct kernfs_ops rdtgroup_kf_single_ops = {
> +	.atomic_write_len	= PAGE_SIZE,
> +	.write			= rdtgroup_file_write,
> +	.seq_show		= rdtgroup_seqfile_show,
> +};
> +
> +static const struct kernfs_ops kf_mondata_ops = {
> +	.atomic_write_len	= PAGE_SIZE,
> +	.seq_show		= rdtgroup_mondata_show,
> +};
> +
> +static bool is_cpu_list(struct kernfs_open_file *of)
> +{
> +	struct rftype *rft = of->kn->priv;
> +
> +	return rft->flags & RFTYPE_FLAGS_CPUS_LIST;
> +}
> +
> +static int rdtgroup_cpus_show(struct kernfs_open_file *of,
> +			      struct seq_file *s, void *v)
> +{
> +	struct rdtgroup *rdtgrp;
> +	struct cpumask *mask;
> +	int ret = 0;
> +
> +	rdtgrp = rdtgroup_kn_lock_live(of->kn);
> +
> +	if (rdtgrp) {
> +		if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) {
> +			if (!rdtgrp->plr->d) {
> +				rdt_last_cmd_clear();
> +				rdt_last_cmd_puts("Cache domain offline\n");
> +				ret = -ENODEV;
> +			} else {
> +				mask = &rdtgrp->plr->d->cpu_mask;
> +				seq_printf(s, is_cpu_list(of) ?
> +					   "%*pbl\n" : "%*pb\n",
> +					   cpumask_pr_args(mask));
> +			}
> +		} else {
> +			seq_printf(s, is_cpu_list(of) ? "%*pbl\n" : "%*pb\n",
> +				   cpumask_pr_args(&rdtgrp->cpu_mask));
> +		}
> +	} else {
> +		ret = -ENOENT;
> +	}
> +	rdtgroup_kn_unlock(of->kn);
> +
> +	return ret;
> +}
> +
> +/*
> + * Update the PGR_ASSOC MSR on all cpus in @cpu_mask,
> + *
> + * Per task closids/rmids must have been set up before calling this function.
> + * @r may be NULL.
> + */
> +static void
> +update_closid_rmid(const struct cpumask *cpu_mask, struct rdtgroup *r)
> +{
> +	struct resctrl_cpu_sync defaults;
> +	struct resctrl_cpu_sync *defaults_p = NULL;
> +
> +	if (r) {
> +		defaults.closid = r->closid;
> +		defaults.rmid = r->mon.rmid;
> +		defaults_p = &defaults;
> +	}
> +
> +	on_each_cpu_mask(cpu_mask, resctrl_arch_sync_cpu_defaults, defaults_p,
> +			 1);
> +}
> +
> +static int cpus_mon_write(struct rdtgroup *rdtgrp, cpumask_var_t newmask,
> +			  cpumask_var_t tmpmask)
> +{
> +	struct rdtgroup *prgrp = rdtgrp->mon.parent, *crgrp;
> +	struct list_head *head;
> +
> +	/* Check whether cpus belong to parent ctrl group */
> +	cpumask_andnot(tmpmask, newmask, &prgrp->cpu_mask);
> +	if (!cpumask_empty(tmpmask)) {
> +		rdt_last_cmd_puts("Can only add CPUs to mongroup that belong to parent\n");
> +		return -EINVAL;
> +	}
> +
> +	/* Check whether cpus are dropped from this group */
> +	cpumask_andnot(tmpmask, &rdtgrp->cpu_mask, newmask);
> +	if (!cpumask_empty(tmpmask)) {
> +		/* Give any dropped cpus to parent rdtgroup */
> +		cpumask_or(&prgrp->cpu_mask, &prgrp->cpu_mask, tmpmask);
> +		update_closid_rmid(tmpmask, prgrp);
> +	}
> +
> +	/*
> +	 * If we added cpus, remove them from previous group that owned them
> +	 * and update per-cpu rmid
> +	 */
> +	cpumask_andnot(tmpmask, newmask, &rdtgrp->cpu_mask);
> +	if (!cpumask_empty(tmpmask)) {
> +		head = &prgrp->mon.crdtgrp_list;
> +		list_for_each_entry(crgrp, head, mon.crdtgrp_list) {
> +			if (crgrp == rdtgrp)
> +				continue;
> +			cpumask_andnot(&crgrp->cpu_mask, &crgrp->cpu_mask,
> +				       tmpmask);
> +		}
> +		update_closid_rmid(tmpmask, rdtgrp);
> +	}
> +
> +	/* Done pushing/pulling - update this group with new mask */
> +	cpumask_copy(&rdtgrp->cpu_mask, newmask);
> +
> +	return 0;
> +}
> +
> +static void cpumask_rdtgrp_clear(struct rdtgroup *r, struct cpumask *m)
> +{
> +	struct rdtgroup *crgrp;
> +
> +	cpumask_andnot(&r->cpu_mask, &r->cpu_mask, m);
> +	/* update the child mon group masks as well*/
> +	list_for_each_entry(crgrp, &r->mon.crdtgrp_list, mon.crdtgrp_list)
> +		cpumask_and(&crgrp->cpu_mask, &r->cpu_mask, &crgrp->cpu_mask);
> +}
> +
> +static int cpus_ctrl_write(struct rdtgroup *rdtgrp, cpumask_var_t newmask,
> +			   cpumask_var_t tmpmask, cpumask_var_t tmpmask1)
> +{
> +	struct rdtgroup *r, *crgrp;
> +	struct list_head *head;
> +
> +	/* Check whether cpus are dropped from this group */
> +	cpumask_andnot(tmpmask, &rdtgrp->cpu_mask, newmask);
> +	if (!cpumask_empty(tmpmask)) {
> +		/* Can't drop from default group */
> +		if (rdtgrp == &rdtgroup_default) {
> +			rdt_last_cmd_puts("Can't drop CPUs from default group\n");
> +			return -EINVAL;
> +		}
> +
> +		/* Give any dropped cpus to rdtgroup_default */
> +		cpumask_or(&rdtgroup_default.cpu_mask,
> +			   &rdtgroup_default.cpu_mask, tmpmask);
> +		update_closid_rmid(tmpmask, &rdtgroup_default);
> +	}
> +
> +	/*
> +	 * If we added cpus, remove them from previous group and
> +	 * the prev group's child groups that owned them
> +	 * and update per-cpu closid/rmid.
> +	 */
> +	cpumask_andnot(tmpmask, newmask, &rdtgrp->cpu_mask);
> +	if (!cpumask_empty(tmpmask)) {
> +		list_for_each_entry(r, &rdt_all_groups, rdtgroup_list) {
> +			if (r == rdtgrp)
> +				continue;
> +			cpumask_and(tmpmask1, &r->cpu_mask, tmpmask);
> +			if (!cpumask_empty(tmpmask1))
> +				cpumask_rdtgrp_clear(r, tmpmask1);
> +		}
> +		update_closid_rmid(tmpmask, rdtgrp);
> +	}
> +
> +	/* Done pushing/pulling - update this group with new mask */
> +	cpumask_copy(&rdtgrp->cpu_mask, newmask);
> +
> +	/*
> +	 * Clear child mon group masks since there is a new parent mask
> +	 * now and update the rmid for the cpus the child lost.
> +	 */
> +	head = &rdtgrp->mon.crdtgrp_list;
> +	list_for_each_entry(crgrp, head, mon.crdtgrp_list) {
> +		cpumask_and(tmpmask, &rdtgrp->cpu_mask, &crgrp->cpu_mask);
> +		update_closid_rmid(tmpmask, rdtgrp);
> +		cpumask_clear(&crgrp->cpu_mask);
> +	}
> +
> +	return 0;
> +}
> +
> +static ssize_t rdtgroup_cpus_write(struct kernfs_open_file *of,
> +				   char *buf, size_t nbytes, loff_t off)
> +{
> +	cpumask_var_t tmpmask, newmask, tmpmask1;
> +	struct rdtgroup *rdtgrp;
> +	int ret;
> +
> +	if (!buf)
> +		return -EINVAL;
> +
> +	if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
> +		return -ENOMEM;
> +	if (!zalloc_cpumask_var(&newmask, GFP_KERNEL)) {
> +		free_cpumask_var(tmpmask);
> +		return -ENOMEM;
> +	}
> +	if (!zalloc_cpumask_var(&tmpmask1, GFP_KERNEL)) {
> +		free_cpumask_var(tmpmask);
> +		free_cpumask_var(newmask);
> +		return -ENOMEM;
> +	}
> +
> +	rdtgrp = rdtgroup_kn_lock_live(of->kn);
> +	if (!rdtgrp) {
> +		ret = -ENOENT;
> +		goto unlock;
> +	}
> +
> +	if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED ||
> +	    rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
> +		ret = -EINVAL;
> +		rdt_last_cmd_puts("Pseudo-locking in progress\n");
> +		goto unlock;
> +	}
> +
> +	if (is_cpu_list(of))
> +		ret = cpulist_parse(buf, newmask);
> +	else
> +		ret = cpumask_parse(buf, newmask);
> +
> +	if (ret) {
> +		rdt_last_cmd_puts("Bad CPU list/mask\n");
> +		goto unlock;
> +	}
> +
> +	/* check that user didn't specify any offline cpus */
> +	cpumask_andnot(tmpmask, newmask, cpu_online_mask);
> +	if (!cpumask_empty(tmpmask)) {
> +		ret = -EINVAL;
> +		rdt_last_cmd_puts("Can only assign online CPUs\n");
> +		goto unlock;
> +	}
> +
> +	if (rdtgrp->type == RDTCTRL_GROUP)
> +		ret = cpus_ctrl_write(rdtgrp, newmask, tmpmask, tmpmask1);
> +	else if (rdtgrp->type == RDTMON_GROUP)
> +		ret = cpus_mon_write(rdtgrp, newmask, tmpmask);
> +	else
> +		ret = -EINVAL;
> +
> +unlock:
> +	rdtgroup_kn_unlock(of->kn);
> +	free_cpumask_var(tmpmask);
> +	free_cpumask_var(newmask);
> +	free_cpumask_var(tmpmask1);
> +
> +	return ret ?: nbytes;
> +}
> +
> +/**
> + * rdtgroup_remove - the helper to remove resource group safely
> + * @rdtgrp: resource group to remove
> + *
> + * On resource group creation via a mkdir, an extra kernfs_node reference is
> + * taken to ensure that the rdtgroup structure remains accessible for the
> + * rdtgroup_kn_unlock() calls where it is removed.
> + *
> + * Drop the extra reference here, then free the rdtgroup structure.
> + *
> + * Return: void
> + */
> +static void rdtgroup_remove(struct rdtgroup *rdtgrp)
> +{
> +	kernfs_put(rdtgrp->kn);
> +	kfree(rdtgrp);
> +}
> +
> +static void _update_task_closid_rmid(void *task)
> +{
> +	/*
> +	 * If the task is still current on this CPU, update PQR_ASSOC MSR.
> +	 * Otherwise, the MSR is updated when the task is scheduled in.
> +	 */
> +	if (task == current)
> +		resctrl_arch_sched_in(task);
> +}
> +
> +static void update_task_closid_rmid(struct task_struct *t)
> +{
> +	if (IS_ENABLED(CONFIG_SMP) && task_curr(t))
> +		smp_call_function_single(task_cpu(t), _update_task_closid_rmid, t, 1);
> +	else
> +		_update_task_closid_rmid(t);
> +}
> +
> +static bool task_in_rdtgroup(struct task_struct *tsk, struct rdtgroup *rdtgrp)
> +{
> +	u32 closid, rmid = rdtgrp->mon.rmid;
> +
> +	if (rdtgrp->type == RDTCTRL_GROUP)
> +		closid = rdtgrp->closid;
> +	else if (rdtgrp->type == RDTMON_GROUP)
> +		closid = rdtgrp->mon.parent->closid;
> +	else
> +		return false;
> +
> +	return resctrl_arch_match_closid(tsk, closid) &&
> +	       resctrl_arch_match_rmid(tsk, closid, rmid);
> +}
> +
> +static int __rdtgroup_move_task(struct task_struct *tsk,
> +				struct rdtgroup *rdtgrp)
> +{
> +	/* If the task is already in rdtgrp, no need to move the task. */
> +	if (task_in_rdtgroup(tsk, rdtgrp))
> +		return 0;
> +
> +	/*
> +	 * Set the task's closid/rmid before the PQR_ASSOC MSR can be
> +	 * updated by them.
> +	 *
> +	 * For ctrl_mon groups, move both closid and rmid.
> +	 * For monitor groups, can move the tasks only from
> +	 * their parent CTRL group.
> +	 */
> +	if (rdtgrp->type == RDTMON_GROUP &&
> +	    !resctrl_arch_match_closid(tsk, rdtgrp->mon.parent->closid)) {
> +		rdt_last_cmd_puts("Can't move task to different control group\n");
> +		return -EINVAL;
> +	}
> +
> +	if (rdtgrp->type == RDTMON_GROUP)
> +		resctrl_arch_set_closid_rmid(tsk, rdtgrp->mon.parent->closid,
> +					     rdtgrp->mon.rmid);
> +	else
> +		resctrl_arch_set_closid_rmid(tsk, rdtgrp->closid,
> +					     rdtgrp->mon.rmid);
> +
> +	/*
> +	 * Ensure the task's closid and rmid are written before determining if
> +	 * the task is current that will decide if it will be interrupted.
> +	 * This pairs with the full barrier between the rq->curr update and
> +	 * resctrl_arch_sched_in() during context switch.
> +	 */
> +	smp_mb();
> +
> +	/*
> +	 * By now, the task's closid and rmid are set. If the task is current
> +	 * on a CPU, the PQR_ASSOC MSR needs to be updated to make the resource
> +	 * group go into effect. If the task is not current, the MSR will be
> +	 * updated when the task is scheduled in.
> +	 */
> +	update_task_closid_rmid(tsk);
> +
> +	return 0;
> +}
> +
> +static bool is_closid_match(struct task_struct *t, struct rdtgroup *r)
> +{
> +	return (resctrl_arch_alloc_capable() && (r->type == RDTCTRL_GROUP) &&
> +		resctrl_arch_match_closid(t, r->closid));
> +}
> +
> +static bool is_rmid_match(struct task_struct *t, struct rdtgroup *r)
> +{
> +	return (resctrl_arch_mon_capable() && (r->type == RDTMON_GROUP) &&
> +		resctrl_arch_match_rmid(t, r->mon.parent->closid,
> +					r->mon.rmid));
> +}
> +
> +/**
> + * rdtgroup_tasks_assigned - Test if tasks have been assigned to resource group
> + * @r: Resource group
> + *
> + * Return: 1 if tasks have been assigned to @r, 0 otherwise
> + */
> +int rdtgroup_tasks_assigned(struct rdtgroup *r)
> +{
> +	struct task_struct *p, *t;
> +	int ret = 0;
> +
> +	lockdep_assert_held(&rdtgroup_mutex);
> +
> +	rcu_read_lock();
> +	for_each_process_thread(p, t) {
> +		if (is_closid_match(t, r) || is_rmid_match(t, r)) {
> +			ret = 1;
> +			break;
> +		}
> +	}
> +	rcu_read_unlock();
> +
> +	return ret;
> +}
> +
> +static int rdtgroup_task_write_permission(struct task_struct *task,
> +					  struct kernfs_open_file *of)
> +{
> +	const struct cred *tcred = get_task_cred(task);
> +	const struct cred *cred = current_cred();
> +	int ret = 0;
> +
> +	/*
> +	 * Even if we're attaching all tasks in the thread group, we only
> +	 * need to check permissions on one of them.
> +	 */
> +	if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) &&
> +	    !uid_eq(cred->euid, tcred->uid) &&
> +	    !uid_eq(cred->euid, tcred->suid)) {
> +		rdt_last_cmd_printf("No permission to move task %d\n", task->pid);
> +		ret = -EPERM;
> +	}
> +
> +	put_cred(tcred);
> +	return ret;
> +}
> +
> +static int rdtgroup_move_task(pid_t pid, struct rdtgroup *rdtgrp,
> +			      struct kernfs_open_file *of)
> +{
> +	struct task_struct *tsk;
> +	int ret;
> +
> +	rcu_read_lock();
> +	if (pid) {
> +		tsk = find_task_by_vpid(pid);
> +		if (!tsk) {
> +			rcu_read_unlock();
> +			rdt_last_cmd_printf("No task %d\n", pid);
> +			return -ESRCH;
> +		}
> +	} else {
> +		tsk = current;
> +	}
> +
> +	get_task_struct(tsk);
> +	rcu_read_unlock();
> +
> +	ret = rdtgroup_task_write_permission(tsk, of);
> +	if (!ret)
> +		ret = __rdtgroup_move_task(tsk, rdtgrp);
> +
> +	put_task_struct(tsk);
> +	return ret;
> +}
> +
> +static ssize_t rdtgroup_tasks_write(struct kernfs_open_file *of,
> +				    char *buf, size_t nbytes, loff_t off)
> +{
> +	struct rdtgroup *rdtgrp;
> +	char *pid_str;
> +	int ret = 0;
> +	pid_t pid;
> +
> +	rdtgrp = rdtgroup_kn_lock_live(of->kn);
> +	if (!rdtgrp) {
> +		rdtgroup_kn_unlock(of->kn);
> +		return -ENOENT;
> +	}
> +	rdt_last_cmd_clear();
> +
> +	if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED ||
> +	    rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
> +		ret = -EINVAL;
> +		rdt_last_cmd_puts("Pseudo-locking in progress\n");
> +		goto unlock;
> +	}
> +
> +	while (buf && buf[0] != '\0' && buf[0] != '\n') {
> +		pid_str = strim(strsep(&buf, ","));
> +
> +		if (kstrtoint(pid_str, 0, &pid)) {
> +			rdt_last_cmd_printf("Task list parsing error pid %s\n", pid_str);
> +			ret = -EINVAL;
> +			break;
> +		}
> +
> +		if (pid < 0) {
> +			rdt_last_cmd_printf("Invalid pid %d\n", pid);
> +			ret = -EINVAL;
> +			break;
> +		}
> +
> +		ret = rdtgroup_move_task(pid, rdtgrp, of);
> +		if (ret) {
> +			rdt_last_cmd_printf("Error while processing task %d\n", pid);
> +			break;
> +		}
> +	}
> +
> +unlock:
> +	rdtgroup_kn_unlock(of->kn);
> +
> +	return ret ?: nbytes;
> +}
> +
> +static void show_rdt_tasks(struct rdtgroup *r, struct seq_file *s)
> +{
> +	struct task_struct *p, *t;
> +	pid_t pid;
> +
> +	rcu_read_lock();
> +	for_each_process_thread(p, t) {
> +		if (is_closid_match(t, r) || is_rmid_match(t, r)) {
> +			pid = task_pid_vnr(t);
> +			if (pid)
> +				seq_printf(s, "%d\n", pid);
> +		}
> +	}
> +	rcu_read_unlock();
> +}
> +
> +static int rdtgroup_tasks_show(struct kernfs_open_file *of,
> +			       struct seq_file *s, void *v)
> +{
> +	struct rdtgroup *rdtgrp;
> +	int ret = 0;
> +
> +	rdtgrp = rdtgroup_kn_lock_live(of->kn);
> +	if (rdtgrp)
> +		show_rdt_tasks(rdtgrp, s);
> +	else
> +		ret = -ENOENT;
> +	rdtgroup_kn_unlock(of->kn);
> +
> +	return ret;
> +}
> +
> +static int rdtgroup_closid_show(struct kernfs_open_file *of,
> +				struct seq_file *s, void *v)
> +{
> +	struct rdtgroup *rdtgrp;
> +	int ret = 0;
> +
> +	rdtgrp = rdtgroup_kn_lock_live(of->kn);
> +	if (rdtgrp)
> +		seq_printf(s, "%u\n", rdtgrp->closid);
> +	else
> +		ret = -ENOENT;
> +	rdtgroup_kn_unlock(of->kn);
> +
> +	return ret;
> +}
> +
> +static int rdtgroup_rmid_show(struct kernfs_open_file *of,
> +			      struct seq_file *s, void *v)
> +{
> +	struct rdtgroup *rdtgrp;
> +	int ret = 0;
> +
> +	rdtgrp = rdtgroup_kn_lock_live(of->kn);
> +	if (rdtgrp)
> +		seq_printf(s, "%u\n", rdtgrp->mon.rmid);
> +	else
> +		ret = -ENOENT;
> +	rdtgroup_kn_unlock(of->kn);
> +
> +	return ret;
> +}
> +
> +#ifdef CONFIG_PROC_CPU_RESCTRL
> +
> +/*
> + * A task can only be part of one resctrl control group and of one monitor
> + * group which is associated to that control group.
> + *
> + * 1)   res:
> + *      mon:
> + *
> + *    resctrl is not available.
> + *
> + * 2)   res:/
> + *      mon:
> + *
> + *    Task is part of the root resctrl control group, and it is not associated
> + *    to any monitor group.
> + *
> + * 3)  res:/
> + *     mon:mon0
> + *
> + *    Task is part of the root resctrl control group and monitor group mon0.
> + *
> + * 4)  res:group0
> + *     mon:
> + *
> + *    Task is part of resctrl control group group0, and it is not associated
> + *    to any monitor group.
> + *
> + * 5) res:group0
> + *    mon:mon1
> + *
> + *    Task is part of resctrl control group group0 and monitor group mon1.
> + */
> +int proc_resctrl_show(struct seq_file *s, struct pid_namespace *ns,
> +		      struct pid *pid, struct task_struct *tsk)
> +{
> +	struct rdtgroup *rdtg;
> +	int ret = 0;
> +
> +	mutex_lock(&rdtgroup_mutex);
> +
> +	/* Return empty if resctrl has not been mounted. */
> +	if (!resctrl_mounted) {
> +		seq_puts(s, "res:\nmon:\n");
> +		goto unlock;
> +	}
> +
> +	list_for_each_entry(rdtg, &rdt_all_groups, rdtgroup_list) {
> +		struct rdtgroup *crg;
> +
> +		/*
> +		 * Task information is only relevant for shareable
> +		 * and exclusive groups.
> +		 */
> +		if (rdtg->mode != RDT_MODE_SHAREABLE &&
> +		    rdtg->mode != RDT_MODE_EXCLUSIVE)
> +			continue;
> +
> +		if (!resctrl_arch_match_closid(tsk, rdtg->closid))
> +			continue;
> +
> +		seq_printf(s, "res:%s%s\n", (rdtg == &rdtgroup_default) ? "/" : "",
> +			   rdtg->kn->name);
> +		seq_puts(s, "mon:");
> +		list_for_each_entry(crg, &rdtg->mon.crdtgrp_list,
> +				    mon.crdtgrp_list) {
> +			if (!resctrl_arch_match_rmid(tsk, crg->mon.parent->closid,
> +						     crg->mon.rmid))
> +				continue;
> +			seq_printf(s, "%s", crg->kn->name);
> +			break;
> +		}
> +		seq_putc(s, '\n');
> +		goto unlock;
> +	}
> +	/*
> +	 * The above search should succeed. Otherwise return
> +	 * with an error.
> +	 */
> +	ret = -ENOENT;
> +unlock:
> +	mutex_unlock(&rdtgroup_mutex);
> +
> +	return ret;
> +}
> +#endif
> +
> +static int rdt_last_cmd_status_show(struct kernfs_open_file *of,
> +				    struct seq_file *seq, void *v)
> +{
> +	int len;
> +
> +	mutex_lock(&rdtgroup_mutex);
> +	len = seq_buf_used(&last_cmd_status);
> +	if (len)
> +		seq_printf(seq, "%.*s", len, last_cmd_status_buf);
> +	else
> +		seq_puts(seq, "ok\n");
> +	mutex_unlock(&rdtgroup_mutex);
> +	return 0;
> +}
> +
> +static int rdt_num_closids_show(struct kernfs_open_file *of,
> +				struct seq_file *seq, void *v)
> +{
> +	struct resctrl_schema *s = of->kn->parent->priv;
> +
> +	seq_printf(seq, "%u\n", s->num_closid);
> +	return 0;
> +}
> +
> +static int rdt_default_ctrl_show(struct kernfs_open_file *of,
> +			     struct seq_file *seq, void *v)
> +{
> +	struct resctrl_schema *s = of->kn->parent->priv;
> +	struct rdt_resource *r = s->res;
> +
> +	seq_printf(seq, "%x\n", r->default_ctrl);
> +	return 0;
> +}
> +
> +static int rdt_min_cbm_bits_show(struct kernfs_open_file *of,
> +			     struct seq_file *seq, void *v)
> +{
> +	struct resctrl_schema *s = of->kn->parent->priv;
> +	struct rdt_resource *r = s->res;
> +
> +	seq_printf(seq, "%u\n", r->cache.min_cbm_bits);
> +	return 0;
> +}
> +
> +static int rdt_shareable_bits_show(struct kernfs_open_file *of,
> +				   struct seq_file *seq, void *v)
> +{
> +	struct resctrl_schema *s = of->kn->parent->priv;
> +	struct rdt_resource *r = s->res;
> +
> +	seq_printf(seq, "%x\n", r->cache.shareable_bits);
> +	return 0;
> +}
> +
> +/*
> + * rdt_bit_usage_show - Display current usage of resources
> + *
> + * A domain is a shared resource that can now be allocated differently. Here
> + * we display the current regions of the domain as an annotated bitmask.
> + * For each domain of this resource its allocation bitmask
> + * is annotated as below to indicate the current usage of the corresponding bit:
> + *   0 - currently unused
> + *   X - currently available for sharing and used by software and hardware
> + *   H - currently used by hardware only but available for software use
> + *   S - currently used and shareable by software only
> + *   E - currently used exclusively by one resource group
> + *   P - currently pseudo-locked by one resource group
> + */
> +static int rdt_bit_usage_show(struct kernfs_open_file *of,
> +			      struct seq_file *seq, void *v)
> +{
> +	struct resctrl_schema *s = of->kn->parent->priv;
> +	/*
> +	 * Use unsigned long even though only 32 bits are used to ensure
> +	 * test_bit() is used safely.
> +	 */
> +	unsigned long sw_shareable = 0, hw_shareable = 0;
> +	unsigned long exclusive = 0, pseudo_locked = 0;
> +	struct rdt_resource *r = s->res;
> +	struct rdt_domain *dom;
> +	int i, hwb, swb, excl, psl;
> +	enum rdtgrp_mode mode;
> +	bool sep = false;
> +	u32 ctrl_val;
> +
> +	cpus_read_lock();
> +	mutex_lock(&rdtgroup_mutex);
> +	hw_shareable = r->cache.shareable_bits;
> +	list_for_each_entry(dom, &r->domains, list) {
> +		if (sep)
> +			seq_putc(seq, ';');
> +		sw_shareable = 0;
> +		exclusive = 0;
> +		seq_printf(seq, "%d=", dom->id);
> +		for (i = 0; i < closids_supported(); i++) {
> +			if (!closid_allocated(i))
> +				continue;
> +			ctrl_val = resctrl_arch_get_config(r, dom, i,
> +							   s->conf_type);
> +			mode = rdtgroup_mode_by_closid(i);
> +			switch (mode) {
> +			case RDT_MODE_SHAREABLE:
> +				sw_shareable |= ctrl_val;
> +				break;
> +			case RDT_MODE_EXCLUSIVE:
> +				exclusive |= ctrl_val;
> +				break;
> +			case RDT_MODE_PSEUDO_LOCKSETUP:
> +			/*
> +			 * RDT_MODE_PSEUDO_LOCKSETUP is possible
> +			 * here but not included since the CBM
> +			 * associated with this CLOSID in this mode
> +			 * is not initialized and no task or cpu can be
> +			 * assigned this CLOSID.
> +			 */
> +				break;
> +			case RDT_MODE_PSEUDO_LOCKED:
> +			case RDT_NUM_MODES:
> +				WARN(1,
> +				     "invalid mode for closid %d\n", i);
> +				break;
> +			}
> +		}
> +		for (i = r->cache.cbm_len - 1; i >= 0; i--) {
> +			pseudo_locked = dom->plr ? dom->plr->cbm : 0;
> +			hwb = test_bit(i, &hw_shareable);
> +			swb = test_bit(i, &sw_shareable);
> +			excl = test_bit(i, &exclusive);
> +			psl = test_bit(i, &pseudo_locked);
> +			if (hwb && swb)
> +				seq_putc(seq, 'X');
> +			else if (hwb && !swb)
> +				seq_putc(seq, 'H');
> +			else if (!hwb && swb)
> +				seq_putc(seq, 'S');
> +			else if (excl)
> +				seq_putc(seq, 'E');
> +			else if (psl)
> +				seq_putc(seq, 'P');
> +			else /* Unused bits remain */
> +				seq_putc(seq, '0');
> +		}
> +		sep = true;
> +	}
> +	seq_putc(seq, '\n');
> +	mutex_unlock(&rdtgroup_mutex);
> +	cpus_read_unlock();
> +	return 0;
> +}
> +
> +static int rdt_min_bw_show(struct kernfs_open_file *of,
> +			     struct seq_file *seq, void *v)
> +{
> +	struct resctrl_schema *s = of->kn->parent->priv;
> +	struct rdt_resource *r = s->res;
> +
> +	seq_printf(seq, "%u\n", r->membw.min_bw);
> +	return 0;
> +}
> +
> +static int rdt_num_rmids_show(struct kernfs_open_file *of,
> +			      struct seq_file *seq, void *v)
> +{
> +	struct rdt_resource *r = of->kn->parent->priv;
> +
> +	seq_printf(seq, "%d\n", r->num_rmid);
> +
> +	return 0;
> +}
> +
> +static int rdt_mon_features_show(struct kernfs_open_file *of,
> +				 struct seq_file *seq, void *v)
> +{
> +	struct rdt_resource *r = of->kn->parent->priv;
> +	struct mon_evt *mevt;
> +
> +	list_for_each_entry(mevt, &r->evt_list, list) {
> +		seq_printf(seq, "%s\n", mevt->name);
> +		if (mevt->configurable)
> +			seq_printf(seq, "%s_config\n", mevt->name);
> +	}
> +
> +	return 0;
> +}
> +
> +static int rdt_bw_gran_show(struct kernfs_open_file *of,
> +			     struct seq_file *seq, void *v)
> +{
> +	struct resctrl_schema *s = of->kn->parent->priv;
> +	struct rdt_resource *r = s->res;
> +
> +	seq_printf(seq, "%u\n", r->membw.bw_gran);
> +	return 0;
> +}
> +
> +static int rdt_delay_linear_show(struct kernfs_open_file *of,
> +			     struct seq_file *seq, void *v)
> +{
> +	struct resctrl_schema *s = of->kn->parent->priv;
> +	struct rdt_resource *r = s->res;
> +
> +	seq_printf(seq, "%u\n", r->membw.delay_linear);
> +	return 0;
> +}
> +
> +static int max_threshold_occ_show(struct kernfs_open_file *of,
> +				  struct seq_file *seq, void *v)
> +{
> +	seq_printf(seq, "%u\n", resctrl_rmid_realloc_threshold);
> +
> +	return 0;
> +}
> +
> +static int rdt_thread_throttle_mode_show(struct kernfs_open_file *of,
> +					 struct seq_file *seq, void *v)
> +{
> +	struct resctrl_schema *s = of->kn->parent->priv;
> +	struct rdt_resource *r = s->res;
> +
> +	if (r->membw.throttle_mode == THREAD_THROTTLE_PER_THREAD)
> +		seq_puts(seq, "per-thread\n");
> +	else
> +		seq_puts(seq, "max\n");
> +
> +	return 0;
> +}
> +
> +static ssize_t max_threshold_occ_write(struct kernfs_open_file *of,
> +				       char *buf, size_t nbytes, loff_t off)
> +{
> +	unsigned int bytes;
> +	int ret;
> +
> +	ret = kstrtouint(buf, 0, &bytes);
> +	if (ret)
> +		return ret;
> +
> +	if (bytes > resctrl_rmid_realloc_limit)
> +		return -EINVAL;
> +
> +	resctrl_rmid_realloc_threshold = resctrl_arch_round_mon_val(bytes);
> +
> +	return nbytes;
> +}
> +
> +/*
> + * rdtgroup_mode_show - Display mode of this resource group
> + */
> +static int rdtgroup_mode_show(struct kernfs_open_file *of,
> +			      struct seq_file *s, void *v)
> +{
> +	struct rdtgroup *rdtgrp;
> +
> +	rdtgrp = rdtgroup_kn_lock_live(of->kn);
> +	if (!rdtgrp) {
> +		rdtgroup_kn_unlock(of->kn);
> +		return -ENOENT;
> +	}
> +
> +	seq_printf(s, "%s\n", rdtgroup_mode_str(rdtgrp->mode));
> +
> +	rdtgroup_kn_unlock(of->kn);
> +	return 0;
> +}
> +
> +static enum resctrl_conf_type resctrl_peer_type(enum resctrl_conf_type my_type)
> +{
> +	switch (my_type) {
> +	case CDP_CODE:
> +		return CDP_DATA;
> +	case CDP_DATA:
> +		return CDP_CODE;
> +	default:
> +	case CDP_NONE:
> +		return CDP_NONE;
> +	}
> +}
> +
> +static int rdt_has_sparse_bitmasks_show(struct kernfs_open_file *of,
> +					struct seq_file *seq, void *v)
> +{
> +	struct resctrl_schema *s = of->kn->parent->priv;
> +	struct rdt_resource *r = s->res;
> +
> +	seq_printf(seq, "%u\n", r->cache.arch_has_sparse_bitmasks);
> +
> +	return 0;
> +}
> +
> +/**
> + * __rdtgroup_cbm_overlaps - Does CBM for intended closid overlap with other
> + * @r: Resource to which domain instance @d belongs.
> + * @d: The domain instance for which @closid is being tested.
> + * @cbm: Capacity bitmask being tested.
> + * @closid: Intended closid for @cbm.
> + * @type: CDP type of @r.
> + * @exclusive: Only check if overlaps with exclusive resource groups
> + *
> + * Checks if provided @cbm intended to be used for @closid on domain
> + * @d overlaps with any other closids or other hardware usage associated
> + * with this domain. If @exclusive is true then only overlaps with
> + * resource groups in exclusive mode will be considered. If @exclusive
> + * is false then overlaps with any resource group or hardware entities
> + * will be considered.
> + *
> + * @cbm is unsigned long, even if only 32 bits are used, to make the
> + * bitmap functions work correctly.
> + *
> + * Return: false if CBM does not overlap, true if it does.
> + */
> +static bool __rdtgroup_cbm_overlaps(struct rdt_resource *r, struct rdt_domain *d,
> +				    unsigned long cbm, int closid,
> +				    enum resctrl_conf_type type, bool exclusive)
> +{
> +	enum rdtgrp_mode mode;
> +	unsigned long ctrl_b;
> +	int i;
> +
> +	/* Check for any overlap with regions used by hardware directly */
> +	if (!exclusive) {
> +		ctrl_b = r->cache.shareable_bits;
> +		if (bitmap_intersects(&cbm, &ctrl_b, r->cache.cbm_len))
> +			return true;
> +	}
> +
> +	/* Check for overlap with other resource groups */
> +	for (i = 0; i < closids_supported(); i++) {
> +		ctrl_b = resctrl_arch_get_config(r, d, i, type);
> +		mode = rdtgroup_mode_by_closid(i);
> +		if (closid_allocated(i) && i != closid &&
> +		    mode != RDT_MODE_PSEUDO_LOCKSETUP) {
> +			if (bitmap_intersects(&cbm, &ctrl_b, r->cache.cbm_len)) {
> +				if (exclusive) {
> +					if (mode == RDT_MODE_EXCLUSIVE)
> +						return true;
> +					continue;
> +				}
> +				return true;
> +			}
> +		}
> +	}
> +
> +	return false;
> +}
> +
> +/**
> + * rdtgroup_cbm_overlaps - Does CBM overlap with other use of hardware
> + * @s: Schema for the resource to which domain instance @d belongs.
> + * @d: The domain instance for which @closid is being tested.
> + * @cbm: Capacity bitmask being tested.
> + * @closid: Intended closid for @cbm.
> + * @exclusive: Only check if overlaps with exclusive resource groups
> + *
> + * Resources that can be allocated using a CBM can use the CBM to control
> + * the overlap of these allocations. rdtgroup_cmb_overlaps() is the test
> + * for overlap. Overlap test is not limited to the specific resource for
> + * which the CBM is intended though - when dealing with CDP resources that
> + * share the underlying hardware the overlap check should be performed on
> + * the CDP resource sharing the hardware also.
> + *
> + * Refer to description of __rdtgroup_cbm_overlaps() for the details of the
> + * overlap test.
> + *
> + * Return: true if CBM overlap detected, false if there is no overlap
> + */
> +bool rdtgroup_cbm_overlaps(struct resctrl_schema *s, struct rdt_domain *d,
> +			   unsigned long cbm, int closid, bool exclusive)
> +{
> +	enum resctrl_conf_type peer_type = resctrl_peer_type(s->conf_type);
> +	struct rdt_resource *r = s->res;
> +
> +	if (__rdtgroup_cbm_overlaps(r, d, cbm, closid, s->conf_type,
> +				    exclusive))
> +		return true;
> +
> +	if (!resctrl_arch_get_cdp_enabled(r->rid))
> +		return false;
> +	return  __rdtgroup_cbm_overlaps(r, d, cbm, closid, peer_type, exclusive);
> +}
> +
> +/**
> + * rdtgroup_mode_test_exclusive - Test if this resource group can be exclusive
> + * @rdtgrp: Resource group identified through its closid.
> + *
> + * An exclusive resource group implies that there should be no sharing of
> + * its allocated resources. At the time this group is considered to be
> + * exclusive this test can determine if its current schemata supports this
> + * setting by testing for overlap with all other resource groups.
> + *
> + * Return: true if resource group can be exclusive, false if there is overlap
> + * with allocations of other resource groups and thus this resource group
> + * cannot be exclusive.
> + */
> +static bool rdtgroup_mode_test_exclusive(struct rdtgroup *rdtgrp)
> +{
> +	int closid = rdtgrp->closid;
> +	struct resctrl_schema *s;
> +	struct rdt_resource *r;
> +	bool has_cache = false;
> +	struct rdt_domain *d;
> +	u32 ctrl;
> +
> +	/* Walking r->domains, ensure it can't race with cpuhp */
> +	lockdep_assert_cpus_held();
> +
> +	list_for_each_entry(s, &resctrl_schema_all, list) {
> +		r = s->res;
> +		if (r->rid == RDT_RESOURCE_MBA || r->rid == RDT_RESOURCE_SMBA)
> +			continue;
> +		has_cache = true;
> +		list_for_each_entry(d, &r->domains, list) {
> +			ctrl = resctrl_arch_get_config(r, d, closid,
> +						       s->conf_type);
> +			if (rdtgroup_cbm_overlaps(s, d, ctrl, closid, false)) {
> +				rdt_last_cmd_puts("Schemata overlaps\n");
> +				return false;
> +			}
> +		}
> +	}
> +
> +	if (!has_cache) {
> +		rdt_last_cmd_puts("Cannot be exclusive without CAT/CDP\n");
> +		return false;
> +	}
> +
> +	return true;
> +}
> +
> +/*
> + * rdtgroup_mode_write - Modify the resource group's mode
> + */
> +static ssize_t rdtgroup_mode_write(struct kernfs_open_file *of,
> +				   char *buf, size_t nbytes, loff_t off)
> +{
> +	struct rdtgroup *rdtgrp;
> +	enum rdtgrp_mode mode;
> +	int ret = 0;
> +
> +	/* Valid input requires a trailing newline */
> +	if (nbytes == 0 || buf[nbytes - 1] != '\n')
> +		return -EINVAL;
> +	buf[nbytes - 1] = '\0';
> +
> +	rdtgrp = rdtgroup_kn_lock_live(of->kn);
> +	if (!rdtgrp) {
> +		rdtgroup_kn_unlock(of->kn);
> +		return -ENOENT;
> +	}
> +
> +	rdt_last_cmd_clear();
> +
> +	mode = rdtgrp->mode;
> +
> +	if ((!strcmp(buf, "shareable") && mode == RDT_MODE_SHAREABLE) ||
> +	    (!strcmp(buf, "exclusive") && mode == RDT_MODE_EXCLUSIVE) ||
> +	    (!strcmp(buf, "pseudo-locksetup") &&
> +	     mode == RDT_MODE_PSEUDO_LOCKSETUP) ||
> +	    (!strcmp(buf, "pseudo-locked") && mode == RDT_MODE_PSEUDO_LOCKED))
> +		goto out;
> +
> +	if (mode == RDT_MODE_PSEUDO_LOCKED) {
> +		rdt_last_cmd_puts("Cannot change pseudo-locked group\n");
> +		ret = -EINVAL;
> +		goto out;
> +	}
> +
> +	if (!strcmp(buf, "shareable")) {
> +		if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
> +			ret = rdtgroup_locksetup_exit(rdtgrp);
> +			if (ret)
> +				goto out;
> +		}
> +		rdtgrp->mode = RDT_MODE_SHAREABLE;
> +	} else if (!strcmp(buf, "exclusive")) {
> +		if (!rdtgroup_mode_test_exclusive(rdtgrp)) {
> +			ret = -EINVAL;
> +			goto out;
> +		}
> +		if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
> +			ret = rdtgroup_locksetup_exit(rdtgrp);
> +			if (ret)
> +				goto out;
> +		}
> +		rdtgrp->mode = RDT_MODE_EXCLUSIVE;
> +	} else if (IS_ENABLED(CONFIG_RESCTRL_FS_PSEUDO_LOCK) &&
> +		   !strcmp(buf, "pseudo-locksetup")) {
> +		ret = rdtgroup_locksetup_enter(rdtgrp);
> +		if (ret)
> +			goto out;
> +		rdtgrp->mode = RDT_MODE_PSEUDO_LOCKSETUP;
> +	} else {
> +		rdt_last_cmd_puts("Unknown or unsupported mode\n");
> +		ret = -EINVAL;
> +	}
> +
> +out:
> +	rdtgroup_kn_unlock(of->kn);
> +	return ret ?: nbytes;
> +}
> +
> +/**
> + * rdtgroup_cbm_to_size - Translate CBM to size in bytes
> + * @r: RDT resource to which @d belongs.
> + * @d: RDT domain instance.
> + * @cbm: bitmask for which the size should be computed.
> + *
> + * The bitmask provided associated with the RDT domain instance @d will be
> + * translated into how many bytes it represents. The size in bytes is
> + * computed by first dividing the total cache size by the CBM length to
> + * determine how many bytes each bit in the bitmask represents. The result
> + * is multiplied with the number of bits set in the bitmask.
> + *
> + * @cbm is unsigned long, even if only 32 bits are used to make the
> + * bitmap functions work correctly.
> + */
> +unsigned int rdtgroup_cbm_to_size(struct rdt_resource *r,
> +				  struct rdt_domain *d, unsigned long cbm)
> +{
> +	struct cpu_cacheinfo *ci;
> +	unsigned int size = 0;
> +	int num_b, i;
> +
> +	num_b = bitmap_weight(&cbm, r->cache.cbm_len);
> +	ci = get_cpu_cacheinfo(cpumask_any(&d->cpu_mask));
> +	for (i = 0; i < ci->num_leaves; i++) {
> +		if (ci->info_list[i].level == r->cache_level) {
> +			size = ci->info_list[i].size / r->cache.cbm_len * num_b;
> +			break;
> +		}
> +	}
> +
> +	return size;
> +}
> +
> +/*
> + * rdtgroup_size_show - Display size in bytes of allocated regions
> + *
> + * The "size" file mirrors the layout of the "schemata" file, printing the
> + * size in bytes of each region instead of the capacity bitmask.
> + */
> +static int rdtgroup_size_show(struct kernfs_open_file *of,
> +			      struct seq_file *s, void *v)
> +{
> +	struct resctrl_schema *schema;
> +	enum resctrl_conf_type type;
> +	struct rdtgroup *rdtgrp;
> +	struct rdt_resource *r;
> +	struct rdt_domain *d;
> +	unsigned int size;
> +	int ret = 0;
> +	u32 closid;
> +	bool sep;
> +	u32 ctrl;
> +
> +	rdtgrp = rdtgroup_kn_lock_live(of->kn);
> +	if (!rdtgrp) {
> +		rdtgroup_kn_unlock(of->kn);
> +		return -ENOENT;
> +	}
> +
> +	if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) {
> +		if (!rdtgrp->plr->d) {
> +			rdt_last_cmd_clear();
> +			rdt_last_cmd_puts("Cache domain offline\n");
> +			ret = -ENODEV;
> +		} else {
> +			seq_printf(s, "%*s:", max_name_width,
> +				   rdtgrp->plr->s->name);
> +			size = rdtgroup_cbm_to_size(rdtgrp->plr->s->res,
> +						    rdtgrp->plr->d,
> +						    rdtgrp->plr->cbm);
> +			seq_printf(s, "%d=%u\n", rdtgrp->plr->d->id, size);
> +		}
> +		goto out;
> +	}
> +
> +	closid = rdtgrp->closid;
> +
> +	list_for_each_entry(schema, &resctrl_schema_all, list) {
> +		r = schema->res;
> +		type = schema->conf_type;
> +		sep = false;
> +		seq_printf(s, "%*s:", max_name_width, schema->name);
> +		list_for_each_entry(d, &r->domains, list) {
> +			if (sep)
> +				seq_putc(s, ';');
> +			if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
> +				size = 0;
> +			} else {
> +				if (is_mba_sc(r))
> +					ctrl = d->mbps_val[closid];
> +				else
> +					ctrl = resctrl_arch_get_config(r, d,
> +								       closid,
> +								       type);
> +				if (r->rid == RDT_RESOURCE_MBA ||
> +				    r->rid == RDT_RESOURCE_SMBA)
> +					size = ctrl;
> +				else
> +					size = rdtgroup_cbm_to_size(r, d, ctrl);
> +			}
> +			seq_printf(s, "%d=%u", d->id, size);
> +			sep = true;
> +		}
> +		seq_putc(s, '\n');
> +	}
> +
> +out:
> +	rdtgroup_kn_unlock(of->kn);
> +
> +	return ret;
> +}
> +
> +static void mondata_config_read(struct resctrl_mon_config_info *mon_info)
> +{
> +	smp_call_function_any(&mon_info->d->cpu_mask,
> +			      resctrl_arch_mon_event_config_read, mon_info, 1);
> +}
> +
> +static int mbm_config_show(struct seq_file *s, struct rdt_resource *r, u32 evtid)
> +{
> +	struct resctrl_mon_config_info mon_info = {0};
> +	struct rdt_domain *dom;
> +	bool sep = false;
> +
> +	cpus_read_lock();
> +	mutex_lock(&rdtgroup_mutex);
> +
> +	list_for_each_entry(dom, &r->domains, list) {
> +		if (sep)
> +			seq_puts(s, ";");
> +
> +		memset(&mon_info, 0, sizeof(struct resctrl_mon_config_info));
> +		mon_info.r = r;
> +		mon_info.d = dom;
> +		mon_info.evtid = evtid;
> +		mondata_config_read(&mon_info);
> +
> +		seq_printf(s, "%d=0x%02x", dom->id, mon_info.mon_config);
> +		sep = true;
> +	}
> +	seq_puts(s, "\n");
> +
> +	mutex_unlock(&rdtgroup_mutex);
> +	cpus_read_unlock();
> +
> +	return 0;
> +}
> +
> +static int mbm_total_bytes_config_show(struct kernfs_open_file *of,
> +				       struct seq_file *seq, void *v)
> +{
> +	struct rdt_resource *r = of->kn->parent->priv;
> +
> +	mbm_config_show(seq, r, QOS_L3_MBM_TOTAL_EVENT_ID);
> +
> +	return 0;
> +}
> +
> +static int mbm_local_bytes_config_show(struct kernfs_open_file *of,
> +				       struct seq_file *seq, void *v)
> +{
> +	struct rdt_resource *r = of->kn->parent->priv;
> +
> +	mbm_config_show(seq, r, QOS_L3_MBM_LOCAL_EVENT_ID);
> +
> +	return 0;
> +}
> +
> +static int mbm_config_write_domain(struct rdt_resource *r,
> +				   struct rdt_domain *d, u32 evtid, u32 val)
> +{
> +	struct resctrl_mon_config_info mon_info = {0};
> +
> +	/*
> +	 * Read the current config value first. If both are the same then
> +	 * no need to write it again.
> +	 */
> +	mon_info.r = r;
> +	mon_info.d = d;
> +	mon_info.evtid = evtid;
> +	mondata_config_read(&mon_info);
> +	if (mon_info.mon_config == val)
> +		return 0;
> +
> +	mon_info.mon_config = val;
> +
> +	/*
> +	 * Update MSR_IA32_EVT_CFG_BASE MSR on one of the CPUs in the
> +	 * domain. The MSRs offset from MSR MSR_IA32_EVT_CFG_BASE
> +	 * are scoped at the domain level. Writing any of these MSRs
> +	 * on one CPU is observed by all the CPUs in the domain.
> +	 */
> +	smp_call_function_any(&d->cpu_mask, resctrl_arch_mon_event_config_write,
> +			      &mon_info, 1);
> +	if (mon_info.err) {
> +		rdt_last_cmd_puts("Invalid event configuration\n");
> +		return mon_info.err;
> +	}
> +
> +	/*
> +	 * When an Event Configuration is changed, the bandwidth counters
> +	 * for all RMIDs and Events will be cleared by the hardware. The
> +	 * hardware also sets MSR_IA32_QM_CTR.Unavailable (bit 62) for
> +	 * every RMID on the next read to any event for every RMID.
> +	 * Subsequent reads will have MSR_IA32_QM_CTR.Unavailable (bit 62)
> +	 * cleared while it is tracked by the hardware. Clear the
> +	 * mbm_local and mbm_total counts for all the RMIDs.
> +	 */
> +	resctrl_arch_reset_rmid_all(r, d);
> +
> +	return 0;
> +}
> +
> +static int mon_config_write(struct rdt_resource *r, char *tok, u32 evtid)
> +{
> +	char *dom_str = NULL, *id_str;
> +	unsigned long dom_id, val;
> +	struct rdt_domain *d;
> +	int err;
> +
> +	/* Walking r->domains, ensure it can't race with cpuhp */
> +	lockdep_assert_cpus_held();
> +
> +next:
> +	if (!tok || tok[0] == '\0')
> +		return 0;
> +
> +	/* Start processing the strings for each domain */
> +	dom_str = strim(strsep(&tok, ";"));
> +	id_str = strsep(&dom_str, "=");
> +
> +	if (!id_str || kstrtoul(id_str, 10, &dom_id)) {
> +		rdt_last_cmd_puts("Missing '=' or non-numeric domain id\n");
> +		return -EINVAL;
> +	}
> +
> +	if (!dom_str || kstrtoul(dom_str, 16, &val)) {
> +		rdt_last_cmd_puts("Non-numeric event configuration value\n");
> +		return -EINVAL;
> +	}
> +
> +	/* Value from user cannot be more than the supported set of events */
> +	if ((val & r->mbm_cfg_mask) != val) {
> +		rdt_last_cmd_printf("Invalid event configuration: max valid mask is 0x%02x\n",
> +				    r->mbm_cfg_mask);
> +		return -EINVAL;
> +	}
> +
> +	list_for_each_entry(d, &r->domains, list) {
> +		if (d->id == dom_id) {
> +			err = mbm_config_write_domain(r, d, evtid, val);
> +			if (err)
> +				return err;
> +			goto next;
> +		}
> +	}
> +
> +	return -EINVAL;
> +}
> +
> +static ssize_t mbm_total_bytes_config_write(struct kernfs_open_file *of,
> +					    char *buf, size_t nbytes,
> +					    loff_t off)
> +{
> +	struct rdt_resource *r = of->kn->parent->priv;
> +	int ret;
> +
> +	/* Valid input requires a trailing newline */
> +	if (nbytes == 0 || buf[nbytes - 1] != '\n')
> +		return -EINVAL;
> +
> +	cpus_read_lock();
> +	mutex_lock(&rdtgroup_mutex);
> +
> +	rdt_last_cmd_clear();
> +
> +	buf[nbytes - 1] = '\0';
> +
> +	ret = mon_config_write(r, buf, QOS_L3_MBM_TOTAL_EVENT_ID);
> +
> +	mutex_unlock(&rdtgroup_mutex);
> +	cpus_read_unlock();
> +
> +	return ret ?: nbytes;
> +}
> +
> +static ssize_t mbm_local_bytes_config_write(struct kernfs_open_file *of,
> +					    char *buf, size_t nbytes,
> +					    loff_t off)
> +{
> +	struct rdt_resource *r = of->kn->parent->priv;
> +	int ret;
> +
> +	/* Valid input requires a trailing newline */
> +	if (nbytes == 0 || buf[nbytes - 1] != '\n')
> +		return -EINVAL;
> +
> +	cpus_read_lock();
> +	mutex_lock(&rdtgroup_mutex);
> +
> +	rdt_last_cmd_clear();
> +
> +	buf[nbytes - 1] = '\0';
> +
> +	ret = mon_config_write(r, buf, QOS_L3_MBM_LOCAL_EVENT_ID);
> +
> +	mutex_unlock(&rdtgroup_mutex);
> +	cpus_read_unlock();
> +
> +	return ret ?: nbytes;
> +}
> +
> +/* rdtgroup information files for one cache resource. */
> +static struct rftype res_common_files[] = {
> +	{
> +		.name		= "last_cmd_status",
> +		.mode		= 0444,
> +		.kf_ops		= &rdtgroup_kf_single_ops,
> +		.seq_show	= rdt_last_cmd_status_show,
> +		.fflags		= RFTYPE_TOP_INFO,
> +	},
> +	{
> +		.name		= "num_closids",
> +		.mode		= 0444,
> +		.kf_ops		= &rdtgroup_kf_single_ops,
> +		.seq_show	= rdt_num_closids_show,
> +		.fflags		= RFTYPE_CTRL_INFO,
> +	},
> +	{
> +		.name		= "mon_features",
> +		.mode		= 0444,
> +		.kf_ops		= &rdtgroup_kf_single_ops,
> +		.seq_show	= rdt_mon_features_show,
> +		.fflags		= RFTYPE_MON_INFO,
> +	},
> +	{
> +		.name		= "num_rmids",
> +		.mode		= 0444,
> +		.kf_ops		= &rdtgroup_kf_single_ops,
> +		.seq_show	= rdt_num_rmids_show,
> +		.fflags		= RFTYPE_MON_INFO,
> +	},
> +	{
> +		.name		= "cbm_mask",
> +		.mode		= 0444,
> +		.kf_ops		= &rdtgroup_kf_single_ops,
> +		.seq_show	= rdt_default_ctrl_show,
> +		.fflags		= RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE,
> +	},
> +	{
> +		.name		= "min_cbm_bits",
> +		.mode		= 0444,
> +		.kf_ops		= &rdtgroup_kf_single_ops,
> +		.seq_show	= rdt_min_cbm_bits_show,
> +		.fflags		= RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE,
> +	},
> +	{
> +		.name		= "shareable_bits",
> +		.mode		= 0444,
> +		.kf_ops		= &rdtgroup_kf_single_ops,
> +		.seq_show	= rdt_shareable_bits_show,
> +		.fflags		= RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE,
> +	},
> +	{
> +		.name		= "bit_usage",
> +		.mode		= 0444,
> +		.kf_ops		= &rdtgroup_kf_single_ops,
> +		.seq_show	= rdt_bit_usage_show,
> +		.fflags		= RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE,
> +	},
> +	{
> +		.name		= "min_bandwidth",
> +		.mode		= 0444,
> +		.kf_ops		= &rdtgroup_kf_single_ops,
> +		.seq_show	= rdt_min_bw_show,
> +		.fflags		= RFTYPE_CTRL_INFO | RFTYPE_RES_MB,
> +	},
> +	{
> +		.name		= "bandwidth_gran",
> +		.mode		= 0444,
> +		.kf_ops		= &rdtgroup_kf_single_ops,
> +		.seq_show	= rdt_bw_gran_show,
> +		.fflags		= RFTYPE_CTRL_INFO | RFTYPE_RES_MB,
> +	},
> +	{
> +		.name		= "delay_linear",
> +		.mode		= 0444,
> +		.kf_ops		= &rdtgroup_kf_single_ops,
> +		.seq_show	= rdt_delay_linear_show,
> +		.fflags		= RFTYPE_CTRL_INFO | RFTYPE_RES_MB,
> +	},
> +	/*
> +	 * Platform specific which (if any) capabilities are provided by
> +	 * thread_throttle_mode. Defer "fflags" initialization to platform
> +	 * discovery.
> +	 */
> +	{
> +		.name		= "thread_throttle_mode",
> +		.mode		= 0444,
> +		.kf_ops		= &rdtgroup_kf_single_ops,
> +		.seq_show	= rdt_thread_throttle_mode_show,
> +	},
> +	{
> +		.name		= "max_threshold_occupancy",
> +		.mode		= 0644,
> +		.kf_ops		= &rdtgroup_kf_single_ops,
> +		.write		= max_threshold_occ_write,
> +		.seq_show	= max_threshold_occ_show,
> +		.fflags		= RFTYPE_MON_INFO | RFTYPE_RES_CACHE,
> +	},
> +	{
> +		.name		= "mbm_total_bytes_config",
> +		.mode		= 0644,
> +		.kf_ops		= &rdtgroup_kf_single_ops,
> +		.seq_show	= mbm_total_bytes_config_show,
> +		.write		= mbm_total_bytes_config_write,
> +	},
> +	{
> +		.name		= "mbm_local_bytes_config",
> +		.mode		= 0644,
> +		.kf_ops		= &rdtgroup_kf_single_ops,
> +		.seq_show	= mbm_local_bytes_config_show,
> +		.write		= mbm_local_bytes_config_write,
> +	},
> +	{
> +		.name		= "cpus",
> +		.mode		= 0644,
> +		.kf_ops		= &rdtgroup_kf_single_ops,
> +		.write		= rdtgroup_cpus_write,
> +		.seq_show	= rdtgroup_cpus_show,
> +		.fflags		= RFTYPE_BASE,
> +	},
> +	{
> +		.name		= "cpus_list",
> +		.mode		= 0644,
> +		.kf_ops		= &rdtgroup_kf_single_ops,
> +		.write		= rdtgroup_cpus_write,
> +		.seq_show	= rdtgroup_cpus_show,
> +		.flags		= RFTYPE_FLAGS_CPUS_LIST,
> +		.fflags		= RFTYPE_BASE,
> +	},
> +	{
> +		.name		= "tasks",
> +		.mode		= 0644,
> +		.kf_ops		= &rdtgroup_kf_single_ops,
> +		.write		= rdtgroup_tasks_write,
> +		.seq_show	= rdtgroup_tasks_show,
> +		.fflags		= RFTYPE_BASE,
> +	},
> +	{
> +		.name		= "mon_hw_id",
> +		.mode		= 0444,
> +		.kf_ops		= &rdtgroup_kf_single_ops,
> +		.seq_show	= rdtgroup_rmid_show,
> +		.fflags		= RFTYPE_MON_BASE | RFTYPE_DEBUG,
> +	},
> +	{
> +		.name		= "schemata",
> +		.mode		= 0644,
> +		.kf_ops		= &rdtgroup_kf_single_ops,
> +		.write		= rdtgroup_schemata_write,
> +		.seq_show	= rdtgroup_schemata_show,
> +		.fflags		= RFTYPE_CTRL_BASE,
> +	},
> +	{
> +		.name		= "mode",
> +		.mode		= 0644,
> +		.kf_ops		= &rdtgroup_kf_single_ops,
> +		.write		= rdtgroup_mode_write,
> +		.seq_show	= rdtgroup_mode_show,
> +		.fflags		= RFTYPE_CTRL_BASE,
> +	},
> +	{
> +		.name		= "size",
> +		.mode		= 0444,
> +		.kf_ops		= &rdtgroup_kf_single_ops,
> +		.seq_show	= rdtgroup_size_show,
> +		.fflags		= RFTYPE_CTRL_BASE,
> +	},
> +	{
> +		.name		= "sparse_masks",
> +		.mode		= 0444,
> +		.kf_ops		= &rdtgroup_kf_single_ops,
> +		.seq_show	= rdt_has_sparse_bitmasks_show,
> +		.fflags		= RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE,
> +	},
> +	{
> +		.name		= "ctrl_hw_id",
> +		.mode		= 0444,
> +		.kf_ops		= &rdtgroup_kf_single_ops,
> +		.seq_show	= rdtgroup_closid_show,
> +		.fflags		= RFTYPE_CTRL_BASE | RFTYPE_DEBUG,
> +	},
> +
> +};
> +
> +static int rdtgroup_add_files(struct kernfs_node *kn, unsigned long fflags)
> +{
> +	struct rftype *rfts, *rft;
> +	int ret, len;
> +
> +	rfts = res_common_files;
> +	len = ARRAY_SIZE(res_common_files);
> +
> +	lockdep_assert_held(&rdtgroup_mutex);
> +
> +	if (resctrl_debug)
> +		fflags |= RFTYPE_DEBUG;
> +
> +	for (rft = rfts; rft < rfts + len; rft++) {
> +		if (rft->fflags && ((fflags & rft->fflags) == rft->fflags)) {
> +			ret = rdtgroup_add_file(kn, rft);
> +			if (ret)
> +				goto error;
> +		}
> +	}
> +
> +	return 0;
> +error:
> +	pr_warn("Failed to add %s, err=%d\n", rft->name, ret);
> +	while (--rft >= rfts) {
> +		if ((fflags & rft->fflags) == rft->fflags)
> +			kernfs_remove_by_name(kn, rft->name);
> +	}
> +	return ret;
> +}
> +
> +static struct rftype *rdtgroup_get_rftype_by_name(const char *name)
> +{
> +	struct rftype *rfts, *rft;
> +	int len;
> +
> +	rfts = res_common_files;
> +	len = ARRAY_SIZE(res_common_files);
> +
> +	for (rft = rfts; rft < rfts + len; rft++) {
> +		if (!strcmp(rft->name, name))
> +			return rft;
> +	}
> +
> +	return NULL;
> +}
> +
> +static void thread_throttle_mode_init(void)
> +{
> +	struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_MBA);
> +	struct rftype *rft;
> +
> +	if (!r->alloc_capable ||
> +	    r->membw.throttle_mode == THREAD_THROTTLE_UNDEFINED)
> +		return;
> +
> +	rft = rdtgroup_get_rftype_by_name("thread_throttle_mode");
> +	if (!rft)
> +		return;
> +
> +	rft->fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_MB;
> +}
> +
> +void mbm_config_rftype_init(const char *config)
> +{
> +	struct rftype *rft;
> +
> +	rft = rdtgroup_get_rftype_by_name(config);
> +	if (rft)
> +		rft->fflags = RFTYPE_MON_INFO | RFTYPE_RES_CACHE;
> +}
> +
> +/**
> + * rdtgroup_kn_mode_restrict - Restrict user access to named resctrl file
> + * @r: The resource group with which the file is associated.
> + * @name: Name of the file
> + *
> + * The permissions of named resctrl file, directory, or link are modified
> + * to not allow read, write, or execute by any user.
> + *
> + * WARNING: This function is intended to communicate to the user that the
> + * resctrl file has been locked down - that it is not relevant to the
> + * particular state the system finds itself in. It should not be relied
> + * on to protect from user access because after the file's permissions
> + * are restricted the user can still change the permissions using chmod
> + * from the command line.
> + *
> + * Return: 0 on success, <0 on failure.
> + */
> +int rdtgroup_kn_mode_restrict(struct rdtgroup *r, const char *name)
> +{
> +	struct iattr iattr = {.ia_valid = ATTR_MODE,};
> +	struct kernfs_node *kn;
> +	int ret = 0;
> +
> +	kn = kernfs_find_and_get_ns(r->kn, name, NULL);
> +	if (!kn)
> +		return -ENOENT;
> +
> +	switch (kernfs_type(kn)) {
> +	case KERNFS_DIR:
> +		iattr.ia_mode = S_IFDIR;
> +		break;
> +	case KERNFS_FILE:
> +		iattr.ia_mode = S_IFREG;
> +		break;
> +	case KERNFS_LINK:
> +		iattr.ia_mode = S_IFLNK;
> +		break;
> +	}
> +
> +	ret = kernfs_setattr(kn, &iattr);
> +	kernfs_put(kn);
> +	return ret;
> +}
> +
> +/**
> + * rdtgroup_kn_mode_restore - Restore user access to named resctrl file
> + * @r: The resource group with which the file is associated.
> + * @name: Name of the file
> + * @mask: Mask of permissions that should be restored
> + *
> + * Restore the permissions of the named file. If @name is a directory the
> + * permissions of its parent will be used.
> + *
> + * Return: 0 on success, <0 on failure.
> + */
> +int rdtgroup_kn_mode_restore(struct rdtgroup *r, const char *name,
> +			     umode_t mask)
> +{
> +	struct iattr iattr = {.ia_valid = ATTR_MODE,};
> +	struct kernfs_node *kn, *parent;
> +	struct rftype *rfts, *rft;
> +	int ret, len;
> +
> +	rfts = res_common_files;
> +	len = ARRAY_SIZE(res_common_files);
> +
> +	for (rft = rfts; rft < rfts + len; rft++) {
> +		if (!strcmp(rft->name, name))
> +			iattr.ia_mode = rft->mode & mask;
> +	}
> +
> +	kn = kernfs_find_and_get_ns(r->kn, name, NULL);
> +	if (!kn)
> +		return -ENOENT;
> +
> +	switch (kernfs_type(kn)) {
> +	case KERNFS_DIR:
> +		parent = kernfs_get_parent(kn);
> +		if (parent) {
> +			iattr.ia_mode |= parent->mode;
> +			kernfs_put(parent);
> +		}
> +		iattr.ia_mode |= S_IFDIR;
> +		break;
> +	case KERNFS_FILE:
> +		iattr.ia_mode |= S_IFREG;
> +		break;
> +	case KERNFS_LINK:
> +		iattr.ia_mode |= S_IFLNK;
> +		break;
> +	}
> +
> +	ret = kernfs_setattr(kn, &iattr);
> +	kernfs_put(kn);
> +	return ret;
> +}
> +
> +static int rdtgroup_mkdir_info_resdir(void *priv, char *name,
> +				      unsigned long fflags)
> +{
> +	struct kernfs_node *kn_subdir;
> +	int ret;
> +
> +	kn_subdir = kernfs_create_dir(kn_info, name,
> +				      kn_info->mode, priv);
> +	if (IS_ERR(kn_subdir))
> +		return PTR_ERR(kn_subdir);
> +
> +	ret = rdtgroup_kn_set_ugid(kn_subdir);
> +	if (ret)
> +		return ret;
> +
> +	ret = rdtgroup_add_files(kn_subdir, fflags);
> +	if (!ret)
> +		kernfs_activate(kn_subdir);
> +
> +	return ret;
> +}
> +
> +static int rdtgroup_create_info_dir(struct kernfs_node *parent_kn)
> +{
> +	enum resctrl_res_level i;
> +	struct resctrl_schema *s;
> +	struct rdt_resource *r;
> +	unsigned long fflags;
> +	char name[32];
> +	int ret;
> +
> +	/* create the directory */
> +	kn_info = kernfs_create_dir(parent_kn, "info", parent_kn->mode, NULL);
> +	if (IS_ERR(kn_info))
> +		return PTR_ERR(kn_info);
> +
> +	ret = rdtgroup_add_files(kn_info, RFTYPE_TOP_INFO);
> +	if (ret)
> +		goto out_destroy;
> +
> +	/* loop over enabled controls, these are all alloc_capable */
> +	list_for_each_entry(s, &resctrl_schema_all, list) {
> +		r = s->res;
> +		fflags = r->fflags | RFTYPE_CTRL_INFO;
> +		ret = rdtgroup_mkdir_info_resdir(s, s->name, fflags);
> +		if (ret)
> +			goto out_destroy;
> +	}
> +
> +	for (i = 0; i < RDT_NUM_RESOURCES; i++) {
> +		r = resctrl_arch_get_resource(i);
> +		if (!r->mon_capable)
> +			continue;
> +
> +		fflags =  r->fflags | RFTYPE_MON_INFO;
> +		sprintf(name, "%s_MON", r->name);
> +		ret = rdtgroup_mkdir_info_resdir(r, name, fflags);
> +		if (ret)
> +			goto out_destroy;
> +	}
> +
> +	ret = rdtgroup_kn_set_ugid(kn_info);
> +	if (ret)
> +		goto out_destroy;
> +
> +	kernfs_activate(kn_info);
> +
> +	return 0;
> +
> +out_destroy:
> +	kernfs_remove(kn_info);
> +	return ret;
> +}
> +
> +static int
> +mongroup_create_dir(struct kernfs_node *parent_kn, struct rdtgroup *prgrp,
> +		    char *name, struct kernfs_node **dest_kn)
> +{
> +	struct kernfs_node *kn;
> +	int ret;
> +
> +	/* create the directory */
> +	kn = kernfs_create_dir(parent_kn, name, parent_kn->mode, prgrp);
> +	if (IS_ERR(kn))
> +		return PTR_ERR(kn);
> +
> +	if (dest_kn)
> +		*dest_kn = kn;
> +
> +	ret = rdtgroup_kn_set_ugid(kn);
> +	if (ret)
> +		goto out_destroy;
> +
> +	kernfs_activate(kn);
> +
> +	return 0;
> +
> +out_destroy:
> +	kernfs_remove(kn);
> +	return ret;
> +}
> +
> +static inline bool is_mba_linear(void)
> +{
> +	return resctrl_arch_get_resource(RDT_RESOURCE_MBA)->membw.delay_linear;
> +}
> +
> +static int mba_sc_domain_allocate(struct rdt_resource *r, struct rdt_domain *d)
> +{
> +	u32 num_closid = resctrl_arch_get_num_closid(r);
> +	int cpu = cpumask_any(&d->cpu_mask);
> +	int i;
> +
> +	d->mbps_val = kcalloc_node(num_closid, sizeof(*d->mbps_val),
> +				   GFP_KERNEL, cpu_to_node(cpu));
> +	if (!d->mbps_val)
> +		return -ENOMEM;
> +
> +	for (i = 0; i < num_closid; i++)
> +		d->mbps_val[i] = MBA_MAX_MBPS;
> +
> +	return 0;
> +}
> +
> +static void mba_sc_domain_destroy(struct rdt_resource *r,
> +				  struct rdt_domain *d)
> +{
> +	kfree(d->mbps_val);
> +	d->mbps_val = NULL;
> +}
> +
> +/*
> + * MBA software controller is supported only if
> + * MBM is supported and MBA is in linear scale.
> + */
> +static bool supports_mba_mbps(void)
> +{
> +	struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_MBA);
> +
> +	return (resctrl_arch_is_mbm_local_enabled() &&
> +		r->alloc_capable && is_mba_linear());
> +}
> +
> +/*
> + * Enable or disable the MBA software controller
> + * which helps user specify bandwidth in MBps.
> + */
> +static int set_mba_sc(bool mba_sc)
> +{
> +	struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_MBA);
> +	u32 num_closid = resctrl_arch_get_num_closid(r);
> +	struct rdt_domain *d;
> +	int i;
> +
> +	if (!supports_mba_mbps() || mba_sc == is_mba_sc(r))
> +		return -EINVAL;
> +
> +	r->membw.mba_sc = mba_sc;
> +
> +	list_for_each_entry(d, &r->domains, list) {
> +		for (i = 0; i < num_closid; i++)
> +			d->mbps_val[i] = MBA_MAX_MBPS;
> +	}
> +
> +	return 0;
> +}
> +
> +/*
> + * We don't allow rdtgroup directories to be created anywhere
> + * except the root directory. Thus when looking for the rdtgroup
> + * structure for a kernfs node we are either looking at a directory,
> + * in which case the rdtgroup structure is pointed at by the "priv"
> + * field, otherwise we have a file, and need only look to the parent
> + * to find the rdtgroup.
> + */
> +static struct rdtgroup *kernfs_to_rdtgroup(struct kernfs_node *kn)
> +{
> +	if (kernfs_type(kn) == KERNFS_DIR) {
> +		/*
> +		 * All the resource directories use "kn->priv"
> +		 * to point to the "struct rdtgroup" for the
> +		 * resource. "info" and its subdirectories don't
> +		 * have rdtgroup structures, so return NULL here.
> +		 */
> +		if (kn == kn_info || kn->parent == kn_info)
> +			return NULL;
> +		else
> +			return kn->priv;
> +	} else {
> +		return kn->parent->priv;
> +	}
> +}
> +
> +static void rdtgroup_kn_get(struct rdtgroup *rdtgrp, struct kernfs_node *kn)
> +{
> +	atomic_inc(&rdtgrp->waitcount);
> +	kernfs_break_active_protection(kn);
> +}
> +
> +static void rdtgroup_kn_put(struct rdtgroup *rdtgrp, struct kernfs_node *kn)
> +{
> +	if (atomic_dec_and_test(&rdtgrp->waitcount) &&
> +	    (rdtgrp->flags & RDT_DELETED)) {
> +		if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP ||
> +		    rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED)
> +			rdtgroup_pseudo_lock_remove(rdtgrp);
> +		kernfs_unbreak_active_protection(kn);
> +		rdtgroup_remove(rdtgrp);
> +	} else {
> +		kernfs_unbreak_active_protection(kn);
> +	}
> +}
> +
> +struct rdtgroup *rdtgroup_kn_lock_live(struct kernfs_node *kn)
> +{
> +	struct rdtgroup *rdtgrp = kernfs_to_rdtgroup(kn);
> +
> +	if (!rdtgrp)
> +		return NULL;
> +
> +	rdtgroup_kn_get(rdtgrp, kn);
> +
> +	cpus_read_lock();
> +	mutex_lock(&rdtgroup_mutex);
> +
> +	/* Was this group deleted while we waited? */
> +	if (rdtgrp->flags & RDT_DELETED)
> +		return NULL;
> +
> +	return rdtgrp;
> +}
> +
> +void rdtgroup_kn_unlock(struct kernfs_node *kn)
> +{
> +	struct rdtgroup *rdtgrp = kernfs_to_rdtgroup(kn);
> +
> +	if (!rdtgrp)
> +		return;
> +
> +	mutex_unlock(&rdtgroup_mutex);
> +	cpus_read_unlock();
> +
> +	rdtgroup_kn_put(rdtgrp, kn);
> +}
> +
> +static int mkdir_mondata_all(struct kernfs_node *parent_kn,
> +			     struct rdtgroup *prgrp,
> +			     struct kernfs_node **mon_data_kn);
> +
> +static void rdt_disable_ctx(void)
> +{
> +	resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L3, false);
> +	resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L2, false);
> +	set_mba_sc(false);
> +
> +	resctrl_debug = false;
> +}
> +
> +static int rdt_enable_ctx(struct rdt_fs_context *ctx)
> +{
> +	int ret = 0;
> +
> +	if (ctx->enable_cdpl2) {
> +		ret = resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L2, true);
> +		if (ret)
> +			goto out_done;
> +	}
> +
> +	if (ctx->enable_cdpl3) {
> +		ret = resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L3, true);
> +		if (ret)
> +			goto out_cdpl2;
> +	}
> +
> +	if (ctx->enable_mba_mbps) {
> +		ret = set_mba_sc(true);
> +		if (ret)
> +			goto out_cdpl3;
> +	}
> +
> +	if (ctx->enable_debug)
> +		resctrl_debug = true;
> +
> +	return 0;
> +
> +out_cdpl3:
> +	resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L3, false);
> +out_cdpl2:
> +	resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L2, false);
> +out_done:
> +	return ret;
> +}
> +
> +static int schemata_list_add(struct rdt_resource *r, enum resctrl_conf_type type)
> +{
> +	struct resctrl_schema *s;
> +	const char *suffix = "";
> +	int ret, cl;
> +
> +	s = kzalloc(sizeof(*s), GFP_KERNEL);
> +	if (!s)
> +		return -ENOMEM;
> +
> +	s->res = r;
> +	s->num_closid = resctrl_arch_get_num_closid(r);
> +	if (resctrl_arch_get_cdp_enabled(r->rid))
> +		s->num_closid /= 2;
> +
> +	s->conf_type = type;
> +	switch (type) {
> +	case CDP_CODE:
> +		suffix = "CODE";
> +		break;
> +	case CDP_DATA:
> +		suffix = "DATA";
> +		break;
> +	case CDP_NONE:
> +		suffix = "";
> +		break;
> +	}
> +
> +	ret = snprintf(s->name, sizeof(s->name), "%s%s", r->name, suffix);
> +	if (ret >= sizeof(s->name)) {
> +		kfree(s);
> +		return -EINVAL;
> +	}
> +
> +	cl = strlen(s->name);
> +
> +	/*
> +	 * If CDP is supported by this resource, but not enabled,
> +	 * include the suffix. This ensures the tabular format of the
> +	 * schemata file does not change between mounts of the filesystem.
> +	 */
> +	if (r->cdp_capable && !resctrl_arch_get_cdp_enabled(r->rid))
> +		cl += 4;
> +
> +	if (cl > max_name_width)
> +		max_name_width = cl;
> +
> +	/*
> +	 * Choose a width for the resource data based on the resource that has
> +	 * widest name and cbm.
> +	 */
> +	max_data_width = max(max_data_width, r->data_width);
> +
> +	INIT_LIST_HEAD(&s->list);
> +	list_add(&s->list, &resctrl_schema_all);
> +
> +	return 0;
> +}
> +
> +static int schemata_list_create(void)
> +{
> +	enum resctrl_res_level i;
> +	struct rdt_resource *r;
> +	int ret = 0;
> +
> +	for (i = 0; i < RDT_NUM_RESOURCES; i++) {
> +		r = resctrl_arch_get_resource(i);
> +		if (!r->alloc_capable)
> +			continue;
> +
> +		if (resctrl_arch_get_cdp_enabled(r->rid)) {
> +			ret = schemata_list_add(r, CDP_CODE);
> +			if (ret)
> +				break;
> +
> +			ret = schemata_list_add(r, CDP_DATA);
> +		} else {
> +			ret = schemata_list_add(r, CDP_NONE);
> +		}
> +
> +		if (ret)
> +			break;
> +	}
> +
> +	return ret;
> +}
> +
> +static void schemata_list_destroy(void)
> +{
> +	struct resctrl_schema *s, *tmp;
> +
> +	list_for_each_entry_safe(s, tmp, &resctrl_schema_all, list) {
> +		list_del(&s->list);
> +		kfree(s);
> +	}
> +}
> +
> +static int rdt_get_tree(struct fs_context *fc)
> +{
> +	struct rdt_resource *l3 = resctrl_arch_get_resource(RDT_RESOURCE_L3);
> +	struct rdt_fs_context *ctx = rdt_fc2context(fc);
> +	unsigned long flags = RFTYPE_CTRL_BASE;
> +	struct rdt_domain *dom;
> +	int ret;
> +
> +	cpus_read_lock();
> +	mutex_lock(&rdtgroup_mutex);
> +	/*
> +	 * resctrl file system can only be mounted once.
> +	 */
> +	if (resctrl_mounted) {
> +		ret = -EBUSY;
> +		goto out;
> +	}
> +
> +	ret = rdtgroup_setup_root(ctx);
> +	if (ret)
> +		goto out;
> +
> +	ret = rdt_enable_ctx(ctx);
> +	if (ret)
> +		goto out_root;
> +
> +	ret = schemata_list_create();
> +	if (ret) {
> +		schemata_list_destroy();
> +		goto out_ctx;
> +	}
> +
> +	closid_init();
> +
> +	if (resctrl_arch_mon_capable())
> +		flags |= RFTYPE_MON;
> +
> +	ret = rdtgroup_add_files(rdtgroup_default.kn, flags);
> +	if (ret)
> +		goto out_schemata_free;
> +
> +	kernfs_activate(rdtgroup_default.kn);
> +
> +	ret = rdtgroup_create_info_dir(rdtgroup_default.kn);
> +	if (ret < 0)
> +		goto out_schemata_free;
> +
> +	if (resctrl_arch_mon_capable()) {
> +		ret = mongroup_create_dir(rdtgroup_default.kn,
> +					  &rdtgroup_default, "mon_groups",
> +					  &kn_mongrp);
> +		if (ret < 0)
> +			goto out_info;
> +
> +		ret = mkdir_mondata_all(rdtgroup_default.kn,
> +					&rdtgroup_default, &kn_mondata);
> +		if (ret < 0)
> +			goto out_mongrp;
> +		rdtgroup_default.mon.mon_data_kn = kn_mondata;
> +	}
> +
> +	ret = rdt_pseudo_lock_init();
> +	if (ret)
> +		goto out_mondata;
> +
> +	ret = kernfs_get_tree(fc);
> +	if (ret < 0)
> +		goto out_psl;
> +
> +	if (resctrl_arch_alloc_capable())
> +		resctrl_arch_enable_alloc();
> +	if (resctrl_arch_mon_capable())
> +		resctrl_arch_enable_mon();
> +
> +	if (resctrl_arch_alloc_capable() || resctrl_arch_mon_capable())
> +		resctrl_mounted = true;
> +
> +	if (resctrl_is_mbm_enabled()) {
> +		list_for_each_entry(dom, &l3->domains, list)
> +			mbm_setup_overflow_handler(dom, MBM_OVERFLOW_INTERVAL,
> +						   RESCTRL_PICK_ANY_CPU);
> +	}
> +
> +	goto out;
> +
> +out_psl:
> +	rdt_pseudo_lock_release();
> +out_mondata:
> +	if (resctrl_arch_mon_capable())
> +		kernfs_remove(kn_mondata);
> +out_mongrp:
> +	if (resctrl_arch_mon_capable())
> +		kernfs_remove(kn_mongrp);
> +out_info:
> +	kernfs_remove(kn_info);
> +out_schemata_free:
> +	schemata_list_destroy();
> +out_ctx:
> +	rdt_disable_ctx();
> +out_root:
> +	rdtgroup_destroy_root();
> +out:
> +	rdt_last_cmd_clear();
> +	mutex_unlock(&rdtgroup_mutex);
> +	cpus_read_unlock();
> +	return ret;
> +}
> +
> +enum rdt_param {
> +	Opt_cdp,
> +	Opt_cdpl2,
> +	Opt_mba_mbps,
> +	Opt_debug,
> +	nr__rdt_params
> +};
> +
> +static const struct fs_parameter_spec rdt_fs_parameters[] = {
> +	fsparam_flag("cdp",		Opt_cdp),
> +	fsparam_flag("cdpl2",		Opt_cdpl2),
> +	fsparam_flag("mba_MBps",	Opt_mba_mbps),
> +	fsparam_flag("debug",		Opt_debug),
> +	{}
> +};
> +
> +static int rdt_parse_param(struct fs_context *fc, struct fs_parameter *param)
> +{
> +	struct rdt_fs_context *ctx = rdt_fc2context(fc);
> +	struct fs_parse_result result;
> +	int opt;
> +
> +	opt = fs_parse(fc, rdt_fs_parameters, param, &result);
> +	if (opt < 0)
> +		return opt;
> +
> +	switch (opt) {
> +	case Opt_cdp:
> +		ctx->enable_cdpl3 = true;
> +		return 0;
> +	case Opt_cdpl2:
> +		ctx->enable_cdpl2 = true;
> +		return 0;
> +	case Opt_mba_mbps:
> +		if (!supports_mba_mbps())
> +			return -EINVAL;
> +		ctx->enable_mba_mbps = true;
> +		return 0;
> +	case Opt_debug:
> +		ctx->enable_debug = true;
> +		return 0;
> +	}
> +
> +	return -EINVAL;
> +}
> +
> +static void rdt_fs_context_free(struct fs_context *fc)
> +{
> +	struct rdt_fs_context *ctx = rdt_fc2context(fc);
> +
> +	kernfs_free_fs_context(fc);
> +	kfree(ctx);
> +}
> +
> +static const struct fs_context_operations rdt_fs_context_ops = {
> +	.free		= rdt_fs_context_free,
> +	.parse_param	= rdt_parse_param,
> +	.get_tree	= rdt_get_tree,
> +};
> +
> +static int rdt_init_fs_context(struct fs_context *fc)
> +{
> +	struct rdt_fs_context *ctx;
> +
> +	ctx = kzalloc(sizeof(struct rdt_fs_context), GFP_KERNEL);
> +	if (!ctx)
> +		return -ENOMEM;
> +
> +	ctx->kfc.magic = RDTGROUP_SUPER_MAGIC;
> +	fc->fs_private = &ctx->kfc;
> +	fc->ops = &rdt_fs_context_ops;
> +	put_user_ns(fc->user_ns);
> +	fc->user_ns = get_user_ns(&init_user_ns);
> +	fc->global = true;
> +	return 0;
> +}
> +
> +/*
> + * Move tasks from one to the other group. If @from is NULL, then all tasks
> + * in the systems are moved unconditionally (used for teardown).
> + *
> + * If @mask is not NULL the cpus on which moved tasks are running are set
> + * in that mask so the update smp function call is restricted to affected
> + * cpus.
> + */
> +static void rdt_move_group_tasks(struct rdtgroup *from, struct rdtgroup *to,
> +				 struct cpumask *mask)
> +{
> +	struct task_struct *p, *t;
> +
> +	read_lock(&tasklist_lock);
> +	for_each_process_thread(p, t) {
> +		if (!from || is_closid_match(t, from) ||
> +		    is_rmid_match(t, from)) {
> +			resctrl_arch_set_closid_rmid(t, to->closid,
> +						     to->mon.rmid);
> +
> +			/*
> +			 * Order the closid/rmid stores above before the loads
> +			 * in task_curr(). This pairs with the full barrier
> +			 * between the rq->curr update and
> +			 * resctrl_arch_sched_in() during context switch.
> +			 */
> +			smp_mb();
> +
> +			/*
> +			 * If the task is on a CPU, set the CPU in the mask.
> +			 * The detection is inaccurate as tasks might move or
> +			 * schedule before the smp function call takes place.
> +			 * In such a case the function call is pointless, but
> +			 * there is no other side effect.
> +			 */
> +			if (IS_ENABLED(CONFIG_SMP) && mask && task_curr(t))
> +				cpumask_set_cpu(task_cpu(t), mask);
> +		}
> +	}
> +	read_unlock(&tasklist_lock);
> +}
> +
> +static void free_all_child_rdtgrp(struct rdtgroup *rdtgrp)
> +{
> +	struct rdtgroup *sentry, *stmp;
> +	struct list_head *head;
> +
> +	head = &rdtgrp->mon.crdtgrp_list;
> +	list_for_each_entry_safe(sentry, stmp, head, mon.crdtgrp_list) {
> +		free_rmid(sentry->closid, sentry->mon.rmid);
> +		list_del(&sentry->mon.crdtgrp_list);
> +
> +		if (atomic_read(&sentry->waitcount) != 0)
> +			sentry->flags = RDT_DELETED;
> +		else
> +			rdtgroup_remove(sentry);
> +	}
> +}
> +
> +/*
> + * Forcibly remove all of subdirectories under root.
> + */
> +static void rmdir_all_sub(void)
> +{
> +	struct rdtgroup *rdtgrp, *tmp;
> +
> +	/* Move all tasks to the default resource group */
> +	rdt_move_group_tasks(NULL, &rdtgroup_default, NULL);
> +
> +	list_for_each_entry_safe(rdtgrp, tmp, &rdt_all_groups, rdtgroup_list) {
> +		/* Free any child rmids */
> +		free_all_child_rdtgrp(rdtgrp);
> +
> +		/* Remove each rdtgroup other than root */
> +		if (rdtgrp == &rdtgroup_default)
> +			continue;
> +
> +		if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP ||
> +		    rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED)
> +			rdtgroup_pseudo_lock_remove(rdtgrp);
> +
> +		/*
> +		 * Give any CPUs back to the default group. We cannot copy
> +		 * cpu_online_mask because a CPU might have executed the
> +		 * offline callback already, but is still marked online.
> +		 */
> +		cpumask_or(&rdtgroup_default.cpu_mask,
> +			   &rdtgroup_default.cpu_mask, &rdtgrp->cpu_mask);
> +
> +		free_rmid(rdtgrp->closid, rdtgrp->mon.rmid);
> +
> +		kernfs_remove(rdtgrp->kn);
> +		list_del(&rdtgrp->rdtgroup_list);
> +
> +		if (atomic_read(&rdtgrp->waitcount) != 0)
> +			rdtgrp->flags = RDT_DELETED;
> +		else
> +			rdtgroup_remove(rdtgrp);
> +	}
> +	/* Notify online CPUs to update per cpu storage and PQR_ASSOC MSR */
> +	update_closid_rmid(cpu_online_mask, &rdtgroup_default);
> +
> +	kernfs_remove(kn_info);
> +	kernfs_remove(kn_mongrp);
> +	kernfs_remove(kn_mondata);
> +}
> +
> +static void rdt_kill_sb(struct super_block *sb)
> +{
> +	cpus_read_lock();
> +	mutex_lock(&rdtgroup_mutex);
> +
> +	rdt_disable_ctx();
> +
> +	/* Put everything back to default values. */
> +	resctrl_arch_reset_resources();
> +
> +	rmdir_all_sub();
> +	rdt_pseudo_lock_release();
> +	rdtgroup_default.mode = RDT_MODE_SHAREABLE;
> +	schemata_list_destroy();
> +	rdtgroup_destroy_root();
> +	if (resctrl_arch_alloc_capable())
> +		resctrl_arch_disable_alloc();
> +	if (resctrl_arch_mon_capable())
> +		resctrl_arch_disable_mon();
> +	resctrl_mounted = false;
> +	kernfs_kill_sb(sb);
> +	mutex_unlock(&rdtgroup_mutex);
> +	cpus_read_unlock();
> +}
> +
> +static struct file_system_type rdt_fs_type = {
> +	.name			= "resctrl",
> +	.init_fs_context	= rdt_init_fs_context,
> +	.parameters		= rdt_fs_parameters,
> +	.kill_sb		= rdt_kill_sb,
> +};
> +
> +static int mon_addfile(struct kernfs_node *parent_kn, const char *name,
> +		       void *priv)
> +{
> +	struct kernfs_node *kn;
> +	int ret = 0;
> +
> +	kn = __kernfs_create_file(parent_kn, name, 0444,
> +				  GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, 0,
> +				  &kf_mondata_ops, priv, NULL, NULL);
> +	if (IS_ERR(kn))
> +		return PTR_ERR(kn);
> +
> +	ret = rdtgroup_kn_set_ugid(kn);
> +	if (ret) {
> +		kernfs_remove(kn);
> +		return ret;
> +	}
> +
> +	return ret;
> +}
> +
> +/*
> + * Remove all subdirectories of mon_data of ctrl_mon groups
> + * and monitor groups with given domain id.
> + */
> +static void rmdir_mondata_subdir_allrdtgrp(struct rdt_resource *r,
> +					   unsigned int dom_id)
> +{
> +	struct rdtgroup *prgrp, *crgrp;
> +	char name[32];
> +
> +	list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) {
> +		sprintf(name, "mon_%s_%02d", r->name, dom_id);
> +		kernfs_remove_by_name(prgrp->mon.mon_data_kn, name);
> +
> +		list_for_each_entry(crgrp, &prgrp->mon.crdtgrp_list, mon.crdtgrp_list)
> +			kernfs_remove_by_name(crgrp->mon.mon_data_kn, name);
> +	}
> +}
> +
> +static int mkdir_mondata_subdir(struct kernfs_node *parent_kn,
> +				struct rdt_domain *d,
> +				struct rdt_resource *r, struct rdtgroup *prgrp)
> +{
> +	union mon_data_bits priv;
> +	struct kernfs_node *kn;
> +	struct mon_evt *mevt;
> +	struct rmid_read rr;
> +	char name[32];
> +	int ret;
> +
> +	sprintf(name, "mon_%s_%02d", r->name, d->id);
> +	/* create the directory */
> +	kn = kernfs_create_dir(parent_kn, name, parent_kn->mode, prgrp);
> +	if (IS_ERR(kn))
> +		return PTR_ERR(kn);
> +
> +	ret = rdtgroup_kn_set_ugid(kn);
> +	if (ret)
> +		goto out_destroy;
> +
> +	if (WARN_ON(list_empty(&r->evt_list))) {
> +		ret = -EPERM;
> +		goto out_destroy;
> +	}
> +
> +	priv.u.rid = r->rid;
> +	priv.u.domid = d->id;
> +	list_for_each_entry(mevt, &r->evt_list, list) {
> +		priv.u.evtid = mevt->evtid;
> +		ret = mon_addfile(kn, mevt->name, priv.priv);
> +		if (ret)
> +			goto out_destroy;
> +
> +		if (resctrl_is_mbm_event(mevt->evtid))
> +			mon_event_read(&rr, r, d, prgrp, mevt->evtid, true);
> +	}
> +	kernfs_activate(kn);
> +	return 0;
> +
> +out_destroy:
> +	kernfs_remove(kn);
> +	return ret;
> +}
> +
> +/*
> + * Add all subdirectories of mon_data for "ctrl_mon" groups
> + * and "monitor" groups with given domain id.
> + */
> +static void mkdir_mondata_subdir_allrdtgrp(struct rdt_resource *r,
> +					   struct rdt_domain *d)
> +{
> +	struct kernfs_node *parent_kn;
> +	struct rdtgroup *prgrp, *crgrp;
> +	struct list_head *head;
> +
> +	list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) {
> +		parent_kn = prgrp->mon.mon_data_kn;
> +		mkdir_mondata_subdir(parent_kn, d, r, prgrp);
> +
> +		head = &prgrp->mon.crdtgrp_list;
> +		list_for_each_entry(crgrp, head, mon.crdtgrp_list) {
> +			parent_kn = crgrp->mon.mon_data_kn;
> +			mkdir_mondata_subdir(parent_kn, d, r, crgrp);
> +		}
> +	}
> +}
> +
> +static int mkdir_mondata_subdir_alldom(struct kernfs_node *parent_kn,
> +				       struct rdt_resource *r,
> +				       struct rdtgroup *prgrp)
> +{
> +	struct rdt_domain *dom;
> +	int ret;
> +
> +	/* Walking r->domains, ensure it can't race with cpuhp */
> +	lockdep_assert_cpus_held();
> +
> +	list_for_each_entry(dom, &r->domains, list) {
> +		ret = mkdir_mondata_subdir(parent_kn, dom, r, prgrp);
> +		if (ret)
> +			return ret;
> +	}
> +
> +	return 0;
> +}
> +
> +/*
> + * This creates a directory mon_data which contains the monitored data.
> + *
> + * mon_data has one directory for each domain which are named
> + * in the format mon_<domain_name>_<domain_id>. For ex: A mon_data
> + * with L3 domain looks as below:
> + * ./mon_data:
> + * mon_L3_00
> + * mon_L3_01
> + * mon_L3_02
> + * ...
> + *
> + * Each domain directory has one file per event:
> + * ./mon_L3_00/:
> + * llc_occupancy
> + *
> + */
> +static int mkdir_mondata_all(struct kernfs_node *parent_kn,
> +			     struct rdtgroup *prgrp,
> +			     struct kernfs_node **dest_kn)
> +{
> +	enum resctrl_res_level i;
> +	struct rdt_resource *r;
> +	struct kernfs_node *kn;
> +	int ret;
> +
> +	/*
> +	 * Create the mon_data directory first.
> +	 */
> +	ret = mongroup_create_dir(parent_kn, prgrp, "mon_data", &kn);
> +	if (ret)
> +		return ret;
> +
> +	if (dest_kn)
> +		*dest_kn = kn;
> +
> +	/*
> +	 * Create the subdirectories for each domain. Note that all events
> +	 * in a domain like L3 are grouped into a resource whose domain is L3
> +	 */
> +	for (i = 0; i < RDT_NUM_RESOURCES; i++) {
> +		r = resctrl_arch_get_resource(i);
> +		if (!r->mon_capable)
> +			continue;
> +
> +		ret = mkdir_mondata_subdir_alldom(kn, r, prgrp);
> +		if (ret)
> +			goto out_destroy;
> +	}
> +
> +	return 0;
> +
> +out_destroy:
> +	kernfs_remove(kn);
> +	return ret;
> +}
> +
> +/**
> + * cbm_ensure_valid - Enforce validity on provided CBM
> + * @_val:	Candidate CBM
> + * @r:		RDT resource to which the CBM belongs
> + *
> + * The provided CBM represents all cache portions available for use. This
> + * may be represented by a bitmap that does not consist of contiguous ones
> + * and thus be an invalid CBM.
> + * Here the provided CBM is forced to be a valid CBM by only considering
> + * the first set of contiguous bits as valid and clearing all bits.
> + * The intention here is to provide a valid default CBM with which a new
> + * resource group is initialized. The user can follow this with a
> + * modification to the CBM if the default does not satisfy the
> + * requirements.
> + */
> +static u32 cbm_ensure_valid(u32 _val, struct rdt_resource *r)
> +{
> +	unsigned int cbm_len = r->cache.cbm_len;
> +	unsigned long first_bit, zero_bit;
> +	unsigned long val = _val;
> +
> +	if (!val)
> +		return 0;
> +
> +	first_bit = find_first_bit(&val, cbm_len);
> +	zero_bit = find_next_zero_bit(&val, cbm_len, first_bit);
> +
> +	/* Clear any remaining bits to ensure contiguous region */
> +	bitmap_clear(&val, zero_bit, cbm_len - zero_bit);
> +	return (u32)val;
> +}
> +
> +/*
> + * Initialize cache resources per RDT domain
> + *
> + * Set the RDT domain up to start off with all usable allocations. That is,
> + * all shareable and unused bits. All-zero CBM is invalid.
> + */
> +static int __init_one_rdt_domain(struct rdt_domain *d, struct resctrl_schema *s,
> +				 u32 closid)
> +{
> +	enum resctrl_conf_type peer_type = resctrl_peer_type(s->conf_type);
> +	enum resctrl_conf_type t = s->conf_type;
> +	struct resctrl_staged_config *cfg;
> +	struct rdt_resource *r = s->res;
> +	u32 used_b = 0, unused_b = 0;
> +	unsigned long tmp_cbm;
> +	enum rdtgrp_mode mode;
> +	u32 peer_ctl, ctrl_val;
> +	int i;
> +
> +	cfg = &d->staged_config[t];
> +	cfg->have_new_ctrl = false;
> +	cfg->new_ctrl = r->cache.shareable_bits;
> +	used_b = r->cache.shareable_bits;
> +	for (i = 0; i < closids_supported(); i++) {
> +		if (closid_allocated(i) && i != closid) {
> +			mode = rdtgroup_mode_by_closid(i);
> +			if (mode == RDT_MODE_PSEUDO_LOCKSETUP)
> +				/*
> +				 * ctrl values for locksetup aren't relevant
> +				 * until the schemata is written, and the mode
> +				 * becomes RDT_MODE_PSEUDO_LOCKED.
> +				 */
> +				continue;
> +			/*
> +			 * If CDP is active include peer domain's
> +			 * usage to ensure there is no overlap
> +			 * with an exclusive group.
> +			 */
> +			if (resctrl_arch_get_cdp_enabled(r->rid))
> +				peer_ctl = resctrl_arch_get_config(r, d, i,
> +								   peer_type);
> +			else
> +				peer_ctl = 0;
> +			ctrl_val = resctrl_arch_get_config(r, d, i,
> +							   s->conf_type);
> +			used_b |= ctrl_val | peer_ctl;
> +			if (mode == RDT_MODE_SHAREABLE)
> +				cfg->new_ctrl |= ctrl_val | peer_ctl;
> +		}
> +	}
> +	if (d->plr && d->plr->cbm > 0)
> +		used_b |= d->plr->cbm;
> +	unused_b = used_b ^ (BIT_MASK(r->cache.cbm_len) - 1);
> +	unused_b &= BIT_MASK(r->cache.cbm_len) - 1;
> +	cfg->new_ctrl |= unused_b;
> +	/*
> +	 * Force the initial CBM to be valid, user can
> +	 * modify the CBM based on system availability.
> +	 */
> +	cfg->new_ctrl = cbm_ensure_valid(cfg->new_ctrl, r);
> +	/*
> +	 * Assign the u32 CBM to an unsigned long to ensure that
> +	 * bitmap_weight() does not access out-of-bound memory.
> +	 */
> +	tmp_cbm = cfg->new_ctrl;
> +	if (bitmap_weight(&tmp_cbm, r->cache.cbm_len) < r->cache.min_cbm_bits) {
> +		rdt_last_cmd_printf("No space on %s:%d\n", s->name, d->id);
> +		return -ENOSPC;
> +	}
> +	cfg->have_new_ctrl = true;
> +
> +	return 0;
> +}
> +
> +/*
> + * Initialize cache resources with default values.
> + *
> + * A new RDT group is being created on an allocation capable (CAT)
> + * supporting system. Set this group up to start off with all usable
> + * allocations.
> + *
> + * If there are no more shareable bits available on any domain then
> + * the entire allocation will fail.
> + */
> +static int rdtgroup_init_cat(struct resctrl_schema *s, u32 closid)
> +{
> +	struct rdt_domain *d;
> +	int ret;
> +
> +	list_for_each_entry(d, &s->res->domains, list) {
> +		ret = __init_one_rdt_domain(d, s, closid);
> +		if (ret < 0)
> +			return ret;
> +	}
> +
> +	return 0;
> +}
> +
> +/* Initialize MBA resource with default values. */
> +static void rdtgroup_init_mba(struct rdt_resource *r, u32 closid)
> +{
> +	struct resctrl_staged_config *cfg;
> +	struct rdt_domain *d;
> +
> +	list_for_each_entry(d, &r->domains, list) {
> +		if (is_mba_sc(r)) {
> +			d->mbps_val[closid] = MBA_MAX_MBPS;
> +			continue;
> +		}
> +
> +		cfg = &d->staged_config[CDP_NONE];
> +		cfg->new_ctrl = r->default_ctrl;
> +		cfg->have_new_ctrl = true;
> +	}
> +}
> +
> +/* Initialize the RDT group's allocations. */
> +static int rdtgroup_init_alloc(struct rdtgroup *rdtgrp)
> +{
> +	struct resctrl_schema *s;
> +	struct rdt_resource *r;
> +	int ret = 0;
> +
> +	rdt_staged_configs_clear();
> +
> +	list_for_each_entry(s, &resctrl_schema_all, list) {
> +		r = s->res;
> +		if (r->rid == RDT_RESOURCE_MBA ||
> +		    r->rid == RDT_RESOURCE_SMBA) {
> +			rdtgroup_init_mba(r, rdtgrp->closid);
> +			if (is_mba_sc(r))
> +				continue;
> +		} else {
> +			ret = rdtgroup_init_cat(s, rdtgrp->closid);
> +			if (ret < 0)
> +				goto out;
> +		}
> +
> +		ret = resctrl_arch_update_domains(r, rdtgrp->closid);
> +		if (ret < 0) {
> +			rdt_last_cmd_puts("Failed to initialize allocations\n");
> +			goto out;
> +		}
> +
> +	}
> +
> +	rdtgrp->mode = RDT_MODE_SHAREABLE;
> +
> +out:
> +	rdt_staged_configs_clear();
> +	return ret;
> +}
> +
> +static int mkdir_rdt_prepare_rmid_alloc(struct rdtgroup *rdtgrp)
> +{
> +	int ret;
> +
> +	if (!resctrl_arch_mon_capable())
> +		return 0;
> +
> +	ret = alloc_rmid(rdtgrp->closid);
> +	if (ret < 0) {
> +		rdt_last_cmd_puts("Out of RMIDs\n");
> +		return ret;
> +	}
> +	rdtgrp->mon.rmid = ret;
> +
> +	ret = mkdir_mondata_all(rdtgrp->kn, rdtgrp, &rdtgrp->mon.mon_data_kn);
> +	if (ret) {
> +		rdt_last_cmd_puts("kernfs subdir error\n");
> +		free_rmid(rdtgrp->closid, rdtgrp->mon.rmid);
> +		return ret;
> +	}
> +
> +	return 0;
> +}
> +
> +static void mkdir_rdt_prepare_rmid_free(struct rdtgroup *rgrp)
> +{
> +	if (resctrl_arch_mon_capable())
> +		free_rmid(rgrp->closid, rgrp->mon.rmid);
> +}
> +
> +static int mkdir_rdt_prepare(struct kernfs_node *parent_kn,
> +			     const char *name, umode_t mode,
> +			     enum rdt_group_type rtype, struct rdtgroup **r)
> +{
> +	struct rdtgroup *prdtgrp, *rdtgrp;
> +	unsigned long files = 0;
> +	struct kernfs_node *kn;
> +	int ret;
> +
> +	prdtgrp = rdtgroup_kn_lock_live(parent_kn);
> +	if (!prdtgrp) {
> +		ret = -ENODEV;
> +		goto out_unlock;
> +	}
> +
> +	if (rtype == RDTMON_GROUP &&
> +	    (prdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP ||
> +	     prdtgrp->mode == RDT_MODE_PSEUDO_LOCKED)) {
> +		ret = -EINVAL;
> +		rdt_last_cmd_puts("Pseudo-locking in progress\n");
> +		goto out_unlock;
> +	}
> +
> +	/* allocate the rdtgroup. */
> +	rdtgrp = kzalloc(sizeof(*rdtgrp), GFP_KERNEL);
> +	if (!rdtgrp) {
> +		ret = -ENOSPC;
> +		rdt_last_cmd_puts("Kernel out of memory\n");
> +		goto out_unlock;
> +	}
> +	*r = rdtgrp;
> +	rdtgrp->mon.parent = prdtgrp;
> +	rdtgrp->type = rtype;
> +	INIT_LIST_HEAD(&rdtgrp->mon.crdtgrp_list);
> +
> +	/* kernfs creates the directory for rdtgrp */
> +	kn = kernfs_create_dir(parent_kn, name, mode, rdtgrp);
> +	if (IS_ERR(kn)) {
> +		ret = PTR_ERR(kn);
> +		rdt_last_cmd_puts("kernfs create error\n");
> +		goto out_free_rgrp;
> +	}
> +	rdtgrp->kn = kn;
> +
> +	/*
> +	 * kernfs_remove() will drop the reference count on "kn" which
> +	 * will free it. But we still need it to stick around for the
> +	 * rdtgroup_kn_unlock(kn) call. Take one extra reference here,
> +	 * which will be dropped by kernfs_put() in rdtgroup_remove().
> +	 */
> +	kernfs_get(kn);
> +
> +	ret = rdtgroup_kn_set_ugid(kn);
> +	if (ret) {
> +		rdt_last_cmd_puts("kernfs perm error\n");
> +		goto out_destroy;
> +	}
> +
> +	if (rtype == RDTCTRL_GROUP) {
> +		files = RFTYPE_BASE | RFTYPE_CTRL;
> +		if (resctrl_arch_mon_capable())
> +			files |= RFTYPE_MON;
> +	} else {
> +		files = RFTYPE_BASE | RFTYPE_MON;
> +	}
> +
> +	ret = rdtgroup_add_files(kn, files);
> +	if (ret) {
> +		rdt_last_cmd_puts("kernfs fill error\n");
> +		goto out_destroy;
> +	}
> +
> +	/*
> +	 * The caller unlocks the parent_kn upon success.
> +	 */
> +	return 0;
> +
> +out_destroy:
> +	kernfs_put(rdtgrp->kn);
> +	kernfs_remove(rdtgrp->kn);
> +out_free_rgrp:
> +	kfree(rdtgrp);
> +out_unlock:
> +	rdtgroup_kn_unlock(parent_kn);
> +	return ret;
> +}
> +
> +static void mkdir_rdt_prepare_clean(struct rdtgroup *rgrp)
> +{
> +	kernfs_remove(rgrp->kn);
> +	rdtgroup_remove(rgrp);
> +}
> +
> +/*
> + * Create a monitor group under "mon_groups" directory of a control
> + * and monitor group(ctrl_mon). This is a resource group
> + * to monitor a subset of tasks and cpus in its parent ctrl_mon group.
> + */
> +static int rdtgroup_mkdir_mon(struct kernfs_node *parent_kn,
> +			      const char *name, umode_t mode)
> +{
> +	struct rdtgroup *rdtgrp, *prgrp;
> +	int ret;
> +
> +	ret = mkdir_rdt_prepare(parent_kn, name, mode, RDTMON_GROUP, &rdtgrp);
> +	if (ret)
> +		return ret;
> +
> +	prgrp = rdtgrp->mon.parent;
> +	rdtgrp->closid = prgrp->closid;
> +
> +	ret = mkdir_rdt_prepare_rmid_alloc(rdtgrp);
> +	if (ret) {
> +		mkdir_rdt_prepare_clean(rdtgrp);
> +		goto out_unlock;
> +	}
> +
> +	kernfs_activate(rdtgrp->kn);
> +
> +	/*
> +	 * Add the rdtgrp to the list of rdtgrps the parent
> +	 * ctrl_mon group has to track.
> +	 */
> +	list_add_tail(&rdtgrp->mon.crdtgrp_list, &prgrp->mon.crdtgrp_list);
> +
> +out_unlock:
> +	rdtgroup_kn_unlock(parent_kn);
> +	return ret;
> +}
> +
> +/*
> + * These are rdtgroups created under the root directory. Can be used
> + * to allocate and monitor resources.
> + */
> +static int rdtgroup_mkdir_ctrl_mon(struct kernfs_node *parent_kn,
> +				   const char *name, umode_t mode)
> +{
> +	struct rdtgroup *rdtgrp;
> +	struct kernfs_node *kn;
> +	u32 closid;
> +	int ret;
> +
> +	ret = mkdir_rdt_prepare(parent_kn, name, mode, RDTCTRL_GROUP, &rdtgrp);
> +	if (ret)
> +		return ret;
> +
> +	kn = rdtgrp->kn;
> +	ret = closid_alloc();
> +	if (ret < 0) {
> +		rdt_last_cmd_puts("Out of CLOSIDs\n");
> +		goto out_common_fail;
> +	}
> +	closid = ret;
> +	ret = 0;
> +
> +	rdtgrp->closid = closid;
> +
> +	ret = mkdir_rdt_prepare_rmid_alloc(rdtgrp);
> +	if (ret)
> +		goto out_closid_free;
> +
> +	kernfs_activate(rdtgrp->kn);
> +
> +	ret = rdtgroup_init_alloc(rdtgrp);
> +	if (ret < 0)
> +		goto out_rmid_free;
> +
> +	list_add(&rdtgrp->rdtgroup_list, &rdt_all_groups);
> +
> +	if (resctrl_arch_mon_capable()) {
> +		/*
> +		 * Create an empty mon_groups directory to hold the subset
> +		 * of tasks and cpus to monitor.
> +		 */
> +		ret = mongroup_create_dir(kn, rdtgrp, "mon_groups", NULL);
> +		if (ret) {
> +			rdt_last_cmd_puts("kernfs subdir error\n");
> +			goto out_del_list;
> +		}
> +	}
> +
> +	goto out_unlock;
> +
> +out_del_list:
> +	list_del(&rdtgrp->rdtgroup_list);
> +out_rmid_free:
> +	mkdir_rdt_prepare_rmid_free(rdtgrp);
> +out_closid_free:
> +	closid_free(closid);
> +out_common_fail:
> +	mkdir_rdt_prepare_clean(rdtgrp);
> +out_unlock:
> +	rdtgroup_kn_unlock(parent_kn);
> +	return ret;
> +}
> +
> +/*
> + * We allow creating mon groups only with in a directory called "mon_groups"
> + * which is present in every ctrl_mon group. Check if this is a valid
> + * "mon_groups" directory.
> + *
> + * 1. The directory should be named "mon_groups".
> + * 2. The mon group itself should "not" be named "mon_groups".
> + *   This makes sure "mon_groups" directory always has a ctrl_mon group
> + *   as parent.
> + */
> +static bool is_mon_groups(struct kernfs_node *kn, const char *name)
> +{
> +	return (!strcmp(kn->name, "mon_groups") &&
> +		strcmp(name, "mon_groups"));
> +}
> +
> +static int rdtgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
> +			  umode_t mode)
> +{
> +	/* Do not accept '\n' to avoid unparsable situation. */
> +	if (strchr(name, '\n'))
> +		return -EINVAL;
> +
> +	/*
> +	 * If the parent directory is the root directory and RDT
> +	 * allocation is supported, add a control and monitoring
> +	 * subdirectory
> +	 */
> +	if (resctrl_arch_alloc_capable() && parent_kn == rdtgroup_default.kn)
> +		return rdtgroup_mkdir_ctrl_mon(parent_kn, name, mode);
> +
> +	/*
> +	 * If RDT monitoring is supported and the parent directory is a valid
> +	 * "mon_groups" directory, add a monitoring subdirectory.
> +	 */
> +	if (resctrl_arch_mon_capable() && is_mon_groups(parent_kn, name))
> +		return rdtgroup_mkdir_mon(parent_kn, name, mode);
> +
> +	return -EPERM;
> +}
> +
> +static int rdtgroup_rmdir_mon(struct rdtgroup *rdtgrp, cpumask_var_t tmpmask)
> +{
> +	struct rdtgroup *prdtgrp = rdtgrp->mon.parent;
> +	u32 closid, rmid;
> +	int cpu;
> +
> +	/* Give any tasks back to the parent group */
> +	rdt_move_group_tasks(rdtgrp, prdtgrp, tmpmask);
> +
> +	/* Update per cpu rmid of the moved CPUs first */
> +	closid = rdtgrp->closid;
> +	rmid = prdtgrp->mon.rmid;
> +	for_each_cpu(cpu, &rdtgrp->cpu_mask)
> +		resctrl_arch_set_cpu_default_closid_rmid(cpu, closid, rmid);
> +
> +	/*
> +	 * Update the MSR on moved CPUs and CPUs which have moved
> +	 * task running on them.
> +	 */
> +	cpumask_or(tmpmask, tmpmask, &rdtgrp->cpu_mask);
> +	update_closid_rmid(tmpmask, NULL);
> +
> +	rdtgrp->flags = RDT_DELETED;
> +	free_rmid(rdtgrp->closid, rdtgrp->mon.rmid);
> +
> +	/*
> +	 * Remove the rdtgrp from the parent ctrl_mon group's list
> +	 */
> +	WARN_ON(list_empty(&prdtgrp->mon.crdtgrp_list));
> +	list_del(&rdtgrp->mon.crdtgrp_list);
> +
> +	kernfs_remove(rdtgrp->kn);
> +
> +	return 0;
> +}
> +
> +static int rdtgroup_ctrl_remove(struct rdtgroup *rdtgrp)
> +{
> +	rdtgrp->flags = RDT_DELETED;
> +	list_del(&rdtgrp->rdtgroup_list);
> +
> +	kernfs_remove(rdtgrp->kn);
> +	return 0;
> +}
> +
> +static int rdtgroup_rmdir_ctrl(struct rdtgroup *rdtgrp, cpumask_var_t tmpmask)
> +{
> +	u32 closid, rmid;
> +	int cpu;
> +
> +	/* Give any tasks back to the default group */
> +	rdt_move_group_tasks(rdtgrp, &rdtgroup_default, tmpmask);
> +
> +	/* Give any CPUs back to the default group */
> +	cpumask_or(&rdtgroup_default.cpu_mask,
> +		   &rdtgroup_default.cpu_mask, &rdtgrp->cpu_mask);
> +
> +	/* Update per cpu closid and rmid of the moved CPUs first */
> +	closid = rdtgroup_default.closid;
> +	rmid = rdtgroup_default.mon.rmid;
> +	for_each_cpu(cpu, &rdtgrp->cpu_mask)
> +		resctrl_arch_set_cpu_default_closid_rmid(cpu, closid, rmid);
> +
> +	/*
> +	 * Update the MSR on moved CPUs and CPUs which have moved
> +	 * task running on them.
> +	 */
> +	cpumask_or(tmpmask, tmpmask, &rdtgrp->cpu_mask);
> +	update_closid_rmid(tmpmask, NULL);
> +
> +	free_rmid(rdtgrp->closid, rdtgrp->mon.rmid);
> +	closid_free(rdtgrp->closid);
> +
> +	rdtgroup_ctrl_remove(rdtgrp);
> +
> +	/*
> +	 * Free all the child monitor group rmids.
> +	 */
> +	free_all_child_rdtgrp(rdtgrp);
> +
> +	return 0;
> +}
> +
> +static int rdtgroup_rmdir(struct kernfs_node *kn)
> +{
> +	struct kernfs_node *parent_kn = kn->parent;
> +	struct rdtgroup *rdtgrp;
> +	cpumask_var_t tmpmask;
> +	int ret = 0;
> +
> +	if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
> +		return -ENOMEM;
> +
> +	rdtgrp = rdtgroup_kn_lock_live(kn);
> +	if (!rdtgrp) {
> +		ret = -EPERM;
> +		goto out;
> +	}
> +
> +	/*
> +	 * If the rdtgroup is a ctrl_mon group and parent directory
> +	 * is the root directory, remove the ctrl_mon group.
> +	 *
> +	 * If the rdtgroup is a mon group and parent directory
> +	 * is a valid "mon_groups" directory, remove the mon group.
> +	 */
> +	if (rdtgrp->type == RDTCTRL_GROUP && parent_kn == rdtgroup_default.kn &&
> +	    rdtgrp != &rdtgroup_default) {
> +		if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP ||
> +		    rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) {
> +			ret = rdtgroup_ctrl_remove(rdtgrp);
> +		} else {
> +			ret = rdtgroup_rmdir_ctrl(rdtgrp, tmpmask);
> +		}
> +	} else if (rdtgrp->type == RDTMON_GROUP &&
> +		 is_mon_groups(parent_kn, kn->name)) {
> +		ret = rdtgroup_rmdir_mon(rdtgrp, tmpmask);
> +	} else {
> +		ret = -EPERM;
> +	}
> +
> +out:
> +	rdtgroup_kn_unlock(kn);
> +	free_cpumask_var(tmpmask);
> +	return ret;
> +}
> +
> +/**
> + * mongrp_reparent() - replace parent CTRL_MON group of a MON group
> + * @rdtgrp:		the MON group whose parent should be replaced
> + * @new_prdtgrp:	replacement parent CTRL_MON group for @rdtgrp
> + * @cpus:		cpumask provided by the caller for use during this call
> + *
> + * Replaces the parent CTRL_MON group for a MON group, resulting in all member
> + * tasks' CLOSID immediately changing to that of the new parent group.
> + * Monitoring data for the group is unaffected by this operation.
> + */
> +static void mongrp_reparent(struct rdtgroup *rdtgrp,
> +			    struct rdtgroup *new_prdtgrp,
> +			    cpumask_var_t cpus)
> +{
> +	struct rdtgroup *prdtgrp = rdtgrp->mon.parent;
> +
> +	WARN_ON(rdtgrp->type != RDTMON_GROUP);
> +	WARN_ON(new_prdtgrp->type != RDTCTRL_GROUP);
> +
> +	/* Nothing to do when simply renaming a MON group. */
> +	if (prdtgrp == new_prdtgrp)
> +		return;
> +
> +	WARN_ON(list_empty(&prdtgrp->mon.crdtgrp_list));
> +	list_move_tail(&rdtgrp->mon.crdtgrp_list,
> +		       &new_prdtgrp->mon.crdtgrp_list);
> +
> +	rdtgrp->mon.parent = new_prdtgrp;
> +	rdtgrp->closid = new_prdtgrp->closid;
> +
> +	/* Propagate updated closid to all tasks in this group. */
> +	rdt_move_group_tasks(rdtgrp, rdtgrp, cpus);
> +
> +	update_closid_rmid(cpus, NULL);
> +}
> +
> +static int rdtgroup_rename(struct kernfs_node *kn,
> +			   struct kernfs_node *new_parent, const char *new_name)
> +{
> +	struct rdtgroup *new_prdtgrp;
> +	struct rdtgroup *rdtgrp;
> +	cpumask_var_t tmpmask;
> +	int ret;
> +
> +	rdtgrp = kernfs_to_rdtgroup(kn);
> +	new_prdtgrp = kernfs_to_rdtgroup(new_parent);
> +	if (!rdtgrp || !new_prdtgrp)
> +		return -ENOENT;
> +
> +	/* Release both kernfs active_refs before obtaining rdtgroup mutex. */
> +	rdtgroup_kn_get(rdtgrp, kn);
> +	rdtgroup_kn_get(new_prdtgrp, new_parent);
> +
> +	mutex_lock(&rdtgroup_mutex);
> +
> +	rdt_last_cmd_clear();
> +
> +	/*
> +	 * Don't allow kernfs_to_rdtgroup() to return a parent rdtgroup if
> +	 * either kernfs_node is a file.
> +	 */
> +	if (kernfs_type(kn) != KERNFS_DIR ||
> +	    kernfs_type(new_parent) != KERNFS_DIR) {
> +		rdt_last_cmd_puts("Source and destination must be directories");
> +		ret = -EPERM;
> +		goto out;
> +	}
> +
> +	if ((rdtgrp->flags & RDT_DELETED) || (new_prdtgrp->flags & RDT_DELETED)) {
> +		ret = -ENOENT;
> +		goto out;
> +	}
> +
> +	if (rdtgrp->type != RDTMON_GROUP || !kn->parent ||
> +	    !is_mon_groups(kn->parent, kn->name)) {
> +		rdt_last_cmd_puts("Source must be a MON group\n");
> +		ret = -EPERM;
> +		goto out;
> +	}
> +
> +	if (!is_mon_groups(new_parent, new_name)) {
> +		rdt_last_cmd_puts("Destination must be a mon_groups subdirectory\n");
> +		ret = -EPERM;
> +		goto out;
> +	}
> +
> +	/*
> +	 * If the MON group is monitoring CPUs, the CPUs must be assigned to the
> +	 * current parent CTRL_MON group and therefore cannot be assigned to
> +	 * the new parent, making the move illegal.
> +	 */
> +	if (!cpumask_empty(&rdtgrp->cpu_mask) &&
> +	    rdtgrp->mon.parent != new_prdtgrp) {
> +		rdt_last_cmd_puts("Cannot move a MON group that monitors CPUs\n");
> +		ret = -EPERM;
> +		goto out;
> +	}
> +
> +	/*
> +	 * Allocate the cpumask for use in mongrp_reparent() to avoid the
> +	 * possibility of failing to allocate it after kernfs_rename() has
> +	 * succeeded.
> +	 */
> +	if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL)) {
> +		ret = -ENOMEM;
> +		goto out;
> +	}
> +
> +	/*
> +	 * Perform all input validation and allocations needed to ensure
> +	 * mongrp_reparent() will succeed before calling kernfs_rename(),
> +	 * otherwise it would be necessary to revert this call if
> +	 * mongrp_reparent() failed.
> +	 */
> +	ret = kernfs_rename(kn, new_parent, new_name);
> +	if (!ret)
> +		mongrp_reparent(rdtgrp, new_prdtgrp, tmpmask);
> +
> +	free_cpumask_var(tmpmask);
> +
> +out:
> +	mutex_unlock(&rdtgroup_mutex);
> +	rdtgroup_kn_put(rdtgrp, kn);
> +	rdtgroup_kn_put(new_prdtgrp, new_parent);
> +	return ret;
> +}
> +
> +static int rdtgroup_show_options(struct seq_file *seq, struct kernfs_root *kf)
> +{
> +	if (resctrl_arch_get_cdp_enabled(RDT_RESOURCE_L3))
> +		seq_puts(seq, ",cdp");
> +
> +	if (resctrl_arch_get_cdp_enabled(RDT_RESOURCE_L2))
> +		seq_puts(seq, ",cdpl2");
> +
> +	if (is_mba_sc(resctrl_arch_get_resource(RDT_RESOURCE_MBA)))
> +		seq_puts(seq, ",mba_MBps");
> +
> +	if (resctrl_debug)
> +		seq_puts(seq, ",debug");
> +
> +	return 0;
> +}
> +
> +static struct kernfs_syscall_ops rdtgroup_kf_syscall_ops = {
> +	.mkdir		= rdtgroup_mkdir,
> +	.rmdir		= rdtgroup_rmdir,
> +	.rename		= rdtgroup_rename,
> +	.show_options	= rdtgroup_show_options,
> +};
> +
> +static int rdtgroup_setup_root(struct rdt_fs_context *ctx)
> +{
> +	rdt_root = kernfs_create_root(&rdtgroup_kf_syscall_ops,
> +				      KERNFS_ROOT_CREATE_DEACTIVATED |
> +				      KERNFS_ROOT_EXTRA_OPEN_PERM_CHECK,
> +				      &rdtgroup_default);
> +	if (IS_ERR(rdt_root))
> +		return PTR_ERR(rdt_root);
> +
> +	ctx->kfc.root = rdt_root;
> +	rdtgroup_default.kn = kernfs_root_to_node(rdt_root);
> +
> +	return 0;
> +}
> +
> +static void rdtgroup_destroy_root(void)
> +{
> +	kernfs_destroy_root(rdt_root);
> +	rdtgroup_default.kn = NULL;
> +}
> +
> +static void rdtgroup_setup_default(void)
> +{
> +	mutex_lock(&rdtgroup_mutex);
> +
> +	rdtgroup_default.closid = RESCTRL_RESERVED_CLOSID;
> +	rdtgroup_default.mon.rmid = RESCTRL_RESERVED_RMID;
> +	rdtgroup_default.type = RDTCTRL_GROUP;
> +	INIT_LIST_HEAD(&rdtgroup_default.mon.crdtgrp_list);
> +
> +	list_add(&rdtgroup_default.rdtgroup_list, &rdt_all_groups);
> +
> +	mutex_unlock(&rdtgroup_mutex);
> +}
> +
> +static void domain_destroy_mon_state(struct rdt_domain *d)
> +{
> +	bitmap_free(d->rmid_busy_llc);
> +	kfree(d->mbm_total);
> +	kfree(d->mbm_local);
> +}
> +
> +void resctrl_offline_domain(struct rdt_resource *r, struct rdt_domain *d)
> +{
> +	mutex_lock(&rdtgroup_mutex);
> +
> +	if (supports_mba_mbps() && r->rid == RDT_RESOURCE_MBA)
> +		mba_sc_domain_destroy(r, d);
> +
> +	if (!r->mon_capable)
> +		goto out_unlock;
> +
> +	/*
> +	 * If resctrl is mounted, remove all the
> +	 * per domain monitor data directories.
> +	 */
> +	if (resctrl_mounted && resctrl_arch_mon_capable())
> +		rmdir_mondata_subdir_allrdtgrp(r, d->id);
> +
> +	if (resctrl_is_mbm_enabled())
> +		cancel_delayed_work(&d->mbm_over);
> +	if (resctrl_arch_is_llc_occupancy_enabled() && has_busy_rmid(d)) {
> +		/*
> +		 * When a package is going down, forcefully
> +		 * decrement rmid->ebusy. There is no way to know
> +		 * that the L3 was flushed and hence may lead to
> +		 * incorrect counts in rare scenarios, but leaving
> +		 * the RMID as busy creates RMID leaks if the
> +		 * package never comes back.
> +		 */
> +		__check_limbo(d, true);
> +		cancel_delayed_work(&d->cqm_limbo);
> +	}
> +
> +	domain_destroy_mon_state(d);
> +
> +out_unlock:
> +	mutex_unlock(&rdtgroup_mutex);
> +}
> +
> +static int domain_setup_mon_state(struct rdt_resource *r, struct rdt_domain *d)
> +{
> +	u32 idx_limit = resctrl_arch_system_num_rmid_idx();
> +	size_t tsize;
> +
> +	if (resctrl_arch_is_llc_occupancy_enabled()) {
> +		d->rmid_busy_llc = bitmap_zalloc(idx_limit, GFP_KERNEL);
> +		if (!d->rmid_busy_llc)
> +			return -ENOMEM;
> +	}
> +	if (resctrl_arch_is_mbm_total_enabled()) {
> +		tsize = sizeof(*d->mbm_total);
> +		d->mbm_total = kcalloc(idx_limit, tsize, GFP_KERNEL);
> +		if (!d->mbm_total) {
> +			bitmap_free(d->rmid_busy_llc);
> +			return -ENOMEM;
> +		}
> +	}
> +	if (resctrl_arch_is_mbm_local_enabled()) {
> +		tsize = sizeof(*d->mbm_local);
> +		d->mbm_local = kcalloc(idx_limit, tsize, GFP_KERNEL);
> +		if (!d->mbm_local) {
> +			bitmap_free(d->rmid_busy_llc);
> +			kfree(d->mbm_total);
> +			return -ENOMEM;
> +		}
> +	}
> +
> +	return 0;
> +}
> +
> +int resctrl_online_domain(struct rdt_resource *r, struct rdt_domain *d)
> +{
> +	int err = 0;
> +
> +	mutex_lock(&rdtgroup_mutex);
> +
> +	if (supports_mba_mbps() && r->rid == RDT_RESOURCE_MBA) {
> +		/* RDT_RESOURCE_MBA is never mon_capable */
> +		err = mba_sc_domain_allocate(r, d);
> +		goto out_unlock;
> +	}
> +
> +	if (!r->mon_capable)
> +		goto out_unlock;
> +
> +	err = domain_setup_mon_state(r, d);
> +	if (err)
> +		goto out_unlock;
> +
> +	if (resctrl_is_mbm_enabled()) {
> +		INIT_DELAYED_WORK(&d->mbm_over, mbm_handle_overflow);
> +		mbm_setup_overflow_handler(d, MBM_OVERFLOW_INTERVAL,
> +					   RESCTRL_PICK_ANY_CPU);
> +	}
> +
> +	if (resctrl_arch_is_llc_occupancy_enabled())
> +		INIT_DELAYED_WORK(&d->cqm_limbo, cqm_handle_limbo);
> +
> +	/*
> +	 * If the filesystem is not mounted then only the default resource group
> +	 * exists. Creation of its directories is deferred until mount time
> +	 * by rdt_get_tree() calling mkdir_mondata_all().
> +	 * If resctrl is mounted, add per domain monitor data directories.
> +	 */
> +	if (resctrl_mounted && resctrl_arch_mon_capable())
> +		mkdir_mondata_subdir_allrdtgrp(r, d);
> +
> +out_unlock:
> +	mutex_unlock(&rdtgroup_mutex);
> +
> +	return err;
> +}
> +
> +void resctrl_online_cpu(unsigned int cpu)
> +{
> +	mutex_lock(&rdtgroup_mutex);
> +	/* The CPU is set in default rdtgroup after online. */
> +	cpumask_set_cpu(cpu, &rdtgroup_default.cpu_mask);
> +	mutex_unlock(&rdtgroup_mutex);
> +}
> +
> +static void clear_childcpus(struct rdtgroup *r, unsigned int cpu)
> +{
> +	struct rdtgroup *cr;
> +
> +	list_for_each_entry(cr, &r->mon.crdtgrp_list, mon.crdtgrp_list) {
> +		if (cpumask_test_and_clear_cpu(cpu, &cr->cpu_mask))
> +			break;
> +	}
> +}
> +
> +void resctrl_offline_cpu(unsigned int cpu)
> +{
> +	struct rdt_resource *l3 = resctrl_arch_get_resource(RDT_RESOURCE_L3);
> +	struct rdtgroup *rdtgrp;
> +	struct rdt_domain *d;
> +
> +	mutex_lock(&rdtgroup_mutex);
> +	list_for_each_entry(rdtgrp, &rdt_all_groups, rdtgroup_list) {
> +		if (cpumask_test_and_clear_cpu(cpu, &rdtgrp->cpu_mask)) {
> +			clear_childcpus(rdtgrp, cpu);
> +			break;
> +		}
> +	}
> +
> +	if (!l3->mon_capable)
> +		goto out_unlock;
> +
> +	d = resctrl_get_domain_from_cpu(cpu, l3);
> +	if (d) {
> +		if (resctrl_is_mbm_enabled() && cpu == d->mbm_work_cpu) {
> +			cancel_delayed_work(&d->mbm_over);
> +			mbm_setup_overflow_handler(d, 0, cpu);
> +		}
> +		if (resctrl_arch_is_llc_occupancy_enabled() &&
> +		    cpu == d->cqm_work_cpu && has_busy_rmid(d)) {
> +			cancel_delayed_work(&d->cqm_limbo);
> +			cqm_setup_limbo_handler(d, 0, cpu);
> +		}
> +	}
> +
> +out_unlock:
> +	mutex_unlock(&rdtgroup_mutex);
> +}
> +
> +/*
> + * resctrl_init - resctrl filesystem initialization
> + *
> + * Setup resctrl file system including set up root, create mount point,
> + * register resctrl filesystem, and initialize files under root directory.
> + *
> + * Return: 0 on success or -errno
> + */
> +int resctrl_init(void)
> +{
> +	int ret = 0;
> +
> +	seq_buf_init(&last_cmd_status, last_cmd_status_buf,
> +		     sizeof(last_cmd_status_buf));
> +
> +	rdtgroup_setup_default();
> +
> +	thread_throttle_mode_init();
> +
> +	ret = resctrl_mon_resource_init();
> +	if (ret)
> +		return ret;
> +
> +	ret = sysfs_create_mount_point(fs_kobj, "resctrl");
> +	if (ret)
> +		return ret;
> +
> +	ret = register_filesystem(&rdt_fs_type);
> +	if (ret)
> +		goto cleanup_mountpoint;
> +
> +	/*
> +	 * Adding the resctrl debugfs directory here may not be ideal since
> +	 * it would let the resctrl debugfs directory appear on the debugfs
> +	 * filesystem before the resctrl filesystem is mounted.
> +	 * It may also be ok since that would enable debugging of RDT before
> +	 * resctrl is mounted.
> +	 * The reason why the debugfs directory is created here and not in
> +	 * rdt_get_tree() is because rdt_get_tree() takes rdtgroup_mutex and
> +	 * during the debugfs directory creation also &sb->s_type->i_mutex_key
> +	 * (the lockdep class of inode->i_rwsem). Other filesystem
> +	 * interactions (eg. SyS_getdents) have the lock ordering:
> +	 * &sb->s_type->i_mutex_key --> &mm->mmap_lock
> +	 * During mmap(), called with &mm->mmap_lock, the rdtgroup_mutex
> +	 * is taken, thus creating dependency:
> +	 * &mm->mmap_lock --> rdtgroup_mutex for the latter that can cause
> +	 * issues considering the other two lock dependencies.
> +	 * By creating the debugfs directory here we avoid a dependency
> +	 * that may cause deadlock (even though file operations cannot
> +	 * occur until the filesystem is mounted, but I do not know how to
> +	 * tell lockdep that).
> +	 */
> +	debugfs_resctrl = debugfs_create_dir("resctrl", NULL);
> +
> +	return 0;
> +
> +cleanup_mountpoint:
> +	sysfs_remove_mount_point(fs_kobj, "resctrl");
> +
> +	return ret;
> +}
> +
> +void resctrl_exit(void)
> +{
> +	debugfs_remove_recursive(debugfs_resctrl);
> +	unregister_filesystem(&rdt_fs_type);
> +	sysfs_remove_mount_point(fs_kobj, "resctrl");
> +
> +	resctrl_mon_resource_exit();
> +}

-- 
Thanks
Babu Moger

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ