[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20240103224209.2541-2-gregory.price@memverge.com>
Date: Wed, 3 Jan 2024 17:41:58 -0500
From: Gregory Price <gourry.memverge@...il.com>
To: linux-mm@...ck.org
Cc: linux-doc@...r.kernel.org,
linux-fsdevel@...r.kernel.org,
linux-kernel@...r.kernel.org,
linux-api@...r.kernel.org,
linux-arch@...r.kernel.org,
akpm@...ux-foundation.org,
arnd@...db.de,
tglx@...utronix.de,
luto@...nel.org,
mingo@...hat.com,
bp@...en8.de,
dave.hansen@...ux.intel.com,
hpa@...or.com,
mhocko@...nel.org,
tj@...nel.org,
ying.huang@...el.com,
gregory.price@...verge.com,
corbet@....net,
rakie.kim@...com,
hyeongtak.ji@...com,
honggyu.kim@...com,
vtavarespetr@...ron.com,
peterz@...radead.org,
jgroves@...ron.com,
ravis.opensrc@...ron.com,
sthanneeru@...ron.com,
emirakhur@...ron.com,
Hasan.Maruf@....com,
seungjun.ha@...sung.com
Subject: [PATCH v6 01/12] mm/mempolicy: implement the sysfs-based weighted_interleave interface
From: Rakie Kim <rakie.kim@...com>
This patch provides a way to set interleave weight information under
sysfs at /sys/kernel/mm/mempolicy/weighted_interleave/nodeN
The sysfs structure is designed as follows.
$ tree /sys/kernel/mm/mempolicy/
/sys/kernel/mm/mempolicy/ [1]
└── weighted_interleave [2]
├── node0 [3]
└── node1
Each file above can be explained as follows.
[1] mm/mempolicy: configuration interface for mempolicy subsystem
[2] weighted_interleave/: config interface for weighted interleave policy
[3] weighted_interleave/nodeN: weight for nodeN
Internally, there is a secondary table `default_iw_table`, which holds
kernel-internal default interleave weights for each possible node.
If the value for a node is set to `0`, the default value will be used.
If sysfs is disabled in the config, interleave weights will default
to use `default_iw_table`.
Suggested-by: Huang Ying <ying.huang@...el.com>
Signed-off-by: Rakie Kim <rakie.kim@...com>
Signed-off-by: Honggyu Kim <honggyu.kim@...com>
Co-developed-by: Gregory Price <gregory.price@...verge.com>
Signed-off-by: Gregory Price <gregory.price@...verge.com>
Co-developed-by: Hyeongtak Ji <hyeongtak.ji@...com>
Signed-off-by: Hyeongtak Ji <hyeongtak.ji@...com>
---
.../ABI/testing/sysfs-kernel-mm-mempolicy | 4 +
...fs-kernel-mm-mempolicy-weighted-interleave | 26 +++
mm/mempolicy.c | 178 ++++++++++++++++++
3 files changed, 208 insertions(+)
create mode 100644 Documentation/ABI/testing/sysfs-kernel-mm-mempolicy
create mode 100644 Documentation/ABI/testing/sysfs-kernel-mm-mempolicy-weighted-interleave
diff --git a/Documentation/ABI/testing/sysfs-kernel-mm-mempolicy b/Documentation/ABI/testing/sysfs-kernel-mm-mempolicy
new file mode 100644
index 000000000000..2dcf24f4384a
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-kernel-mm-mempolicy
@@ -0,0 +1,4 @@
+What: /sys/kernel/mm/mempolicy/
+Date: December 2023
+Contact: Linux memory management mailing list <linux-mm@...ck.org>
+Description: Interface for Mempolicy
diff --git a/Documentation/ABI/testing/sysfs-kernel-mm-mempolicy-weighted-interleave b/Documentation/ABI/testing/sysfs-kernel-mm-mempolicy-weighted-interleave
new file mode 100644
index 000000000000..e6a38139bf0f
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-kernel-mm-mempolicy-weighted-interleave
@@ -0,0 +1,26 @@
+What: /sys/kernel/mm/mempolicy/weighted_interleave/
+Date: December 2023
+Contact: Linux memory management mailing list <linux-mm@...ck.org>
+Description: Configuration Interface for the Weighted Interleave policy
+
+What: /sys/kernel/mm/mempolicy/weighted_interleave/nodeN
+Date: December 2023
+Contact: Linux memory management mailing list <linux-mm@...ck.org>
+Description: Weight configuration interface for nodeN
+
+ The interleave weight for a memory node (N). These weights are
+ utilized by processes which have set their mempolicy to
+ MPOL_WEIGHTED_INTERLEAVE and have opted into global weights by
+ omitting a task-local weight array.
+
+ These weights only affect new allocations, and changes at runtime
+ will not cause migrations on already allocated pages.
+
+ The minimum weight for a node is always 1.
+
+ Minimum weight: 1
+ Maximum weight: 255
+
+ Writing an empty string or `0` will reset the weight to the
+ system default. The system default may be set by the kernel
+ or drivers at boot or during hotplug events.
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 10a590ee1c89..30da1a1be707 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -131,6 +131,23 @@ static struct mempolicy default_policy = {
static struct mempolicy preferred_node_policy[MAX_NUMNODES];
+/*
+ * default_iw_table is the kernel-internal default value interleave
+ * weight table. It is to be set by driver code capable of reading
+ * HMAT/CDAT information, and to provide mempolicy a sane set of
+ * default weight values for WEIGHTED_INTERLEAVE mode.
+ *
+ * By default, prior to HMAT/CDAT information being consumed, the
+ * default weight of all nodes is 1. The default weight of any
+ * node can only be in the range 1-255. A 0-weight is not allowed.
+ */
+static u8 default_iw_table[MAX_NUMNODES];
+/*
+ * iw_table is the sysfs-set interleave weight table, a value of 0
+ * denotes that the default_iw_table value should be used.
+ */
+static u8 iw_table[MAX_NUMNODES];
+
/**
* numa_nearest_node - Find nearest node by state
* @node: Node id to start the search
@@ -3067,3 +3084,164 @@ void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
p += scnprintf(p, buffer + maxlen - p, ":%*pbl",
nodemask_pr_args(&nodes));
}
+
+#ifdef CONFIG_SYSFS
+struct iw_node_attr {
+ struct kobj_attribute kobj_attr;
+ int nid;
+};
+
+static ssize_t node_show(struct kobject *kobj, struct kobj_attribute *attr,
+ char *buf)
+{
+ struct iw_node_attr *node_attr;
+ u8 weight;
+
+ node_attr = container_of(attr, struct iw_node_attr, kobj_attr);
+ weight = iw_table[node_attr->nid];
+ if (!weight)
+ weight = default_iw_table[node_attr->nid];
+ return sysfs_emit(buf, "%d\n", weight);
+}
+
+static ssize_t node_store(struct kobject *kobj, struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct iw_node_attr *node_attr;
+ u8 weight = 0;
+
+ node_attr = container_of(attr, struct iw_node_attr, kobj_attr);
+ /* If no input, revert to default weight */
+ if (count == 0 || sysfs_streq(buf, ""))
+ weight = 0;
+ else if (kstrtou8(buf, 0, &weight))
+ return -EINVAL;
+
+ iw_table[node_attr->nid] = weight;
+ return count;
+}
+
+static struct iw_node_attr *node_attrs[MAX_NUMNODES];
+
+static void sysfs_wi_node_release(struct iw_node_attr *node_attr,
+ struct kobject *parent)
+{
+ if (!node_attr)
+ return;
+ sysfs_remove_file(parent, &node_attr->kobj_attr.attr);
+ kfree(node_attr->kobj_attr.attr.name);
+ kfree(node_attr);
+}
+
+static void sysfs_mempolicy_release(struct kobject *mempolicy_kobj)
+{
+ int i;
+
+ for (i = 0; i < MAX_NUMNODES; i++)
+ sysfs_wi_node_release(node_attrs[i], mempolicy_kobj);
+ kobject_put(mempolicy_kobj);
+}
+
+static const struct kobj_type mempolicy_ktype = {
+ .sysfs_ops = &kobj_sysfs_ops,
+ .release = sysfs_mempolicy_release,
+};
+
+static int add_weight_node(int nid, struct kobject *wi_kobj)
+{
+ struct iw_node_attr *node_attr;
+ char *name;
+
+ node_attr = kzalloc(sizeof(*node_attr), GFP_KERNEL);
+ if (!node_attr)
+ return -ENOMEM;
+
+ name = kasprintf(GFP_KERNEL, "node%d", nid);
+ if (!name) {
+ kfree(node_attr);
+ return -ENOMEM;
+ }
+
+ sysfs_attr_init(&node_attr->kobj_attr.attr);
+ node_attr->kobj_attr.attr.name = name;
+ node_attr->kobj_attr.attr.mode = 0644;
+ node_attr->kobj_attr.show = node_show;
+ node_attr->kobj_attr.store = node_store;
+ node_attr->nid = nid;
+
+ if (sysfs_create_file(wi_kobj, &node_attr->kobj_attr.attr)) {
+ kfree(node_attr->kobj_attr.attr.name);
+ kfree(node_attr);
+ pr_err("failed to add attribute to weighted_interleave\n");
+ return -ENOMEM;
+ }
+
+ node_attrs[nid] = node_attr;
+ return 0;
+}
+
+static int add_weighted_interleave_group(struct kobject *root_kobj)
+{
+ struct kobject *wi_kobj;
+ int nid, err;
+
+ wi_kobj = kzalloc(sizeof(struct kobject), GFP_KERNEL);
+ if (!wi_kobj)
+ return -ENOMEM;
+
+ err = kobject_init_and_add(wi_kobj, &mempolicy_ktype, root_kobj,
+ "weighted_interleave");
+ if (err) {
+ kfree(wi_kobj);
+ return err;
+ }
+
+ memset(node_attrs, 0, sizeof(node_attrs));
+ for_each_node_state(nid, N_POSSIBLE) {
+ err = add_weight_node(nid, wi_kobj);
+ if (err) {
+ pr_err("failed to add sysfs [node%d]\n", nid);
+ break;
+ }
+ }
+ if (err)
+ kobject_put(wi_kobj);
+ return 0;
+}
+
+static int __init mempolicy_sysfs_init(void)
+{
+ int err;
+ struct kobject *root_kobj;
+
+ memset(&default_iw_table, 1, sizeof(default_iw_table));
+ memset(&iw_table, 0, sizeof(iw_table));
+
+ root_kobj = kobject_create_and_add("mempolicy", mm_kobj);
+ if (!root_kobj) {
+ pr_err("failed to add mempolicy kobject to the system\n");
+ return -ENOMEM;
+ }
+
+ err = add_weighted_interleave_group(root_kobj);
+
+ if (err)
+ kobject_put(root_kobj);
+ return err;
+
+}
+#else
+static int __init mempolicy_sysfs_init(void)
+{
+ /*
+ * if sysfs is not enabled MPOL_WEIGHTED_INTERLEAVE defaults to
+ * MPOL_INTERLEAVE behavior, but is still defined separately to
+ * allow task-local weighted interleave and system-defaults to
+ * operate as intended.
+ */
+ memset(&default_iw_table, 1, sizeof(default_iw_table));
+ memset(&iw_table, 0, sizeof(iw_table));
+ return 0;
+}
+#endif /* CONFIG_SYSFS */
+late_initcall(mempolicy_sysfs_init);
--
2.39.1
Powered by blists - more mailing lists