[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <d1c30b1f-1628-4092-acee-2b0980a3d888@micron.com>
Date: Tue, 31 Oct 2023 23:53:15 +0530
From: Srinivasulu Thanneeru <sthanneeru.opensrc@...ron.com>
To: Gregory Price <gourry.memverge@...il.com>,
<linux-kernel@...r.kernel.org>
CC: <linux-cxl@...r.kernel.org>, <linux-mm@...ck.org>,
<ying.huang@...el.com>, <akpm@...ux-foundation.org>,
<aneesh.kumar@...ux.ibm.com>, <weixugc@...gle.com>,
<apopple@...dia.com>, <hannes@...xchg.org>, <tim.c.chen@...el.com>,
<dave.hansen@...el.com>, <mhocko@...nel.org>,
<shy828301@...il.com>, <gregkh@...uxfoundation.org>,
<rafael@...nel.org>, Gregory Price <gregory.price@...verge.com>
Subject: Re: [EXT] [RFC PATCH v3 4/4] mm/mempolicy: modify interleave
mempolicy to use node weights
On 10/31/2023 6:08 AM, Gregory Price wrote:
>
>
> The node subsystem implements interleave weighting for the purpose
> of bandwidth optimization. Each node may have different weights in
> relation to each compute node ("access node").
>
> The mempolicy MPOL_INTERLEAVE utilizes the node weights to implement
> weighted interleave. By default, since all nodes default to a weight
> of 1, the original interleave behavior is retained.
>
> Examples
>
> Weight settings:
> echo 4 > node0/access0/il_weight
> echo 1 > node0/access1/il_weight
>
> echo 3 > node1/access0/il_weight
> echo 2 > node1/access1/il_weight
>
> Results:
>
> Task A:
> cpunode: 0
> nodemask: [0,1]
> weights: [4,3]
> allocation result: [0,0,0,0,1,1,1 repeat]
>
> Task B:
> cpunode: 1
> nodemask: [0,1]
> weights: [1,2]
> allocation result: [0,1,1 repeat]
> Weights are relative to access node
>
> Signed-off-by: Gregory Price <gregory.price@...verge.com>
Thank you Gregory for the collaboration.
Signed-off-by: Srinivasulu Thanneeru <sthanneeru.opensrc@...ron.com>
> ---
> include/linux/mempolicy.h | 4 ++
> mm/mempolicy.c | 138 +++++++++++++++++++++++++++++---------
> 2 files changed, 112 insertions(+), 30 deletions(-)
>
> diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
> index d232de7cdc56..240468b669fd 100644
> --- a/include/linux/mempolicy.h
> +++ b/include/linux/mempolicy.h
> @@ -48,6 +48,10 @@ struct mempolicy {
> nodemask_t nodes; /* interleave/bind/perfer */
> int home_node; /* Home node to use for MPOL_BIND and MPOL_PREFERRED_MANY */
>
> + /* weighted interleave settings */
> + unsigned char cur_weight;
> + unsigned char il_weights[MAX_NUMNODES];
> +
> union {
> nodemask_t cpuset_mems_allowed; /* relative to these nodes */
> nodemask_t user_nodemask; /* nodemask passed by user */
> diff --git a/mm/mempolicy.c b/mm/mempolicy.c
> index 29ebf1e7898c..d62e942a13bd 100644
> --- a/mm/mempolicy.c
> +++ b/mm/mempolicy.c
> @@ -102,6 +102,7 @@
> #include <linux/mmu_notifier.h>
> #include <linux/printk.h>
> #include <linux/swapops.h>
> +#include <linux/memory-tiers.h>
>
> #include <asm/tlbflush.h>
> #include <asm/tlb.h>
> @@ -300,6 +301,7 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
> policy->mode = mode;
> policy->flags = flags;
> policy->home_node = NUMA_NO_NODE;
> + policy->cur_weight = 0;
>
> return policy;
> }
> @@ -334,6 +336,7 @@ static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
> tmp = *nodes;
>
> pol->nodes = tmp;
> + pol->cur_weight = 0;
> }
>
> static void mpol_rebind_preferred(struct mempolicy *pol,
> @@ -881,8 +884,11 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags,
>
> old = current->mempolicy;
> current->mempolicy = new;
> - if (new && new->mode == MPOL_INTERLEAVE)
> + if (new && new->mode == MPOL_INTERLEAVE) {
> current->il_prev = MAX_NUMNODES-1;
> + new->cur_weight = 0;
> + }
> +
> task_unlock(current);
> mpol_put(old);
> ret = 0;
> @@ -1903,12 +1909,21 @@ static int policy_node(gfp_t gfp, struct mempolicy *policy, int nd)
> /* Do dynamic interleaving for a process */
> static unsigned interleave_nodes(struct mempolicy *policy)
> {
> - unsigned next;
> + unsigned int next;
> + unsigned char next_weight;
> struct task_struct *me = current;
>
> next = next_node_in(me->il_prev, policy->nodes);
> - if (next < MAX_NUMNODES)
> + if (!policy->cur_weight) {
> + /* If the node is set, at least 1 allocation is required */
> + next_weight = node_get_il_weight(next, numa_node_id());
> + policy->cur_weight = next_weight ? next_weight : 1;
> + }
> +
> + policy->cur_weight--;
> + if (next < MAX_NUMNODES && !policy->cur_weight)
> me->il_prev = next;
> +
> return next;
> }
>
> @@ -1967,25 +1982,37 @@ unsigned int mempolicy_slab_node(void)
> static unsigned offset_il_node(struct mempolicy *pol, unsigned long n)
> {
> nodemask_t nodemask = pol->nodes;
> - unsigned int target, nnodes;
> - int i;
> + unsigned int target, nnodes, il_weight;
> + unsigned char weight;
> int nid;
> + int cur_node = numa_node_id();
> +
> /*
> * The barrier will stabilize the nodemask in a register or on
> * the stack so that it will stop changing under the code.
> *
> * Between first_node() and next_node(), pol->nodes could be changed
> * by other threads. So we put pol->nodes in a local stack.
> + *
> + * Additionally, place the cur_node on the stack in case of a migration
> */
> barrier();
>
> nnodes = nodes_weight(nodemask);
> if (!nnodes)
> - return numa_node_id();
> - target = (unsigned int)n % nnodes;
> + return cur_node;
> +
> + il_weight = nodes_get_il_weights(cur_node, &nodemask, pol->il_weights);
> + target = (unsigned int)n % il_weight;
> nid = first_node(nodemask);
> - for (i = 0; i < target; i++)
> - nid = next_node(nid, nodemask);
> + while (target) {
> + weight = pol->il_weights[nid];
> + if (target < weight)
> + break;
> + target -= weight;
> + nid = next_node_in(nid, nodemask);
> + }
> +
> return nid;
> }
>
> @@ -2319,32 +2346,83 @@ static unsigned long alloc_pages_bulk_array_interleave(gfp_t gfp,
> struct mempolicy *pol, unsigned long nr_pages,
> struct page **page_array)
> {
> - int nodes;
> - unsigned long nr_pages_per_node;
> - int delta;
> - int i;
> - unsigned long nr_allocated;
> + struct task_struct *me = current;
> unsigned long total_allocated = 0;
> + unsigned long nr_allocated;
> + unsigned long rounds;
> + unsigned long node_pages, delta;
> + unsigned char weight;
> + unsigned long il_weight;
> + unsigned long req_pages = nr_pages;
> + int nnodes, node, prev_node;
> + int cur_node = numa_node_id();
> + int i;
>
> - nodes = nodes_weight(pol->nodes);
> - nr_pages_per_node = nr_pages / nodes;
> - delta = nr_pages - nodes * nr_pages_per_node;
> -
> - for (i = 0; i < nodes; i++) {
> - if (delta) {
> - nr_allocated = __alloc_pages_bulk(gfp,
> - interleave_nodes(pol), NULL,
> - nr_pages_per_node + 1, NULL,
> - page_array);
> - delta--;
> - } else {
> - nr_allocated = __alloc_pages_bulk(gfp,
> - interleave_nodes(pol), NULL,
> - nr_pages_per_node, NULL, page_array);
> + prev_node = me->il_prev;
> + nnodes = nodes_weight(pol->nodes);
> + /* Continue allocating from most recent node */
> + if (pol->cur_weight) {
> + node = next_node_in(prev_node, pol->nodes);
> + node_pages = pol->cur_weight;
> + if (node_pages > nr_pages)
> + node_pages = nr_pages;
> + nr_allocated = __alloc_pages_bulk(gfp, node, NULL, node_pages,
> + NULL, page_array);
> + page_array += nr_allocated;
> + total_allocated += nr_allocated;
> + /* if that's all the pages, no need to interleave */
> + if (req_pages <= pol->cur_weight) {
> + pol->cur_weight -= req_pages;
> + return total_allocated;
> }
> -
> + /* Otherwise we adjust req_pages down, and continue from there */
> + req_pages -= pol->cur_weight;
> + pol->cur_weight = 0;
> + prev_node = node;
> + }
> +
> + il_weight = nodes_get_il_weights(cur_node, &pol->nodes,
> + pol->il_weights);
> + rounds = req_pages / il_weight;
> + delta = req_pages % il_weight;
> + for (i = 0; i < nnodes; i++) {
> + node = next_node_in(prev_node, pol->nodes);
> + weight = pol->il_weights[node];
> + node_pages = weight * rounds;
> + if (delta > weight) {
> + node_pages += weight;
> + delta -= weight;
> + } else if (delta) {
> + node_pages += delta;
> + delta = 0;
> + }
> + /* The number of requested pages may not hit every node */
> + if (!node_pages)
> + break;
> + /* If an over-allocation would occur, floor it */
> + if (node_pages + total_allocated > nr_pages) {
> + node_pages = nr_pages - total_allocated;
> + delta = 0;
> + }
> + nr_allocated = __alloc_pages_bulk(gfp, node, NULL, node_pages,
> + NULL, page_array);
> page_array += nr_allocated;
> total_allocated += nr_allocated;
> + prev_node = node;
> + }
> +
> + /*
> + * Finally, we need to update me->il_prev and pol->cur_weight
> + * If the last node allocated on has un-used weight, apply
> + * the remainder as the cur_weight, otherwise proceed to next node
> + */
> + if (node_pages) {
> + me->il_prev = prev_node;
> + node_pages %= weight;
> + pol->cur_weight = weight - node_pages;
> + } else {
> + me->il_prev = node;
> + pol->cur_weight = 0;
> }
>
> return total_allocated;
> --
> 2.39.1
>
>
Powered by blists - more mailing lists